From 8a53d576e685b700c498d50588b4c0224711bed2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 13 Mar 2024 17:10:20 +0200
Subject: [PATCH 01/43] fix(metrics): time individual layer flush operations
 (#7109)

Currently, the flushing operation could flush multiple frozen layers to
the disk and store the aggregate time in the histogram. The result is a
bimodal distribution with short and over 1000-second flushes. Change it
so that we record how long one layer flush takes.
---
 pageserver/src/tenant/timeline.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a733a3b1a7..f10df19b7b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2967,7 +2967,6 @@ impl Timeline {
             }
 
             trace!("waking up");
-            let timer = self.metrics.flush_time_histo.start_timer();
             let flush_counter = *layer_flush_start_rx.borrow();
             let result = loop {
                 if self.cancel.is_cancelled() {
@@ -2978,6 +2977,8 @@ impl Timeline {
                     return;
                 }
 
+                let timer = self.metrics.flush_time_histo.start_timer();
+
                 let layer_to_flush = {
                     let guard = self.layers.read().await;
                     guard.layer_map().frozen_layers.front().cloned()
@@ -2999,13 +3000,12 @@ impl Timeline {
                         break err;
                     }
                 }
+                timer.stop_and_record();
             };
             // Notify any listeners that we're done
             let _ = self
                 .layer_flush_done_tx
                 .send_replace((flush_counter, result));
-
-            timer.stop_and_record();
         }
     }
 
@@ -3073,6 +3073,7 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<(), FlushLayerError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
+
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
         // files instead. This is possible as long as *all* the data imported into the

From 5309711691325c274fc34994c240feaa529cbef5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 13 Mar 2024 17:30:29 +0100
Subject: [PATCH 02/43] Make tenant_id in TenantLocationConfigRequest optional
 (#7055)

The `tenant_id` in `TenantLocationConfigRequest` in the
`location_config` endpoint was only used in the storage
controller/attachment service, and there it was only used for assertions
and the creation part.
---
 control_plane/attachment_service/src/http.rs    |  6 +++---
 control_plane/attachment_service/src/service.rs | 13 +++++++------
 libs/pageserver_api/src/models.rs               |  2 +-
 pageserver/client/src/mgmt_api.rs               |  2 +-
 pageserver/src/http/openapi_spec.yml            |  7 ++++---
 5 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 27ba5bdb65..515c287ea9 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -174,14 +174,14 @@ async fn handle_tenant_location_config(
     service: Arc<Service>,
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
     let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
     json_response(
         StatusCode::OK,
         service
-            .tenant_location_config(tenant_id, config_req)
+            .tenant_location_config(tenant_shard_id, config_req)
             .await?,
     )
 }
@@ -587,7 +587,7 @@ pub fn make_router(
         .get("/v1/tenant/:tenant_id/config", |r| {
             tenant_service_handler(r, handle_tenant_config_get)
         })
-        .put("/v1/tenant/:tenant_id/location_config", |r| {
+        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
             tenant_service_handler(r, handle_tenant_location_config)
         })
         .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index ea301d0372..1c4ede3d9d 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1262,6 +1262,7 @@ impl Service {
         let mut updates = Vec::new();
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
 
         // Use location config mode as an indicator of policy.
         let placement_policy = match req.config.mode {
@@ -1326,12 +1327,10 @@ impl Service {
             TenantCreateOrUpdate::Create(
                 // Synthesize a creation request
                 TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    new_tenant_id: tenant_shard_id,
                     generation,
                     shard_parameters: ShardParameters {
-                        // Must preserve the incoming shard_count do distinguish unsharded (0)
-                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
-                        count: req.tenant_id.shard_count,
+                        count: tenant_shard_id.shard_count,
                         // We only import un-sharded or single-sharded tenants, so stripe
                         // size can be made up arbitrarily here.
                         stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
@@ -1360,15 +1359,17 @@ impl Service {
     /// - Call with mode Detached to switch to PolicyMode::Detached
     pub(crate) async fn tenant_location_config(
         &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         req: TenantLocationConfigRequest,
     ) -> Result<TenantLocationConfigResponse, ApiError> {
-        if !req.tenant_id.is_unsharded() {
+        if !tenant_shard_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
                 "This API is for importing single-sharded or unsharded tenants"
             )));
         }
 
+        let tenant_id = tenant_shard_id.tenant_id;
+
         // First check if this is a creation or an update
         let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index a96cc09158..3aa84f8903 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -426,7 +426,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    pub tenant_id: TenantShardId,
+    pub tenant_id: Option<TenantShardId>,
     #[serde(flatten)]
     pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 732eb951c9..2f22ebd54d 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -257,7 +257,7 @@ impl Client {
         lazy: bool,
     ) -> Result<()> {
         let req_body = TenantLocationConfigRequest {
-            tenant_id: tenant_shard_id,
+            tenant_id: Some(tenant_shard_id),
             config,
         };
 
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 6a070e2135..4823710fb5 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -567,9 +567,9 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
-  /v1/tenant/{tenant_id}/location_config:
+  /v1/tenant/{tenant_shard_id}/location_config:
     parameters:
-      - name: tenant_id
+      - name: tenant_shard_id
         in: path
         required: true
         schema:
@@ -1367,10 +1367,11 @@ components:
     TenantLocationConfigRequest:
       type: object
       required:
-        - tenant_id
+        - mode
       properties:
         tenant_id:
           type: string
+          description: Not used, scheduled for removal.
         mode:
           type: string
           enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]

From 69338e53e3628cd0133b27d1c079f9deeaaea725 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 13 Mar 2024 18:49:17 +0100
Subject: [PATCH 03/43] throttling: fixup interactions with
 Timeline::get_vectored (#7089)

## Problem

Before this PR, `Timeline::get_vectored` would be throttled twice if the
sequential option was enabled or if validation was enabled.

Also, `pageserver_get_vectored_seconds` included the time spent in the
throttle, which turns out to be undesirable for what we use that metric
for.

## Summary of changes

Double-throttle:

* Add `Timeline::get0` method which is unthrottled.
* Use that method from within the `Timeline::get_vectored` code path.

Metric:

* return throttled time from `throttle()` method
* deduct the value from the observed time
* globally rate-limited logging of duration subtraction errors, like in
all other places that do the throttled-time deduction from observations
---
 pageserver/src/bin/pageserver.rs  |  2 ++
 pageserver/src/metrics.rs         |  2 +-
 pageserver/src/tenant/throttle.rs |  7 ++--
 pageserver/src/tenant/timeline.rs | 59 +++++++++++++++++++++++++------
 4 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2f172bd384..59750897ff 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,3 +1,5 @@
+#![recursion_limit = "300"]
+
 //! Main entry point for the Page Server executable.
 
 use std::env::{var, VarError};
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 814b3e1f96..03537ddb05 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -167,7 +167,7 @@ impl GetVectoredLatency {
 pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_get_vectored_seconds",
-        "Time spent in get_vectored",
+        "Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
         &["task_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 280773e9c3..f3f3d5e3ae 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -130,10 +130,10 @@ where
         self.inner.load().config.steady_rps()
     }
 
-    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) {
+    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
         let inner = self.inner.load_full(); // clones the `Inner` Arc
         if !inner.task_kinds.contains(ctx.task_kind()) {
-            return;
+            return None;
         };
         let start = std::time::Instant::now();
         let mut did_throttle = false;
@@ -170,6 +170,9 @@ where
                     });
                 }
             }
+            Some(wait_time)
+        } else {
+            None
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f10df19b7b..d507a19de9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -634,6 +634,8 @@ impl Timeline {
     /// If a remote layer file is needed, it is downloaded as part of this
     /// call.
     ///
+    /// This method enforces [`Self::timeline_get_throttle`] internally.
+    ///
     /// NOTE: It is considered an error to 'get' a key that doesn't exist. The
     /// abstraction above this needs to store suitable metadata to track what
     /// data exists with what keys, in separate metadata entries. If a
@@ -644,18 +646,27 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
+    #[inline(always)]
     pub(crate) async fn get(
         &self,
         key: Key,
         lsn: Lsn,
         ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.timeline_get_throttle.throttle(ctx, 1).await;
+        self.get_impl(key, lsn, ctx).await
+    }
+    /// Not subject to [`Self::timeline_get_throttle`].
+    async fn get_impl(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if !lsn.is_valid() {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
-        self.timeline_get_throttle.throttle(ctx, 1).await;
-
         // This check is debug-only because of the cost of hashing, and because it's a double-check: we
         // already checked the key against the shard_identity when looking up the Timeline from
         // page_service.
@@ -752,10 +763,6 @@ impl Timeline {
             return Err(GetVectoredError::Oversized(key_count));
         }
 
-        self.timeline_get_throttle
-            .throttle(ctx, key_count as usize)
-            .await;
-
         for range in &keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
@@ -772,11 +779,18 @@ impl Timeline {
             self.conf.get_vectored_impl
         );
 
-        let _timer = crate::metrics::GET_VECTORED_LATENCY
+        let start = crate::metrics::GET_VECTORED_LATENCY
             .for_task_kind(ctx.task_kind())
-            .map(|t| t.start_timer());
+            .map(|metric| (metric, Instant::now()));
 
-        match self.conf.get_vectored_impl {
+        // start counting after throttle so that throttle time
+        // is always less than observation time
+        let throttled = self
+            .timeline_get_throttle
+            .throttle(ctx, key_count as usize)
+            .await;
+
+        let res = match self.conf.get_vectored_impl {
             GetVectoredImpl::Sequential => {
                 self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
             }
@@ -790,9 +804,33 @@ impl Timeline {
 
                 vectored_res
             }
+        };
+
+        if let Some((metric, start)) = start {
+            let elapsed = start.elapsed();
+            let ex_throttled = if let Some(throttled) = throttled {
+                elapsed.checked_sub(throttled)
+            } else {
+                Some(elapsed)
+            };
+
+            if let Some(ex_throttled) = ex_throttled {
+                metric.observe(ex_throttled.as_secs_f64());
+            } else {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    warn!("error deducting time spent throttled; this message is logged at a global rate limit");
+                });
+            }
         }
+
+        res
     }
 
+    /// Not subject to [`Self::timeline_get_throttle`].
     pub(super) async fn get_vectored_sequential_impl(
         &self,
         keyspace: KeySpace,
@@ -803,7 +841,7 @@ impl Timeline {
         for range in keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
-                let block = self.get(key, lsn, ctx).await;
+                let block = self.get_impl(key, lsn, ctx).await;
 
                 use PageReconstructError::*;
                 match block {
@@ -853,6 +891,7 @@ impl Timeline {
         Ok(results)
     }
 
+    /// Not subject to [`Self::timeline_get_throttle`].
     pub(super) async fn validate_get_vectored_impl(
         &self,
         vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,

From 3bd6551b36be636c7497ee774c65718320093bc3 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 14 Mar 2024 08:20:56 +0000
Subject: [PATCH 04/43] proxy http cancellation safety (#7117)

## Problem

hyper auto-cancels the request futures on connection close.
`sql_over_http::handle` is not 'drop cancel safe', so we need to do some
other work to make sure connections are queries in the right way.

## Summary of changes

1. tokio::spawn the request handler to resolve the initial cancel-safety
issue
2. share a cancellation token, and cancel it when the request `Service`
is dropped.
3. Add a new log span to be able to track the HTTP connection lifecycle.
---
 proxy/src/protocol2.rs                | 18 ++++++-
 proxy/src/serverless.rs               | 74 +++++++++++++++++++--------
 proxy/src/serverless/sql_over_http.rs |  2 +-
 proxy/src/serverless/tls_listener.rs  | 29 ++++-------
 test_runner/fixtures/neon_fixtures.py |  2 +
 test_runner/regress/test_proxy.py     | 36 +++++++++++++
 6 files changed, 120 insertions(+), 41 deletions(-)

diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index f476cb9b37..700c8c8681 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -341,7 +341,14 @@ impl Accept for ProxyProtocolAccept {
         cx: &mut Context<'_>,
     ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
         let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
-        tracing::info!(protocol = self.protocol, "accepted new TCP connection");
+
+        let conn_id = uuid::Uuid::new_v4();
+        let span = tracing::info_span!("http_conn", ?conn_id);
+        {
+            let _enter = span.enter();
+            tracing::info!("accepted new TCP connection");
+        }
+
         let Some(conn) = conn else {
             return Poll::Ready(None);
         };
@@ -354,6 +361,7 @@ impl Accept for ProxyProtocolAccept {
                     .with_label_values(&[self.protocol])
                     .guard(),
             )),
+            span,
         })))
     }
 }
@@ -364,6 +372,14 @@ pin_project! {
         pub inner: T,
         pub connection_id: Uuid,
         pub gauge: Mutex<Option<IntCounterPairGuard>>,
+        pub span: tracing::Span,
+    }
+
+    impl<S> PinnedDrop for WithConnectionGuard<S> {
+        fn drop(this: Pin<&mut Self>) {
+            let _enter = this.span.enter();
+            tracing::info!("HTTP connection closed")
+        }
     }
 }
 
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 68f68eaba1..be9f90acde 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -19,6 +19,7 @@ use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
+use tracing::instrument::Instrumented;
 
 use crate::context::RequestMonitoring;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
@@ -30,13 +31,12 @@ use hyper::{
     Body, Method, Request, Response,
 };
 
-use std::convert::Infallible;
 use std::net::IpAddr;
 use std::sync::Arc;
 use std::task::Poll;
 use tls_listener::TlsListener;
 use tokio::net::TcpListener;
-use tokio_util::sync::CancellationToken;
+use tokio_util::sync::{CancellationToken, DropGuard};
 use tracing::{error, info, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};
 
@@ -100,12 +100,7 @@ pub async fn task_main(
     let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
     ws_connections.close(); // allows `ws_connections.wait to complete`
 
-    let tls_listener = TlsListener::new(
-        tls_acceptor,
-        addr_incoming,
-        "http",
-        config.handshake_timeout,
-    );
+    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout);
 
     let make_svc = hyper::service::make_service_fn(
         |stream: &tokio_rustls::server::TlsStream<
@@ -121,6 +116,11 @@ pub async fn task_main(
                 .take()
                 .expect("gauge should be set on connection start");
 
+            // Cancel all current inflight HTTP requests if the HTTP connection is closed.
+            let http_cancellation_token = CancellationToken::new();
+            let cancel_connection = http_cancellation_token.clone().drop_guard();
+
+            let span = conn.span.clone();
             let client_addr = conn.inner.client_addr();
             let remote_addr = conn.inner.inner.remote_addr();
             let backend = backend.clone();
@@ -136,27 +136,43 @@ pub async fn task_main(
                 Ok(MetricService::new(
                     hyper::service::service_fn(move |req: Request<Body>| {
                         let backend = backend.clone();
-                        let ws_connections = ws_connections.clone();
+                        let ws_connections2 = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
                         let cancellation_handler = cancellation_handler.clone();
+                        let http_cancellation_token = http_cancellation_token.child_token();
 
-                        async move {
-                            Ok::<_, Infallible>(
-                                request_handler(
+                        // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
+                        // By spawning the future, we ensure it never gets cancelled until it decides to.
+                        ws_connections.spawn(
+                            async move {
+                                // Cancel the current inflight HTTP request if the requets stream is closed.
+                                // This is slightly different to `_cancel_connection` in that
+                                // h2 can cancel individual requests with a `RST_STREAM`.
+                                let _cancel_session = http_cancellation_token.clone().drop_guard();
+
+                                let res = request_handler(
                                     req,
                                     config,
                                     backend,
-                                    ws_connections,
+                                    ws_connections2,
                                     cancellation_handler,
                                     peer_addr.ip(),
                                     endpoint_rate_limiter,
+                                    http_cancellation_token,
                                 )
                                 .await
-                                .map_or_else(|e| e.into_response(), |r| r),
-                            )
-                        }
+                                .map_or_else(|e| e.into_response(), |r| r);
+
+                                _cancel_session.disarm();
+
+                                res
+                            }
+                            .in_current_span(),
+                        )
                     }),
                     gauge,
+                    cancel_connection,
+                    span,
                 ))
             }
         },
@@ -176,11 +192,23 @@ pub async fn task_main(
 struct MetricService<S> {
     inner: S,
     _gauge: IntCounterPairGuard,
+    _cancel: DropGuard,
+    span: tracing::Span,
 }
 
 impl<S> MetricService<S> {
-    fn new(inner: S, _gauge: IntCounterPairGuard) -> MetricService<S> {
-        MetricService { inner, _gauge }
+    fn new(
+        inner: S,
+        _gauge: IntCounterPairGuard,
+        _cancel: DropGuard,
+        span: tracing::Span,
+    ) -> MetricService<S> {
+        MetricService {
+            inner,
+            _gauge,
+            _cancel,
+            span,
+        }
     }
 }
 
@@ -190,14 +218,16 @@ where
 {
     type Response = S::Response;
     type Error = S::Error;
-    type Future = S::Future;
+    type Future = Instrumented<S::Future>;
 
     fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
         self.inner.poll_ready(cx)
     }
 
     fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
-        self.inner.call(req)
+        self.span
+            .in_scope(|| self.inner.call(req))
+            .instrument(self.span.clone())
     }
 }
 
@@ -210,6 +240,8 @@ async fn request_handler(
     cancellation_handler: Arc<CancellationHandler>,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    // used to cancel in-flight HTTP requests. not used to cancel websockets
+    http_cancellation_token: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let session_id = uuid::Uuid::new_v4();
 
@@ -253,7 +285,7 @@ async fn request_handler(
         let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
         let span = ctx.span.clone();
 
-        sql_over_http::handle(config, ctx, request, backend)
+        sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
             .instrument(span)
             .await
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 86c278030f..f675375ff1 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -217,8 +217,8 @@ pub async fn handle(
     mut ctx: RequestMonitoring,
     request: Request<Body>,
     backend: Arc<PoolingBackend>,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let cancel = CancellationToken::new();
     let cancel2 = cancel.clone();
     let handle = tokio::spawn(async move {
         time::sleep(config.http_config.request_timeout).await;
diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs
index cce02e3850..33f194dd59 100644
--- a/proxy/src/serverless/tls_listener.rs
+++ b/proxy/src/serverless/tls_listener.rs
@@ -13,7 +13,7 @@ use tokio::{
     time::timeout,
 };
 use tokio_rustls::{server::TlsStream, TlsAcceptor};
-use tracing::{info, warn};
+use tracing::{info, warn, Instrument};
 
 use crate::{
     metrics::TLS_HANDSHAKE_FAILURES,
@@ -29,24 +29,17 @@ pin_project! {
         tls: TlsAcceptor,
         waiting: JoinSet<Option<TlsStream<A::Conn>>>,
         timeout: Duration,
-        protocol: &'static str,
     }
 }
 
 impl<A: Accept> TlsListener<A> {
     /// Create a `TlsListener` with default options.
-    pub(crate) fn new(
-        tls: TlsAcceptor,
-        listener: A,
-        protocol: &'static str,
-        timeout: Duration,
-    ) -> Self {
+    pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self {
         TlsListener {
             listener,
             tls,
             waiting: JoinSet::new(),
             timeout,
-            protocol,
         }
     }
 }
@@ -73,7 +66,7 @@ where
                 Poll::Ready(Some(Ok(mut conn))) => {
                     let t = *this.timeout;
                     let tls = this.tls.clone();
-                    let protocol = *this.protocol;
+                    let span = conn.span.clone();
                     this.waiting.spawn(async move {
                         let peer_addr = match conn.inner.wait_for_addr().await {
                             Ok(Some(addr)) => addr,
@@ -86,21 +79,24 @@ where
 
                         let accept = tls.accept(conn);
                         match timeout(t, accept).await {
-                            Ok(Ok(conn)) => Some(conn),
+                            Ok(Ok(conn)) => {
+                                info!(%peer_addr, "accepted new TLS connection");
+                                Some(conn)
+                            },
                             // The handshake failed, try getting another connection from the queue
                             Ok(Err(e)) => {
                                 TLS_HANDSHAKE_FAILURES.inc();
-                                warn!(%peer_addr, protocol, "failed to accept TLS connection: {e:?}");
+                                warn!(%peer_addr, "failed to accept TLS connection: {e:?}");
                                 None
                             }
                             // The handshake timed out, try getting another connection from the queue
                             Err(_) => {
                                 TLS_HANDSHAKE_FAILURES.inc();
-                                warn!(%peer_addr, protocol, "failed to accept TLS connection: timeout");
+                                warn!(%peer_addr, "failed to accept TLS connection: timeout");
                                 None
                             }
                         }
-                    });
+                    }.instrument(span));
                 }
                 Poll::Ready(Some(Err(e))) => {
                     tracing::error!("error accepting TCP connection: {e}");
@@ -112,10 +108,7 @@ where
 
         loop {
             return match this.waiting.poll_join_next(cx) {
-                Poll::Ready(Some(Ok(Some(conn)))) => {
-                    info!(protocol = this.protocol, "accepted new TLS connection");
-                    Poll::Ready(Some(Ok(conn)))
-                }
+                Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))),
                 // The handshake failed to complete, try getting another connection from the queue
                 Poll::Ready(Some(Ok(None))) => continue,
                 // The handshake panicked or was cancelled. ignore and get another connection
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b3f460c7fe..5b76e808d5 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2944,6 +2944,7 @@ class NeonProxy(PgProtocol):
         user = quote(kwargs["user"])
         password = quote(kwargs["password"])
         expected_code = kwargs.get("expected_code")
+        timeout = kwargs.get("timeout")
 
         log.info(f"Executing http query: {query}")
 
@@ -2957,6 +2958,7 @@ class NeonProxy(PgProtocol):
                 "Neon-Pool-Opt-In": "true",
             },
             verify=str(self.test_output_dir / "proxy.crt"),
+            timeout=timeout,
         )
 
         if expected_code is not None:
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 078589d8eb..3e986a8f7b 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -596,3 +596,39 @@ def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy):
     assert (
         "duplicate key value violates unique constraint" in res["message"]
     ), "HTTP query should conflict"
+
+
+def test_sql_over_http_connection_cancel(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    static_proxy.safe_psql("create table test_table ( id int primary key )")
+
+    # insert into a table, with a unique constraint, after sleeping for n seconds
+    query = "WITH temp AS ( \
+        SELECT pg_sleep($1) as sleep, $2::int as id \
+    ) INSERT INTO test_table (id) SELECT id FROM temp"
+
+    try:
+        # The request should complete before the proxy HTTP timeout triggers.
+        # Timeout and cancel the request on the client side before the query completes.
+        static_proxy.http_query(
+            query,
+            [static_proxy.http_timeout_seconds - 1, 1],
+            user="http",
+            password="http",
+            timeout=2,
+        )
+    except requests.exceptions.ReadTimeout:
+        pass
+
+    # wait until the query _would_ have been complete
+    time.sleep(static_proxy.http_timeout_seconds)
+
+    res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200)
+    assert res["command"] == "INSERT", "HTTP query should insert"
+    assert res["rowCount"] == 1, "HTTP query should insert"
+
+    res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400)
+    assert (
+        "duplicate key value violates unique constraint" in res["message"]
+    ), "HTTP query should conflict"

From 44f42627dd32f29a3be9cfcd2d8c487c89642dc8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 14 Mar 2024 09:11:57 +0000
Subject: [PATCH 05/43] pageserver/controller: error handling for shard
 splitting (#7074)

## Problem

Shard splits worked, but weren't safe against failures (e.g. node crash
during split) yet.

Related: #6676

## Summary of changes

- Introduce async rwlocks at the scope of Tenant and Node:
  - exclusive tenant lock is used to protect splits
- exclusive node lock is used to protect new reconciliation process that
happens when setting node active
- exclusive locks used in both cases when doing persistent updates (e.g.
node scheduling conf) where the update to DB & in-memory state needs to
be atomic.
- Add failpoints to shard splitting in control plane and pageserver
code.
- Implement error handling in control plane for shard splits: this
detaches child chards and ensures parent shards are re-attached.
- Crash-safety for storage controller restarts requires little effort:
we already reconcile with nodes over a storage controller restart, so as
long as we reset any incomplete splits in the DB on restart (added in
this PR), things are implicitly cleaned up.
- Implement reconciliation with offline nodes before they transition to
active:
- (in this context reconciliation means something like
startup_reconcile, not literally the Reconciler)
- This covers cases where split abort cannot reach a node to clean it
up: the cleanup will eventually happen when the node is marked active,
as part of reconciliation.
- This also covers the case where a node was unavailable when the
storage controller started, but becomes available later: previously this
allowed it to skip the startup reconcile.
- Storage controller now terminates on panics. We only use panics for
true "should never happen" assertions, and these cases can leave us in
an un-usable state if we keep running (e.g. panicking in a shard split).
In the unlikely event that we get into a crashloop as a result, we'll
rely on kubernetes to back us off.
- Add `test_sharding_split_failures` which exercises a variety of
failure cases during shard split.
---
 Cargo.lock                                    |   2 +
 control_plane/attachment_service/Cargo.toml   |   2 +
 control_plane/attachment_service/src/http.rs  |   5 +
 .../attachment_service/src/id_lock_map.rs     |  54 ++
 control_plane/attachment_service/src/lib.rs   |   1 +
 control_plane/attachment_service/src/main.rs  |   6 +
 control_plane/attachment_service/src/node.rs  |  35 +-
 .../attachment_service/src/persistence.rs     |  78 ++
 .../attachment_service/src/reconciler.rs      |  56 +-
 .../attachment_service/src/service.rs         | 787 +++++++++++++++---
 .../attachment_service/src/tenant_state.rs    |   7 +-
 pageserver/src/http/routes.rs                 |  10 +
 pageserver/src/tenant/mgr.rs                  |  56 +-
 test_runner/conftest.py                       |   1 +
 test_runner/fixtures/compute_reconfigure.py   |  62 ++
 test_runner/fixtures/neon_fixtures.py         |  17 +
 test_runner/fixtures/workload.py              |  76 +-
 test_runner/regress/test_sharding.py          | 340 +++++++-
 18 files changed, 1445 insertions(+), 150 deletions(-)
 create mode 100644 control_plane/attachment_service/src/id_lock_map.rs
 create mode 100644 test_runner/fixtures/compute_reconfigure.py

diff --git a/Cargo.lock b/Cargo.lock
index 7fd9053f62..45397eb4a2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -282,8 +282,10 @@ dependencies = [
  "control_plane",
  "diesel",
  "diesel_migrations",
+ "fail",
  "futures",
  "git-version",
+ "hex",
  "humantime",
  "hyper",
  "metrics",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index a5fad7216c..f78f56c480 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -19,8 +19,10 @@ aws-config.workspace = true
 aws-sdk-secretsmanager.workspace = true
 camino.workspace = true
 clap.workspace = true
+fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
+hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
 once_cell.workspace = true
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 515c287ea9..d26652cc94 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -10,7 +10,9 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
+use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
+use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
 use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
@@ -554,6 +556,9 @@ pub fn make_router(
         .post("/debug/v1/consistency_check", |r| {
             request_span(r, handle_consistency_check)
         })
+        .put("/debug/v1/failpoints", |r| {
+            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
+        })
         .get("/control/v1/tenant/:tenant_id/locate", |r| {
             tenant_service_handler(r, handle_tenant_locate)
         })
diff --git a/control_plane/attachment_service/src/id_lock_map.rs b/control_plane/attachment_service/src/id_lock_map.rs
new file mode 100644
index 0000000000..b03700b50c
--- /dev/null
+++ b/control_plane/attachment_service/src/id_lock_map.rs
@@ -0,0 +1,54 @@
+use std::{collections::HashMap, sync::Arc};
+
+/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
+/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
+/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
+/// is needed at a tenant-wide granularity.
+pub(crate) struct IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    /// A synchronous lock for getting/setting the async locks that our callers will wait on.
+    entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
+}
+
+impl<T> IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    pub(crate) fn shared(
+        &self,
+        key: T,
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default();
+        entry.clone().read_owned()
+    }
+
+    pub(crate) fn exclusive(
+        &self,
+        key: T,
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default();
+        entry.clone().write_owned()
+    }
+
+    /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
+    /// periodic housekeeping to avoid the map growing indefinitely
+    pub(crate) fn housekeeping(&self) {
+        let mut locked = self.entities.lock().unwrap();
+        locked.retain(|_k, lock| lock.try_write().is_err())
+    }
+}
+
+impl<T> Default for IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    fn default() -> Self {
+        Self {
+            entities: std::sync::Mutex::new(HashMap::new()),
+        }
+    }
+}
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 796b465c10..a017bc1ecc 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -4,6 +4,7 @@ use utils::seqwait::MonotonicCounter;
 mod auth;
 mod compute_hook;
 pub mod http;
+mod id_lock_map;
 pub mod metrics;
 mod node;
 pub mod persistence;
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 333c3911e3..fb7b363c39 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -206,6 +206,12 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
 }
 
 fn main() -> anyhow::Result<()> {
+    let default_panic = std::panic::take_hook();
+    std::panic::set_hook(Box::new(move |info| {
+        default_panic(info);
+        std::process::exit(1);
+    }));
+
     tokio::runtime::Builder::new_current_thread()
         // We use spawn_blocking for database operations, so require approximately
         // as many blocking threads as we will open database connections.
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 27b03608fa..dda8a155c6 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -83,29 +83,38 @@ impl Node {
         }
     }
 
-    pub(crate) fn set_availability(
-        &mut self,
-        availability: NodeAvailability,
-    ) -> AvailabilityTransition {
-        use NodeAvailability::*;
-        let transition = match (self.availability, availability) {
-            (Offline, Active) => {
+    pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
+        match self.get_availability_transition(availability) {
+            AvailabilityTransition::ToActive => {
                 // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                 // users of previously-cloned copies of the node will still see the old cancellation
                 // state.  For example, Reconcilers in flight will have to complete and be spawned
                 // again to realize that the node has become available.
                 self.cancel = CancellationToken::new();
-                AvailabilityTransition::ToActive
             }
-            (Active, Offline) => {
+            AvailabilityTransition::ToOffline => {
                 // Fire the node's cancellation token to cancel any in-flight API requests to it
                 self.cancel.cancel();
-                AvailabilityTransition::ToOffline
             }
-            _ => AvailabilityTransition::Unchanged,
-        };
+            AvailabilityTransition::Unchanged => {}
+        }
         self.availability = availability;
-        transition
+    }
+
+    /// Without modifying the availability of the node, convert the intended availability
+    /// into a description of the transition.
+    pub(crate) fn get_availability_transition(
+        &self,
+        availability: NodeAvailability,
+    ) -> AvailabilityTransition {
+        use AvailabilityTransition::*;
+        use NodeAvailability::*;
+
+        match (self.availability, availability) {
+            (Offline, Active) => ToActive,
+            (Active, Offline) => ToOffline,
+            _ => Unchanged,
+        }
     }
 
     /// Whether we may send API requests to this node.
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index aa08945834..3602cf8b1f 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -11,6 +11,9 @@ use diesel::prelude::*;
 use diesel::Connection;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
+use pageserver_api::shard::ShardConfigError;
+use pageserver_api::shard::ShardIdentity;
+use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
@@ -72,6 +75,14 @@ pub(crate) enum DatabaseError {
     Logical(String),
 }
 
+#[must_use]
+pub(crate) enum AbortShardSplitStatus {
+    /// We aborted the split in the database by reverting to the parent shards
+    Aborted,
+    /// The split had already been persisted.
+    Complete,
+}
+
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 impl Persistence {
@@ -570,6 +581,51 @@ impl Persistence {
         })
         .await
     }
+
+    /// Used when the remote part of a shard split failed: we will revert the database state to have only
+    /// the parent shards, with SplitState::Idle.
+    pub(crate) async fn abort_shard_split(
+        &self,
+        split_tenant_id: TenantId,
+        new_shard_count: ShardCount,
+    ) -> DatabaseResult<AbortShardSplitStatus> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+            let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
+                // Clear the splitting state on parent shards
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.ne(new_shard_count.literal() as i32))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+
+                // Parent shards are already gone: we cannot abort.
+                if updated == 0 {
+                    return Ok(AbortShardSplitStatus::Complete);
+                }
+
+                // Sanity check: if parent shards were present, their cardinality should
+                // be less than the number of child shards.
+                if updated >= new_shard_count.count() as usize {
+                    return Err(DatabaseError::Logical(format!(
+                        "Unexpected parent shard count {updated} while aborting split to \
+                            count {new_shard_count:?} on tenant {split_tenant_id}"
+                    )));
+                }
+
+                // Erase child shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(new_shard_count.literal() as i32))
+                    .execute(conn)?;
+
+                Ok(AbortShardSplitStatus::Aborted)
+            })?;
+
+            Ok(aborted)
+        })
+        .await
+    }
 }
 
 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
@@ -604,6 +660,28 @@ pub(crate) struct TenantShardPersistence {
     pub(crate) config: String,
 }
 
+impl TenantShardPersistence {
+    pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
+        if self.shard_count == 0 {
+            Ok(ShardIdentity::unsharded())
+        } else {
+            Ok(ShardIdentity::new(
+                ShardNumber(self.shard_number as u8),
+                ShardCount::new(self.shard_count as u8),
+                ShardStripeSize(self.shard_stripe_size as u32),
+            )?)
+        }
+    }
+
+    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
+        Ok(TenantShardId {
+            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
+            shard_number: ShardNumber(self.shard_number as u8),
+            shard_count: ShardCount::new(self.shard_count as u8),
+        })
+    }
+}
+
 /// Parts of [`crate::node::Node`] that are stored durably
 #[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::nodes)]
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 603da9bf02..7f68a65c15 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,5 +1,6 @@
 use crate::persistence::Persistence;
 use crate::service;
+use hyper::StatusCode;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -18,6 +19,8 @@ use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
 use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
 
+const DEFAULT_HEATMAP_PERIOD: &str = "60s";
+
 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
 pub(super) struct Reconciler {
@@ -485,17 +488,29 @@ impl Reconciler {
                 )
                 .await
             {
-                Some(Ok(observed)) => observed,
+                Some(Ok(observed)) => Some(observed),
+                Some(Err(mgmt_api::Error::ApiError(status, _msg)))
+                    if status == StatusCode::NOT_FOUND =>
+                {
+                    None
+                }
                 Some(Err(e)) => return Err(e.into()),
                 None => return Err(ReconcileError::Cancel),
             };
             tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
-            self.observed.locations.insert(
-                attached_node.get_id(),
-                ObservedStateLocation {
-                    conf: observed_conf,
-                },
-            );
+            match observed_conf {
+                Some(conf) => {
+                    // Pageserver returned a state: update it in observed.  This may still be an indeterminate (None) state,
+                    // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
+                    self.observed
+                        .locations
+                        .insert(attached_node.get_id(), ObservedStateLocation { conf });
+                }
+                None => {
+                    // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
+                    self.observed.locations.remove(&attached_node.get_id());
+                }
+            }
         }
 
         Ok(())
@@ -525,7 +540,12 @@ impl Reconciler {
                 )));
             };
 
-            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let mut wanted_conf = attached_location_conf(
+                generation,
+                &self.shard,
+                &self.config,
+                !self.intent.secondary.is_empty(),
+            );
             match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
@@ -662,10 +682,26 @@ impl Reconciler {
     }
 }
 
+/// We tweak the externally-set TenantConfig while configuring
+/// locations, using our awareness of whether secondary locations
+/// are in use to automatically enable/disable heatmap uploads.
+fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
+    let mut config = config.clone();
+    if has_secondaries {
+        if config.heatmap_period.is_none() {
+            config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
+        }
+    } else {
+        config.heatmap_period = None;
+    }
+    config
+}
+
 pub(crate) fn attached_location_conf(
     generation: Generation,
     shard: &ShardIdentity,
     config: &TenantConfig,
+    has_secondaries: bool,
 ) -> LocationConfig {
     LocationConfig {
         mode: LocationConfigMode::AttachedSingle,
@@ -674,7 +710,7 @@ pub(crate) fn attached_location_conf(
         shard_number: shard.number.0,
         shard_count: shard.count.literal(),
         shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: config.clone(),
+        tenant_conf: ha_aware_config(config, has_secondaries),
     }
 }
 
@@ -689,6 +725,6 @@ pub(crate) fn secondary_location_conf(
         shard_number: shard.number.0,
         shard_count: shard.count.literal(),
         shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: config.clone(),
+        tenant_conf: ha_aware_config(config, true),
     }
 }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 1c4ede3d9d..1b85081666 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -7,6 +7,7 @@ use std::{
     time::{Duration, Instant},
 };
 
+use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus};
 use anyhow::Context;
 use control_plane::storage_controller::{
     AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
@@ -36,6 +37,7 @@ use pageserver_api::{
     },
 };
 use pageserver_client::mgmt_api;
+use tokio::sync::OwnedRwLockWriteGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
 use utils::{
@@ -147,6 +149,18 @@ pub struct Service {
     compute_hook: Arc<ComputeHook>,
     result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
 
+    // Channel for background cleanup from failed operations that require cleanup, such as shard split
+    abort_tx: tokio::sync::mpsc::UnboundedSender<TenantShardSplitAbort>,
+
+    // Locking on a tenant granularity (covers all shards in the tenant):
+    // - Take exclusively for rare operations that mutate the tenant's persistent state (e.g. create/delete/split)
+    // - Take in shared mode for operations that need the set of shards to stay the same to complete reliably (e.g. timeline CRUD)
+    tenant_op_locks: IdLockMap<TenantId>,
+
+    // Locking for node-mutating operations: take exclusively for operations that modify the node's persistent state, or
+    // that transition it to/from Active.
+    node_op_locks: IdLockMap<NodeId>,
+
     // Process shutdown will fire this token
     cancel: CancellationToken,
 
@@ -174,6 +188,27 @@ enum TenantCreateOrUpdate {
     Update(Vec<ShardUpdate>),
 }
 
+/// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes
+/// might not be available.  We therefore use a queue of abort operations processed in the background.
+struct TenantShardSplitAbort {
+    tenant_id: TenantId,
+    /// The target values from the request that failed
+    new_shard_count: ShardCount,
+    new_stripe_size: Option<ShardStripeSize>,
+    /// Until this abort op is complete, no other operations may be done on the tenant
+    _tenant_lock: tokio::sync::OwnedRwLockWriteGuard<()>,
+}
+
+#[derive(thiserror::Error, Debug)]
+enum TenantShardSplitAbortError {
+    #[error(transparent)]
+    Database(#[from] DatabaseError),
+    #[error(transparent)]
+    Remote(#[from] mgmt_api::Error),
+    #[error("Unavailable")]
+    Unavailable,
+}
+
 struct ShardUpdate {
     tenant_shard_id: TenantShardId,
     placement_policy: PlacementPolicy,
@@ -627,8 +662,52 @@ impl Service {
         }
     }
 
+    async fn process_aborts(
+        &self,
+        mut abort_rx: tokio::sync::mpsc::UnboundedReceiver<TenantShardSplitAbort>,
+    ) {
+        loop {
+            // Wait for the next result, or for cancellation
+            let op = tokio::select! {
+                r = abort_rx.recv() => {
+                    match r {
+                        Some(op) => {op},
+                        None => {break;}
+                    }
+                }
+                _ = self.cancel.cancelled() => {
+                    break;
+                }
+            };
+
+            // Retry until shutdown: we must keep this request object alive until it is properly
+            // processed, as it holds a lock guard that prevents other operations trying to do things
+            // to the tenant while it is in a weird part-split state.
+            while !self.cancel.is_cancelled() {
+                match self.abort_tenant_shard_split(&op).await {
+                    Ok(_) => break,
+                    Err(e) => {
+                        tracing::warn!(
+                            "Failed to abort shard split on {}, will retry: {e}",
+                            op.tenant_id
+                        );
+
+                        // If a node is unavailable, we hope that it has been properly marked Offline
+                        // when we retry, so that the abort op will succeed.  If the abort op is failing
+                        // for some other reason, we will keep retrying forever, or until a human notices
+                        // and does something about it (either fixing a pageserver or restarting the controller).
+                        tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled())
+                            .await
+                            .ok();
+                    }
+                }
+            }
+        }
+    }
+
     pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
         let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
+        let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel();
 
         tracing::info!("Loading nodes from database...");
         let nodes = persistence
@@ -641,12 +720,62 @@ impl Service {
         tracing::info!("Loaded {} nodes from database.", nodes.len());
 
         tracing::info!("Loading shards from database...");
-        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
+        let mut tenant_shard_persistence = persistence.list_tenant_shards().await?;
         tracing::info!(
             "Loaded {} shards from database.",
             tenant_shard_persistence.len()
         );
 
+        // If any shard splits were in progress, reset the database state to abort them
+        let mut tenant_shard_count_min_max: HashMap<TenantId, (ShardCount, ShardCount)> =
+            HashMap::new();
+        for tsp in &mut tenant_shard_persistence {
+            let shard = tsp.get_shard_identity()?;
+            let tenant_shard_id = tsp.get_tenant_shard_id()?;
+            let entry = tenant_shard_count_min_max
+                .entry(tenant_shard_id.tenant_id)
+                .or_insert_with(|| (shard.count, shard.count));
+            entry.0 = std::cmp::min(entry.0, shard.count);
+            entry.1 = std::cmp::max(entry.1, shard.count);
+        }
+
+        for (tenant_id, (count_min, count_max)) in tenant_shard_count_min_max {
+            if count_min != count_max {
+                // Aborting the split in the database and dropping the child shards is sufficient: the reconciliation in
+                // [`Self::startup_reconcile`] will implicitly drop the child shards on remote pageservers, or they'll
+                // be dropped later in [`Self::node_activate_reconcile`] if it isn't available right now.
+                tracing::info!("Aborting shard split {tenant_id} {count_min:?} -> {count_max:?}");
+                let abort_status = persistence.abort_shard_split(tenant_id, count_max).await?;
+
+                // We may never see the Complete status here: if the split was complete, we wouldn't have
+                // identified this tenant has having mismatching min/max counts.
+                assert!(matches!(abort_status, AbortShardSplitStatus::Aborted));
+
+                // Clear the splitting status in-memory, to reflect that we just aborted in the database
+                tenant_shard_persistence.iter_mut().for_each(|tsp| {
+                    // Set idle split state on those shards that we will retain.
+                    let tsp_tenant_id = TenantId::from_str(tsp.tenant_id.as_str()).unwrap();
+                    if tsp_tenant_id == tenant_id
+                        && tsp.get_shard_identity().unwrap().count == count_min
+                    {
+                        tsp.splitting = SplitState::Idle;
+                    } else if tsp_tenant_id == tenant_id {
+                        // Leave the splitting state on the child shards: this will be used next to
+                        // drop them.
+                        tracing::info!(
+                            "Shard {tsp_tenant_id} will be dropped after shard split abort",
+                        );
+                    }
+                });
+
+                // Drop shards for this tenant which we didn't just mark idle (i.e. child shards of the aborted split)
+                tenant_shard_persistence.retain(|tsp| {
+                    TenantId::from_str(tsp.tenant_id.as_str()).unwrap() != tenant_id
+                        || tsp.splitting == SplitState::Idle
+                });
+            }
+        }
+
         let mut tenants = BTreeMap::new();
 
         let mut scheduler = Scheduler::new(nodes.values());
@@ -676,21 +805,8 @@ impl Service {
             }
         }
         for tsp in tenant_shard_persistence {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount::new(tsp.shard_count as u8),
-            };
-            let shard_identity = if tsp.shard_count == 0 {
-                ShardIdentity::unsharded()
-            } else {
-                ShardIdentity::new(
-                    ShardNumber(tsp.shard_number as u8),
-                    ShardCount::new(tsp.shard_count as u8),
-                    ShardStripeSize(tsp.shard_stripe_size as u32),
-                )?
-            };
-
+            let tenant_shard_id = tsp.get_tenant_shard_id()?;
+            let shard_identity = tsp.get_shard_identity()?;
             // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
             // it with what we can infer: the node for which a generation was most recently issued.
             let mut intent = IntentState::new();
@@ -728,9 +844,12 @@ impl Service {
             persistence,
             compute_hook: Arc::new(ComputeHook::new(config)),
             result_tx,
+            abort_tx,
             startup_complete: startup_complete.clone(),
             cancel: CancellationToken::new(),
             gate: Gate::default(),
+            tenant_op_locks: Default::default(),
+            node_op_locks: Default::default(),
         });
 
         let result_task_this = this.clone();
@@ -741,6 +860,33 @@ impl Service {
             }
         });
 
+        tokio::task::spawn({
+            let this = this.clone();
+            async move {
+                // Block shutdown until we're done (we must respect self.cancel)
+                if let Ok(_gate) = this.gate.enter() {
+                    this.process_aborts(abort_rx).await
+                }
+            }
+        });
+
+        tokio::task::spawn({
+            let this = this.clone();
+            async move {
+                if let Ok(_gate) = this.gate.enter() {
+                    loop {
+                        tokio::select! {
+                            _ = this.cancel.cancelled() => {
+                                break;
+                            },
+                            _ = tokio::time::sleep(Duration::from_secs(60)) => {}
+                        };
+                        this.tenant_op_locks.housekeeping();
+                    }
+                }
+            }
+        });
+
         tokio::task::spawn({
             let this = this.clone();
             // We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`]
@@ -889,6 +1035,7 @@ impl Service {
                             tenant_state.generation.unwrap(),
                             &tenant_state.shard,
                             &tenant_state.config,
+                            false,
                         )),
                     },
                 )]);
@@ -918,6 +1065,118 @@ impl Service {
         }
     }
 
+    // When the availability state of a node transitions to active, we must do a full reconciliation
+    // of LocationConfigs on that node.  This is because while a node was offline:
+    // - we might have proceeded through startup_reconcile without checking for extraneous LocationConfigs on this node
+    // - aborting a tenant shard split might have left rogue child shards behind on this node.
+    //
+    // This function must complete _before_ setting a `Node` to Active: once it is set to Active, other
+    // Reconcilers might communicate with the node, and these must not overlap with the work we do in
+    // this function.
+    //
+    // The reconciliation logic in here is very similar to what [`Self::startup_reconcile`] does, but
+    // for written for a single node rather than as a batch job for all nodes.
+    #[tracing::instrument(skip_all, fields(node_id=%node.get_id()))]
+    async fn node_activate_reconcile(
+        &self,
+        mut node: Node,
+        _lock: &OwnedRwLockWriteGuard<()>,
+    ) -> Result<(), ApiError> {
+        // This Node is a mutable local copy: we will set it active so that we can use its
+        // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
+        // later.
+        node.set_availability(NodeAvailability::Active);
+
+        let configs = match node
+            .with_client_retries(
+                |client| async move { client.list_location_config().await },
+                &self.config.jwt_token,
+                1,
+                5,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await
+        {
+            None => {
+                // We're shutting down (the Node's cancellation token can't have fired, because
+                // we're the only scope that has a reference to it, and we didn't fire it).
+                return Err(ApiError::ShuttingDown);
+            }
+            Some(Err(e)) => {
+                // This node didn't succeed listing its locations: it may not proceed to active state
+                // as it is apparently unavailable.
+                return Err(ApiError::PreconditionFailed(
+                    format!("Failed to query node location configs, cannot activate ({e})").into(),
+                ));
+            }
+            Some(Ok(configs)) => configs,
+        };
+        tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len());
+
+        let mut cleanup = Vec::new();
+        {
+            let mut locked = self.inner.write().unwrap();
+
+            for (tenant_shard_id, observed_loc) in configs.tenant_shards {
+                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
+                    cleanup.push(tenant_shard_id);
+                    continue;
+                };
+                tenant_state
+                    .observed
+                    .locations
+                    .insert(node.get_id(), ObservedStateLocation { conf: observed_loc });
+            }
+        }
+
+        for tenant_shard_id in cleanup {
+            tracing::info!("Detaching {tenant_shard_id}");
+            match node
+                .with_client_retries(
+                    |client| async move {
+                        let config = LocationConfig {
+                            mode: LocationConfigMode::Detached,
+                            generation: None,
+                            secondary_conf: None,
+                            shard_number: tenant_shard_id.shard_number.0,
+                            shard_count: tenant_shard_id.shard_count.literal(),
+                            shard_stripe_size: 0,
+                            tenant_conf: models::TenantConfig::default(),
+                        };
+                        client
+                            .location_config(tenant_shard_id, config, None, false)
+                            .await
+                    },
+                    &self.config.jwt_token,
+                    1,
+                    5,
+                    SHORT_RECONCILE_TIMEOUT,
+                    &self.cancel,
+                )
+                .await
+            {
+                None => {
+                    // We're shutting down (the Node's cancellation token can't have fired, because
+                    // we're the only scope that has a reference to it, and we didn't fire it).
+                    return Err(ApiError::ShuttingDown);
+                }
+                Some(Err(e)) => {
+                    // Do not let the node proceed to Active state if it is not responsive to requests
+                    // to detach.  This could happen if e.g. a shutdown bug in the pageserver is preventing
+                    // detach completing: we should not let this node back into the set of nodes considered
+                    // okay for scheduling.
+                    return Err(ApiError::Conflict(format!(
+                        "Node {node} failed to detach {tenant_shard_id}: {e}"
+                    )));
+                }
+                Some(Ok(_)) => {}
+            };
+        }
+
+        Ok(())
+    }
+
     pub(crate) async fn re_attach(
         &self,
         reattach_req: ReAttachRequest,
@@ -926,15 +1185,6 @@ impl Service {
             self.node_register(register_req).await?;
         }
 
-        // Take a re-attach as indication that the node is available: this is a precursor to proper
-        // heartbeating in https://github.com/neondatabase/neon/issues/6844
-        self.node_configure(NodeConfigureRequest {
-            node_id: reattach_req.node_id,
-            availability: Some(NodeAvailability::Active),
-            scheduling: None,
-        })
-        .await?;
-
         // Ordering: we must persist generation number updates before making them visible in the in-memory state
         let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;
 
@@ -946,6 +1196,7 @@ impl Service {
 
         // Apply the updated generation to our in-memory state
         let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
 
         let mut response = ReAttachResponse {
             tenants: Vec::new(),
@@ -957,7 +1208,7 @@ impl Service {
                 gen: new_gen.into().unwrap(),
             });
             // Apply the new generation number to our in-memory state
-            let shard_state = locked.tenants.get_mut(&tenant_shard_id);
+            let shard_state = tenants.get_mut(&tenant_shard_id);
             let Some(shard_state) = shard_state else {
                 // Not fatal.  This edge case requires a re-attach to happen
                 // between inserting a new tenant shard in to the database, and updating our in-memory
@@ -1008,6 +1259,25 @@ impl Service {
             // request in flight over the network: TODO handle that by making location_conf API refuse
             // to go backward in generations.
         }
+
+        // We consider a node Active once we have composed a re-attach response, but we
+        // do not call [`Self::node_activate_reconcile`]: the handling of the re-attach response
+        // implicitly synchronizes the LocationConfigs on the node.
+        //
+        // Setting a node active unblocks any Reconcilers that might write to the location config API,
+        // but those requests will not be accepted by the node until it has finished processing
+        // the re-attach response.
+        if let Some(node) = nodes.get(&reattach_req.node_id) {
+            if !node.is_available() {
+                let mut new_nodes = (**nodes).clone();
+                if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
+                    node.set_availability(NodeAvailability::Active);
+                }
+                let new_nodes = Arc::new(new_nodes);
+                *nodes = new_nodes;
+            }
+        }
+
         Ok(response)
     }
 
@@ -1048,6 +1318,12 @@ impl Service {
         &self,
         create_req: TenantCreateRequest,
     ) -> Result<TenantCreateResponse, ApiError> {
+        // Exclude any concurrent attempts to create/access the same tenant ID
+        let _tenant_lock = self
+            .tenant_op_locks
+            .exclusive(create_req.new_tenant_id.tenant_id)
+            .await;
+
         let (response, waiters) = self.do_tenant_create(create_req).await?;
 
         self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
@@ -1362,16 +1638,20 @@ impl Service {
         tenant_shard_id: TenantShardId,
         req: TenantLocationConfigRequest,
     ) -> Result<TenantLocationConfigResponse, ApiError> {
+        // We require an exclusive lock, because we are updating both persistent and in-memory state
+        let _tenant_lock = self
+            .tenant_op_locks
+            .exclusive(tenant_shard_id.tenant_id)
+            .await;
+
         if !tenant_shard_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
                 "This API is for importing single-sharded or unsharded tenants"
             )));
         }
 
-        let tenant_id = tenant_shard_id.tenant_id;
-
         // First check if this is a creation or an update
-        let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
+        let create_or_update = self.tenant_location_config_prepare(tenant_shard_id.tenant_id, req);
 
         let mut result = TenantLocationConfigResponse {
             shards: Vec::new(),
@@ -1477,6 +1757,9 @@ impl Service {
     }
 
     pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
+        // We require an exclusive lock, because we are updating persistent and in-memory state
+        let _tenant_lock = self.tenant_op_locks.exclusive(req.tenant_id).await;
+
         let tenant_id = req.tenant_id;
         let config = req.config;
 
@@ -1558,6 +1841,8 @@ impl Service {
         timestamp: Cow<'_, str>,
         done_if_after: Cow<'_, str>,
     ) -> Result<(), ApiError> {
+        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+
         let node = {
             let locked = self.inner.read().unwrap();
             // Just a sanity check to prevent misuse: the API expects that the tenant is fully
@@ -1643,6 +1928,8 @@ impl Service {
         &self,
         tenant_id: TenantId,
     ) -> Result<(), ApiError> {
+        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+
         // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
         let targets = {
             let locked = self.inner.read().unwrap();
@@ -1692,6 +1979,8 @@ impl Service {
     }
 
     pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
+        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+
         self.ensure_attached_wait(tenant_id).await?;
 
         // TODO: refactor into helper
@@ -1788,10 +2077,10 @@ impl Service {
             create_req.new_timeline_id,
         );
 
+        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+
         self.ensure_attached_wait(tenant_id).await?;
 
-        // TODO: refuse to do this if shard splitting is in progress
-        // (https://github.com/neondatabase/neon/issues/6676)
         let mut targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -1913,11 +2202,10 @@ impl Service {
         timeline_id: TimelineId,
     ) -> Result<StatusCode, ApiError> {
         tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
+        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
 
         self.ensure_attached_wait(tenant_id).await?;
 
-        // TODO: refuse to do this if shard splitting is in progress
-        // (https://github.com/neondatabase/neon/issues/6676)
         let mut targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -2106,10 +2394,306 @@ impl Service {
         })
     }
 
+    #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
+    async fn abort_tenant_shard_split(
+        &self,
+        op: &TenantShardSplitAbort,
+    ) -> Result<(), TenantShardSplitAbortError> {
+        // Cleaning up a split:
+        // - Parent shards are not destroyed during a split, just detached.
+        // - Failed pageserver split API calls can leave the remote node with just the parent attached,
+        //   just the children attached, or both.
+        //
+        // Therefore our work to do is to:
+        // 1. Clean up storage controller's internal state to just refer to parents, no children
+        // 2. Call out to pageservers to ensure that children are detached
+        // 3. Call out to pageservers to ensure that parents are attached.
+        //
+        // Crash safety:
+        // - If the storage controller stops running during this cleanup *after* clearing the splitting state
+        //   from our database, then [`Self::startup_reconcile`] will regard child attachments as garbage
+        //   and detach them.
+        // - TODO: If the storage controller stops running during this cleanup *before* clearing the splitting state
+        //   from our database, then we will re-enter this cleanup routine on startup.
+
+        let TenantShardSplitAbort {
+            tenant_id,
+            new_shard_count,
+            new_stripe_size,
+            ..
+        } = op;
+
+        // First abort persistent state, if any exists.
+        match self
+            .persistence
+            .abort_shard_split(*tenant_id, *new_shard_count)
+            .await?
+        {
+            AbortShardSplitStatus::Aborted => {
+                // Proceed to roll back any child shards created on pageservers
+            }
+            AbortShardSplitStatus::Complete => {
+                // The split completed (we might hit that path if e.g. our database transaction
+                // to write the completion landed in the database, but we dropped connection
+                // before seeing the result).
+                //
+                // We must update in-memory state to reflect the successful split.
+                self.tenant_shard_split_commit_inmem(
+                    *tenant_id,
+                    *new_shard_count,
+                    *new_stripe_size,
+                );
+                return Ok(());
+            }
+        }
+
+        // Clean up in-memory state, and accumulate the list of child locations that need detaching
+        let detach_locations: Vec<(Node, TenantShardId)> = {
+            let mut detach_locations = Vec::new();
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, _scheduler) = locked.parts_mut();
+
+            for (tenant_shard_id, shard) in
+                tenants.range_mut(TenantShardId::tenant_range(op.tenant_id))
+            {
+                if shard.shard.count == op.new_shard_count {
+                    // Surprising: the phase of [`Self::do_tenant_shard_split`] which inserts child shards in-memory
+                    // is infallible, so if we got an error we shouldn't have got that far.
+                    tracing::warn!(
+                        "During split abort, child shard {tenant_shard_id} found in-memory"
+                    );
+                    continue;
+                }
+
+                // Add the children of this shard to this list of things to detach
+                if let Some(node_id) = shard.intent.get_attached() {
+                    for child_id in tenant_shard_id.split(*new_shard_count) {
+                        detach_locations.push((
+                            nodes
+                                .get(node_id)
+                                .expect("Intent references nonexistent node")
+                                .clone(),
+                            child_id,
+                        ));
+                    }
+                } else {
+                    tracing::warn!(
+                        "During split abort, shard {tenant_shard_id} has no attached location"
+                    );
+                }
+
+                tracing::info!("Restoring parent shard {tenant_shard_id}");
+                shard.splitting = SplitState::Idle;
+                self.maybe_reconcile_shard(shard, nodes);
+            }
+
+            // We don't expect any new_shard_count shards to exist here, but drop them just in case
+            tenants.retain(|_id, s| s.shard.count != *new_shard_count);
+
+            detach_locations
+        };
+
+        for (node, child_id) in detach_locations {
+            if !node.is_available() {
+                // An unavailable node cannot be cleaned up now: to avoid blocking forever, we will permit this, and
+                // rely on the reconciliation that happens when a node transitions to Active to clean up. Since we have
+                // removed child shards from our in-memory state and database, the reconciliation will implicitly remove
+                // them from the node.
+                tracing::warn!("Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated.");
+                continue;
+            }
+
+            // Detach the remote child.  If the pageserver split API call is still in progress, this call will get
+            // a 503 and retry, up to our limit.
+            tracing::info!("Detaching {child_id} on {node}...");
+            match node
+                .with_client_retries(
+                    |client| async move {
+                        let config = LocationConfig {
+                            mode: LocationConfigMode::Detached,
+                            generation: None,
+                            secondary_conf: None,
+                            shard_number: child_id.shard_number.0,
+                            shard_count: child_id.shard_count.literal(),
+                            // Stripe size and tenant config don't matter when detaching
+                            shard_stripe_size: 0,
+                            tenant_conf: TenantConfig::default(),
+                        };
+
+                        client.location_config(child_id, config, None, false).await
+                    },
+                    &self.config.jwt_token,
+                    1,
+                    10,
+                    Duration::from_secs(5),
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Ok(_)) => {}
+                Some(Err(e)) => {
+                    // We failed to communicate with the remote node.  This is problematic: we may be
+                    // leaving it with a rogue child shard.
+                    tracing::warn!(
+                        "Failed to detach child {child_id} from node {node} during abort"
+                    );
+                    return Err(e.into());
+                }
+                None => {
+                    // Cancellation: we were shutdown or the node went offline. Shutdown is fine, we'll
+                    // clean up on restart. The node going offline requires a retry.
+                    return Err(TenantShardSplitAbortError::Unavailable);
+                }
+            };
+        }
+
+        tracing::info!("Successfully aborted split");
+        Ok(())
+    }
+
+    /// Infallible final stage of [`Self::tenant_shard_split`]: update the contents
+    /// of the tenant map to reflect the child shards that exist after the split.
+    fn tenant_shard_split_commit_inmem(
+        &self,
+        tenant_id: TenantId,
+        new_shard_count: ShardCount,
+        new_stripe_size: Option<ShardStripeSize>,
+    ) -> (
+        TenantShardSplitResponse,
+        Vec<(TenantShardId, NodeId, ShardStripeSize)>,
+    ) {
+        let mut response = TenantShardSplitResponse {
+            new_shards: Vec::new(),
+        };
+        let mut child_locations = Vec::new();
+        {
+            let mut locked = self.inner.write().unwrap();
+
+            let parent_ids = locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .map(|(shard_id, _)| *shard_id)
+                .collect::<Vec<_>>();
+
+            let (_nodes, tenants, scheduler) = locked.parts_mut();
+            for parent_id in parent_ids {
+                let child_ids = parent_id.split(new_shard_count);
+
+                let (pageserver, generation, policy, parent_ident, config) = {
+                    let mut old_state = tenants
+                        .remove(&parent_id)
+                        .expect("It was present, we just split it");
+
+                    // A non-splitting state is impossible, because [`Self::tenant_shard_split`] holds
+                    // a TenantId lock and passes it through to [`TenantShardSplitAbort`] in case of cleanup:
+                    // nothing else can clear this.
+                    assert!(matches!(old_state.splitting, SplitState::Splitting));
+
+                    let old_attached = old_state.intent.get_attached().unwrap();
+                    old_state.intent.clear(scheduler);
+                    let generation = old_state.generation.expect("Shard must have been attached");
+                    (
+                        old_attached,
+                        generation,
+                        old_state.policy,
+                        old_state.shard,
+                        old_state.config,
+                    )
+                };
+
+                for child in child_ids {
+                    let mut child_shard = parent_ident;
+                    child_shard.number = child.shard_number;
+                    child_shard.count = child.shard_count;
+                    if let Some(stripe_size) = new_stripe_size {
+                        child_shard.stripe_size = stripe_size;
+                    }
+
+                    let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
+                    child_observed.insert(
+                        pageserver,
+                        ObservedStateLocation {
+                            conf: Some(attached_location_conf(
+                                generation,
+                                &child_shard,
+                                &config,
+                                matches!(policy, PlacementPolicy::Double(n) if n > 0),
+                            )),
+                        },
+                    );
+
+                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
+                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
+                    child_state.observed = ObservedState {
+                        locations: child_observed,
+                    };
+                    child_state.generation = Some(generation);
+                    child_state.config = config.clone();
+
+                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
+                    // as at this point in the split process we have succeeded and this part is infallible:
+                    // we will never need to do any special recovery from this state.
+
+                    child_locations.push((child, pageserver, child_shard.stripe_size));
+
+                    if let Err(e) = child_state.schedule(scheduler) {
+                        // This is not fatal, because we've implicitly already got an attached
+                        // location for the child shard.  Failure here just means we couldn't
+                        // find a secondary (e.g. because cluster is overloaded).
+                        tracing::warn!("Failed to schedule child shard {child}: {e}");
+                    }
+
+                    tenants.insert(child, child_state);
+                    response.new_shards.push(child);
+                }
+            }
+
+            (response, child_locations)
+        }
+    }
+
     pub(crate) async fn tenant_shard_split(
         &self,
         tenant_id: TenantId,
         split_req: TenantShardSplitRequest,
+    ) -> Result<TenantShardSplitResponse, ApiError> {
+        // TODO: return 503 if we get stuck waiting for this lock
+        // (issue https://github.com/neondatabase/neon/issues/7108)
+        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+
+        let new_shard_count = ShardCount::new(split_req.new_shard_count);
+        let new_stripe_size = split_req.new_stripe_size;
+
+        let r = self.do_tenant_shard_split(tenant_id, split_req).await;
+
+        match r {
+            Ok(r) => Ok(r),
+            Err(ApiError::BadRequest(_)) => {
+                // A request validation error does not require rollback: we rejected it before we started making any changes: just
+                // return the error
+                r
+            }
+            Err(e) => {
+                // General case error handling: split might be part-done, we must do work to abort it.
+                tracing::warn!("Enqueuing background abort of split on {tenant_id}");
+                self.abort_tx
+                    .send(TenantShardSplitAbort {
+                        tenant_id,
+                        new_shard_count,
+                        new_stripe_size,
+                        _tenant_lock,
+                    })
+                    // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
+                    .ok();
+                Err(e)
+            }
+        }
+    }
+
+    pub(crate) async fn do_tenant_shard_split(
+        &self,
+        tenant_id: TenantId,
+        split_req: TenantShardSplitRequest,
     ) -> Result<TenantShardSplitResponse, ApiError> {
         let mut policy = None;
         let mut shard_ident = None;
@@ -2121,6 +2705,10 @@ impl Service {
             child_ids: Vec<TenantShardId>,
         }
 
+        fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest(
+            anyhow::anyhow!("failpoint")
+        )));
+
         // Validate input, and calculate which shards we will create
         let (old_shard_count, targets) =
             {
@@ -2230,7 +2818,9 @@ impl Service {
             if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size {
                 return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size)));
             }
+
             shard_ident.stripe_size = new_stripe_size;
+            tracing::info!("applied  stripe size {}", shard_ident.stripe_size.0);
             shard_ident
         } else {
             shard_ident.unwrap()
@@ -2255,6 +2845,11 @@ impl Service {
                 child_shard.number = child.shard_number;
                 child_shard.count = child.shard_count;
 
+                tracing::info!(
+                    "Create child shard persistence with stripe size {}",
+                    shard_ident.stripe_size.0
+                );
+
                 this_child_tsps.push(TenantShardPersistence {
                     tenant_id: child.tenant_id.to_string(),
                     shard_number: child.shard_number.0 as i32,
@@ -2293,6 +2888,9 @@ impl Service {
                 _ => return Err(ApiError::InternalServerError(e.into())),
             }
         }
+        fail::fail_point!("shard-split-post-begin", |_| Err(
+            ApiError::InternalServerError(anyhow::anyhow!("failpoint"))
+        ));
 
         // Now that I have persisted the splitting state, apply it in-memory.  This is infallible, so
         // callers may assume that if splitting is set in memory, then it was persisted, and if splitting
@@ -2302,15 +2900,16 @@ impl Service {
             for target in &targets {
                 if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) {
                     parent_shard.splitting = SplitState::Splitting;
+                    // Put the observed state to None, to reflect that it is indeterminate once we start the
+                    // split operation.
+                    parent_shard
+                        .observed
+                        .locations
+                        .insert(target.node.get_id(), ObservedStateLocation { conf: None });
                 }
             }
         }
 
-        // FIXME: we have now committed the shard split state to the database, so any subsequent
-        // failure needs to roll it back.  We will later wrap this function in logic to roll back
-        // the split if it fails.
-        // (https://github.com/neondatabase/neon/issues/6676)
-
         // TODO: issue split calls concurrently (this only matters once we're splitting
         // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
 
@@ -2332,6 +2931,10 @@ impl Service {
                 .await
                 .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?;
 
+            fail::fail_point!("shard-split-post-remote", |_| Err(ApiError::Conflict(
+                "failpoint".to_string()
+            )));
+
             tracing::info!(
                 "Split {} into {}",
                 parent_id,
@@ -2366,62 +2969,16 @@ impl Service {
             .complete_shard_split(tenant_id, old_shard_count)
             .await?;
 
+        fail::fail_point!("shard-split-post-complete", |_| Err(
+            ApiError::InternalServerError(anyhow::anyhow!("failpoint"))
+        ));
+
         // Replace all the shards we just split with their children: this phase is infallible.
-        let mut response = TenantShardSplitResponse {
-            new_shards: Vec::new(),
-        };
-        let mut child_locations = Vec::new();
-        {
-            let mut locked = self.inner.write().unwrap();
-            let (_nodes, tenants, scheduler) = locked.parts_mut();
-            for target in targets {
-                let SplitTarget {
-                    parent_id,
-                    node: _node,
-                    child_ids,
-                } = target;
-                let (pageserver, generation, config) = {
-                    let mut old_state = tenants
-                        .remove(&parent_id)
-                        .expect("It was present, we just split it");
-                    let old_attached = old_state.intent.get_attached().unwrap();
-                    old_state.intent.clear(scheduler);
-                    let generation = old_state.generation.expect("Shard must have been attached");
-                    (old_attached, generation, old_state.config.clone())
-                };
-
-                for child in child_ids {
-                    let mut child_shard = shard_ident;
-                    child_shard.number = child.shard_number;
-                    child_shard.count = child.shard_count;
-
-                    let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
-                    child_observed.insert(
-                        pageserver,
-                        ObservedStateLocation {
-                            conf: Some(attached_location_conf(generation, &child_shard, &config)),
-                        },
-                    );
-
-                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
-                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
-                    child_state.observed = ObservedState {
-                        locations: child_observed,
-                    };
-                    child_state.generation = Some(generation);
-                    child_state.config = config.clone();
-
-                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
-                    // as at this point in the split process we have succeeded and this part is infallible:
-                    // we will never need to do any special recovery from this state.
-
-                    child_locations.push((child, pageserver, child_shard.stripe_size));
-
-                    tenants.insert(child, child_state);
-                    response.new_shards.push(child);
-                }
-            }
-        }
+        let (response, child_locations) = self.tenant_shard_split_commit_inmem(
+            tenant_id,
+            ShardCount::new(split_req.new_shard_count),
+            split_req.new_stripe_size,
+        );
 
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
@@ -2710,6 +3267,8 @@ impl Service {
         &self,
         register_req: NodeRegisterRequest,
     ) -> Result<(), ApiError> {
+        let _node_lock = self.node_op_locks.exclusive(register_req.node_id).await;
+
         // Pre-check for an already-existing node
         {
             let locked = self.inner.read().unwrap();
@@ -2771,6 +3330,8 @@ impl Service {
         &self,
         config_req: NodeConfigureRequest,
     ) -> Result<(), ApiError> {
+        let _node_lock = self.node_op_locks.exclusive(config_req.node_id).await;
+
         if let Some(scheduling) = config_req.scheduling {
             // Scheduling is a persistent part of Node: we must write updates to the database before
             // applying them in memory
@@ -2779,6 +3340,37 @@ impl Service {
                 .await?;
         }
 
+        // If we're activating a node, then before setting it active we must reconcile any shard locations
+        // on that node, in case it is out of sync, e.g. due to being unavailable during controller startup,
+        // by calling [`Self::node_activate_reconcile`]
+        //
+        // The transition we calculate here remains valid later in the function because we hold the op lock on the node:
+        // nothing else can mutate its availability while we run.
+        let availability_transition = if let Some(input_availability) = config_req.availability {
+            let (activate_node, availability_transition) = {
+                let locked = self.inner.read().unwrap();
+                let Some(node) = locked.nodes.get(&config_req.node_id) else {
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Node {} not registered", config_req.node_id).into(),
+                    ));
+                };
+
+                (
+                    node.clone(),
+                    node.get_availability_transition(input_availability),
+                )
+            };
+
+            if matches!(availability_transition, AvailabilityTransition::ToActive) {
+                self.node_activate_reconcile(activate_node, &_node_lock)
+                    .await?;
+            }
+            availability_transition
+        } else {
+            AvailabilityTransition::Unchanged
+        };
+
+        // Apply changes from the request to our in-memory state for the Node
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, scheduler) = locked.parts_mut();
 
@@ -2790,11 +3382,9 @@ impl Service {
             ));
         };
 
-        let availability_transition = if let Some(availability) = &config_req.availability {
-            node.set_availability(*availability)
-        } else {
-            AvailabilityTransition::Unchanged
-        };
+        if let Some(availability) = &config_req.availability {
+            node.set_availability(*availability);
+        }
 
         if let Some(scheduling) = config_req.scheduling {
             node.set_scheduling(scheduling);
@@ -2808,6 +3398,7 @@ impl Service {
 
         let new_nodes = Arc::new(new_nodes);
 
+        // Modify scheduling state for any Tenants that are affected by a change in the node's availability state.
         match availability_transition {
             AvailabilityTransition::ToOffline => {
                 tracing::info!("Node {} transition to offline", config_req.node_id);
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 3c91e09ac3..39e557616d 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -577,7 +577,12 @@ impl TenantState {
                 .generation
                 .expect("Attempted to enter attached state without a generation");
 
-            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let wanted_conf = attached_location_conf(
+                generation,
+                &self.shard,
+                &self.config,
+                !self.intent.secondary.is_empty(),
+            );
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index bb8b1bb7e5..fc67f4cf8f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2108,6 +2108,16 @@ where
     R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
     H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
 {
+    if request.uri() != &"/v1/failpoints".parse::<Uri>().unwrap() {
+        fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable(
+            "failpoint".into()
+        )));
+
+        fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError(
+            anyhow::anyhow!("failpoint")
+        )));
+    }
+
     // Spawn a new task to handle the request, to protect the handler from unexpected
     // async cancellations. Most pageserver functions are not async cancellation safe.
     // We arm a drop-guard, so that if Hyper drops the Future, we signal the task
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 26fcce1f38..7cf03d8fd6 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1443,6 +1443,35 @@ impl TenantManager {
         new_shard_count: ShardCount,
         new_stripe_size: Option<ShardStripeSize>,
         ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<TenantShardId>> {
+        let r = self
+            .do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
+            .await;
+        if r.is_err() {
+            // Shard splitting might have left the original shard in a partially shut down state (it
+            // stops the shard's remote timeline client).  Reset it to ensure we leave things in
+            // a working state.
+            if self.get(tenant_shard_id).is_some() {
+                tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
+                if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
+                    // Log this error because our return value will still be the original error, not this one.  This is
+                    // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
+                    // (e.g. has uploads disabled).  We can't do anything else: if reset fails then shutting the tenant down or
+                    // setting it broken probably won't help either.
+                    tracing::error!("Failed to reset {tenant_shard_id}: {e}");
+                }
+            }
+        }
+
+        r
+    }
+
+    pub(crate) async fn do_shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        new_shard_count: ShardCount,
+        new_stripe_size: Option<ShardStripeSize>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Vec<TenantShardId>> {
         let tenant = get_tenant(tenant_shard_id, true)?;
 
@@ -1477,6 +1506,10 @@ impl TenantManager {
                 .join(",")
         );
 
+        fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
         let parent_shard_identity = tenant.shard_identity;
         let parent_tenant_conf = tenant.get_tenant_conf();
         let parent_generation = tenant.generation;
@@ -1490,6 +1523,10 @@ impl TenantManager {
             return Err(e);
         }
 
+        fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
         self.resources.deletion_queue_client.flush_advisory();
 
         // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
@@ -1511,11 +1548,16 @@ impl TenantManager {
                 anyhow::bail!("Detached parent shard in the middle of split!")
             }
         };
-
+        fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
         // Optimization: hardlink layers from the parent into the children, so that they don't have to
         // re-download & duplicate the data referenced in their initial IndexPart
         self.shard_split_hardlink(parent, child_shards.clone())
             .await?;
+        fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
 
         // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
         // child shards to reach this point.
@@ -1555,6 +1597,10 @@ impl TenantManager {
             .await?;
         }
 
+        fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
         // Phase 4: wait for child chards WAL ingest to catch up to target LSN
         for child_shard_id in &child_shards {
             let child_shard_id = *child_shard_id;
@@ -1587,6 +1633,10 @@ impl TenantManager {
                         timeline.timeline_id,
                         target_lsn
                     );
+
+                    fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
+                        "failpoint"
+                    )));
                     if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
                         // Failure here might mean shutdown, in any case this part is an optimization
                         // and we shouldn't hold up the split operation.
@@ -1632,6 +1682,10 @@ impl TenantManager {
             },
         );
 
+        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
         parent_slot_guard.drop_old_value()?;
 
         // Phase 6: Release the InProgress on the parent shard
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 200c9c3740..4b0c9ac71d 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -2,6 +2,7 @@ pytest_plugins = (
     "fixtures.pg_version",
     "fixtures.parametrize",
     "fixtures.httpserver",
+    "fixtures.compute_reconfigure",
     "fixtures.neon_fixtures",
     "fixtures.benchmark_fixture",
     "fixtures.pg_stats",
diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
new file mode 100644
index 0000000000..9dd66fe636
--- /dev/null
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -0,0 +1,62 @@
+import concurrent.futures
+from typing import Any
+
+import pytest
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+from fixtures.log_helper import log
+from fixtures.types import TenantId
+
+
+class ComputeReconfigure:
+    def __init__(self, server):
+        self.server = server
+        self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
+        self.workloads = {}
+
+    def register_workload(self, workload):
+        self.workloads[workload.tenant_id] = workload
+
+
+@pytest.fixture(scope="function")
+def compute_reconfigure_listener(make_httpserver):
+    """
+    This fixture exposes an HTTP listener for the storage controller to submit
+    compute notifications to us, instead of updating neon_local endpoints itself.
+
+    Although storage controller can use neon_local directly, this causes problems when
+    the test is also concurrently modifying endpoints.  Instead, configure storage controller
+    to send notifications up to this test code, which will route all endpoint updates
+    through Workload, which has a mutex to make concurrent updates safe.
+    """
+    server = make_httpserver
+
+    self = ComputeReconfigure(server)
+
+    # Do neon_local endpoint reconfiguration in the background so that we can
+    # accept a healthy rate of calls into notify-attach.
+    reconfigure_threads = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+
+    def handler(request: Request):
+        assert request.json is not None
+        body: dict[str, Any] = request.json
+        log.info(f"notify-attach request: {body}")
+
+        try:
+            workload = self.workloads[TenantId(body["tenant_id"])]
+        except KeyError:
+            pass
+        else:
+            # This causes the endpoint to query storage controller for its location, which
+            # is redundant since we already have it here, but this avoids extending the
+            # neon_local CLI to take full lists of locations
+            reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[no-any-return]
+
+        return Response(status=200)
+
+    self.server.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
+
+    yield self
+    reconfigure_threads.shutdown()
+    server.clear()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5b76e808d5..16ebc19698 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2177,6 +2177,23 @@ class NeonStorageController(MetricsGetter):
         )
         log.info("storage controller passed consistency check")
 
+    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.request(
+            "PUT",
+            f"{self.env.storage_controller_api}/debug/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        res.raise_for_status()
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 1d5394dc1d..e852281fcf 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -1,3 +1,4 @@
+import threading
 from typing import Optional
 
 from fixtures.log_helper import log
@@ -11,6 +12,10 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import TenantId, TimelineId
 
+# neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
+# to ensure we don't do that: this enables running lots of Workloads in parallel safely.
+ENDPOINT_LOCK = threading.Lock()
+
 
 class Workload:
     """
@@ -41,17 +46,30 @@ class Workload:
 
         self._endpoint: Optional[Endpoint] = None
 
+    def reconfigure(self):
+        """
+        Request the endpoint to reconfigure based on location reported by storage controller
+        """
+        if self._endpoint is not None:
+            with ENDPOINT_LOCK:
+                self._endpoint.reconfigure()
+
     def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
-        if self._endpoint is None:
-            self._endpoint = self.env.endpoints.create(
-                self.branch_name,
-                tenant_id=self.tenant_id,
-                pageserver_id=pageserver_id,
-                endpoint_id="ep-workload",
-            )
-            self._endpoint.start(pageserver_id=pageserver_id)
-        else:
-            self._endpoint.reconfigure(pageserver_id=pageserver_id)
+        # We may be running alongside other Workloads for different tenants.  Full TTID is
+        # obnoxiously long for use here, but a cut-down version is still unique enough for tests.
+        endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}"
+
+        with ENDPOINT_LOCK:
+            if self._endpoint is None:
+                self._endpoint = self.env.endpoints.create(
+                    self.branch_name,
+                    tenant_id=self.tenant_id,
+                    pageserver_id=pageserver_id,
+                    endpoint_id=endpoint_id,
+                )
+                self._endpoint.start(pageserver_id=pageserver_id)
+            else:
+                self._endpoint.reconfigure(pageserver_id=pageserver_id)
 
         connstring = self._endpoint.safe_psql(
             "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
@@ -94,7 +112,7 @@ class Workload:
         else:
             return False
 
-    def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
+    def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True, ingest=True):
         assert self.expect_rows >= n
 
         max_iters = 10
@@ -132,22 +150,28 @@ class Workload:
                 ]
             )
 
-        for tenant_shard_id, pageserver in tenant_get_shards(
-            self.env, self.tenant_id, pageserver_id
-        ):
-            last_flush_lsn = wait_for_last_flush_lsn(
-                self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
-            )
-            ps_http = pageserver.http_client()
-            wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+        if ingest:
+            # Wait for written data to be ingested by the pageserver
+            for tenant_shard_id, pageserver in tenant_get_shards(
+                self.env, self.tenant_id, pageserver_id
+            ):
+                last_flush_lsn = wait_for_last_flush_lsn(
+                    self.env,
+                    endpoint,
+                    self.tenant_id,
+                    self.timeline_id,
+                    pageserver_id=pageserver_id,
+                )
+                ps_http = pageserver.http_client()
+                wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
 
-            if upload:
-                # force a checkpoint to trigger upload
-                ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
-                wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
-                log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
-            else:
-                log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
+                if upload:
+                    # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload)
+                    ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
+                    wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+                    log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
+                else:
+                    log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
 
     def validate(self, pageserver_id: Optional[int] = None):
         endpoint = self.endpoint(pageserver_id)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 9309af066b..bdb9990a51 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,10 +1,14 @@
 import os
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union
 
 import pytest
+import requests
+from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    NeonEnv,
     NeonEnvBuilder,
+    StorageControllerApiException,
     tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
@@ -495,3 +499,337 @@ def test_sharding_ingest(
 
     # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance.
     assert huge_layer_count <= shard_count
+
+
+class Failure:
+    pageserver_id: Optional[int]
+
+    def apply(self, env: NeonEnv):
+        raise NotImplementedError()
+
+    def clear(self, env: NeonEnv):
+        """
+        Clear the failure, in a way that should enable the system to proceed
+        to a totally clean state (all nodes online and reconciled)
+        """
+        raise NotImplementedError()
+
+    def expect_available(self):
+        raise NotImplementedError()
+
+    def can_mitigate(self):
+        """Whether Self.mitigate is available for use"""
+        return False
+
+    def mitigate(self, env: NeonEnv):
+        """
+        Mitigate the failure in a way that should allow shard split to
+        complete and service to resume, but does not guarantee to leave
+        the whole world in a clean state (e.g. an Offline node might have
+        junk LocationConfigs on it)
+        """
+        raise NotImplementedError()
+
+    def fails_forward(self, env: NeonEnv):
+        """
+        If true, this failure results in a state that eventualy completes the split.
+        """
+        return False
+
+    def expect_exception(self):
+        """
+        How do we expect a call to the split API to fail?
+        """
+        return StorageControllerApiException
+
+
+class PageserverFailpoint(Failure):
+    def __init__(self, failpoint, pageserver_id, mitigate):
+        self.failpoint = failpoint
+        self.pageserver_id = pageserver_id
+        self._mitigate = mitigate
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.allowed_errors.extend(
+            [".*failpoint.*", ".*Resetting.*after shard split failure.*"]
+        )
+        pageserver.http_client().configure_failpoints((self.failpoint, "return(1)"))
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints((self.failpoint, "off"))
+        if self._mitigate:
+            env.storage_controller.node_configure(self.pageserver_id, {"availability": "Active"})
+
+    def expect_available(self):
+        return True
+
+    def can_mitigate(self):
+        return self._mitigate
+
+    def mitigate(self, env):
+        env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"})
+
+
+class StorageControllerFailpoint(Failure):
+    def __init__(self, failpoint, action):
+        self.failpoint = failpoint
+        self.pageserver_id = None
+        self.action = action
+
+    def apply(self, env: NeonEnv):
+        env.storage_controller.configure_failpoints((self.failpoint, self.action))
+
+    def clear(self, env: NeonEnv):
+        if "panic" in self.action:
+            log.info("Restarting storage controller after panic")
+            env.storage_controller.stop()
+            env.storage_controller.start()
+        else:
+            env.storage_controller.configure_failpoints((self.failpoint, "off"))
+
+    def expect_available(self):
+        # Controller panics _do_ leave pageservers available, but our test code relies
+        # on using the locate API to update configurations in Workload, so we must skip
+        # these actions when the controller has been panicked.
+        return "panic" not in self.action
+
+    def can_mitigate(self):
+        return False
+
+    def fails_forward(self, env):
+        # Edge case: the very last failpoint that simulates a DB connection error, where
+        # the abort path will fail-forward and result in a complete split.
+        fail_forward = self.failpoint == "shard-split-post-complete"
+
+        # If the failure was a panic, then if we expect split to eventually (after restart)
+        # complete, we must restart before checking that.
+        if fail_forward and "panic" in self.action:
+            log.info("Restarting storage controller after panic")
+            env.storage_controller.stop()
+            env.storage_controller.start()
+
+        return fail_forward
+
+    def expect_exception(self):
+        if "panic" in self.action:
+            return requests.exceptions.ConnectionError
+        else:
+            return StorageControllerApiException
+
+
+class NodeKill(Failure):
+    def __init__(self, pageserver_id, mitigate):
+        self.pageserver_id = pageserver_id
+        self._mitigate = mitigate
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.stop(immediate=True)
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.start()
+
+    def expect_available(self):
+        return False
+
+    def mitigate(self, env):
+        env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"})
+
+
+class CompositeFailure(Failure):
+    """
+    Wrapper for failures in multiple components (e.g. a failpoint in the storage controller, *and*
+    stop a pageserver to interfere with rollback)
+    """
+
+    def __init__(self, failures: list[Failure]):
+        self.failures = failures
+
+        self.pageserver_id = None
+        for f in failures:
+            if f.pageserver_id is not None:
+                self.pageserver_id = f.pageserver_id
+                break
+
+    def apply(self, env: NeonEnv):
+        for f in self.failures:
+            f.apply(env)
+
+    def clear(self, env):
+        for f in self.failures:
+            f.clear(env)
+
+    def expect_available(self):
+        return all(f.expect_available() for f in self.failures)
+
+    def mitigate(self, env):
+        for f in self.failures:
+            f.mitigate(env)
+
+    def expect_exception(self):
+        expect = set(f.expect_exception() for f in self.failures)
+
+        # We can't give a sensible response if our failures have different expectations
+        assert len(expect) == 1
+
+        return list(expect)[0]
+
+
+@pytest.mark.parametrize(
+    "failure",
+    [
+        PageserverFailpoint("api-500", 1, False),
+        NodeKill(1, False),
+        PageserverFailpoint("api-500", 1, True),
+        NodeKill(1, True),
+        PageserverFailpoint("shard-split-pre-prepare", 1, False),
+        PageserverFailpoint("shard-split-post-prepare", 1, False),
+        PageserverFailpoint("shard-split-pre-hardlink", 1, False),
+        PageserverFailpoint("shard-split-post-hardlink", 1, False),
+        PageserverFailpoint("shard-split-post-child-conf", 1, False),
+        PageserverFailpoint("shard-split-lsn-wait", 1, False),
+        PageserverFailpoint("shard-split-pre-finish", 1, False),
+        StorageControllerFailpoint("shard-split-validation", "return(1)"),
+        StorageControllerFailpoint("shard-split-post-begin", "return(1)"),
+        StorageControllerFailpoint("shard-split-post-remote", "return(1)"),
+        StorageControllerFailpoint("shard-split-post-complete", "return(1)"),
+        StorageControllerFailpoint("shard-split-validation", "panic(failpoint)"),
+        StorageControllerFailpoint("shard-split-post-begin", "panic(failpoint)"),
+        StorageControllerFailpoint("shard-split-post-remote", "panic(failpoint)"),
+        StorageControllerFailpoint("shard-split-post-complete", "panic(failpoint)"),
+        CompositeFailure(
+            [NodeKill(1, True), StorageControllerFailpoint("shard-split-post-begin", "return(1)")]
+        ),
+        CompositeFailure(
+            [NodeKill(1, False), StorageControllerFailpoint("shard-split-post-begin", "return(1)")]
+        ),
+    ],
+)
+def test_sharding_split_failures(
+    neon_env_builder: NeonEnvBuilder,
+    compute_reconfigure_listener: ComputeReconfigure,
+    failure: Failure,
+):
+    neon_env_builder.num_pageservers = 4
+    neon_env_builder.control_plane_compute_hook_api = (
+        compute_reconfigure_listener.control_plane_compute_hook_api
+    )
+    initial_shard_count = 2
+    split_shard_count = 4
+
+    env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    for ps in env.pageservers:
+        # When we do node failures and abandon a shard, it will de-facto have old generation and
+        # thereby be unable to publish remote consistent LSN updates
+        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
+    # Make sure the node we're failing has a shard on it, otherwise the test isn't testing anything
+    assert (
+        failure.pageserver_id is None
+        or len(
+            env.get_pageserver(failure.pageserver_id)
+            .http_client()
+            .tenant_list_locations()["tenant_shards"]
+        )
+        > 0
+    )
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
+    # Put the environment into a failing state (exact meaning depends on `failure`)
+    failure.apply(env)
+
+    with pytest.raises(failure.expect_exception()):
+        env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
+
+    # We expect that the overall operation will fail, but some split requests
+    # will have succeeded: the net result should be to return to a clean state, including
+    # detaching any child shards.
+    def assert_rolled_back(exclude_ps_id=None) -> None:
+        count = 0
+        for ps in env.pageservers:
+            if exclude_ps_id is not None and ps.id == exclude_ps_id:
+                continue
+
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                tenant_shard_id = TenantShardId.parse(loc[0])
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
+                assert tenant_shard_id.shard_count == initial_shard_count
+                count += 1
+        assert count == initial_shard_count
+
+    def assert_split_done(exclude_ps_id=None) -> None:
+        count = 0
+        for ps in env.pageservers:
+            if exclude_ps_id is not None and ps.id == exclude_ps_id:
+                continue
+
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                tenant_shard_id = TenantShardId.parse(loc[0])
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
+                assert tenant_shard_id.shard_count == split_shard_count
+                count += 1
+        assert count == split_shard_count
+
+    def finish_split():
+        # Having failed+rolled back, we should be able to split again
+        # No failures this time; it will succeed
+        env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
+
+        workload.churn_rows(10)
+        workload.validate()
+
+    if failure.expect_available():
+        # Even though the split failed partway through, this should not have interrupted
+        # clients.  Disable waiting for pageservers in the workload helper, because our
+        # failpoints may prevent API access.
+        # This only applies for failure modes that leave pageserver page_service API available.
+        workload.churn_rows(10, upload=False, ingest=False)
+        workload.validate()
+
+    if failure.fails_forward(env):
+        log.info("Fail-forward failure, checking split eventually completes...")
+        # A failure type which results in eventual completion of the split
+        wait_until(30, 1, assert_split_done)
+    elif failure.can_mitigate():
+        log.info("Mitigating failure...")
+        # Mitigation phase: we expect to be able to proceed with a successful shard split
+        failure.mitigate(env)
+
+        # The split should appear to be rolled back from the point of view of all pageservers
+        # apart from the one that is offline
+        wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
+
+        finish_split()
+        wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id))
+
+        # Having cleared the failure, everything should converge to a pristine state
+        failure.clear(env)
+        wait_until(30, 1, assert_split_done)
+    else:
+        # Once we restore the faulty pageserver's API to good health, rollback should
+        # eventually complete.
+        log.info("Clearing failure...")
+        failure.clear(env)
+
+        wait_until(30, 1, assert_rolled_back)
+
+        # Having rolled back, the tenant should be working
+        workload.churn_rows(10)
+        workload.validate()
+
+        # Splitting again should work, since we cleared the failure
+        finish_split()
+        assert_split_done()
+
+    env.storage_controller.consistency_check()

From 8075f0965af3bba94ac0bc874dcc14b068a62261 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 14 Mar 2024 12:18:55 +0100
Subject: [PATCH 06/43] fix(test suite) `virtual_file_io_engine` and
 `get_vectored_impl` patametrization doesn't work (#7113)

# Problem

While investigating #7124, I noticed that the benchmark was always using
the `DEFAULT_*` `virtual_file_io_engine` , i.e., `tokio-epoll-uring` as
of https://github.com/neondatabase/neon/pull/7077.

The fundamental problem is that the `control_plane` code has its own
view of `PageServerConfig`, which, I believe, will always be a subset of
the real pageserver's `pageserver/src/config.rs`.

For the `virtual_file_io_engine` and `get_vectored_impl` parametrization
of the test suite, we were constructing a dict on the Python side that
contained these parameters, then handed it to
`control_plane::PageServerConfig`'s derived `serde::Deserialize`.
The default in serde is to ignore unknown fields, so, the Deserialize
impl silently ignored the fields.
In consequence, the fields weren't propagated to the `pageserver --init`
call, and the tests ended up using the
`pageserver/src/config.rs::DEFAULT_` values for the respective options
all the time.

Tests that explicitly used overrides in `env.pageserver.start()` and
similar were not affected by this.

But, it means that all the test suite runs where with parametrization
didn't properly exercise the code path.

# Changes

- use `serde(deny_unknown_fields)` to expose the problem
- With this change, the Python tests that override
`virtual_file_io_engine` and
`get_vectored_impl` fail on `pageserver --init`, exposing the problem.
- use destructuring to uncover the issue in the future
- fix the issue by adding the missing fields to the `control_plane`
crate's `PageServerConf`
- A better solution would be for control plane to re-use a struct
provided
    by the pageserver crate, so that everything is in one place in
    `pageserver/src/config.rs`, but, our config parsing code is (almost)
    beyond repair anyways.
- fix the `pageserver_virtual_file_io_engine` to be responsive to the
env var
  - => required to make parametrization work in benchmarks

# Testing

Before merging this PR, I re-ran the regression tests & CI with the full
matrix of `virtual_file_io_engine` and `tokio-epoll-uring`, see
https://github.com/neondatabase/neon/pull/7113/commits/9c7ea364e04835b894a33136ca26e0cdb8cd6e30
---
 control_plane/src/local_env.rs      |  8 +++++++-
 control_plane/src/pageserver.rs     | 30 +++++++++++++++++++++--------
 test_runner/fixtures/parametrize.py |  2 +-
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 2e64489432..c7f22cc8f8 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -114,7 +114,7 @@ impl NeonBroker {
 }
 
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default)]
+#[serde(default, deny_unknown_fields)]
 pub struct PageServerConf {
     // node id
     pub id: NodeId,
@@ -126,6 +126,9 @@ pub struct PageServerConf {
     // auth type used for the PG and HTTP ports
     pub pg_auth_type: AuthType,
     pub http_auth_type: AuthType,
+
+    pub(crate) virtual_file_io_engine: String,
+    pub(crate) get_vectored_impl: String,
 }
 
 impl Default for PageServerConf {
@@ -136,6 +139,9 @@ impl Default for PageServerConf {
             listen_http_addr: String::new(),
             pg_auth_type: AuthType::Trust,
             http_auth_type: AuthType::Trust,
+            // FIXME: use the ones exposed by pageserver crate
+            virtual_file_io_engine: "tokio-epoll-uring".to_owned(),
+            get_vectored_impl: "sequential".to_owned(),
         }
     }
 }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 06ec942895..ab2f80fb0c 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -78,18 +78,31 @@ impl PageServerNode {
     ///
     /// These all end up on the command line of the `pageserver` binary.
     fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
-        let id = format!("id={}", self.conf.id);
         // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
         let pg_distrib_dir_param = format!(
             "pg_distrib_dir='{}'",
             self.env.pg_distrib_dir_raw().display()
         );
 
-        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
-        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
+        let PageServerConf {
+            id,
+            listen_pg_addr,
+            listen_http_addr,
+            pg_auth_type,
+            http_auth_type,
+            virtual_file_io_engine,
+            get_vectored_impl,
+        } = &self.conf;
 
-        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
-        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
+        let id = format!("id={}", id);
+
+        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
+        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
+
+        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
+        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
+        let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'");
+        let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'");
 
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
@@ -101,6 +114,8 @@ impl PageServerNode {
             listen_http_addr_param,
             listen_pg_addr_param,
             broker_endpoint_param,
+            virtual_file_io_engine,
+            get_vectored_impl,
         ];
 
         if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -111,7 +126,7 @@ impl PageServerNode {
 
             // Storage controller uses the same auth as pageserver: if JWT is enabled
             // for us, we will also need it to talk to them.
-            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
+            if matches!(http_auth_type, AuthType::NeonJWT) {
                 let jwt_token = self
                     .env
                     .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -129,8 +144,7 @@ impl PageServerNode {
             ));
         }
 
-        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
-        {
+        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
             // Keys are generated in the toplevel repo dir, pageservers' workdirs
             // are one level below that, so refer to keys with ../
             overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index b28da83508..c8ab550ad7 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -28,7 +28,7 @@ def platform() -> Optional[str]:
 
 @pytest.fixture(scope="function", autouse=True)
 def pageserver_virtual_file_io_engine() -> Optional[str]:
-    return None
+    return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE")
 
 
 def pytest_generate_tests(metafunc: Metafunc):

From 9fe0193e5154c7ac24093c02587c6669820c5fa6 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 14 Mar 2024 07:14:53 +0300
Subject: [PATCH 07/43] Bump vendor/postgres v15 v14.

---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index b980d6f090..3b09894ddb 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit b980d6f090c676e55fb2c830fb2434f532f635c0
+Subproject commit 3b09894ddb8825b50c963942059eab1a2a0b0a89
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 56f32c0e73..80cef885ad 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 56f32c0e7330d17aaeee8bf211a73995180bd133
+Subproject commit 80cef885add1af6741aa31944c7d2c84d8f9098f
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 1941c235ee..ae524d70b1 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
   "postgres-v16": "90078947229aa7f9ac5f7ed4527b2c7386d5332b",
-  "postgres-v15": "56f32c0e7330d17aaeee8bf211a73995180bd133",
-  "postgres-v14": "b980d6f090c676e55fb2c830fb2434f532f635c0"
+  "postgres-v15": "80cef885add1af6741aa31944c7d2c84d8f9098f",
+  "postgres-v14": "3b09894ddb8825b50c963942059eab1a2a0b0a89"
 }

From 38767ace686d191f6ae3f01df6ad5d682976bde2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 14 Mar 2024 15:21:36 +0000
Subject: [PATCH 08/43] storage_controller: periodic pageserver heartbeats
 (#7092)

## Problem
If a pageserver was offline when the storage controller started, there
was no mechanism to update the
storage controller state when the pageserver becomes active.

## Summary of changes
* Add a heartbeater module. The heartbeater must be driven by an
external loop.
* Integrate the heartbeater into the service.
- Extend the types used by the service and scheduler to keep track of a
nodes' utilisation score.
- Add a background loop to drive the heartbeater and update the state
based on the deltas it generated
  - Do an initial round of heartbeats at start-up
---
 Cargo.lock                                    |   1 +
 control_plane/Cargo.toml                      |   1 +
 .../attachment_service/src/heartbeater.rs     | 227 ++++++++++++++
 control_plane/attachment_service/src/http.rs  |  11 +-
 control_plane/attachment_service/src/lib.rs   |   1 +
 control_plane/attachment_service/src/main.rs  |  10 +-
 control_plane/attachment_service/src/node.rs  |  29 +-
 .../attachment_service/src/scheduler.rs       |  34 ++-
 .../attachment_service/src/service.rs         | 277 ++++++++++++++----
 control_plane/src/storage_controller.rs       |   7 +
 libs/pageserver_api/src/controller_api.rs     |  62 +++-
 libs/pageserver_api/src/models/utilization.rs |  15 +-
 pageserver/client/src/mgmt_api.rs             |   9 +
 pageserver/src/http/routes.rs                 |   4 +
 test_runner/fixtures/neon_fixtures.py         |   8 +
 .../regress/test_pageserver_generations.py    |   2 +
 test_runner/regress/test_sharding_service.py  | 169 +++++++++++
 17 files changed, 779 insertions(+), 88 deletions(-)
 create mode 100644 control_plane/attachment_service/src/heartbeater.rs

diff --git a/Cargo.lock b/Cargo.lock
index 45397eb4a2..b8b276d74f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1346,6 +1346,7 @@ dependencies = [
  "futures",
  "git-version",
  "hex",
+ "humantime",
  "hyper",
  "nix 0.27.1",
  "once_cell",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 75e5dcb7f8..b544a8c587 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -12,6 +12,7 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 git-version.workspace = true
+humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
diff --git a/control_plane/attachment_service/src/heartbeater.rs b/control_plane/attachment_service/src/heartbeater.rs
new file mode 100644
index 0000000000..e15de28920
--- /dev/null
+++ b/control_plane/attachment_service/src/heartbeater.rs
@@ -0,0 +1,227 @@
+use futures::{stream::FuturesUnordered, StreamExt};
+use std::{
+    collections::HashMap,
+    sync::Arc,
+    time::{Duration, Instant},
+};
+use tokio_util::sync::CancellationToken;
+
+use pageserver_api::{
+    controller_api::{NodeAvailability, UtilizationScore},
+    models::PageserverUtilization,
+};
+
+use thiserror::Error;
+use utils::id::NodeId;
+
+use crate::node::Node;
+
+struct HeartbeaterTask {
+    receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
+    cancel: CancellationToken,
+
+    state: HashMap<NodeId, PageserverState>,
+
+    max_unavailable_interval: Duration,
+    jwt_token: Option<String>,
+}
+
+#[derive(Debug, Clone)]
+pub(crate) enum PageserverState {
+    Available {
+        last_seen_at: Instant,
+        utilization: PageserverUtilization,
+    },
+    Offline,
+}
+
+#[derive(Debug)]
+pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>);
+
+#[derive(Debug, Error)]
+pub(crate) enum HeartbeaterError {
+    #[error("Cancelled")]
+    Cancel,
+}
+
+struct HeartbeatRequest {
+    pageservers: Arc<HashMap<NodeId, Node>>,
+    reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas, HeartbeaterError>>,
+}
+
+pub(crate) struct Heartbeater {
+    sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest>,
+}
+
+impl Heartbeater {
+    pub(crate) fn new(
+        jwt_token: Option<String>,
+        max_unavailable_interval: Duration,
+        cancel: CancellationToken,
+    ) -> Self {
+        let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
+        let mut heartbeater =
+            HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
+        tokio::task::spawn(async move { heartbeater.run().await });
+
+        Self { sender }
+    }
+
+    pub(crate) async fn heartbeat(
+        &self,
+        pageservers: Arc<HashMap<NodeId, Node>>,
+    ) -> Result<AvailablityDeltas, HeartbeaterError> {
+        let (sender, receiver) = tokio::sync::oneshot::channel();
+        self.sender
+            .send(HeartbeatRequest {
+                pageservers,
+                reply: sender,
+            })
+            .unwrap();
+
+        receiver.await.unwrap()
+    }
+}
+
+impl HeartbeaterTask {
+    fn new(
+        receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
+        jwt_token: Option<String>,
+        max_unavailable_interval: Duration,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            receiver,
+            cancel,
+            state: HashMap::new(),
+            max_unavailable_interval,
+            jwt_token,
+        }
+    }
+
+    async fn run(&mut self) {
+        loop {
+            tokio::select! {
+                request = self.receiver.recv() => {
+                    match request {
+                        Some(req) => {
+                            let res = self.heartbeat(req.pageservers).await;
+                            req.reply.send(res).unwrap();
+                        },
+                        None => { return; }
+                    }
+                },
+                _ = self.cancel.cancelled() => return
+            }
+        }
+    }
+
+    async fn heartbeat(
+        &mut self,
+        pageservers: Arc<HashMap<NodeId, Node>>,
+    ) -> Result<AvailablityDeltas, HeartbeaterError> {
+        let mut new_state = HashMap::new();
+
+        let mut heartbeat_futs = FuturesUnordered::new();
+        for (node_id, node) in &*pageservers {
+            heartbeat_futs.push({
+                let jwt_token = self.jwt_token.clone();
+                let cancel = self.cancel.clone();
+
+                // Clone the node and mark it as available such that the request
+                // goes through to the pageserver even when the node is marked offline.
+                // This doesn't impact the availability observed by [`crate::service::Service`].
+                let mut node = node.clone();
+                node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+
+                async move {
+                    let response = node
+                        .with_client_retries(
+                            |client| async move { client.get_utilization().await },
+                            &jwt_token,
+                            2,
+                            3,
+                            Duration::from_secs(1),
+                            &cancel,
+                        )
+                        .await;
+
+                    let response = match response {
+                        Some(r) => r,
+                        None => {
+                            // This indicates cancellation of the request.
+                            // We ignore the node in this case.
+                            return None;
+                        }
+                    };
+
+                    let status = if let Ok(utilization) = response {
+                        PageserverState::Available {
+                            last_seen_at: Instant::now(),
+                            utilization,
+                        }
+                    } else {
+                        PageserverState::Offline
+                    };
+
+                    Some((*node_id, status))
+                }
+            });
+
+            loop {
+                let maybe_status = tokio::select! {
+                    next = heartbeat_futs.next() => {
+                        match next {
+                            Some(result) => result,
+                            None => { break; }
+                        }
+                    },
+                    _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
+                };
+
+                if let Some((node_id, status)) = maybe_status {
+                    new_state.insert(node_id, status);
+                }
+            }
+        }
+
+        let mut deltas = Vec::new();
+        let now = Instant::now();
+        for (node_id, ps_state) in new_state {
+            use std::collections::hash_map::Entry::*;
+            let entry = self.state.entry(node_id);
+
+            let mut needs_update = false;
+            match entry {
+                Occupied(ref occ) => match (occ.get(), &ps_state) {
+                    (PageserverState::Offline, PageserverState::Offline) => {}
+                    (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
+                        if now - *last_seen_at >= self.max_unavailable_interval {
+                            deltas.push((node_id, ps_state.clone()));
+                            needs_update = true;
+                        }
+                    }
+                    _ => {
+                        deltas.push((node_id, ps_state.clone()));
+                        needs_update = true;
+                    }
+                },
+                Vacant(_) => {
+                    deltas.push((node_id, ps_state.clone()));
+                }
+            }
+
+            match entry {
+                Occupied(mut occ) if needs_update => {
+                    (*occ.get_mut()) = ps_state;
+                }
+                Vacant(vac) => {
+                    vac.insert(ps_state);
+                }
+                _ => {}
+            }
+        }
+
+        Ok(AvailablityDeltas(deltas))
+    }
+}
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index d26652cc94..560a05e908 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -28,7 +28,7 @@ use utils::{
 };
 
 use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
 
@@ -389,7 +389,14 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
 
     json_response(
         StatusCode::OK,
-        state.service.node_configure(config_req).await?,
+        state
+            .service
+            .node_configure(
+                config_req.node_id,
+                config_req.availability.map(NodeAvailability::from),
+                config_req.scheduling,
+            )
+            .await?,
     )
 }
 
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index a017bc1ecc..4aff29f15b 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -3,6 +3,7 @@ use utils::seqwait::MonotonicCounter;
 
 mod auth;
 mod compute_hook;
+mod heartbeater;
 pub mod http;
 mod id_lock_map;
 pub mod metrics;
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index fb7b363c39..0a925a63f6 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -2,7 +2,7 @@ use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
-use attachment_service::service::{Config, Service};
+use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
@@ -54,6 +54,10 @@ struct Cli {
     /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
     #[arg(long)]
     database_url: Option<String>,
+
+    /// Grace period before marking unresponsive pageserver offline
+    #[arg(long)]
+    max_unavailable_interval: Option<humantime::Duration>,
 }
 
 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
@@ -249,6 +253,10 @@ async fn async_main() -> anyhow::Result<()> {
         jwt_token: secrets.jwt_token,
         control_plane_jwt_token: secrets.control_plane_jwt_token,
         compute_hook_url: args.compute_hook_url,
+        max_unavailable_interval: args
+            .max_unavailable_interval
+            .map(humantime::Duration::into)
+            .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index dda8a155c6..4167782715 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -12,7 +12,7 @@ use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};
 
-use crate::persistence::NodePersistence;
+use crate::{persistence::NodePersistence, scheduler::MaySchedule};
 
 /// Represents the in-memory description of a Node.
 ///
@@ -111,8 +111,8 @@ impl Node {
         use NodeAvailability::*;
 
         match (self.availability, availability) {
-            (Offline, Active) => ToActive,
-            (Active, Offline) => ToOffline,
+            (Offline, Active(_)) => ToActive,
+            (Active(_), Offline) => ToOffline,
             _ => Unchanged,
         }
     }
@@ -123,21 +123,21 @@ impl Node {
         // a reference to the original Node's cancellation status.  Checking both of these results
         // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
         // when we cloned it, or if the original Node instance's cancellation token was fired.
-        matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
+        matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
     }
 
     /// Is this node elegible to have work scheduled onto it?
-    pub(crate) fn may_schedule(&self) -> bool {
-        match self.availability {
-            NodeAvailability::Active => {}
-            NodeAvailability::Offline => return false,
-        }
+    pub(crate) fn may_schedule(&self) -> MaySchedule {
+        let score = match self.availability {
+            NodeAvailability::Active(score) => score,
+            NodeAvailability::Offline => return MaySchedule::No,
+        };
 
         match self.scheduling {
-            NodeSchedulingPolicy::Active => true,
-            NodeSchedulingPolicy::Draining => false,
-            NodeSchedulingPolicy::Filling => true,
-            NodeSchedulingPolicy::Pause => false,
+            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Draining => MaySchedule::No,
+            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Pause => MaySchedule::No,
         }
     }
 
@@ -155,8 +155,7 @@ impl Node {
             listen_pg_addr,
             listen_pg_port,
             scheduling: NodeSchedulingPolicy::Filling,
-            // TODO: we shouldn't really call this Active until we've heartbeated it.
-            availability: NodeAvailability::Active,
+            availability: NodeAvailability::Offline,
             cancel: CancellationToken::new(),
         }
     }
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 26a2707e8d..981ba26cce 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,4 +1,5 @@
 use crate::{node::Node, tenant_state::TenantState};
+use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
@@ -19,15 +20,34 @@ impl From<ScheduleError> for ApiError {
 }
 
 #[derive(Serialize, Eq, PartialEq)]
+pub enum MaySchedule {
+    Yes(UtilizationScore),
+    No,
+}
+
+#[derive(Serialize)]
 struct SchedulerNode {
     /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
     shard_count: usize,
 
     /// Whether this node is currently elegible to have new shards scheduled (this is derived
     /// from a node's availability state and scheduling policy).
-    may_schedule: bool,
+    may_schedule: MaySchedule,
 }
 
+impl PartialEq for SchedulerNode {
+    fn eq(&self, other: &Self) -> bool {
+        let may_schedule_matches = matches!(
+            (&self.may_schedule, &other.may_schedule),
+            (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
+        );
+
+        may_schedule_matches && self.shard_count == other.shard_count
+    }
+}
+
+impl Eq for SchedulerNode {}
+
 /// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
 /// on which to run.
 ///
@@ -186,13 +206,15 @@ impl Scheduler {
             return None;
         }
 
+        // TODO: When the utilization score returned by the pageserver becomes meaningful,
+        // schedule based on that instead of the shard count.
         let node = nodes
             .iter()
             .map(|node_id| {
                 let may_schedule = self
                     .nodes
                     .get(node_id)
-                    .map(|n| n.may_schedule)
+                    .map(|n| n.may_schedule != MaySchedule::No)
                     .unwrap_or(false);
                 (*node_id, may_schedule)
             })
@@ -211,7 +233,7 @@ impl Scheduler {
             .nodes
             .iter()
             .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) || !v.may_schedule {
+                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
                     None
                 } else {
                     Some((*k, v.shard_count))
@@ -230,7 +252,7 @@ impl Scheduler {
             for (node_id, node) in &self.nodes {
                 tracing::info!(
                     "Node {node_id}: may_schedule={} shards={}",
-                    node.may_schedule,
+                    node.may_schedule != MaySchedule::No,
                     node.shard_count
                 );
             }
@@ -255,6 +277,7 @@ impl Scheduler {
 pub(crate) mod test_utils {
 
     use crate::node::Node;
+    use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
     use std::collections::HashMap;
     use utils::id::NodeId;
     /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -264,13 +287,14 @@ pub(crate) mod test_utils {
         (1..n + 1)
             .map(|i| {
                 (NodeId(i), {
-                    let node = Node::new(
+                    let mut node = Node::new(
                         NodeId(i),
                         format!("httphost-{i}"),
                         80 + i as u16,
                         format!("pghost-{i}"),
                         5432 + i as u16,
                     );
+                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
                     assert!(node.is_available());
                     node
                 })
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 1b85081666..ac61209c38 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -16,19 +16,13 @@ use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
-    controller_api::{
-        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, PlacementPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
-    },
-    models::TenantConfigRequest,
-};
-use pageserver_api::{
+    controller_api::UtilizationScore,
     models::{
-        self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
-        TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
-        TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
-        TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
+        self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
+        PageserverUtilization, ShardParameters, TenantConfig, TenantCreateRequest,
+        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
+        TenantShardSplitRequest, TenantShardSplitResponse, TenantTimeTravelRequest,
+        TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
     upcall_api::{
@@ -36,6 +30,14 @@ use pageserver_api::{
         ValidateResponse, ValidateResponseTenant,
     },
 };
+use pageserver_api::{
+    controller_api::{
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
+        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
+    },
+    models::TenantConfigRequest,
+};
 use pageserver_client::mgmt_api;
 use tokio::sync::OwnedRwLockWriteGuard;
 use tokio_util::sync::CancellationToken;
@@ -51,6 +53,7 @@ use utils::{
 
 use crate::{
     compute_hook::{self, ComputeHook},
+    heartbeater::{Heartbeater, PageserverState},
     node::{AvailabilityTransition, Node},
     persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
     reconciler::attached_location_conf,
@@ -78,6 +81,8 @@ const INITIAL_GENERATION: Generation = Generation::new(0);
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
+pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
+
 // Top level state available to all HTTP handlers
 struct ServiceState {
     tenants: BTreeMap<TenantShardId, TenantState>,
@@ -125,6 +130,11 @@ pub struct Config {
     /// (this URL points to the control plane in prod). If this is None, the compute hook will
     /// assume it is running in a test environment and try to update neon_local.
     pub compute_hook_url: Option<String>,
+
+    /// Grace period within which a pageserver does not respond to heartbeats, but is still
+    /// considered active. Once the grace period elapses, the next heartbeat failure will
+    /// mark the pagseserver offline.
+    pub max_unavailable_interval: Duration,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -149,6 +159,8 @@ pub struct Service {
     compute_hook: Arc<ComputeHook>,
     result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
 
+    heartbeater: Heartbeater,
+
     // Channel for background cleanup from failed operations that require cleanup, such as shard split
     abort_tx: tokio::sync::mpsc::UnboundedSender<TenantShardSplitAbort>,
 
@@ -232,8 +244,6 @@ impl Service {
         let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
             HashMap::new();
 
-        let mut nodes_online = HashSet::new();
-
         // Startup reconciliation does I/O to other services: whether they
         // are responsive or not, we should aim to finish within our deadline, because:
         // - If we don't, a k8s readiness hook watching /ready will kill us.
@@ -255,6 +265,9 @@ impl Service {
         let mut cleanup = Vec::new();
 
         let node_listings = self.scan_node_locations(node_scan_deadline).await;
+        // Send initial heartbeat requests to nodes that replied to the location listing above.
+        let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await;
+
         for (node_id, list_response) in node_listings {
             let tenant_shards = list_response.tenant_shards;
             tracing::info!(
@@ -262,7 +275,6 @@ impl Service {
                 tenant_shards.len(),
                 node_id
             );
-            nodes_online.insert(node_id);
 
             for (tenant_shard_id, conf_opt) in tenant_shards {
                 let shard_observations = observed.entry(tenant_shard_id).or_default();
@@ -281,8 +293,10 @@ impl Service {
             // Mark nodes online if they responded to us: nodes are offline by default after a restart.
             let mut new_nodes = (**nodes).clone();
             for (node_id, node) in new_nodes.iter_mut() {
-                if nodes_online.contains(node_id) {
-                    node.set_availability(NodeAvailability::Active);
+                if let Some(utilization) = nodes_online.get(node_id) {
+                    node.set_availability(NodeAvailability::Active(UtilizationScore(
+                        utilization.utilization_score,
+                    )));
                     scheduler.node_upsert(node);
                 }
             }
@@ -371,6 +385,49 @@ impl Service {
         tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
     }
 
+    async fn initial_heartbeat_round<'a>(
+        &self,
+        node_ids: impl Iterator<Item = &'a NodeId>,
+    ) -> HashMap<NodeId, PageserverUtilization> {
+        assert!(!self.startup_complete.is_ready());
+
+        let all_nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+
+        let mut nodes_to_heartbeat = HashMap::new();
+        for node_id in node_ids {
+            match all_nodes.get(node_id) {
+                Some(node) => {
+                    nodes_to_heartbeat.insert(*node_id, node.clone());
+                }
+                None => {
+                    tracing::warn!("Node {node_id} was removed during start-up");
+                }
+            }
+        }
+
+        let res = self
+            .heartbeater
+            .heartbeat(Arc::new(nodes_to_heartbeat))
+            .await;
+
+        let mut online_nodes = HashMap::new();
+        if let Ok(deltas) = res {
+            for (node_id, status) in deltas.0 {
+                match status {
+                    PageserverState::Available { utilization, .. } => {
+                        online_nodes.insert(node_id, utilization);
+                    }
+                    PageserverState::Offline => {}
+                }
+            }
+        }
+
+        online_nodes
+    }
+
     /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline.
     ///
     /// The result includes only nodes which responded within the deadline
@@ -391,7 +448,7 @@ impl Service {
             node_list_futs.push({
                 async move {
                     tracing::info!("Scanning shards on node {node}...");
-                    let timeout = Duration::from_secs(5);
+                    let timeout = Duration::from_secs(1);
                     let response = node
                         .with_client_retries(
                             |client| async move { client.list_location_config().await },
@@ -586,6 +643,56 @@ impl Service {
             }
         }
     }
+    #[instrument(skip_all)]
+    async fn spawn_heartbeat_driver(&self) {
+        self.startup_complete.clone().wait().await;
+
+        const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5);
+
+        let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL);
+        while !self.cancel.is_cancelled() {
+            tokio::select! {
+              _ = interval.tick() => { }
+              _ = self.cancel.cancelled() => return
+            };
+
+            let nodes = {
+                let locked = self.inner.read().unwrap();
+                locked.nodes.clone()
+            };
+
+            let res = self.heartbeater.heartbeat(nodes).await;
+            if let Ok(deltas) = res {
+                for (node_id, state) in deltas.0 {
+                    let new_availability = match state {
+                        PageserverState::Available { utilization, .. } => NodeAvailability::Active(
+                            UtilizationScore(utilization.utilization_score),
+                        ),
+                        PageserverState::Offline => NodeAvailability::Offline,
+                    };
+                    let res = self
+                        .node_configure(node_id, Some(new_availability), None)
+                        .await;
+
+                    match res {
+                        Ok(()) => {}
+                        Err(ApiError::NotFound(_)) => {
+                            // This should be rare, but legitimate since the heartbeats are done
+                            // on a snapshot of the nodes.
+                            tracing::info!("Node {} was not found after heartbeat round", node_id);
+                        }
+                        Err(err) => {
+                            tracing::error!(
+                                "Failed to update node {} after heartbeat round: {}",
+                                node_id,
+                                err
+                            );
+                        }
+                    }
+                }
+            }
+        }
+    }
 
     /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
     /// was successful, this will update the observed state of the tenant such that subsequent
@@ -836,6 +943,12 @@ impl Service {
 
         let (startup_completion, startup_complete) = utils::completion::channel();
 
+        let cancel = CancellationToken::new();
+        let heartbeater = Heartbeater::new(
+            config.jwt_token.clone(),
+            config.max_unavailable_interval,
+            cancel.clone(),
+        );
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                 nodes, tenants, scheduler,
@@ -844,9 +957,10 @@ impl Service {
             persistence,
             compute_hook: Arc::new(ComputeHook::new(config)),
             result_tx,
+            heartbeater,
             abort_tx,
             startup_complete: startup_complete.clone(),
-            cancel: CancellationToken::new(),
+            cancel,
             gate: Gate::default(),
             tenant_op_locks: Default::default(),
             node_op_locks: Default::default(),
@@ -899,13 +1013,28 @@ impl Service {
                 };
 
                 this.startup_reconcile().await;
-
                 drop(startup_completion);
+            }
+        });
 
+        tokio::task::spawn({
+            let this = this.clone();
+            let startup_complete = startup_complete.clone();
+            async move {
+                startup_complete.wait().await;
                 this.background_reconcile().await;
             }
         });
 
+        tokio::task::spawn({
+            let this = this.clone();
+            let startup_complete = startup_complete.clone();
+            async move {
+                startup_complete.wait().await;
+                this.spawn_heartbeat_driver().await;
+            }
+        });
+
         Ok(this)
     }
 
@@ -964,11 +1093,37 @@ impl Service {
         }
 
         let new_generation = if let Some(req_node_id) = attach_req.node_id {
-            Some(
-                self.persistence
-                    .increment_generation(attach_req.tenant_shard_id, req_node_id)
-                    .await?,
-            )
+            let maybe_tenant_conf = {
+                let locked = self.inner.write().unwrap();
+                locked
+                    .tenants
+                    .get(&attach_req.tenant_shard_id)
+                    .map(|t| t.config.clone())
+            };
+
+            match maybe_tenant_conf {
+                Some(conf) => {
+                    let new_generation = self
+                        .persistence
+                        .increment_generation(attach_req.tenant_shard_id, req_node_id)
+                        .await?;
+
+                    // Persist the placement policy update. This is required
+                    // when we reattaching a detached tenant.
+                    self.persistence
+                        .update_tenant_shard(
+                            attach_req.tenant_shard_id,
+                            PlacementPolicy::Single,
+                            conf,
+                            None,
+                        )
+                        .await?;
+                    Some(new_generation)
+                }
+                None => {
+                    anyhow::bail!("Attach hook handling raced with tenant removal")
+                }
+            }
         } else {
             self.persistence.detach(attach_req.tenant_shard_id).await?;
             None
@@ -983,6 +1138,7 @@ impl Service {
 
         if let Some(new_generation) = new_generation {
             tenant_state.generation = Some(new_generation);
+            tenant_state.policy = PlacementPolicy::Single;
         } else {
             // This is a detach notification.  We must update placement policy to avoid re-attaching
             // during background scheduling/reconciliation, or during storage controller restart.
@@ -1085,7 +1241,7 @@ impl Service {
         // This Node is a mutable local copy: we will set it active so that we can use its
         // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
         // later.
-        node.set_availability(NodeAvailability::Active);
+        node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
 
         let configs = match node
             .with_client_retries(
@@ -1196,7 +1352,7 @@ impl Service {
 
         // Apply the updated generation to our in-memory state
         let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
 
         let mut response = ReAttachResponse {
             tenants: Vec::new(),
@@ -1271,7 +1427,8 @@ impl Service {
             if !node.is_available() {
                 let mut new_nodes = (**nodes).clone();
                 if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
-                    node.set_availability(NodeAvailability::Active);
+                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                    scheduler.node_upsert(node);
                 }
                 let new_nodes = Arc::new(new_nodes);
                 *nodes = new_nodes;
@@ -3328,16 +3485,16 @@ impl Service {
 
     pub(crate) async fn node_configure(
         &self,
-        config_req: NodeConfigureRequest,
+        node_id: NodeId,
+        availability: Option<NodeAvailability>,
+        scheduling: Option<NodeSchedulingPolicy>,
     ) -> Result<(), ApiError> {
-        let _node_lock = self.node_op_locks.exclusive(config_req.node_id).await;
+        let _node_lock = self.node_op_locks.exclusive(node_id).await;
 
-        if let Some(scheduling) = config_req.scheduling {
+        if let Some(scheduling) = scheduling {
             // Scheduling is a persistent part of Node: we must write updates to the database before
             // applying them in memory
-            self.persistence
-                .update_node(config_req.node_id, scheduling)
-                .await?;
+            self.persistence.update_node(node_id, scheduling).await?;
         }
 
         // If we're activating a node, then before setting it active we must reconcile any shard locations
@@ -3346,12 +3503,12 @@ impl Service {
         //
         // The transition we calculate here remains valid later in the function because we hold the op lock on the node:
         // nothing else can mutate its availability while we run.
-        let availability_transition = if let Some(input_availability) = config_req.availability {
+        let availability_transition = if let Some(input_availability) = availability {
             let (activate_node, availability_transition) = {
                 let locked = self.inner.read().unwrap();
-                let Some(node) = locked.nodes.get(&config_req.node_id) else {
+                let Some(node) = locked.nodes.get(&node_id) else {
                     return Err(ApiError::NotFound(
-                        anyhow::anyhow!("Node {} not registered", config_req.node_id).into(),
+                        anyhow::anyhow!("Node {} not registered", node_id).into(),
                     ));
                 };
 
@@ -3376,17 +3533,17 @@ impl Service {
 
         let mut new_nodes = (**nodes).clone();
 
-        let Some(node) = new_nodes.get_mut(&config_req.node_id) else {
+        let Some(node) = new_nodes.get_mut(&node_id) else {
             return Err(ApiError::NotFound(
                 anyhow::anyhow!("Node not registered").into(),
             ));
         };
 
-        if let Some(availability) = &config_req.availability {
+        if let Some(availability) = &availability {
             node.set_availability(*availability);
         }
 
-        if let Some(scheduling) = config_req.scheduling {
+        if let Some(scheduling) = scheduling {
             node.set_scheduling(scheduling);
 
             // TODO: once we have a background scheduling ticker for fill/drain, kick it
@@ -3401,25 +3558,23 @@ impl Service {
         // Modify scheduling state for any Tenants that are affected by a change in the node's availability state.
         match availability_transition {
             AvailabilityTransition::ToOffline => {
-                tracing::info!("Node {} transition to offline", config_req.node_id);
+                tracing::info!("Node {} transition to offline", node_id);
                 let mut tenants_affected: usize = 0;
                 for (tenant_shard_id, tenant_state) in tenants {
-                    if let Some(observed_loc) =
-                        tenant_state.observed.locations.get_mut(&config_req.node_id)
-                    {
+                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
                         // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
                         // not assume our knowledge of the node's configuration is accurate until it comes back online
                         observed_loc.conf = None;
                     }
 
-                    if tenant_state.intent.demote_attached(config_req.node_id) {
+                    if tenant_state.intent.demote_attached(node_id) {
                         tenant_state.sequence = tenant_state.sequence.next();
                         match tenant_state.schedule(scheduler) {
                             Err(e) => {
                                 // It is possible that some tenants will become unschedulable when too many pageservers
                                 // go offline: in this case there isn't much we can do other than make the issue observable.
                                 // TODO: give TenantState a scheduling error attribute to be queried later.
-                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
+                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
                             }
                             Ok(()) => {
                                 if self
@@ -3435,17 +3590,15 @@ impl Service {
                 tracing::info!(
                     "Launched {} reconciler tasks for tenants affected by node {} going offline",
                     tenants_affected,
-                    config_req.node_id
+                    node_id
                 )
             }
             AvailabilityTransition::ToActive => {
-                tracing::info!("Node {} transition to active", config_req.node_id);
+                tracing::info!("Node {} transition to active", node_id);
                 // When a node comes back online, we must reconcile any tenant that has a None observed
                 // location on the node.
                 for tenant_state in locked.tenants.values_mut() {
-                    if let Some(observed_loc) =
-                        tenant_state.observed.locations.get_mut(&config_req.node_id)
-                    {
+                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
                         if observed_loc.conf.is_none() {
                             self.maybe_reconcile_shard(tenant_state, &new_nodes);
                         }
@@ -3455,7 +3608,7 @@ impl Service {
                 // TODO: in the background, we should balance work back onto this pageserver
             }
             AvailabilityTransition::Unchanged => {
-                tracing::info!("Node {} no change during config", config_req.node_id);
+                tracing::info!("Node {} no change during config", node_id);
             }
         }
 
@@ -3534,17 +3687,23 @@ impl Service {
         )
     }
 
-    /// Check all tenants for pending reconciliation work, and reconcile those in need
+    /// Check all tenants for pending reconciliation work, and reconcile those in need.
+    /// Additionally, reschedule tenants that require it.
     ///
     /// Returns how many reconciliation tasks were started
     fn reconcile_all(&self) -> usize {
         let mut locked = self.inner.write().unwrap();
-        let pageservers = locked.nodes.clone();
-        locked
-            .tenants
-            .iter_mut()
-            .filter_map(|(_tenant_shard_id, shard)| self.maybe_reconcile_shard(shard, &pageservers))
-            .count()
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let pageservers = nodes.clone();
+
+        let mut reconciles_spawned = 0;
+        for (_tenant_shard_id, shard) in tenants.iter_mut() {
+            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
+                reconciles_spawned += 1;
+            }
+        }
+
+        reconciles_spawned
     }
 
     pub async fn shutdown(&self) {
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index d7673f1b26..18014adba4 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -38,6 +38,9 @@ const COMMAND: &str = "storage_controller";
 
 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
+// Use a shorter pageserver unavailability interval than the default to speed up tests.
+const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -269,6 +272,8 @@ impl StorageController {
         // Run migrations on every startup, in case something changed.
         let database_url = self.setup_database().await?;
 
+        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
+
         let mut args = vec![
             "-l",
             &self.listen,
@@ -276,6 +281,8 @@ impl StorageController {
             self.path.as_ref(),
             "--database-url",
             &database_url,
+            "--max-unavailable-interval",
+            &max_unavailable.to_string(),
         ]
         .into_iter()
         .map(|s| s.to_string())
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index c172354e9f..6053e8b8ed 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -35,7 +35,7 @@ pub struct NodeRegisterRequest {
 pub struct NodeConfigureRequest {
     pub node_id: NodeId,
 
-    pub availability: Option<NodeAvailability>,
+    pub availability: Option<NodeAvailabilityWrapper>,
     pub scheduling: Option<NodeSchedulingPolicy>,
 }
 
@@ -66,22 +66,76 @@ pub struct TenantShardMigrateRequest {
     pub node_id: NodeId,
 }
 
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+/// Utilisation score indicating how good a candidate a pageserver
+/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
+/// Lower values are better.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+pub struct UtilizationScore(pub u64);
+
+impl UtilizationScore {
+    pub fn worst() -> Self {
+        UtilizationScore(u64::MAX)
+    }
+}
+
+#[derive(Serialize, Clone, Copy)]
+#[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
     // Normal, happy state
-    Active,
+    Active(UtilizationScore),
     // Offline: Tenants shouldn't try to attach here, but they may assume that their
     // secondary locations on this node still exist.  Newly added nodes are in this
     // state until we successfully contact them.
     Offline,
 }
 
+impl PartialEq for NodeAvailability {
+    fn eq(&self, other: &Self) -> bool {
+        use NodeAvailability::*;
+        matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
+    }
+}
+
+impl Eq for NodeAvailability {}
+
+// This wrapper provides serde functionality and it should only be used to
+// communicate with external callers which don't know or care about the
+// utilisation score of the pageserver it is targeting.
+#[derive(Serialize, Deserialize, Clone)]
+pub enum NodeAvailabilityWrapper {
+    Active,
+    Offline,
+}
+
+impl From<NodeAvailabilityWrapper> for NodeAvailability {
+    fn from(val: NodeAvailabilityWrapper) -> Self {
+        match val {
+            // Assume the worst utilisation score to begin with. It will later be updated by
+            // the heartbeats.
+            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
+            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
+        }
+    }
+}
+
+impl From<NodeAvailability> for NodeAvailabilityWrapper {
+    fn from(val: NodeAvailability) -> Self {
+        match val {
+            NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
+            NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
+        }
+    }
+}
+
 impl FromStr for NodeAvailability {
     type Err = anyhow::Error;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
         match s {
-            "active" => Ok(Self::Active),
+            // This is used when parsing node configuration requests from neon-local.
+            // Assume the worst possible utilisation score
+            // and let it get updated via the heartbeats.
+            "active" => Ok(Self::Active(UtilizationScore::worst())),
             "offline" => Ok(Self::Offline),
             _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
         }
diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
index 7195a12395..f5984dff5d 100644
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -7,7 +7,7 @@ use std::time::SystemTime;
 ///
 /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
 /// not handle full u64 values properly.
-#[derive(serde::Serialize, Debug)]
+#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
 pub struct PageserverUtilization {
     /// Used disk space
     #[serde(serialize_with = "ser_saturating_u63")]
@@ -21,7 +21,10 @@ pub struct PageserverUtilization {
     /// When was this snapshot captured, pageserver local time.
     ///
     /// Use millis to give confidence that the value is regenerated often enough.
-    #[serde(serialize_with = "ser_rfc3339_millis")]
+    #[serde(
+        serialize_with = "ser_rfc3339_millis",
+        deserialize_with = "deser_rfc3339_millis"
+    )]
     pub captured_at: SystemTime,
 }
 
@@ -32,6 +35,14 @@ fn ser_rfc3339_millis<S: serde::Serializer>(
     serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
 }
 
+fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
+    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
+}
+
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
 ///
 /// Instead of newtype, use this because a newtype would get require handling deserializing values
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 2f22ebd54d..ed9f633253 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -416,4 +416,13 @@ impl Client {
             .await
             .map_err(Error::ReceiveBody)
     }
+
+    pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
+        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
+        self.get(uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index fc67f4cf8f..d6fe9f6055 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2053,6 +2053,10 @@ async fn get_utilization(
     r: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
+    fail::fail_point!("get-utilization-http-handler", |_| {
+        Err(ApiError::ResourceUnavailable("failpoint".into()))
+    });
+
     // this probably could be completely public, but lets make that change later.
     check_permission(&r, None)?;
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 16ebc19698..70d3076371 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2088,6 +2088,14 @@ class NeonStorageController(MetricsGetter):
         )
         return response.json()
 
+    def tenant_list(self):
+        response = self.request(
+            "GET",
+            f"{self.env.storage_controller_api}/debug/v1/tenant",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
     def node_configure(self, node_id, body: dict[str, Any]):
         log.info(f"node_configure({node_id}, {body})")
         body["node_id"] = node_id
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 3ca13a904d..56b4548b64 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -209,10 +209,12 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.node_register(env.pageserver)
 
     env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
+    env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})
 
     env.neon_cli.create_tenant(
         tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
     )
+
     generate_uploads_and_deletions(env, pageserver=env.pageserver)
 
     def parse_generation_suffix(key):
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 7a0707b564..27ea425bb1 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -769,3 +769,172 @@ def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
     assert "pitr_interval" not in readback_ps.tenant_specific_overrides
 
     env.storage_controller.consistency_check()
+
+
+class Failure:
+    pageserver_id: int
+
+    def apply(self, env: NeonEnv):
+        raise NotImplementedError()
+
+    def clear(self, env: NeonEnv):
+        raise NotImplementedError()
+
+
+class NodeStop(Failure):
+    def __init__(self, pageserver_id, immediate):
+        self.pageserver_id = pageserver_id
+        self.immediate = immediate
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.stop(immediate=self.immediate)
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.start()
+
+
+class PageserverFailpoint(Failure):
+    def __init__(self, failpoint, pageserver_id):
+        self.failpoint = failpoint
+        self.pageserver_id = pageserver_id
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints((self.failpoint, "return(1)"))
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints((self.failpoint, "off"))
+
+
+def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
+    tenants = env.storage_controller.tenant_list()
+
+    node_to_tenants: dict[int, list[TenantId]] = {}
+    for t in tenants:
+        for node_id, loc_state in t["observed"]["locations"].items():
+            if (
+                loc_state is not None
+                and "conf" in loc_state
+                and loc_state["conf"] is not None
+                and loc_state["conf"]["mode"] == "AttachedSingle"
+            ):
+                crnt = node_to_tenants.get(int(node_id), [])
+                crnt.append(TenantId(t["tenant_shard_id"]))
+                node_to_tenants[int(node_id)] = crnt
+
+    return node_to_tenants
+
+
+@pytest.mark.parametrize(
+    "failure",
+    [
+        NodeStop(pageserver_id=1, immediate=False),
+        NodeStop(pageserver_id=1, immediate=True),
+        PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"),
+    ],
+)
+def test_sharding_service_heartbeats(
+    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure
+):
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    # Initially we have two online pageservers
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 2
+    assert all([n["availability"] == "Active" for n in nodes])
+
+    # ... then we create two tenants and write some data into them
+    def create_tenant(tid: TenantId):
+        env.storage_controller.tenant_create(tid)
+
+        branch_name = "main"
+        env.neon_cli.create_timeline(
+            branch_name,
+            tenant_id=tid,
+        )
+
+        with env.endpoints.create_start("main", tenant_id=tid) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
+
+    tenant_ids = [TenantId.generate(), TenantId.generate()]
+    for tid in tenant_ids:
+        create_tenant(tid)
+
+    # ... expecting that each tenant will be placed on a different node
+    def tenants_placed():
+        node_to_tenants = build_node_to_tenants_map(env)
+        log.info(f"{node_to_tenants=}")
+
+        # Check that all the tenants have been attached
+        assert sum((len(ts) for ts in node_to_tenants.values())) == len(tenant_ids)
+        # Check that each node got one tenant
+        assert all((len(ts) == 1 for ts in node_to_tenants.values()))
+
+    wait_until(10, 1, tenants_placed)
+
+    # ... then we apply the failure
+    offline_node_id = failure.pageserver_id
+    online_node_id = (set(range(1, len(env.pageservers) + 1)) - {offline_node_id}).pop()
+    env.get_pageserver(offline_node_id).allowed_errors.append(
+        # In the case of the failpoint failure, the impacted pageserver
+        # still believes it has the tenant attached since location
+        # config calls into it will fail due to being marked offline.
+        ".*Dropped remote consistent LSN updates.*",
+    )
+
+    failure.apply(env)
+
+    # ... expecting the heartbeats to mark it offline
+    def node_offline():
+        nodes = env.storage_controller.node_list()
+        log.info(f"{nodes=}")
+        target = next(n for n in nodes if n["id"] == offline_node_id)
+        assert target["availability"] == "Offline"
+
+    # A node is considered offline if the last successful heartbeat
+    # was more than 10 seconds ago (hardcoded in the storage controller).
+    wait_until(20, 1, node_offline)
+
+    # .. expecting the tenant on the offline node to be migrated
+    def tenant_migrated():
+        node_to_tenants = build_node_to_tenants_map(env)
+        log.info(f"{node_to_tenants=}")
+        assert set(node_to_tenants[online_node_id]) == set(tenant_ids)
+
+    wait_until(10, 1, tenant_migrated)
+
+    # ... then we clear the failure
+    failure.clear(env)
+
+    # ... expecting the offline node to become active again
+    def node_online():
+        nodes = env.storage_controller.node_list()
+        target = next(n for n in nodes if n["id"] == offline_node_id)
+        assert target["availability"] == "Active"
+
+    wait_until(10, 1, node_online)
+
+    time.sleep(5)
+
+    # ... then we create a new tenant
+    tid = TenantId.generate()
+    env.storage_controller.tenant_create(tid)
+
+    # ... expecting it to be placed on the node that just came back online
+    tenants = env.storage_controller.tenant_list()
+    newest_tenant = next(t for t in tenants if t["tenant_shard_id"] == str(tid))
+    locations = list(newest_tenant["observed"]["locations"].keys())
+    locations = [int(node_id) for node_id in locations]
+    assert locations == [offline_node_id]
+
+    # ... expecting the storage controller to reach a consistent state
+    def storage_controller_consistent():
+        env.storage_controller.consistency_check()
+
+    wait_until(10, 1, storage_controller_consistent)

From 3d8830ac357a0b8d2084f1dfd6b0393ca912e74e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 14 Mar 2024 16:47:32 +0000
Subject: [PATCH 09/43] test_runner: re-enable large slru benchmark (#7125)

Previously disabled due to
https://github.com/neondatabase/neon/issues/7006.
---
 .../pageserver/pagebench/test_large_slru_basebackup.py       | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index c98fa44b1a..324ef0d516 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -1,6 +1,5 @@
 import asyncio
 import json
-import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
@@ -20,10 +19,6 @@ from performance.pageserver.util import (
 @pytest.mark.parametrize("n_tenants", [10])
 @pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
 @pytest.mark.timeout(1000)
-@pytest.mark.skipif(
-    os.getenv("CI", "false") == "true",
-    reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/7006",
-)
 def test_basebackup_with_high_slru_count(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,

From 678ed39de2446bb02572bb7d9439f4a2fa31aeef Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 14 Mar 2024 16:48:38 +0000
Subject: [PATCH 10/43] storage controller: validate DNS of registering nodes
 (#7101)

A node with a bad DNS configuration can register itself with the storage
controller, and the controller will try and schedule work onto the node,
but never succeed because it can't reach the node.

The DNS case is a special case of asymmetric network issues. The general
case isn't covered here -- but might make sense to tighten up after
#6844 merges -- then we can avoid assuming a node is immediately
available in re_attach.
---
 .../attachment_service/src/service.rs         | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index ac61209c38..8439ea5567 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -3453,6 +3453,30 @@ impl Service {
             }
         }
 
+        // We do not require that a node is actually online when registered (it will start life
+        // with it's  availability set to Offline), but we _do_ require that its DNS record exists. We're
+        // therefore not immune to asymmetric L3 connectivity issues, but we are protected against nodes
+        // that register themselves with a broken DNS config.  We check only the HTTP hostname, because
+        // the postgres hostname might only be resolvable to clients (e.g. if we're on a different VPC than clients).
+        if tokio::net::lookup_host(format!(
+            "{}:{}",
+            register_req.listen_http_addr, register_req.listen_http_port
+        ))
+        .await
+        .is_err()
+        {
+            // If we have a transient DNS issue, it's up to the caller to retry their registration.  Because
+            // we can't robustly distinguish between an intermittent issue and a totally bogus DNS situation,
+            // we return a soft 503 error, to encourage callers to retry past transient issues.
+            return Err(ApiError::ResourceUnavailable(
+                format!(
+                    "Node {} tried to register with unknown DNS name '{}'",
+                    register_req.node_id, register_req.listen_http_addr
+                )
+                .into(),
+            ));
+        }
+
         // Ordering: we must persist the new node _before_ adding it to in-memory state.
         // This ensures that before we use it for anything or expose it via any external
         // API, it is guaranteed to be available after a restart.

From 58ef78cf4174fc1802e365f2d164976ce77cde7e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 14 Mar 2024 20:49:42 +0200
Subject: [PATCH 11/43] doc(README): note cargo-nextest usage (#7122)

We have been using #5681 for quite some time, and at least since #6931
the tests have assumed `cargo-nextest` to work around our use of global
statics. Unlike the `cargo test`, the `cargo nextest run` runs each test
as a separate process that can be timeouted.

Add a mention of using `cargo-nextest` in the top-level README.md.
Sub-crates can still declare they support `cargo test`, like
`compute_tools/README.md` does.
---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index c44ae695d6..00a90f4483 100644
--- a/README.md
+++ b/README.md
@@ -238,6 +238,14 @@ If you encounter errors during setting up the initial tenant, it's best to stop
 
 ## Running tests
 
+### Rust unit tests
+
+We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
+Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
+You can install `cargo-nextest` with `cargo install cargo-nextest`.
+
+### Integration tests
+
 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
 
 ```sh

From 76c44dc140a61a359a1ce42f988074fdda910c3f Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 14 Mar 2024 15:45:38 -0400
Subject: [PATCH 12/43] spec: disable neon extension auto upgrade (#7128)

This pull request disables neon extension auto upgrade to help the next
compute image upgrade smooth.

## Summary of changes

We have two places to auto-upgrade neon extension: during compute spec
update, and when the compute node starts. The compute spec update logic
is always there, and the compute node start logic is added in
https://github.com/neondatabase/neon/pull/7029. In this pull request, we
disable both of them, so that we can still roll back to an older version
of compute before figuring out the best way of extension
upgrade-downgrade. https://github.com/neondatabase/neon/issues/6936

We will enable auto-upgrade in the next release following this release.

There are no other extension upgrades from release 4917 and therefore
after this pull request, it would be safe to revert to release 4917.

Impact:

* Project created after unpinning the compute image -> if we need to
roll back, **they will stuck**, because the default neon extension
version is 1.3. Need to manually pin the compute image version if such
things happen.
* Projects already stuck on staging due to not downgradeable -> I don't
know their current status, maybe they are already running the latest
compute image?
* Other projects -> can be rolled back to release 4917.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/spec.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index ba3a84cda8..3b596a88ff 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -743,19 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
     // which may happen in two cases:
     // - extension was just installed
     // - extension was already installed and is up to date
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    client.simple_query(query)?;
+    // DISABLED due to compute node unpinning epic
+    // let query = "ALTER EXTENSION neon UPDATE";
+    // info!("update neon extension version with query: {}", query);
+    // client.simple_query(query)?;
 
     Ok(())
 }
 
 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
-    info!("handle neon extension upgrade");
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    client.simple_query(query)?;
+pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
+    info!("handle neon extension upgrade (not really)");
+    // DISABLED due to compute node unpinning epic
+    // let query = "ALTER EXTENSION neon UPDATE";
+    // info!("update neon extension version with query: {}", query);
+    // client.simple_query(query)?;
 
     Ok(())
 }

From 49bc734e02c4c89911168d8a6ecb823afd676967 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 15 Mar 2024 09:21:48 +0000
Subject: [PATCH 13/43] proxy: add websocket regression tests (#7121)

## Problem

We have no regression tests for websocket flow

## Summary of changes

Add a hacky implementation of the postgres protocol over websockets just
to verify the protocol behaviour does not regress over time.
---
 poetry.lock                                  |  94 +++++++--
 pyproject.toml                               |   1 +
 test_runner/regress/test_proxy_websockets.py | 189 +++++++++++++++++++
 3 files changed, 272 insertions(+), 12 deletions(-)
 create mode 100644 test_runner/regress/test_proxy_websockets.py

diff --git a/poetry.lock b/poetry.lock
index 832d7c4334..7b49daf42a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2182,7 +2182,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2529,6 +2528,87 @@ docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"]
 optional = ["python-socks", "wsaccel"]
 test = ["websockets"]
 
+[[package]]
+name = "websockets"
+version = "12.0"
+description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"},
+    {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"},
+    {file = "websockets-12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eb809e816916a3b210bed3c82fb88eaf16e8afcf9c115ebb2bacede1797d2547"},
+    {file = "websockets-12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c588f6abc13f78a67044c6b1273a99e1cf31038ad51815b3b016ce699f0d75c2"},
+    {file = "websockets-12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5aa9348186d79a5f232115ed3fa9020eab66d6c3437d72f9d2c8ac0c6858c558"},
+    {file = "websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480"},
+    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:70ec754cc2a769bcd218ed8d7209055667b30860ffecb8633a834dde27d6307c"},
+    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e96f5ed1b83a8ddb07909b45bd94833b0710f738115751cdaa9da1fb0cb66e8"},
+    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4d87be612cbef86f994178d5186add3d94e9f31cc3cb499a0482b866ec477603"},
+    {file = "websockets-12.0-cp310-cp310-win32.whl", hash = "sha256:befe90632d66caaf72e8b2ed4d7f02b348913813c8b0a32fae1cc5fe3730902f"},
+    {file = "websockets-12.0-cp310-cp310-win_amd64.whl", hash = "sha256:363f57ca8bc8576195d0540c648aa58ac18cf85b76ad5202b9f976918f4219cf"},
+    {file = "websockets-12.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5d873c7de42dea355d73f170be0f23788cf3fa9f7bed718fd2830eefedce01b4"},
+    {file = "websockets-12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3f61726cae9f65b872502ff3c1496abc93ffbe31b278455c418492016e2afc8f"},
+    {file = "websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed2fcf7a07334c77fc8a230755c2209223a7cc44fc27597729b8ef5425aa61a3"},
+    {file = "websockets-12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e332c210b14b57904869ca9f9bf4ca32f5427a03eeb625da9b616c85a3a506c"},
+    {file = "websockets-12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5693ef74233122f8ebab026817b1b37fe25c411ecfca084b29bc7d6efc548f45"},
+    {file = "websockets-12.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e9e7db18b4539a29cc5ad8c8b252738a30e2b13f033c2d6e9d0549b45841c04"},
+    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6e2df67b8014767d0f785baa98393725739287684b9f8d8a1001eb2839031447"},
+    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bea88d71630c5900690fcb03161ab18f8f244805c59e2e0dc4ffadae0a7ee0ca"},
+    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dff6cdf35e31d1315790149fee351f9e52978130cef6c87c4b6c9b3baf78bc53"},
+    {file = "websockets-12.0-cp311-cp311-win32.whl", hash = "sha256:3e3aa8c468af01d70332a382350ee95f6986db479ce7af14d5e81ec52aa2b402"},
+    {file = "websockets-12.0-cp311-cp311-win_amd64.whl", hash = "sha256:25eb766c8ad27da0f79420b2af4b85d29914ba0edf69f547cc4f06ca6f1d403b"},
+    {file = "websockets-12.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0e6e2711d5a8e6e482cacb927a49a3d432345dfe7dea8ace7b5790df5932e4df"},
+    {file = "websockets-12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dbcf72a37f0b3316e993e13ecf32f10c0e1259c28ffd0a85cee26e8549595fbc"},
+    {file = "websockets-12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12743ab88ab2af1d17dd4acb4645677cb7063ef4db93abffbf164218a5d54c6b"},
+    {file = "websockets-12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b645f491f3c48d3f8a00d1fce07445fab7347fec54a3e65f0725d730d5b99cb"},
+    {file = "websockets-12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9893d1aa45a7f8b3bc4510f6ccf8db8c3b62120917af15e3de247f0780294b92"},
+    {file = "websockets-12.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f38a7b376117ef7aff996e737583172bdf535932c9ca021746573bce40165ed"},
+    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f764ba54e33daf20e167915edc443b6f88956f37fb606449b4a5b10ba42235a5"},
+    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:1e4b3f8ea6a9cfa8be8484c9221ec0257508e3a1ec43c36acdefb2a9c3b00aa2"},
+    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9fdf06fd06c32205a07e47328ab49c40fc1407cdec801d698a7c41167ea45113"},
+    {file = "websockets-12.0-cp312-cp312-win32.whl", hash = "sha256:baa386875b70cbd81798fa9f71be689c1bf484f65fd6fb08d051a0ee4e79924d"},
+    {file = "websockets-12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae0a5da8f35a5be197f328d4727dbcfafa53d1824fac3d96cdd3a642fe09394f"},
+    {file = "websockets-12.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5f6ffe2c6598f7f7207eef9a1228b6f5c818f9f4d53ee920aacd35cec8110438"},
+    {file = "websockets-12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9edf3fc590cc2ec20dc9d7a45108b5bbaf21c0d89f9fd3fd1685e223771dc0b2"},
+    {file = "websockets-12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8572132c7be52632201a35f5e08348137f658e5ffd21f51f94572ca6c05ea81d"},
+    {file = "websockets-12.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:604428d1b87edbf02b233e2c207d7d528460fa978f9e391bd8aaf9c8311de137"},
+    {file = "websockets-12.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a9d160fd080c6285e202327aba140fc9a0d910b09e423afff4ae5cbbf1c7205"},
+    {file = "websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87b4aafed34653e465eb77b7c93ef058516cb5acf3eb21e42f33928616172def"},
+    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b2ee7288b85959797970114deae81ab41b731f19ebcd3bd499ae9ca0e3f1d2c8"},
+    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7fa3d25e81bfe6a89718e9791128398a50dec6d57faf23770787ff441d851967"},
+    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a571f035a47212288e3b3519944f6bf4ac7bc7553243e41eac50dd48552b6df7"},
+    {file = "websockets-12.0-cp38-cp38-win32.whl", hash = "sha256:3c6cc1360c10c17463aadd29dd3af332d4a1adaa8796f6b0e9f9df1fdb0bad62"},
+    {file = "websockets-12.0-cp38-cp38-win_amd64.whl", hash = "sha256:1bf386089178ea69d720f8db6199a0504a406209a0fc23e603b27b300fdd6892"},
+    {file = "websockets-12.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab3d732ad50a4fbd04a4490ef08acd0517b6ae6b77eb967251f4c263011a990d"},
+    {file = "websockets-12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1d9697f3337a89691e3bd8dc56dea45a6f6d975f92e7d5f773bc715c15dde28"},
+    {file = "websockets-12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1df2fbd2c8a98d38a66f5238484405b8d1d16f929bb7a33ed73e4801222a6f53"},
+    {file = "websockets-12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23509452b3bc38e3a057382c2e941d5ac2e01e251acce7adc74011d7d8de434c"},
+    {file = "websockets-12.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e5fc14ec6ea568200ea4ef46545073da81900a2b67b3e666f04adf53ad452ec"},
+    {file = "websockets-12.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46e71dbbd12850224243f5d2aeec90f0aaa0f2dde5aeeb8fc8df21e04d99eff9"},
+    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b81f90dcc6c85a9b7f29873beb56c94c85d6f0dac2ea8b60d995bd18bf3e2aae"},
+    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a02413bc474feda2849c59ed2dfb2cddb4cd3d2f03a2fedec51d6e959d9b608b"},
+    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bbe6013f9f791944ed31ca08b077e26249309639313fff132bfbf3ba105673b9"},
+    {file = "websockets-12.0-cp39-cp39-win32.whl", hash = "sha256:cbe83a6bbdf207ff0541de01e11904827540aa069293696dd528a6640bd6a5f6"},
+    {file = "websockets-12.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:248d8e2446e13c1d4326e0a6a4e9629cb13a11195051a73acf414812700badbd"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44069528d45a933997a6fef143030d8ca8042f0dfaad753e2906398290e2870"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c4e37d36f0d19f0a4413d3e18c0d03d0c268ada2061868c1e6f5ab1a6d575077"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d829f975fc2e527a3ef2f9c8f25e553eb7bc779c6665e8e1d52aa22800bb38b"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2c71bd45a777433dd9113847af751aae36e448bc6b8c361a566cb043eda6ec30"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0bee75f400895aef54157b36ed6d3b308fcab62e5260703add87f44cee9c82a6"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:423fc1ed29f7512fceb727e2d2aecb952c46aa34895e9ed96071821309951123"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27a5e9964ef509016759f2ef3f2c1e13f403725a5e6a1775555994966a66e931"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3181df4583c4d3994d31fb235dc681d2aaad744fbdbf94c4802485ececdecf2"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b067cb952ce8bf40115f6c19f478dc71c5e719b7fbaa511359795dfd9d1a6468"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:00700340c6c7ab788f176d118775202aadea7602c5cc6be6ae127761c16d6b0b"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e469d01137942849cff40517c97a30a93ae79917752b34029f0ec72df6b46399"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba0cab91b3956dfa9f512147860783a1829a8d905ee218a9837c18f683239611"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2cb388a5bfb56df4d9a406783b7f9dbefb888c09b71629351cc6b036e9259370"},
+    {file = "websockets-12.0-py3-none-any.whl", hash = "sha256:dc284bbc8d7c78a6c69e0c7325ab46ee5e40bb4d50e494d8131a07ef47500e9e"},
+    {file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"},
+]
+
 [[package]]
 name = "werkzeug"
 version = "3.0.1"
@@ -2572,16 +2652,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2819,4 +2889,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "af9d5b45310c12411bfe67cb9677d2236808d0780ca1bd81525d2763a928f7f9"
+content-hash = "df7161da4fdc3cba0a445176fc9dda2a0e8a53e13a7aa8a864385ca259381b41"
diff --git a/pyproject.toml b/pyproject.toml
index 6dff112a5e..e347d47cbf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
 httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
+websockets = "^12.0"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py
new file mode 100644
index 0000000000..6d1cb9765a
--- /dev/null
+++ b/test_runner/regress/test_proxy_websockets.py
@@ -0,0 +1,189 @@
+import ssl
+
+import pytest
+import websockets
+from fixtures.neon_fixtures import NeonProxy
+
+
+@pytest.mark.asyncio
+async def test_websockets(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user ws_auth with password 'ws' superuser")
+
+    user = "ws_auth"
+    password = "ws"
+
+    version = b"\x00\x03\x00\x00"
+    params = {
+        "user": user,
+        "database": "postgres",
+        "client_encoding": "UTF8",
+    }
+
+    ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+    ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt"))
+
+    async with websockets.connect(
+        f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+        ssl=ssl_context,
+    ) as websocket:
+        startup_message = bytearray(version)
+        for key, value in params.items():
+            startup_message.extend(key.encode("ascii"))
+            startup_message.extend(b"\0")
+            startup_message.extend(value.encode("ascii"))
+            startup_message.extend(b"\0")
+        startup_message.extend(b"\0")
+        length = (4 + len(startup_message)).to_bytes(4, byteorder="big")
+
+        await websocket.send([length, startup_message])
+
+        startup_response = await websocket.recv()
+        assert isinstance(startup_response, bytes)
+        assert startup_response[0:1] == b"R", "should be authentication message"
+        assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
+        assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext"
+
+        auth_message = password.encode("utf-8") + b"\0"
+        length = (4 + len(auth_message)).to_bytes(4, byteorder="big")
+        await websocket.send([b"p", length, auth_message])
+
+        auth_response = await websocket.recv()
+        assert isinstance(auth_response, bytes)
+        assert auth_response[0:1] == b"R", "should be authentication message"
+        assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
+        assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated"
+
+        query_message = "SELECT 1".encode("utf-8") + b"\0"
+        length = (4 + len(query_message)).to_bytes(4, byteorder="big")
+        await websocket.send([b"Q", length, query_message])
+
+        query_response = await websocket.recv()
+        assert isinstance(query_response, bytes)
+        # 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00'
+        # 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011'
+        # 'C\x00\x00\x00\rSELECT 1\x00'
+        # 'Z\x00\x00\x00\x05I'
+
+        assert query_response[0:1] == b"T", "should be row description message"
+        row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        row_description, query_response = (
+            query_response[:row_description_len],
+            query_response[row_description_len:],
+        )
+        assert row_description[5:7] == b"\x00\x01", "should have 1 column"
+        assert row_description[7:16] == b"?column?\0", "column should be named ?column?"
+        assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4"
+
+        assert query_response[0:1] == b"D", "should be data row message"
+        data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        data_row, query_response = query_response[:data_row_len], query_response[data_row_len:]
+        assert (
+            data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011"
+        ), "should contain 1 column with text value 1"
+
+        assert query_response[0:1] == b"C", "should be command complete message"
+        command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        command_complete, query_response = (
+            query_response[:command_complete_len],
+            query_response[command_complete_len:],
+        )
+        assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0"
+
+        assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)"
+
+        # close
+        await websocket.send(b"X\x00\x00\x00\x04")
+        await websocket.wait_closed()
+
+
+@pytest.mark.asyncio
+async def test_websockets_pipelined(static_proxy: NeonProxy):
+    """
+    Test whether we can send the startup + auth + query all in one go
+    """
+
+    static_proxy.safe_psql("create user ws_auth with password 'ws' superuser")
+
+    user = "ws_auth"
+    password = "ws"
+
+    version = b"\x00\x03\x00\x00"
+    params = {
+        "user": user,
+        "database": "postgres",
+        "client_encoding": "UTF8",
+    }
+
+    ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+    ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt"))
+
+    async with websockets.connect(
+        f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+        ssl=ssl_context,
+    ) as websocket:
+        startup_message = bytearray(version)
+        for key, value in params.items():
+            startup_message.extend(key.encode("ascii"))
+            startup_message.extend(b"\0")
+            startup_message.extend(value.encode("ascii"))
+            startup_message.extend(b"\0")
+        startup_message.extend(b"\0")
+        length0 = (4 + len(startup_message)).to_bytes(4, byteorder="big")
+
+        auth_message = password.encode("utf-8") + b"\0"
+        length1 = (4 + len(auth_message)).to_bytes(4, byteorder="big")
+        query_message = "SELECT 1".encode("utf-8") + b"\0"
+        length2 = (4 + len(query_message)).to_bytes(4, byteorder="big")
+        await websocket.send(
+            [length0, startup_message, b"p", length1, auth_message, b"Q", length2, query_message]
+        )
+
+        startup_response = await websocket.recv()
+        assert isinstance(startup_response, bytes)
+        assert startup_response[0:1] == b"R", "should be authentication message"
+        assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
+        assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext"
+
+        auth_response = await websocket.recv()
+        assert isinstance(auth_response, bytes)
+        assert auth_response[0:1] == b"R", "should be authentication message"
+        assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
+        assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated"
+
+        query_response = await websocket.recv()
+        assert isinstance(query_response, bytes)
+        # 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00'
+        # 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011'
+        # 'C\x00\x00\x00\rSELECT 1\x00'
+        # 'Z\x00\x00\x00\x05I'
+
+        assert query_response[0:1] == b"T", "should be row description message"
+        row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        row_description, query_response = (
+            query_response[:row_description_len],
+            query_response[row_description_len:],
+        )
+        assert row_description[5:7] == b"\x00\x01", "should have 1 column"
+        assert row_description[7:16] == b"?column?\0", "column should be named ?column?"
+        assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4"
+
+        assert query_response[0:1] == b"D", "should be data row message"
+        data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        data_row, query_response = query_response[:data_row_len], query_response[data_row_len:]
+        assert (
+            data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011"
+        ), "should contain 1 column with text value 1"
+
+        assert query_response[0:1] == b"C", "should be command complete message"
+        command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        command_complete, query_response = (
+            query_response[:command_complete_len],
+            query_response[command_complete_len:],
+        )
+        assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0"
+
+        assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)"
+
+        # close
+        await websocket.send(b"X\x00\x00\x00\x04")
+        await websocket.wait_closed()

From 46098ea0ea8b0a1fa2f49df76229291bdbae43b0 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Fri, 15 Mar 2024 16:13:15 +0500
Subject: [PATCH 14/43] proxy: add more missing warm logging (#7133)

## Problem

There is one more missing thing about cached connections for
`cold_start_info`.

## Summary of changes

Fix and add comments.
---
 proxy/src/console/provider/neon.rs | 3 +++
 proxy/src/serverless/backend.rs    | 1 +
 2 files changed, 4 insertions(+)

diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 3b2e0cc204..b36663518d 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -250,6 +250,8 @@ impl super::Api for Api {
         // which means that we might cache it to reduce the load and latency.
         if let Some(cached) = self.caches.node_info.get(&key) {
             info!(key = &*key, "found cached compute node info");
+            info!("cold_start_info=warm");
+            ctx.set_cold_start_info(ColdStartInfo::Warm);
             return Ok(cached);
         }
 
@@ -260,6 +262,7 @@ impl super::Api for Api {
         if permit.should_check_cache() {
             if let Some(cached) = self.caches.node_info.get(&key) {
                 info!(key = &*key, "found cached compute node info");
+                info!("cold_start_info=warm");
                 ctx.set_cold_start_info(ColdStartInfo::Warm);
                 return Ok(cached);
             }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 9b3ca8d447..29ef641265 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -84,6 +84,7 @@ impl PoolingBackend {
         };
 
         if let Some(client) = maybe_client {
+            info!("cold_start_info=warm");
             ctx.set_cold_start_info(ColdStartInfo::Warm);
             return Ok(client);
         }

From 23416cc3580ecb2d05333e3fc9431aa27cc663a7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 11:14:25 +0000
Subject: [PATCH 15/43] docs: sharding phase 1 RFC (#5432)

We need to shard our Tenants to support larger databases without those
large databases dominating our pageservers and/or requiring dedicated
pageservers.

This RFC aims to define an initial capability that will permit creating
large-capacity databases using a static configuration
defined at time of Tenant creation.

Online re-sharding is deferred as future work, as is offloading layers
for historical reads. However, both of these capabilities would be
implementable without further changes to the control plane or compute:
this RFC aims to define the cross-component work needed to bootstrap
sharding end-to-end.
---
 docs/rfcs/031-sharding-static.md | 408 +++++++++++++++++++++++++++++++
 1 file changed, 408 insertions(+)
 create mode 100644 docs/rfcs/031-sharding-static.md

diff --git a/docs/rfcs/031-sharding-static.md b/docs/rfcs/031-sharding-static.md
new file mode 100644
index 0000000000..fe009b8660
--- /dev/null
+++ b/docs/rfcs/031-sharding-static.md
@@ -0,0 +1,408 @@
+# Sharding Phase 1: Static Key-space Sharding
+
+## Summary
+
+To enable databases with sizes approaching the capacity of a pageserver's disk,
+it is necessary to break up the storage for the database, or _shard_ it.
+
+Sharding in general is a complex area. This RFC aims to define an initial
+capability that will permit creating large-capacity databases using a static configuration
+defined at time of Tenant creation.
+
+## Motivation
+
+Currently, all data for a Tenant, including all its timelines, is stored on a single
+pageserver. The local storage required may be several times larger than the actual
+database size, due to LSM write inflation.
+
+If a database is larger than what one pageserver can hold, then it becomes impossible
+for the pageserver to hold it in local storage, as it must do to provide service to
+clients.
+
+### Prior art
+
+In Neon:
+
+- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
+- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
+- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
+
+Prior art in other distributed systems is too broad to capture here: pretty much
+any scale out storage system does something like this.
+
+## Requirements
+
+- Enable creating a large (for example, 16TiB) database without requiring dedicated
+  pageserver nodes.
+- Share read/write bandwidth costs for large databases across pageservers, as well
+  as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
+  that disrupt service to other tenants.
+- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
+  does not write out a single contiguous ranges of page numbers.
+
+_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
+that a user might create on a current-gen enterprise SSD should also work well on
+Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
+pageserver backend is not the limiting factor in the database size_.
+
+## Non Goals
+
+- Independently distributing timelines within the same tenant. If a tenant has many
+  timelines, then sharding may be a less efficient mechanism for distributing load than
+  sharing out timelines between pageservers.
+- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
+  based on the idea that separate mechanisms will make sense for each dimension.
+
+## Impacted Components
+
+pageserver, control plane, postgres/smgr
+
+## Terminology
+
+**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
+the page number is the key in that store. `Key` is a literal data type in existing code.
+
+**LSN dimension**: this just means the range of LSNs (history), when talking about the range
+of keys and LSNs as a two dimensional space.
+
+## Implementation
+
+### Key sharding vs. LSN sharding
+
+When we think of sharding across the two dimensional key/lsn space, this is an
+opportunity to think about how the two dimensions differ:
+
+- Sharding the key space distributes the _write_ workload of ingesting data
+  and compacting. This work must be carefully managed so that exactly one
+  node owns a given key.
+- Sharding the LSN space distributes the _historical read_ workload. This work
+  can be done by anyone without any special coordination, as long as they can
+  see the remote index and layers.
+
+The key sharding is the harder part, and also the more urgent one, to support larger
+capacity databases. Because distributing historical LSN read work is a relatively
+simpler problem that most users don't have, we defer it to future work. It is anticipated
+that some quite simple P2P offload model will enable distributing work for historical
+reads: a node which is low on space can call out to peer to ask it to download and
+serve reads from a historical layer.
+
+### Key mapping scheme
+
+Having decided to focus on key sharding, we must next decide how we will map
+keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
+between data locality and avoiding entire large relations mapping to the same shard.
+
+We will define two spaces:
+
+- Key space: unsigned integer
+- Shard space: integer from 0 to N-1, where we have N shards.
+
+### Key -> Shard mapping
+
+Keys are currently defined in the pageserver's getpage@lsn interface as follows:
+
+```
+pub struct Key {
+    pub field1: u8,
+    pub field2: u32,
+    pub field3: u32,
+    pub field4: u32,
+    pub field5: u8,
+    pub field6: u32,
+}
+
+
+fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: blknum,
+    }
+}
+```
+
+_Note: keys for relation metadata are ignored here, as this data will be mirrored to all
+shards. For distribution purposes, we only care about user data keys_
+
+The properties we want from our Key->Shard mapping are:
+
+- Locality in `blknum`, such that adjacent `blknum` will usually map to
+  the same stripe and consequently land on the same shard, even though the overall
+  collection of blocks in a relation will be spread over many stripes and therefore
+  many shards.
+- Avoid the same blknum on different relations landing on the same stripe, so that
+  with many small relations we do not end up aliasing data to the same stripe/shard.
+- Avoid vulnerability to aliasing in the values of relation identity fields, such that
+  if there are patterns in the value of `relnode`, these do not manifest as patterns
+  in data placement.
+
+To accomplish this, the blknum is used to select a stripe, and stripes are
+assigned to shards in a pseudorandom order via a hash. The motivation for
+pseudo-random distribution (rather than sequential mapping of stripe to shard)
+is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
+all relations' stripes to touch pageservers in the same order.
+
+To map a `Key` to a shard:
+
+- Hash the `Key` field 4 (relNode).
+- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
+  hash of this with the hash from the previous step.
+- The total hash modulo the shard count gives the shard holding this key.
+
+Why don't we use the other fields in the Key?
+
+- We ignore `forknum` for key mapping, because it distinguishes different classes of data
+  in the same relation, and we would like to keep the data in a relation together.
+- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
+  database's blocks differ only by spcNode and dbNode from the original. To enable running
+  this type of creation without cross-pageserver communication, we must ensure that these
+  blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
+
+### Data placement examples
+
+For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
+and a stripe size of 32k pages:
+
+- A single large relation: `blknum` division will break the data up into 4096
+  stripes, which will be scattered across the shards.
+- 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
+  and that stripe will be placed according to the hash of the key fields 4. The
+  data placement will be statistically uniform across shards.
+
+Data placement will be more uneven on smaller databases:
+
+- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
+  that both relations land on the same shard and no data lands on the other shard.
+- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
+  the data of the other four shards.
+
+These uneven cases for small amounts of data do not matter, as long as the stripe size
+is an order of magnitude smaller than the amount of data we are comfortable holding
+in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
+a tenant has some shards with 256MB size and some shards with 512MB size, even though
+the standard deviation of shard size within the tenant is very high. Our key mapping
+scheme provides a statistical guarantee that as the tenant's overall data size increases,
+uniformity of placement will improve.
+
+### Important Types
+
+#### `ShardIdentity`
+
+Provides the information needed to know whether a particular key belongs
+to a particular shard:
+
+- Layout version
+- Stripe size
+- Shard count
+- Shard index
+
+This structure's size is constant. Note that if we had used a differnet key
+mapping scheme such as consistent hashing with explicit hash ranges assigned
+to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
+key mapping scheme used here enables a small fixed size ShardIdentity.
+
+### Pageserver changes
+
+#### Structural
+
+Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
+`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
+of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
+covers the whole keyspace.
+
+When the pageserver writes layers and index_part.json to remote storage, it must
+include the shard index & count in the name, to avoid collisions (the count is
+necessary for future-proofing: the count will vary in time). These keys
+will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
+exactly the same for TenantShards as it does for Tenants today: each shard will have
+its own generation number.
+
+#### Storage Format: Keys
+
+For tenants with >1 shard, layer files implicitly become sparse: within the key
+range described in the layer name, the layer file for a shard will only hold the
+content relevant to stripes assigned to the shard.
+
+For this reason, the LayerFileName within a tenant is no longer unique: different shards
+may use the same LayerFileName to refer to different data. We may solve this simply
+by including the shard number in the keys used for layers.
+
+The shard number will be included as a prefix (as part of tenant ID), like this:
+
+`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
+
+`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
+
+Reasons for this particular format:
+
+- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
+  we construct a layer file name), and enables efficient listing of index_parts within
+  a particular shard-timeline prefix.
+- Including the shard _count_ as well as shard number means that in future when we implement
+  shard splitting, it will be possible for a parent shard and one of its children to write
+  the same layer file without a name collision. For example, a parent shard 0_1 might split
+  into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
+  that is distinct from what shard 0_1 would have written at the same place.
+
+In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
+and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
+for example a single-shard tenant's prefix will be `0001`.
+
+For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
+and use this as a cue to construct paths with no prefix at all.
+
+#### Storage Format: Indices
+
+In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
+when we implement shard splitting in future, it will be useful to enable shards to reference layers
+written by other shards (specifically the parent shard during a split), so that shards don't
+have to exhaustively copy all data into their own shard-prefixed keys.
+
+To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
+tuple on each layer, such that it can construct paths for layers written by other shards. This
+naturally raises the question of who "owns" such layers written by ancestral shards: this problem
+will be addressed in phase 2.
+
+For backward compatibility, any index entry without shard information will be assumed to be
+in the legacy shardidentity.
+
+#### WAL Ingest
+
+In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
+it down to the pages relevant to their shard:
+
+- For ordinary user data writes, only retain a write if it matches the ShardIdentity
+- For metadata describing relations etc, all shards retain these writes.
+
+The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
+one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
+and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
+expensive: if the safekeeper can be made shard-aware then it could be taught to use
+the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
+
+#### Compaction/GC
+
+No changes needed.
+
+The pageserver doesn't have to do anything special during compaction
+or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
+This will result in sparse layer files, containing keys only in the stripes that this
+shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
+the key range, these should be updated to ignore gaps that are due to sharding, to
+avoid spuriously splitting up layers ito stripe-sized pieces.
+
+### Compute Endpoints
+
+Compute endpoints will need to:
+
+- Accept a vector of connection strings as part of their configuration from the control plane
+- Route pageserver requests according to mapping the hash of key to the correct
+  entry in the vector of connection strings.
+
+Doing this in compute rather than routing requests via a single pageserver is
+necessary to enable sharding tenants without adding latency from extra hops.
+
+### Control Plane
+
+Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
+be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
+tenants.
+
+Tenant lifecycle operations like deletion will require fanning-out to all the shards
+in the tenant. The same goes for timeline creation and deletion: a timeline should
+not be considered created until it has been created in all shards.
+
+#### Selectively enabling sharding for large tenants
+
+Initially, we will explicitly enable sharding for large tenants only.
+
+In future, this hint mechanism will become optional when we implement automatic
+re-sharding of tenants.
+
+## Future Phases
+
+This section exists to indicate what will likely come next after this phase.
+
+Phases 2a and 2b are amenable to execution in parallel.
+
+### Phase 2a: WAL fan-out
+
+**Problem**: when all shards consume the whole WAL, the network bandwidth used
+for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
+of the shard count.
+
+Network bandwidth is not our most pressing bottleneck, but it is likely to become
+a problem if we set a modest shard count (~8) on a significant number of tenants,
+especially as those larger tenants which we shard are also likely to have higher
+write bandwidth than average.
+
+### Phase 2b: Shard Splitting
+
+**Problem**: the number of shards in a tenant is defined at creation time and cannot
+be changed. This causes excessive sharding for most small tenants, and an upper
+bound on scale for very large tenants.
+
+To address this, a _splitting_ feature will later be added. One shard can split its
+data into a number of children by doing a special compaction operation to generate
+image layers broken up child-shard-wise, and then writing out an `index_part.json` for
+each child. This will then require external coordination (by the control plane) to
+safely attach these new child shards and then move them around to distribute work.
+The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
+once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
+the risk/complexity of implementing such a rarely-encountered scenario.
+
+### Phase N (future): distributed historical reads
+
+**Problem**: while sharding based on key is good for handling changes in overall
+database size, it is less suitable for spiky/unpredictable changes in the read
+workload to historical layers. Sudden increases in historical reads could result
+in sudden increases in local disk capacity required for a TenantShard.
+
+Example: the extreme case of this would be to run a tenant for a year, then create branches
+with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
+the on-disk capacity footprint of a TenantShard, since it would be serving reads
+from all those disparate historical layers.
+
+If we can respond fast enough, then key-sharding a tenant more finely can help with
+this, but splitting may be a relatively expensive operation and the increased historical
+read load may be transient.
+
+A separate mechanism for handling heavy historical reads could be something like
+a gossip mechanism for pageservers to communicate
+about their workload, and then a getpageatlsn offload mechanism where one pageserver can
+ask another to go read the necessary layers from remote storage to serve the read. This
+requires relativly little coordination because it is read-only: any node can service any
+read. All reads to a particular shard would still flow through one node, but the
+disk capactity & I/O impact of servicing the read would be distributed.
+
+## FAQ/Alternatives
+
+### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
+
+When a database is growing under a write workload, writes may predominantly hit the
+end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
+is intensively re-writing a particular relation, if that relation lived in a particular
+shard then it would not achieve our goal of distributing the write work across shards.
+
+### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
+
+1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
+   database would still cause a load hotspot on the pageserver routing its read requests.
+2. The additional hop through the "proxy" pageserver would add latency and overall
+   resource cost (CPU, network bandwidth)
+
+### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
+
+In this model, there would be no explicit sharding of work, but the pageserver to which
+a tenant is attached would not hold all layers on its disk: instead, it would call out
+to peers to have them store some layers, and call out to those peers to request reads
+in those layers.
+
+This mechanism will work well for distributing work in the LSN dimension, but in the key
+space dimension it has the major limitation of requiring one node to handle all
+incoming writes, and compactions. Even if the write workload for a large database
+fits in one pageserver, it will still be a hotspot and such tenants may still
+de-facto require their own pageserver.

From 6443dbef90d121ad44911962d3991fec5d31b873 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 13:18:12 +0000
Subject: [PATCH 16/43] tests: extend log allow list for
 test_sharding_split_failures (#7134)

Failure types that panic the storage controller can cause unlucky
pageservers to emit log warnings that they can't reach the generation
validation API:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/8284495687/index.html

Tolerate this log message: it's an expected behavior.
---
 test_runner/regress/test_sharding.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bdb9990a51..e8511e428e 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -729,6 +729,10 @@ def test_sharding_split_failures(
         # thereby be unable to publish remote consistent LSN updates
         ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
 
+        # If we're using a failure that will panic the storage controller, all background
+        # upcalls from the pageserver can fail
+        ps.allowed_errors.append(".*calling control plane generation validation API failed.*")
+
     # Make sure the node we're failing has a shard on it, otherwise the test isn't testing anything
     assert (
         failure.pageserver_id is None

From 516f793ab4c15e840f7990ca84fe00f13b1382d5 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 13:37:49 +0000
Subject: [PATCH 17/43] remote_storage: make last_modified and etag mandatory
 (#7126)

## Problem

These fields were only optional for the convenience of the `local_fs`
test helper -- real remote storage backends provide them. It complicated
any code that actually wanted to use them for anything.

## Summary of changes

- Make these fields non-optional
- For azure/S3 it is an error if the server doesn't provide them
- For local_fs, use random strings as etags and the file's mtime for
last_modified.
---
 libs/remote_storage/Cargo.toml            |   1 +
 libs/remote_storage/src/azure_blob.rs     |   9 ++
 libs/remote_storage/src/lib.rs            |   4 +-
 libs/remote_storage/src/local_fs.rs       | 162 ++++++++++++----------
 libs/remote_storage/src/s3_bucket.rs      |  14 +-
 libs/remote_storage/tests/test_real_s3.rs |   2 +-
 6 files changed, 111 insertions(+), 81 deletions(-)

diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 15f3cd3b80..4a53f485ca 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -18,6 +18,7 @@ camino.workspace = true
 humantime.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
+rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 12ec680cb6..1e337bc1e8 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -174,6 +174,15 @@ impl AzureBlobStorage {
                     .map_err(|e| DownloadError::Other(e.into()))?;
                 bufs.push(data);
             }
+
+            if bufs.is_empty() {
+                return Err(DownloadError::Other(anyhow::anyhow!(
+                    "Azure GET response contained no buffers"
+                )));
+            }
+            let etag = etag.unwrap();
+            let last_modified = last_modified.unwrap();
+
             Ok(Download {
                 download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
                 etag,
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index b0b69f9155..fd832eb94f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -291,9 +291,9 @@ pub type DownloadStream =
 pub struct Download {
     pub download_stream: DownloadStream,
     /// The last time the file was modified (`last-modified` HTTP header)
-    pub last_modified: Option<SystemTime>,
+    pub last_modified: SystemTime,
     /// A way to identify this specific version of the resource (`etag` HTTP header)
-    pub etag: Option<String>,
+    pub etag: String,
     /// Extra key-value data, associated with the current remote file.
     pub metadata: Option<StorageMetadata>,
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 478ad81dc1..ea0756541b 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -10,7 +10,7 @@ use std::{
     io::ErrorKind,
     num::NonZeroU32,
     pin::Pin,
-    time::{Duration, SystemTime},
+    time::{Duration, SystemTime, UNIX_EPOCH},
 };
 
 use anyhow::{bail, ensure, Context};
@@ -406,35 +406,37 @@ impl RemoteStorage for LocalFs {
         cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         let target_path = from.with_base(&self.storage_root);
-        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let source = ReaderStream::new(
-                fs::OpenOptions::new()
-                    .read(true)
-                    .open(&target_path)
-                    .await
-                    .with_context(|| {
-                        format!("Failed to open source file {target_path:?} to use in the download")
-                    })
-                    .map_err(DownloadError::Other)?,
-            );
 
-            let metadata = self
-                .read_storage_metadata(&target_path)
+        let file_metadata = file_metadata(&target_path).await?;
+
+        let source = ReaderStream::new(
+            fs::OpenOptions::new()
+                .read(true)
+                .open(&target_path)
                 .await
-                .map_err(DownloadError::Other)?;
+                .with_context(|| {
+                    format!("Failed to open source file {target_path:?} to use in the download")
+                })
+                .map_err(DownloadError::Other)?,
+        );
 
-            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+        let metadata = self
+            .read_storage_metadata(&target_path)
+            .await
+            .map_err(DownloadError::Other)?;
 
-            Ok(Download {
-                metadata,
-                last_modified: None,
-                etag: None,
-                download_stream: Box::pin(source),
-            })
-        } else {
-            Err(DownloadError::NotFound)
-        }
+        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
+        let etag = mock_etag(&file_metadata);
+        Ok(Download {
+            metadata,
+            last_modified: file_metadata
+                .modified()
+                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
+            etag,
+            download_stream: Box::pin(source),
+        })
     }
 
     async fn download_byte_range(
@@ -452,50 +454,51 @@ impl RemoteStorage for LocalFs {
                 return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
             }
         }
+
         let target_path = from.with_base(&self.storage_root);
-        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let mut source = tokio::fs::OpenOptions::new()
-                .read(true)
-                .open(&target_path)
-                .await
-                .with_context(|| {
-                    format!("Failed to open source file {target_path:?} to use in the download")
-                })
-                .map_err(DownloadError::Other)?;
-
-            let len = source
-                .metadata()
-                .await
-                .context("query file length")
-                .map_err(DownloadError::Other)?
-                .len();
-
-            source
-                .seek(io::SeekFrom::Start(start_inclusive))
-                .await
-                .context("Failed to seek to the range start in a local storage file")
-                .map_err(DownloadError::Other)?;
-
-            let metadata = self
-                .read_storage_metadata(&target_path)
-                .await
-                .map_err(DownloadError::Other)?;
-
-            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
-            let source = ReaderStream::new(source);
-
-            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
-
-            Ok(Download {
-                metadata,
-                last_modified: None,
-                etag: None,
-                download_stream: Box::pin(source),
+        let file_metadata = file_metadata(&target_path).await?;
+        let mut source = tokio::fs::OpenOptions::new()
+            .read(true)
+            .open(&target_path)
+            .await
+            .with_context(|| {
+                format!("Failed to open source file {target_path:?} to use in the download")
             })
-        } else {
-            Err(DownloadError::NotFound)
-        }
+            .map_err(DownloadError::Other)?;
+
+        let len = source
+            .metadata()
+            .await
+            .context("query file length")
+            .map_err(DownloadError::Other)?
+            .len();
+
+        source
+            .seek(io::SeekFrom::Start(start_inclusive))
+            .await
+            .context("Failed to seek to the range start in a local storage file")
+            .map_err(DownloadError::Other)?;
+
+        let metadata = self
+            .read_storage_metadata(&target_path)
+            .await
+            .map_err(DownloadError::Other)?;
+
+        let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
+        let source = ReaderStream::new(source);
+
+        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
+        let etag = mock_etag(&file_metadata);
+        Ok(Download {
+            metadata,
+            last_modified: file_metadata
+                .modified()
+                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
+            etag,
+            download_stream: Box::pin(source),
+        })
     }
 
     async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
@@ -610,13 +613,22 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
     Ok(())
 }
 
-fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
-    if file_path.exists() {
-        ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
-        Ok(true)
-    } else {
-        Ok(false)
-    }
+async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
+    tokio::fs::metadata(&file_path).await.map_err(|e| {
+        if e.kind() == ErrorKind::NotFound {
+            DownloadError::NotFound
+        } else {
+            DownloadError::BadInput(e.into())
+        }
+    })
+}
+
+// Use mtime as stand-in for ETag.  We could calculate a meaningful one by md5'ing the contents of files we
+// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
+// quickly, with less overhead than using a mock S3 server.
+fn mock_etag(meta: &std::fs::Metadata) -> String {
+    let mtime = meta.modified().expect("Filesystem mtime missing");
+    format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis())
 }
 
 #[cfg(test)]
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 438f45fbde..56bc32ebdd 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
 
-use aws_smithy_types::byte_stream::ByteStream;
 use aws_smithy_types::{body::SdkBody, DateTime};
+use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
@@ -287,8 +287,16 @@ impl S3Bucket {
         let remaining = self.timeout.saturating_sub(started_at.elapsed());
 
         let metadata = object_output.metadata().cloned().map(StorageMetadata);
-        let etag = object_output.e_tag;
-        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
+        let etag = object_output
+            .e_tag
+            .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?;
+        let last_modified = object_output
+            .last_modified
+            .ok_or(DownloadError::Other(anyhow::anyhow!(
+                "Missing LastModified header"
+            )))?
+            .try_into()
+            .map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
 
         let body = object_output.body;
         let body = ByteStreamAsStream::from(body);
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index d8b9824d99..bc5e40e70f 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -118,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     // A little check to ensure that our clock is not too far off from the S3 clock
     {
         let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
-        let last_modified = dl.last_modified.unwrap();
+        let last_modified = dl.last_modified;
         let half_wt = WAIT_TIME.mul_f32(0.5);
         let t0_hwt = t0 + half_wt;
         let t1_hwt = t1 - half_wt;

From 22c26d610b0a0cd4c7d714c25b8db1d516e3cc7f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 15:23:05 +0000
Subject: [PATCH 18/43] pageserver: remove un-needed "uninit mark" (#5717)

Switched the order; doing https://github.com/neondatabase/neon/pull/6139
first then can remove uninit marker after.

## Problem

Previously, existence of a timeline directory was treated as evidence of
the timeline's logical existence. That is no longer the case since we
treat remote storage as the source of truth on each startup: we can
therefore do without this mark file.

The mark file had also been used as a pseudo-lock to guard against
concurrent creations of the same TimelineId -- now that persistence is
no longer required, this is a bit unwieldy.

In #6139 the `Tenant::timelines_creating` was added to protect against
concurrent creations on the same TimelineId, making the uninit mark file
entirely redundant.

## Summary of changes

- Code that writes & reads mark file is removed
- Some nearby `pub` definitions are amended to `pub(crate)`
- `test_duplicate_creation` is added to demonstrate that mutual
exclusion of creations still works.
---
 pageserver/src/config.rs                      |  20 +--
 pageserver/src/http/routes.rs                 |   6 +-
 pageserver/src/lib.rs                         |  16 +--
 pageserver/src/tenant.rs                      | 122 +++++++-----------
 pageserver/src/tenant/timeline/uninit.rs      |  98 ++++----------
 .../fixtures/pageserver/allowed_errors.py     |   1 -
 test_runner/fixtures/pageserver/http.py       |   2 +-
 test_runner/regress/test_branching.py         |  58 +++++++++
 test_runner/regress/test_broken_timeline.py   |   8 +-
 test_runner/regress/test_import.py            |   1 -
 10 files changed, 147 insertions(+), 185 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 845b20c8db..7dac0ab352 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -41,7 +41,7 @@ use crate::tenant::{
 use crate::virtual_file;
 use crate::{
     IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
-    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
+    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
 };
 
 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -845,18 +845,7 @@ impl PageServerConf {
             .join(timeline_id.to_string())
     }
 
-    pub fn timeline_uninit_mark_file_path(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Utf8PathBuf {
-        path_with_suffix_extension(
-            self.timeline_path(&tenant_shard_id, &timeline_id),
-            TIMELINE_UNINIT_MARK_SUFFIX,
-        )
-    }
-
-    pub fn timeline_delete_mark_file_path(
+    pub(crate) fn timeline_delete_mark_file_path(
         &self,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
@@ -867,7 +856,10 @@ impl PageServerConf {
         )
     }
 
-    pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+    pub(crate) fn tenant_deleted_mark_file_path(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> Utf8PathBuf {
         self.tenant_path(tenant_shard_id)
             .join(TENANT_DELETED_MARKER_FILE_NAME)
     }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d6fe9f6055..7d3ede21ce 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -535,9 +535,9 @@ async fn timeline_create_handler(
                 )
             }
             Err(
-                tenant::CreateTimelineError::Conflict
-                | tenant::CreateTimelineError::AlreadyCreating,
-            ) => json_response(StatusCode::CONFLICT, ()),
+                e @ tenant::CreateTimelineError::Conflict
+                | e @ tenant::CreateTimelineError::AlreadyCreating,
+            ) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
             Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
                 StatusCode::NOT_ACCEPTABLE,
                 HttpErrorBody::from_msg(format!("{err:#}")),
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 02a690d4e1..b00db02a1c 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -114,27 +114,27 @@ pub const METADATA_FILE_NAME: &str = "metadata";
 
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
-pub const TENANT_CONFIG_NAME: &str = "config";
+pub(crate) const TENANT_CONFIG_NAME: &str = "config";
 
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
-pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
+pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
 
 /// Per-tenant copy of their remote heatmap, downloaded into the local
 /// tenant path while in secondary mode.
-pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
+pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
 
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
-pub const TEMP_FILE_SUFFIX: &str = "___temp";
+pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
 
 /// A marker file to mark that a timeline directory was not fully initialized.
 /// If a timeline directory with this marker is encountered at pageserver startup,
 /// the timeline directory and the marker file are both removed.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
-pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
+pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
 
-pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
+pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
 
 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
@@ -161,11 +161,11 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
 // from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
 // from the name.
 
-pub fn is_uninit_mark(path: &Utf8Path) -> bool {
+pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
     ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }
 
-pub fn is_delete_mark(path: &Utf8Path) -> bool {
+pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
     ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f0996328c0..ddfb47369b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -55,8 +55,8 @@ use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
 use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
+use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
-use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
@@ -565,9 +565,8 @@ impl Tenant {
             // avoiding holding it across awaits
             let mut timelines_accessor = self.timelines.lock().unwrap();
             match timelines_accessor.entry(timeline_id) {
+                // We should never try and load the same timeline twice during startup
                 Entry::Occupied(_) => {
-                    // The uninit mark file acts as a lock that prevents another task from
-                    // initializing the timeline at the same time.
                     unreachable!(
                         "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
                     );
@@ -1064,8 +1063,7 @@ impl Tenant {
             let entry_path = entry.path();
 
             let purge = if crate::is_temporary(entry_path)
-                // TODO: uninit_mark isn't needed any more, since uninitialized timelines are already
-                // covered by the check that the timeline must exist in remote storage.
+                // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
                 || is_uninit_mark(entry_path)
                 || crate::is_delete_mark(entry_path)
             {
@@ -1298,11 +1296,6 @@ impl Tenant {
     /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
     /// and the timeline will fail to load at a restart.
     ///
-    /// That's why we add an uninit mark file, and wrap it together witht the Timeline
-    /// in-memory object into UninitializedTimeline.
-    /// Once the caller is done setting up the timeline, they should call
-    /// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark.
-    ///
     /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
     /// minimum amount of keys required to get a writable timeline.
     /// (Without it, `put` might fail due to `repartition` failing.)
@@ -1318,7 +1311,9 @@ impl Tenant {
             "Cannot create empty timelines on inactive tenant"
         );
 
-        let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
+        // Protect against concurrent attempts to use this TimelineId
+        let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
+
         let new_metadata = TimelineMetadata::new(
             // Initialize disk_consistent LSN to 0, The caller must import some data to
             // make it valid, before calling finish_creation()
@@ -1333,7 +1328,7 @@ impl Tenant {
         self.prepare_new_timeline(
             new_timeline_id,
             &new_metadata,
-            timeline_uninit_mark,
+            create_guard,
             initdb_lsn,
             None,
         )
@@ -1421,9 +1416,8 @@ impl Tenant {
             .map_err(|_| CreateTimelineError::ShuttingDown)?;
 
         // Get exclusive access to the timeline ID: this ensures that it does not already exist,
-        // and that no other creation attempts will be allowed in while we are working.  The
-        // uninit_mark is a guard.
-        let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
+        // and that no other creation attempts will be allowed in while we are working.
+        let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
             Ok(m) => m,
             Err(TimelineExclusionError::AlreadyCreating) => {
                 // Creation is in progress, we cannot create it again, and we cannot
@@ -1466,6 +1460,8 @@ impl Tenant {
             }
         };
 
+        pausable_failpoint!("timeline-creation-after-uninit");
+
         let loaded_timeline = match ancestor_timeline_id {
             Some(ancestor_timeline_id) => {
                 let ancestor_timeline = self
@@ -1513,7 +1509,7 @@ impl Tenant {
                     &ancestor_timeline,
                     new_timeline_id,
                     ancestor_start_lsn,
-                    uninit_mark,
+                    create_guard,
                     ctx,
                 )
                 .await?
@@ -1523,7 +1519,7 @@ impl Tenant {
                     new_timeline_id,
                     pg_version,
                     load_existing_initdb,
-                    uninit_mark,
+                    create_guard,
                     ctx,
                 )
                 .await?
@@ -2870,9 +2866,9 @@ impl Tenant {
         start_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
+        let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
         let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
             .await?;
         tl.set_state(TimelineState::Active);
         Ok(tl)
@@ -2886,10 +2882,10 @@ impl Tenant {
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
-        timeline_uninit_mark: TimelineUninitMark<'_>,
+        timeline_create_guard: TimelineCreateGuard<'_>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
+        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
             .await
     }
 
@@ -2898,7 +2894,7 @@ impl Tenant {
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
-        timeline_uninit_mark: TimelineUninitMark<'_>,
+        timeline_create_guard: TimelineCreateGuard<'_>,
         _ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
         let src_id = src_timeline.timeline_id;
@@ -2982,7 +2978,7 @@ impl Tenant {
             .prepare_new_timeline(
                 dst_id,
                 &metadata,
-                timeline_uninit_mark,
+                timeline_create_guard,
                 start_lsn + 1,
                 Some(Arc::clone(src_timeline)),
             )
@@ -3014,12 +3010,12 @@ impl Tenant {
         load_existing_initdb: Option<TimelineId>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
-        let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
+        let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
         self.bootstrap_timeline(
             timeline_id,
             pg_version,
             load_existing_initdb,
-            uninit_mark,
+            create_guard,
             ctx,
         )
         .await
@@ -3083,7 +3079,7 @@ impl Tenant {
         timeline_id: TimelineId,
         pg_version: u32,
         load_existing_initdb: Option<TimelineId>,
-        timeline_uninit_mark: TimelineUninitMark<'_>,
+        timeline_create_guard: TimelineCreateGuard<'_>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
@@ -3095,13 +3091,14 @@ impl Tenant {
             TEMP_FILE_SUFFIX,
         );
 
-        // an uninit mark was placed before, nothing else can access this timeline files
-        // current initdb was not run yet, so remove whatever was left from the previous runs
+        // Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees
+        // we won't race with other creations or existent timelines with the same path.
         if pgdata_path.exists() {
             fs::remove_dir_all(&pgdata_path).with_context(|| {
                 format!("Failed to remove already existing initdb directory: {pgdata_path}")
             })?;
         }
+
         // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
         scopeguard::defer! {
             if let Err(e) = fs::remove_dir_all(&pgdata_path) {
@@ -3178,7 +3175,7 @@ impl Tenant {
             .prepare_new_timeline(
                 timeline_id,
                 &new_metadata,
-                timeline_uninit_mark,
+                timeline_create_guard,
                 pgdata_lsn,
                 None,
             )
@@ -3250,13 +3247,12 @@ impl Tenant {
     ///
     /// An empty layer map is initialized, and new data and WAL can be imported starting
     /// at 'disk_consistent_lsn'. After any initial data has been imported, call
-    /// `finish_creation` to insert the Timeline into the timelines map and to remove the
-    /// uninit mark file.
+    /// `finish_creation` to insert the Timeline into the timelines map.
     async fn prepare_new_timeline<'a>(
         &'a self,
         new_timeline_id: TimelineId,
         new_metadata: &TimelineMetadata,
-        uninit_mark: TimelineUninitMark<'a>,
+        create_guard: TimelineCreateGuard<'a>,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
     ) -> anyhow::Result<UninitializedTimeline> {
@@ -3279,9 +3275,12 @@ impl Tenant {
 
         timeline_struct.init_empty_layer_map(start_lsn);
 
-        if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
+        if let Err(e) = self
+            .create_timeline_files(&create_guard.timeline_path)
+            .await
+        {
             error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
-            cleanup_timeline_directory(uninit_mark);
+            cleanup_timeline_directory(create_guard);
             return Err(e);
         }
 
@@ -3292,41 +3291,31 @@ impl Tenant {
         Ok(UninitializedTimeline::new(
             self,
             new_timeline_id,
-            Some((timeline_struct, uninit_mark)),
+            Some((timeline_struct, create_guard)),
         ))
     }
 
     async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
         crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
 
-        fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
-            anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
+        fail::fail_point!("after-timeline-dir-creation", |_| {
+            anyhow::bail!("failpoint after-timeline-dir-creation");
         });
 
         Ok(())
     }
 
-    /// Attempts to create an uninit mark file for the timeline initialization.
-    /// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists.
-    ///
-    /// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init.
-    fn create_timeline_uninit_mark(
+    /// Get a guard that provides exclusive access to the timeline directory, preventing
+    /// concurrent attempts to create the same timeline.
+    fn create_timeline_create_guard(
         &self,
         timeline_id: TimelineId,
-    ) -> Result<TimelineUninitMark, TimelineExclusionError> {
+    ) -> Result<TimelineCreateGuard, TimelineExclusionError> {
         let tenant_shard_id = self.tenant_shard_id;
 
-        let uninit_mark_path = self
-            .conf
-            .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
         let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
 
-        let uninit_mark = TimelineUninitMark::new(
-            self,
-            timeline_id,
-            uninit_mark_path.clone(),
-            timeline_path.clone(),
-        )?;
+        let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
 
         // At this stage, we have got exclusive access to in-memory state for this timeline ID
         // for creation.
@@ -3342,23 +3331,7 @@ impl Tenant {
             )));
         }
 
-        // Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
-        // that during process runtime, colliding creations will be caught in-memory without getting
-        // as far as failing to write a file.
-        fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .open(&uninit_mark_path)
-            .context("Failed to create uninit mark file")
-            .and_then(|_| {
-                crashsafe::fsync_file_and_parent(&uninit_mark_path)
-                    .context("Failed to fsync uninit mark file")
-            })
-            .with_context(|| {
-                format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
-            })?;
-
-        Ok(uninit_mark)
+        Ok(create_guard)
     }
 
     /// Gathers inputs from all of the timelines to produce a sizing model input.
@@ -5099,15 +5072,15 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_uninit_mark_crash() -> anyhow::Result<()> {
-        let name = "test_uninit_mark_crash";
+    async fn test_create_guard_crash() -> anyhow::Result<()> {
+        let name = "test_create_guard_crash";
         let harness = TenantHarness::create(name)?;
         {
             let (tenant, ctx) = harness.load().await;
             let tline = tenant
                 .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                 .await?;
-            // Keeps uninit mark in place
+            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
             let raw_tline = tline.raw_timeline().unwrap();
             raw_tline
                 .shutdown()
@@ -5135,11 +5108,6 @@ mod tests {
             .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
             .exists());
 
-        assert!(!harness
-            .conf
-            .timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID)
-            .exists());
-
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index 27d6fd9c28..e1034a9fe2 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -2,8 +2,8 @@ use std::{collections::hash_map::Entry, fs, sync::Arc};
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use tracing::{error, info, info_span, warn};
-use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
+use tracing::{error, info, info_span};
+use utils::{fs_ext, id::TimelineId, lsn::Lsn};
 
 use crate::{context::RequestContext, import_datadir, tenant::Tenant};
 
@@ -11,22 +11,22 @@ use super::Timeline;
 
 /// A timeline with some of its files on disk, being initialized.
 /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
-/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
-/// to be removed on next restart.
+/// its local files are removed.  If we crash while this class exists, then the timeline's local
+/// state is cleaned up during [`Tenant::clean_up_timelines`], because the timeline's content isn't in remote storage.
 ///
 /// The caller is responsible for proper timeline data filling before the final init.
 #[must_use]
 pub struct UninitializedTimeline<'t> {
     pub(crate) owning_tenant: &'t Tenant,
     timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
+    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
 }
 
 impl<'t> UninitializedTimeline<'t> {
     pub(crate) fn new(
         owning_tenant: &'t Tenant,
         timeline_id: TimelineId,
-        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
+        raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
     ) -> Self {
         Self {
             owning_tenant,
@@ -35,8 +35,7 @@ impl<'t> UninitializedTimeline<'t> {
         }
     }
 
-    /// Finish timeline creation: insert it into the Tenant's timelines map and remove the
-    /// uninit mark file.
+    /// Finish timeline creation: insert it into the Tenant's timelines map
     ///
     /// This function launches the flush loop if not already done.
     ///
@@ -72,16 +71,9 @@ impl<'t> UninitializedTimeline<'t> {
             Entry::Vacant(v) => {
                 // after taking here should be no fallible operations, because the drop guard will not
                 // cleanup after and would block for example the tenant deletion
-                let (new_timeline, uninit_mark) =
+                let (new_timeline, _create_guard) =
                     self.raw_timeline.take().expect("already checked");
 
-                // this is the mutual exclusion between different retries to create the timeline;
-                // this should be an assertion.
-                uninit_mark.remove_uninit_mark().with_context(|| {
-                    format!(
-                        "Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}"
-                    )
-                })?;
                 v.insert(Arc::clone(&new_timeline));
 
                 new_timeline.maybe_spawn_flush_loop();
@@ -120,8 +112,7 @@ impl<'t> UninitializedTimeline<'t> {
             .await
             .context("Failed to flush after basebackup import")?;
 
-        // All the data has been imported. Insert the Timeline into the tenant's timelines
-        // map and remove the uninit mark file.
+        // All the data has been imported. Insert the Timeline into the tenant's timelines map
         let tl = self.finish_creation()?;
         tl.activate(broker_client, None, ctx);
         Ok(tl)
@@ -143,37 +134,35 @@ impl<'t> UninitializedTimeline<'t> {
 
 impl Drop for UninitializedTimeline<'_> {
     fn drop(&mut self) {
-        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
+        if let Some((_, create_guard)) = self.raw_timeline.take() {
             let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
             error!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(uninit_mark);
+            cleanup_timeline_directory(create_guard);
         }
     }
 }
 
-pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
-    let timeline_path = &uninit_mark.timeline_path;
+pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
+    let timeline_path = &create_guard.timeline_path;
     match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
         Ok(()) => {
-            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
+            info!("Timeline dir {timeline_path:?} removed successfully")
         }
         Err(e) => {
             error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
         }
     }
-    drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
+    // Having cleaned up, we can release this TimelineId in `[Tenant::timelines_creating]` to allow other
+    // timeline creation attempts under this TimelineId to proceed
+    drop(create_guard);
 }
 
-/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
-/// or gets removed eventually.
-///
-/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
+/// A guard for timeline creations in process: as long as this object exists, the timeline ID
+/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
 #[must_use]
-pub(crate) struct TimelineUninitMark<'t> {
+pub(crate) struct TimelineCreateGuard<'t> {
     owning_tenant: &'t Tenant,
     timeline_id: TimelineId,
-    uninit_mark_deleted: bool,
-    uninit_mark_path: Utf8PathBuf,
     pub(crate) timeline_path: Utf8PathBuf,
 }
 
@@ -190,11 +179,10 @@ pub(crate) enum TimelineExclusionError {
     Other(#[from] anyhow::Error),
 }
 
-impl<'t> TimelineUninitMark<'t> {
+impl<'t> TimelineCreateGuard<'t> {
     pub(crate) fn new(
         owning_tenant: &'t Tenant,
         timeline_id: TimelineId,
-        uninit_mark_path: Utf8PathBuf,
         timeline_path: Utf8PathBuf,
     ) -> Result<Self, TimelineExclusionError> {
         // Lock order: this is the only place we take both locks.  During drop() we only
@@ -214,56 +202,14 @@ impl<'t> TimelineUninitMark<'t> {
             Ok(Self {
                 owning_tenant,
                 timeline_id,
-                uninit_mark_deleted: false,
-                uninit_mark_path,
                 timeline_path,
             })
         }
     }
-
-    fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
-        if !self.uninit_mark_deleted {
-            self.delete_mark_file_if_present()?;
-        }
-
-        Ok(())
-    }
-
-    fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
-        let uninit_mark_file = &self.uninit_mark_path;
-        let uninit_mark_parent = uninit_mark_file
-            .parent()
-            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
-            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
-        })?;
-        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
-        self.uninit_mark_deleted = true;
-
-        Ok(())
-    }
 }
 
-impl Drop for TimelineUninitMark<'_> {
+impl Drop for TimelineCreateGuard<'_> {
     fn drop(&mut self) {
-        if !self.uninit_mark_deleted {
-            if self.timeline_path.exists() {
-                error!(
-                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
-                    self.uninit_mark_path, self.timeline_path
-                )
-            } else {
-                // unblock later timeline creation attempts
-                warn!(
-                    "Removing intermediate uninit mark file {}",
-                    self.uninit_mark_path
-                );
-                if let Err(e) = self.delete_mark_file_if_present() {
-                    error!("Failed to remove the uninit mark file: {e}")
-                }
-            }
-        }
-
         self.owning_tenant
             .timelines_creating
             .lock()
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 8ff4341cc0..839d4166c7 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -55,7 +55,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # FIXME: These need investigation
     ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
     ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
-    ".*Removing intermediate uninit mark file.*",
     # Tenant::delete_timeline() can cause any of the four following errors.
     # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
     ".*could not flush frozen layer.*queue is in state Stopped",  # when schedule layer upload fails because queued got closed before compaction got killed
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 6e082374d7..4e355b73a9 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -34,7 +34,7 @@ class TimelineCreate406(PageserverApiException):
 class TimelineCreate409(PageserverApiException):
     def __init__(self, res: requests.Response):
         assert res.status_code == 409
-        super().__init__("", res.status_code)
+        super().__init__(res.json()["msg"], res.status_code)
 
 
 @dataclass
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 9a0b91b54e..2a7a3c41ac 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -347,6 +347,64 @@ def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvB
         ps_http.timeline_detail(env.initial_tenant, branch_id)
 
 
+def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_configs()
+    env.start()
+    env.pageserver.tenant_create(env.initial_tenant)
+
+    success_timeline = TimelineId.generate()
+    log.info(f"Creating timeline {success_timeline}")
+    ps_http = env.pageserver.http_client()
+    success_result = ps_http.timeline_create(
+        env.pg_version, env.initial_tenant, success_timeline, timeout=60
+    )
+
+    ps_http.configure_failpoints(("timeline-creation-after-uninit", "pause"))
+
+    def start_creating_timeline():
+        log.info(f"Creating (expect failure) timeline {env.initial_timeline}")
+        with pytest.raises(RequestException):
+            ps_http.timeline_create(
+                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
+            )
+
+    t = threading.Thread(target=start_creating_timeline)
+    try:
+        t.start()
+
+        wait_until_paused(env, "timeline-creation-after-uninit")
+
+        # While timeline creation is in progress, trying to create a timeline
+        # again with the same ID should return 409
+        with pytest.raises(
+            PageserverApiException, match="creation of timeline with the given ID is in progress"
+        ):
+            ps_http.timeline_create(
+                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
+            )
+
+        # Creation of a timeline already successfully created is idempotent, and is not impeded by some
+        # other timeline creation with a different TimelineId being stuck.
+        repeat_result = ps_http.timeline_create(
+            env.pg_version, env.initial_tenant, success_timeline, timeout=60
+        )
+        assert repeat_result == success_result
+    finally:
+        env.pageserver.stop(immediate=True)
+        t.join()
+
+    # now without a failpoint
+    env.pageserver.start()
+
+    wait_until_tenant_active(ps_http, env.initial_tenant)
+
+    with pytest.raises(PageserverApiException, match="not found"):
+        ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+
+    # The one successfully created timeline should still be there.
+    assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1
+
+
 def wait_until_paused(env: NeonEnv, failpoint: str):
     found = False
     msg = f"at failpoint {failpoint}"
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index b046ed7f1b..804ad135ce 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -204,7 +204,7 @@ def test_timeline_init_break_before_checkpoint_recreate(
     assert timeline_id == new_timeline_id
 
 
-def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder):
+def test_timeline_create_break_after_dir_creation(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
@@ -214,9 +214,9 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde
     old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
     initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
 
-    # Introduce failpoint when creating a new timeline uninit mark, before any other files were created
-    pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return"))
-    with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"):
+    # Introduce failpoint when creating a new timeline, right after creating its directory
+    pageserver_http.configure_failpoints(("after-timeline-dir-creation", "return"))
+    with pytest.raises(Exception, match="after-timeline-dir-creation"):
         _ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
 
     # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index ec57860033..132427ba2d 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -90,7 +90,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
         [
             ".*error importing base backup .*",
             ".*Timeline got dropped without initializing, cleaning its files.*",
-            ".*Removing intermediate uninit mark file.*",
             ".*InternalServerError.*timeline not found.*",
             ".*InternalServerError.*Tenant .* not found.*",
             ".*InternalServerError.*Timeline .* not found.*",

From bf187aa13f8527ea81de370d14984a6cc5889ecd Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 15 Mar 2024 17:30:13 +0200
Subject: [PATCH 19/43] fix(layer): metric miscalculations (#7137)

Split off from #7030:
- each early exit is counted as canceled init, even though it most
likely was just `LayerInner::keep_resident` doing the no-download repair
check
- `downloaded_after` could had been accounted for multiple times, and
also when repairing to match on-disk state

Cc: #5331
---
 pageserver/src/tenant/storage_layer/layer.rs | 63 ++++++++++++++------
 1 file changed, 45 insertions(+), 18 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 959065bc4c..0200ff8cf4 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -710,10 +710,6 @@ impl LayerInner {
                     // disable any scheduled but not yet running eviction deletions for this
                     let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
 
-                    // count cancellations, which currently remain largely unexpected
-                    let init_cancelled =
-                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-
                     // no need to make the evict_and_wait wait for the actual download to complete
                     drop(self.status.send(Status::Downloaded));
 
@@ -722,7 +718,9 @@ impl LayerInner {
                         .upgrade()
                         .ok_or_else(|| DownloadError::TimelineShutdown)?;
 
-                    // FIXME: grab a gate
+                    // count cancellations, which currently remain largely unexpected
+                    let init_cancelled =
+                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
 
                     let can_ever_evict = timeline.remote_client.as_ref().is_some();
 
@@ -731,9 +729,17 @@ impl LayerInner {
                     let needs_download = self
                         .needs_download()
                         .await
-                        .map_err(DownloadError::PreStatFailed)?;
+                        .map_err(DownloadError::PreStatFailed);
 
-                    let permit = if let Some(reason) = needs_download {
+                    let needs_download = match needs_download {
+                        Ok(reason) => reason,
+                        Err(e) => {
+                            scopeguard::ScopeGuard::into_inner(init_cancelled);
+                            return Err(e);
+                        }
+                    };
+
+                    let (permit, downloaded) = if let Some(reason) = needs_download {
                         if let NeedsDownload::NotFile(ft) = reason {
                             return Err(DownloadError::NotFile(ft));
                         }
@@ -744,36 +750,59 @@ impl LayerInner {
                         self.wanted_evicted.store(false, Ordering::Release);
 
                         if !can_ever_evict {
+                            scopeguard::ScopeGuard::into_inner(init_cancelled);
                             return Err(DownloadError::NoRemoteStorage);
                         }
 
                         if let Some(ctx) = ctx {
-                            self.check_expected_download(ctx)?;
+                            let res = self.check_expected_download(ctx);
+                            if let Err(e) = res {
+                                scopeguard::ScopeGuard::into_inner(init_cancelled);
+                                return Err(e);
+                            }
                         }
 
                         if !allow_download {
                             // this does look weird, but for LayerInner the "downloading" means also changing
                             // internal once related state ...
+                            scopeguard::ScopeGuard::into_inner(init_cancelled);
                             return Err(DownloadError::DownloadRequired);
                         }
 
                         tracing::info!(%reason, "downloading on-demand");
 
-                        self.spawn_download_and_wait(timeline, permit).await?
+                        let permit = self.spawn_download_and_wait(timeline, permit).await;
+
+                        let permit = match permit {
+                            Ok(permit) => permit,
+                            Err(e) => {
+                                scopeguard::ScopeGuard::into_inner(init_cancelled);
+                                return Err(e);
+                            }
+                        };
+
+                        (permit, true)
                     } else {
                         // the file is present locally, probably by a previous but cancelled call to
                         // get_or_maybe_download. alternatively we might be running without remote storage.
                         LAYER_IMPL_METRICS.inc_init_needed_no_download();
 
-                        permit
+                        (permit, false)
                     };
 
-                    let since_last_eviction =
-                        self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
-                    if let Some(since_last_eviction) = since_last_eviction {
-                        // FIXME: this will not always be recorded correctly until #6028 (the no
-                        // download needed branch above)
-                        LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);
+
+                    if downloaded {
+                        let since_last_eviction = self
+                            .last_evicted_at
+                            .lock()
+                            .unwrap()
+                            .take()
+                            .map(|ts| ts.elapsed());
+
+                        if let Some(since_last_eviction) = since_last_eviction {
+                            LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                        }
                     }
 
                     let res = Arc::new(DownloadedLayer {
@@ -795,8 +824,6 @@ impl LayerInner {
                         );
                     }
 
-                    scopeguard::ScopeGuard::into_inner(init_cancelled);
-
                     Ok((ResidentOrWantedEvicted::Resident(res), permit))
                 }
                 .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))

From 59b6cce4189eab1f66c805dc6a8f73b9f37f7063 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 15 Mar 2024 17:54:28 +0200
Subject: [PATCH 20/43] heavier_once_cell: add detached init support (#7135)

Aiming for the design where `heavier_once_cell::OnceCell` is initialized
by a future factory lead to awkwardness with how
`LayerInner::get_or_maybe_download` looks right now with the `loop`. The
loop helps with two situations:

- an eviction has been scheduled but has not yet happened, and a read
access should cancel the eviction
- a previous `LayerInner::get_or_maybe_download` that canceled a pending
eviction was canceled leaving the `heavier_once_cell::OnceCell`
uninitialized but needing repair by the next
`LayerInner::get_or_maybe_download`

By instead supporting detached initialization in
`heavier_once_cell::OnceCell` via an `OnceCell::get_or_detached_init`,
we can fix what the monolithic #7030 does:
- spawned off download task initializes the
`heavier_once_cell::OnceCell` regardless of the download starter being
canceled
- a canceled `LayerInner::get_or_maybe_download` no longer stops
eviction but can win it if not canceled

Split off from #7030.

Cc: #5331
---
 libs/utils/src/sync/heavier_once_cell.rs | 78 ++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 703a6dfd52..a3aee45b58 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -110,6 +110,49 @@ impl<T> OnceCell<T> {
         }
     }
 
+    /// Returns a guard to an existing initialized value, or returns an unique initialization
+    /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
+    pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
+        // It looks like OnceCell::get_or_init could be implemented using this method instead of
+        // duplication. However, that makes the future be !Send due to possibly holding on to the
+        // MutexGuard over an await point.
+        loop {
+            let sem = {
+                let guard = self.inner.lock().unwrap();
+                if guard.value.is_some() {
+                    return Ok(Guard(guard));
+                }
+                guard.init_semaphore.clone()
+            };
+
+            {
+                let permit = {
+                    // increment the count for the duration of queued
+                    let _guard = CountWaitingInitializers::start(self);
+                    sem.acquire().await
+                };
+
+                let Ok(permit) = permit else {
+                    let guard = self.inner.lock().unwrap();
+                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
+                        // there was a take_and_deinit in between
+                        continue;
+                    }
+                    assert!(
+                        guard.value.is_some(),
+                        "semaphore got closed, must be initialized"
+                    );
+                    return Ok(Guard(guard));
+                };
+
+                permit.forget();
+            }
+
+            let permit = InitPermit(sem);
+            return Err(permit);
+        }
+    }
+
     /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
     /// to complete initializing the inner value.
     ///
@@ -481,4 +524,39 @@ mod tests {
 
         assert_eq!("t1", *cell.get().unwrap());
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn detached_init_smoke() {
+        let target = OnceCell::default();
+
+        let Err(permit) = target.get_or_init_detached().await else {
+            unreachable!("it is not initialized")
+        };
+
+        tokio::time::timeout(
+            std::time::Duration::from_secs(3600 * 24 * 7 * 365),
+            target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
+        )
+        .await
+        .expect_err("should timeout since we are already holding the permit");
+
+        target.set(42, permit);
+
+        let (_answer, permit) = {
+            let mut guard = target
+                .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
+                .await
+                .unwrap();
+
+            assert_eq!(*guard, 42);
+
+            guard.take_and_deinit()
+        };
+
+        assert!(target.get().is_none());
+
+        target.set(11, permit);
+
+        assert_eq!(*target.get().unwrap(), 11);
+    }
 }

From 7d32af5ad5ec316761cb29d6a8141f4baf68735b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 15:57:01 +0000
Subject: [PATCH 21/43] .github: apply timeout to pytest `regress` (#7142)

These test runs usually take 20-30 minutes. if something hangs, we see
actions proceeding for several hours: it's more convenient to have them
time out sooner so that we notice that something has hung faster.
---
 .github/workflows/build_and_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 810c61de2d..2bcda7cc8e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -461,6 +461,7 @@ jobs:
 
       - name: Pytest regression tests
         uses: ./.github/actions/run-python-test-set
+        timeout-minutes: 60
         with:
           build_type: ${{ matrix.build_type }}
           test_selection: regress

From 67522ce83d1b6b754e6e50dec79425ba13fdc993 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 16:00:04 +0000
Subject: [PATCH 22/43] docs: shard splitting RFC (#6358)

Extend the previous sharding RFC with functionality for dynamically splitting shards to increase the total shard count on existing tenants.
---
 docs/rfcs/032-shard-splitting.md | 479 +++++++++++++++++++++++++++++++
 1 file changed, 479 insertions(+)
 create mode 100644 docs/rfcs/032-shard-splitting.md

diff --git a/docs/rfcs/032-shard-splitting.md b/docs/rfcs/032-shard-splitting.md
new file mode 100644
index 0000000000..d5fbda8415
--- /dev/null
+++ b/docs/rfcs/032-shard-splitting.md
@@ -0,0 +1,479 @@
+# Shard splitting
+
+## Summary
+
+This RFC describes a new pageserver API for splitting an existing tenant shard into
+multiple shards, and describes how to use this API to safely increase the total
+shard count of a tenant.
+
+## Motivation
+
+In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
+tenants beyond the capacity of a single pageserver by breaking up the key space
+into stripes, and distributing these stripes across many pageservers. However,
+the shard count was defined once at tenant creation time and not varied thereafter.
+
+In practice, the expected size of a database is rarely known at creation time, and
+it is inefficient to enable sharding for very small tenants: we need to be
+able to create a tenant with a small number of shards (such as 1), and later expand
+when it becomes clear that the tenant has grown in size to a point where sharding
+is beneficial.
+
+### Prior art
+
+Many distributed systems have the problem of choosing how many shards to create for
+tenants that do not specify an expected size up-front. There are a couple of general
+approaches:
+
+- Write to a key space in order, and start a new shard when the highest key advances
+  past some point. This doesn't work well for Neon, because we write to our key space
+  in many different contiguous ranges (per relation), rather than in one contiguous
+  range. To adapt to this kind of model, we would need a sharding scheme where each
+  relation had its own range of shards, which would be inefficient for the common
+  case of databases with many small relations.
+- Monitor the system, and automatically re-shard at some size threshold. For
+  example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
+  component monitors the size of each RADOS Pool, and adjusts the number of Placement
+  Groups (Ceph's shard equivalent).
+
+## Requirements
+
+- A configurable capacity limit per-shard is enforced.
+- Changes in shard count do not interrupt service beyond requiring postgres
+  to reconnect (i.e. milliseconds).
+- Human being does not have to choose shard count
+
+## Non Goals
+
+- Shard splitting is always a tenant-global operation: we will not enable splitting
+  one shard while leaving others intact.
+- The inverse operation (shard merging) is not described in this RFC. This is a lower
+  priority than splitting, because databases grow more often than they shrink, and
+  a database with many shards will still work properly if the stored data shrinks, just
+  with slightly more overhead (e.g. redundant WAL replication)
+- Shard splitting is only initiated based on capacity bounds, not load. Splitting
+  a tenant based on load will make sense for some medium-capacity, high-load workloads,
+  but is more complex to reason about and likely is not desirable until we have
+  shard merging to reduce the shard count again if the database becomes less busy.
+
+## Impacted Components
+
+pageserver, storage controller
+
+(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
+
+## Terminology
+
+**Parent** shards are the shards that exist before a split. **Child** shards are
+the new shards created during a split.
+
+**Shard** is synonymous with _tenant shard_.
+
+**Shard Index** is the 2-tuple of shard number and shard count, written in
+paths as {:02x}{:02x}, e.g. `0001`.
+
+## Background
+
+In the implementation section, a couple of existing aspects of sharding are important
+to remember:
+
+- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
+  a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
+  storage paths, and remote index metadata.
+- Remote layer file paths contain the shard index of the shard that created them, and
+  remote indices contain the same index to enable building the layer file path. A shard's
+  index may reference layers that were created by another shard.
+- Local tenant shard directories include the shard index. All layers downloaded by
+  a tenant shard are stored in this shard-prefixed path, even if those layers were
+  initially created by another shard: tenant shards do not read and write one anothers'
+  paths.
+- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
+  This is for historical reasons and will be cleaned up in future, but the existing
+  name is used here to help comprehension when reading code.
+
+## Implementation
+
+Note: this section focuses on the correctness of the core split process. This will
+be fairly inefficient in a naive implementation, and several important optimizations
+are described in a later section.
+
+There are broadly two parts to the implementation:
+
+1. The pageserver split API, which splits one shard on one pageserver
+2. The overall tenant split proccess which is coordinated by the storage controller,
+   and calls into the pageserver split API as needed.
+
+### Pageserver Split API
+
+The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
+that takes the new total shard count in the body.
+
+The pageserver split API operates on one tenant shard, on one pageserver. External
+coordination is required to use it safely, this is described in the later
+'Split procedure' section.
+
+#### Preparation
+
+First identify the shard indices for the new child shards. These are deterministic,
+calculated from the parent shard's index, and the number of children being created (this
+is an input to the API, and validated to be a power of two). In a trivial example, splitting
+0001 in two always results in 0002 and 0102.
+
+Child shard indices are chosen such that the childrens' parts of the keyspace will
+be subsets of the parent's parts of the keyspace.
+
+#### Step 1: write new remote indices
+
+In remote storage, splitting is very simple: we may just write new index_part.json
+objects for each child shard, containing exactly the same layers as the parent shard.
+
+The children will have more data than they need, but this avoids any exhausive
+re-writing or copying of layer files.
+
+The index key path includes a generation number: the parent shard's current
+attached generation number will also be used for the child shards' indices. This
+makes the operation safely retryable: if everything crashes and restarts, we may
+call the split API again on the parent shard, and the result will be some new remote
+indices for the child shards, under a higher generation number.
+
+#### Step 2: start new `Tenant` objects
+
+A new `Tenant` object may be instantiated for each child shard, while the parent
+shard still exists. When calling the tenant_spawn function for this object,
+the remote index from step 1 will be read, and the child shard will start
+to ingest WAL to catch up from whatever was in the remote storage at step 1.
+
+We now wait for child shards' WAL ingestion to catch up with the parent shard,
+so that we can safely tear down the parent shard without risking an availability
+gap to clients reading recent LSNs.
+
+#### Step 3: tear down parent `Tenant` object
+
+Once child shards are running and have caught up with WAL ingest, we no longer
+need the parent shard. Note that clients may still be using it -- when we
+shut it down, any page_service handlers will also shut down, causing clients
+to disconnect. When the client reconnects, it will re-lookup the tenant,
+and hit the child shard instead of the parent (shard lookup from page_service
+should bias toward higher ShardCount shards).
+
+Note that at this stage the page service client has not yet been notified of
+any split. In the trivial single split example:
+
+- Shard 0001 is gone: Tenant object torn down
+- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
+- Clients will continue to connect to that server thinking that shard 0001 is there,
+  and all requests will work, because any key that was in shard 0001 is definitely
+  available in either shard 0002 or shard 0102.
+- Eventually, the storage controller (not the pageserver) will decide to migrate
+  some child shards away: at that point it will do a live migration, ensuring
+  that the client has an updated configuration before it detaches anything
+  from the original server.
+
+#### Complete
+
+When we send a 200 response to the split request, we are promising the caller:
+
+- That the child shards are persistent in remote storage
+- That the parent shard has been shut down
+
+This enables the caller to proceed with the overall shard split operation, which
+may involve other shards on other pageservers.
+
+### Storage Controller Split procedure
+
+Splitting a tenant requires calling the pageserver split API, and tracking
+enough state to ensure recovery + completion in the event of any component (pageserver
+or storage controller) crashing (or request timing out) during the split.
+
+1. call the split API on all existing shards. Ensure that the resulting
+   child shards are pinned to their pageservers until _all_ the split calls are done.
+   This pinning may be implemented as a "split bit" on the tenant shards, that
+   blocks any migrations, and also acts as a sign that if we restart, we must go
+   through some recovery steps to resume the split.
+2. Once all the split calls are done, we may unpin the child shards (clear
+   the split bit). The split is now complete: subsequent steps are just migrations,
+   not strictly part of the split.
+3. Try to schedule new pageserver locations for the child shards, using
+   a soft anti-affinity constraint to place shards from the same tenant onto different
+   pageservers.
+
+Updating computes about the new shard count is not necessary until we migrate
+any of the child shards away from the parent's location.
+
+### Recovering from failures
+
+#### Rolling back an incomplete split
+
+An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
+and detaching child shards. This will lose any WAL ingested into the children after the parents
+were detached earlier, but the parents will catch up.
+
+No special pageserver API is needed for this. From the storage controllers point of view, the
+procedure is:
+
+1. For all parent shards in the tenant, ensure they are attached
+2. For all child shards, ensure they are not attached
+3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
+
+Any remote storage content for child shards is left behind. This is similar to other cases where
+we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
+index that references it). Future online scrub/cleanup functionality can remove these objects, or
+they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
+which would include any child shards that were rolled back.
+
+If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
+this, we will **block timeline creation during splitting**, so that we can safely roll back until
+the split is complete, without risking losing timelines.
+
+Rolling back an incomplete split will happen automatically if a split fails due to some fatal
+reason, and will not be accessible via an API:
+
+- A pageserver fails to complete its split API request after too many retries
+- A pageserver returns a fatal unexpected error such as 400 or 500
+- The storage controller database returns a non-retryable error
+- Some internal invariant is violated in the storage controller split code
+
+#### Rolling back a complete split
+
+A complete shard split may be rolled back similarly to an incomplete split, with the following
+modifications:
+
+- The parent shards will no longer exist in the storage controller database, so these must
+  be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
+  may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
+  shards in the storage controller database.
+- Any timelines that were created after the split complete will disappear when rolling back
+  to the tenant shards. For this reason, rolling back after a complete split should only
+  be done due to serious issues where loss of recently created timelines is acceptable, or
+  in cases where we have confirmed that no timelines were created in the intervening period.
+- Parent shards' layers must not have been deleted: this property will come "for free" when
+  we first roll out sharding, by simply not implementing deletion of parent layers after
+  a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
+  Optimizations section), it should apply a TTL to layers such that we have a
+  defined walltime window in which rollback will be possible.
+
+The storage controller will expose an API for rolling back a complete split, for use
+in the field if we encounter some critical bug with a post-split tenant.
+
+#### Retrying API calls during Pageserver Restart
+
+When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
+child shards from an ongoing split. This does not intrinsically break anything, and the
+pageserver may include all these shards in its `/re-attach` request to the storage controller.
+
+In order to support such restarts, it is important that the storage controller stores
+persistent records of each child shard before it calls into a pageserver, as these child shards
+may require generation increments via a `/re-attach` request.
+
+The pageserver restart will also result in a failed API call from the storage controller's point
+of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
+complete, and all shards must remain pinned to their current pageserver locations until the
+split is done.
+
+The pageserver API calls during splitting will retry on transient errors, so that
+short availability gaps do not result in a failure of the overall operation. The
+split in progress will be automatically rolled back if the threshold for API
+retries is reached (e.g. if a pageserver stays offline for longer than a typical
+restart).
+
+#### Rollback on Storage Controller Restart
+
+On startup, the storage controller will inspect the split bit for tenant shards that
+it loads from the database. If any splits are in progress:
+
+- Database content will be reverted to the parent shards
+- Child shards will be dropped from memory
+- The parent and child shards will be included in the general startup reconciliation that
+  the storage controller does: any child shards will be detached from pageservers because
+  they don't exist in the storage controller's expected set of shards, and parent shards
+  will be attached if they aren't already.
+
+#### Storage controller API request failures/retries
+
+The split request handler will implement idempotency: if the [`Tenant`] requested to split
+doesn't exist, we will check for the would-be child shards, and if they already exist,
+we consider the request complete.
+
+If a request is retried while the original request is still underway, then the split
+request handler will notice an InProgress marker in TenantManager, and return 503
+to encourage the client to backoff/retry. This is the same as the general pageserver
+API handling for calls that try to act on an InProgress shard.
+
+#### Compute start/restart during a split
+
+If a compute starts up during split, it will be configured with the old sharding
+configuration. This will work for reads irrespective of the progress of the split
+as long as no child hards have been migrated away from their original location, and
+this is guaranteed in the split procedure (see earlier section).
+
+#### Pageserver fails permanently during a split
+
+If a pageserver permanently fails (i.e. the storage controller availability state for it
+goes to Offline) while a split is in progress, the splitting operation will roll back, and
+during the roll back it will skip any API calls to the offline pageserver. If the offline
+pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
+
+### Handling secondary locations
+
+For correctness, it is not necessary to split secondary locations. We can simply detach
+the secondary locations for parent shards, and then attach new secondary locations
+for child shards.
+
+Clearly this is not optimal, as it will result in re-downloads of layer files that
+were already present on disk. See "Splitting secondary locations"
+
+### Conditions to trigger a split
+
+The pageserver will expose a new API for reporting on shards that are candidates
+for split: this will return a top-N report of the largest tenant shards by
+physical size (remote size). This should exclude any tenants that are already
+at the maximum configured shard count.
+
+The API would look something like:
+`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
+
+The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
+
+A split operation will be started when the tenant exceeds some threshold. This threshold
+should be _less than_ how large we actually want shards to be, perhaps much less. That's to
+minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
+wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
+tenant size distribution may be useful here: if we can make a statement like "usually, if
+a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
+make our policy to split a tenant at 20GiB.
+
+The finest split we can do is by factors of two, but we can do higher-cardinality splits
+too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
+as it grows. An example of a very simple heuristic for early deployment of the splitting
+feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
+would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
+split a tenant, it will not need re-splitting soon after.
+
+## Optimizations
+
+### Flush parent shard to remote storage during split
+
+Any data that is in WAL but not remote storage at time of split will need
+to be replayed by child shards when they start for the first time. To minimize
+this work, we may flush the parent shard to remote storage before writing the
+remote indices for child shards.
+
+It is important that this flush is subject to some time bounds: we may be splitting
+in response to a surge of write ingest, so it may be time-critical to split. A
+few seconds to flush latest data should be sufficient to optimize common cases without
+running the risk of holding up a split for a harmful length of time when a parent
+shard is being written heavily. If the flush doesn't complete in time, we may proceed
+to shut down the parent shard and carry on with the split.
+
+### Hard linking parent layers into child shard directories
+
+Before we start the Tenant objects for child shards, we may pre-populate their
+local storage directories with hard links to the layer files already present
+in the parent shard's local directory. When the child shard starts and downloads
+its remote index, it will find all those layer files already present on local disk.
+
+This avoids wasting download capacity and makes splitting faster, but more importantly
+it avoids taking up a factor of N more disk space when splitting 1 shard into N.
+
+This mechanism will work well in typical flows where shards are migrated away
+promptly after a split, but for the general case including what happens when
+layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
+section below.
+
+### Filtering during compaction
+
+Compaction, especially image layer generation, should skip any keys that are
+present in a shard's layer files, but do not match the shard's ShardIdentity's
+is_key_local() check. This avoids carrying around data for longer than necessary
+in post-split compactions.
+
+This was already implemented in https://github.com/neondatabase/neon/pull/6246
+
+### Proactive compaction
+
+In remote storage, there is little reason to rewrite any data on a shard split:
+all the children can reference parent layers via the very cheap write of the child
+index_part.json.
+
+In local storage, things are more nuanced. During the initial split there is no
+capacity cost to duplicating parent layers, if we implement the hard linking
+optimization described above. However, as soon as any layers are evicted from
+local disk and re-downloaded, the downloaded layers will not be hard-links any more:
+they'll have real capacity footprint. That isn't a problem if we migrate child shards
+away from the parent node swiftly, but it risks a significant over-use of local disk
+space if we do not.
+
+For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
+the shards elsewhere, then churned all the layers in all the shards via eviction,
+then we would blow up the storage capacity used on the node by 8x. If we're splitting
+a 100GB shard, that could take the pageserver to the point of exhausting disk space.
+
+To avoid this scenario, we could implement a special compaction mode where we just
+read historic layers, drop unwanted keys, and write back the layer file. This
+is pretty expensive, but useful if we have split a large shard and are not going to
+migrate the child shards away.
+
+The heuristic conditions for triggering such a compaction are:
+
+- A) eviction plus time: if a child shard
+  has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
+- B) resident size plus time: we may inspect the resident layers and calculate how
+  many of them include the overhead of storing pre-split keys. After some time
+  threshold (different to the one in case A) we still have such layers occupying
+  local disk space, then we should proactively compact them.
+
+### Cleaning up parent-shard layers
+
+It is functionally harmless to leave parent shard layers in remote storage indefinitely.
+They would be cleaned up in the event of the tenant's deletion.
+
+As an optimization to avoid leaking remote storage capacity (which costs money), we may
+lazily clean up parent shard layers once no child shards reference them.
+
+This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
+
+- list all the key prefixes beginning with the tenant ID, and select those shard prefixes
+  which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
+- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
+  may drop out now.
+- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
+- for all ancestral shards, list objects in the prefix and delete any layer which was not
+  referenced by a current shard.
+
+If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
+
+The cleanup may be done by the scrubber (external process), or we may choose to have
+the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
+reading the other shard's indices at runtime, and we do not require visibility of the
+latest index writes.
+
+Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
+that we retain the option to roll back a split in case of bugs.
+
+### Splitting secondary locations
+
+We may implement a pageserver API similar to the main splitting API, which does a simpler
+operation for secondary locations: it would not write anything to S3, instead it would simply
+create the child shard directory on local disk, hard link in directories from the parent,
+and set up the in memory (TenantSlot) state for the children.
+
+Similar to attached locations, a subset of secondary locations will probably need re-locating
+after the split is complete, to avoid leaving multiple child shards on the same pageservers,
+where they may use excessive space for the tenant.
+
+## FAQ/Alternatives
+
+### What should the thresholds be set to?
+
+Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
+
+Max shard count:
+
+- The safekeeper overhead to sharding is currently O(N) network bandwidth because
+  the un-filtered WAL is sent to all shards. To avoid this growing out of control,
+  a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
+  on the safekeeper.
+- there is also little benefit to increasing the shard count beyond the number
+  of pageservers in a region.
+
+### Is it worth just rewriting all the data during a split to simplify reasoning about space?

From bc1efa827f794fee9ac3755b4d51b344dd2cefbf Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 16:07:36 +0000
Subject: [PATCH 23/43] pageserver: exclude gc_horizon from synthetic size
 calculation (#6407)

## Problem

See:
- https://github.com/neondatabase/neon/issues/6374

## Summary of changes

Whereas previously we calculated synthetic size from the gc_horizon or
the pitr_interval (whichever is the lower LSN), now we ignore gc_horizon
and exclusively start from the `pitr_interval`. This is a more generous
calculation for billing, where we do not charge users for data retained
due to gc_horizon.
---
 pageserver/src/tenant/size.rs           |  8 +++-
 pageserver/src/tenant/timeline.rs       |  5 ++-
 test_runner/regress/test_tenant_size.py | 54 +++++++++++++++++++------
 3 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index e0b1652d98..ad79b74d8b 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -183,7 +183,13 @@ pub(super) async fn gather_inputs(
         // new gc run, which we have no control over. however differently from `Timeline::gc`
         // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
         // actually removing files.
-        let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
+        //
+        // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
+        // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
+        // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
+        // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
+        // horizon_cutoff.
+        let mut next_gc_cutoff = gc_info.pitr_cutoff;
 
         // If the caller provided a shorter retention period, use that instead of the GC cutoff.
         let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d507a19de9..2ab7301cce 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3784,8 +3784,11 @@ impl Timeline {
                         // The timestamp is in the future. That sounds impossible,
                         // but what it really means is that there hasn't been
                         // any commits since the cutoff timestamp.
+                        //
+                        // In this case we should use the LSN of the most recent commit,
+                        // which is implicitly the last LSN in the log.
                         debug!("future({})", lsn);
-                        cutoff_horizon
+                        self.get_last_record_lsn()
                     }
                     LsnForTimestamp::Past(lsn) => {
                         debug!("past({})", lsn);
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 7cea301a9c..025cc930d7 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,3 +1,4 @@
+import os
 from pathlib import Path
 from typing import List, Tuple
 
@@ -326,7 +327,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa
     size_debug_file.write(size_debug)
 
 
-@pytest.mark.xfail
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
 def test_single_branch_get_tenant_size_grows(
     neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion
 ):
@@ -349,10 +350,21 @@ def test_single_branch_get_tenant_size_grows(
     # adjust the gc_horizon accordingly.
     if pg_version == PgVersion.V14:
         gc_horizon = 0x4A000
+    elif pg_version == PgVersion.V15:
+        gc_horizon = 0x3BA00
+    elif pg_version == PgVersion.V16:
+        gc_horizon = 210000
+    else:
+        raise NotImplementedError(pg_version)
 
-    neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
+    tenant_config = {
+        "compaction_period": "0s",
+        "gc_period": "0s",
+        "pitr_interval": "0s",
+        "gc_horizon": gc_horizon,
+    }
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_config)
 
     tenant_id = env.initial_tenant
     branch_name, timeline_id = env.neon_cli.list_timelines(tenant_id)[0]
@@ -405,6 +417,7 @@ def test_single_branch_get_tenant_size_grows(
             current_lsn = after_lsn
         size_debug_file.write(size_debug)
         assert size > 0
+        log.info(f"size: {size} at lsn {current_lsn}")
         return (current_lsn, size)
 
     with env.endpoints.create_start(
@@ -492,24 +505,41 @@ def test_single_branch_get_tenant_size_grows(
 
             collected_responses.append(("DELETE", current_lsn, size))
 
+        size_before_drop = get_current_consistent_size(
+            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+        )[1]
+
         with endpoint.cursor() as cur:
             cur.execute("DROP TABLE t0")
 
+        # Without setting a PITR interval, dropping the table doesn't reclaim any space
+        # from the user's point of view, because the DROP transaction is too small
+        # to fall out of gc_horizon.
+        (current_lsn, size) = get_current_consistent_size(
+            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        prev_size = collected_responses[-1][2]
+        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+
+        # Set a tiny PITR interval to allow the DROP to impact the synthetic size
+        # Because synthetic size calculation uses pitr interval when available,
+        # when our tenant is configured with a tiny pitr interval, dropping a table should
+        # cause synthetic size to go down immediately
+        tenant_config["pitr_interval"] = "1ms"
+        env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config)
+        (current_lsn, size) = get_current_consistent_size(
+            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        assert size < size_before_drop
+
         # The size of the tenant should still be as large as before we dropped
         # the table, because the drop operation can still be undone in the PITR
         # defined by gc_horizon.
-        (current_lsn, size) = get_current_consistent_size(
-            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
-        )
-
-        prev_size = collected_responses[-1][2]
-
-        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
-
         collected_responses.append(("DROP", current_lsn, size))
 
     # Should have gone past gc_horizon, otherwise gc_horizon is too large
-    assert current_lsn - initdb_lsn > gc_horizon
+    bytes_written = current_lsn - initdb_lsn
+    assert bytes_written > gc_horizon
 
     # this isn't too many lines to forget for a while. observed while
     # developing these tests that locally the value is a bit more than what we

From 60f30000ef4c9b4c06aa0ba0455af4c4f0ba3940 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 15 Mar 2024 18:46:04 +0100
Subject: [PATCH 24/43] tokio-epoll-uring: fallback to std-fs if not available
 & not explicitly requested (#7120)

fixes https://github.com/neondatabase/neon/issues/7116

Changes:

- refactor PageServerConfigBuilder: support not-set values
- implement runtime feature test
- use runtime feature test to determine `virtual_file_io_engine` if not
explicitly configured in the config
- log the effective engine at startup
- drive-by: improve assertion messages in `test_pageserver_init_node_id`

This needed a tiny bit of tokio-epoll-uring work, hence bumping it.
Changelog:

```
    git log --no-decorate --oneline --reverse 868d2c42b5d54ca82fead6e8f2f233b69a540d3e..342ddd197a060a8354e8f11f4d12994419fff939
    c7a74c6 Bump mio from 0.8.8 to 0.8.11
    4df3466 Bump mio from 0.8.8 to 0.8.11 (#47)
    342ddd1 lifecycle: expose `LaunchResult` enum (#49)
```
---
 Cargo.lock                                 |   4 +-
 pageserver/src/bin/pageserver.rs           |   3 +
 pageserver/src/config.rs                   | 235 ++++++++++-----------
 pageserver/src/virtual_file.rs             |   2 +
 pageserver/src/virtual_file/io_engine.rs   |  76 +++++++
 test_runner/regress/test_pageserver_api.py |  15 +-
 6 files changed, 195 insertions(+), 140 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b8b276d74f..99ba8b1cb3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5887,7 +5887,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6424,7 +6424,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
  "bytes",
  "io-uring",
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 59750897ff..6380a4c6c1 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -120,6 +120,9 @@ fn main() -> anyhow::Result<()> {
         &[("node_id", &conf.id.to_string())],
     );
 
+    // after setting up logging, log the effective IO engine choice
+    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
+
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
         utils::crashsafe::create_dir_all(conf.tenants_path())
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 7dac0ab352..8ad9ade4a9 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,15 +30,14 @@ use utils::{
     logging::LogFormat,
 };
 
-use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
-use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
-use crate::virtual_file;
+use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
+use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{
     IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
     TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
@@ -291,16 +290,23 @@ pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
 
 // use dedicated enum for builder to better indicate the intention
 // and avoid possible confusion with nested options
+#[derive(Clone, Default)]
 pub enum BuilderValue<T> {
     Set(T),
+    #[default]
     NotSet,
 }
 
-impl<T> BuilderValue<T> {
-    pub fn ok_or<E>(self, err: E) -> Result<T, E> {
+impl<T: Clone> BuilderValue<T> {
+    pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
         match self {
-            Self::Set(v) => Ok(v),
-            Self::NotSet => Err(err),
+            Self::Set(v) => Ok(v.clone()),
+            Self::NotSet => match default {
+                BuilderValue::Set(v) => Ok(v.clone()),
+                BuilderValue::NotSet => {
+                    anyhow::bail!("missing config value {field_name:?}")
+                }
+            },
         }
     }
 }
@@ -326,6 +332,7 @@ pub(crate) struct NodeMetadata {
 }
 
 // needed to simplify config construction
+#[derive(Default)]
 struct PageServerConfigBuilder {
     listen_pg_addr: BuilderValue<String>,
 
@@ -393,8 +400,9 @@ struct PageServerConfigBuilder {
     validate_vectored_get: BuilderValue<bool>,
 }
 
-impl Default for PageServerConfigBuilder {
-    fn default() -> Self {
+impl PageServerConfigBuilder {
+    #[inline(always)]
+    fn default_values() -> Self {
         use self::BuilderValue::*;
         use defaults::*;
         Self {
@@ -647,125 +655,96 @@ impl PageServerConfigBuilder {
     }
 
     pub fn build(self) -> anyhow::Result<PageServerConf> {
-        let concurrent_tenant_warmup = self
-            .concurrent_tenant_warmup
-            .ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
-        let concurrent_tenant_size_logical_size_queries = self
-            .concurrent_tenant_size_logical_size_queries
-            .ok_or(anyhow!(
-                "missing concurrent_tenant_size_logical_size_queries"
-            ))?;
-        Ok(PageServerConf {
-            listen_pg_addr: self
-                .listen_pg_addr
-                .ok_or(anyhow!("missing listen_pg_addr"))?,
-            listen_http_addr: self
-                .listen_http_addr
-                .ok_or(anyhow!("missing listen_http_addr"))?,
-            availability_zone: self
-                .availability_zone
-                .ok_or(anyhow!("missing availability_zone"))?,
-            wait_lsn_timeout: self
-                .wait_lsn_timeout
-                .ok_or(anyhow!("missing wait_lsn_timeout"))?,
-            wal_redo_timeout: self
-                .wal_redo_timeout
-                .ok_or(anyhow!("missing wal_redo_timeout"))?,
-            superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
-            page_cache_size: self
-                .page_cache_size
-                .ok_or(anyhow!("missing page_cache_size"))?,
-            max_file_descriptors: self
-                .max_file_descriptors
-                .ok_or(anyhow!("missing max_file_descriptors"))?,
-            workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
-            pg_distrib_dir: self
-                .pg_distrib_dir
-                .ok_or(anyhow!("missing pg_distrib_dir"))?,
-            http_auth_type: self
-                .http_auth_type
-                .ok_or(anyhow!("missing http_auth_type"))?,
-            pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
-            auth_validation_public_key_path: self
-                .auth_validation_public_key_path
-                .ok_or(anyhow!("missing auth_validation_public_key_path"))?,
-            remote_storage_config: self
-                .remote_storage_config
-                .ok_or(anyhow!("missing remote_storage_config"))?,
-            id: self.id.ok_or(anyhow!("missing id"))?,
-            // TenantConf is handled separately
-            default_tenant_conf: TenantConf::default(),
-            broker_endpoint: self
-                .broker_endpoint
-                .ok_or(anyhow!("No broker endpoints provided"))?,
-            broker_keepalive_interval: self
-                .broker_keepalive_interval
-                .ok_or(anyhow!("No broker keepalive interval provided"))?,
-            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
-            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
-            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
-                concurrent_tenant_size_logical_size_queries,
-            ),
-            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
-                concurrent_tenant_size_logical_size_queries,
-            ),
-            metric_collection_interval: self
-                .metric_collection_interval
-                .ok_or(anyhow!("missing metric_collection_interval"))?,
-            cached_metric_collection_interval: self
-                .cached_metric_collection_interval
-                .ok_or(anyhow!("missing cached_metric_collection_interval"))?,
-            metric_collection_endpoint: self
-                .metric_collection_endpoint
-                .ok_or(anyhow!("missing metric_collection_endpoint"))?,
-            synthetic_size_calculation_interval: self
-                .synthetic_size_calculation_interval
-                .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
-            disk_usage_based_eviction: self
-                .disk_usage_based_eviction
-                .ok_or(anyhow!("missing disk_usage_based_eviction"))?,
-            test_remote_failures: self
-                .test_remote_failures
-                .ok_or(anyhow!("missing test_remote_failuers"))?,
-            ondemand_download_behavior_treat_error_as_warn: self
-                .ondemand_download_behavior_treat_error_as_warn
-                .ok_or(anyhow!(
-                    "missing ondemand_download_behavior_treat_error_as_warn"
-                ))?,
-            background_task_maximum_delay: self
-                .background_task_maximum_delay
-                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
-            control_plane_api: self
-                .control_plane_api
-                .ok_or(anyhow!("missing control_plane_api"))?,
-            control_plane_api_token: self
-                .control_plane_api_token
-                .ok_or(anyhow!("missing control_plane_api_token"))?,
-            control_plane_emergency_mode: self
-                .control_plane_emergency_mode
-                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
-            heatmap_upload_concurrency: self
-                .heatmap_upload_concurrency
-                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
-            secondary_download_concurrency: self
-                .secondary_download_concurrency
-                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
-            ingest_batch_size: self
-                .ingest_batch_size
-                .ok_or(anyhow!("missing ingest_batch_size"))?,
-            virtual_file_io_engine: self
-                .virtual_file_io_engine
-                .ok_or(anyhow!("missing virtual_file_io_engine"))?,
-            get_vectored_impl: self
-                .get_vectored_impl
-                .ok_or(anyhow!("missing get_vectored_impl"))?,
-            max_vectored_read_bytes: self
-                .max_vectored_read_bytes
-                .ok_or(anyhow!("missing max_vectored_read_bytes"))?,
-            validate_vectored_get: self
-                .validate_vectored_get
-                .ok_or(anyhow!("missing validate_vectored_get"))?,
-        })
+        let default = Self::default_values();
+
+        macro_rules! conf {
+            (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
+                PageServerConf {
+                    $(
+                        $field: self.$field.ok_or(stringify!($field), default.$field)?,
+                    )*
+                    $(
+                        $custom_field: $custom_value,
+                    )*
+                }
+            };
+        }
+
+        Ok(conf!(
+            USING DEFAULT
+            {
+                listen_pg_addr,
+                listen_http_addr,
+                availability_zone,
+                wait_lsn_timeout,
+                wal_redo_timeout,
+                superuser,
+                page_cache_size,
+                max_file_descriptors,
+                workdir,
+                pg_distrib_dir,
+                http_auth_type,
+                pg_auth_type,
+                auth_validation_public_key_path,
+                remote_storage_config,
+                id,
+                broker_endpoint,
+                broker_keepalive_interval,
+                log_format,
+                metric_collection_interval,
+                cached_metric_collection_interval,
+                metric_collection_endpoint,
+                synthetic_size_calculation_interval,
+                disk_usage_based_eviction,
+                test_remote_failures,
+                ondemand_download_behavior_treat_error_as_warn,
+                background_task_maximum_delay,
+                control_plane_api,
+                control_plane_api_token,
+                control_plane_emergency_mode,
+                heatmap_upload_concurrency,
+                secondary_download_concurrency,
+                ingest_batch_size,
+                get_vectored_impl,
+                max_vectored_read_bytes,
+                validate_vectored_get,
+            }
+            CUSTOM LOGIC
+            {
+                // TenantConf is handled separately
+                default_tenant_conf: TenantConf::default(),
+                concurrent_tenant_warmup: ConfigurableSemaphore::new({
+                    self
+                        .concurrent_tenant_warmup
+                        .ok_or("concurrent_tenant_warmpup",
+                               default.concurrent_tenant_warmup)?
+                }),
+                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
+                    self
+                        .concurrent_tenant_size_logical_size_queries
+                        .ok_or("concurrent_tenant_size_logical_size_queries",
+                               default.concurrent_tenant_size_logical_size_queries.clone())?
+                ),
+                eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
+                    // re-use `concurrent_tenant_size_logical_size_queries`
+                    self
+                        .concurrent_tenant_size_logical_size_queries
+                        .ok_or("eviction_task_immitated_concurrent_logical_size_queries",
+                               default.concurrent_tenant_size_logical_size_queries.clone())?,
+                ),
+                virtual_file_io_engine: match self.virtual_file_io_engine {
+                    BuilderValue::Set(v) => v,
+                    BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
+                        io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
+                        io_engine::FeatureTestResult::Worse { engine, remark } => {
+                            // TODO: bubble this up to the caller so we can tracing::warn! it.
+                            eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
+                            engine
+                        }
+                    },
+                },
+            }
+        ))
     }
 }
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 6d4774cf75..ae44e9edc4 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -28,6 +28,8 @@ use tokio::time::Instant;
 
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
+pub use io_engine::feature_test as io_engine_feature_test;
+pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 pub(crate) use io_engine::IoEngineKind;
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index e369d28711..7f2342e76e 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -253,3 +253,79 @@ impl IoEngine {
         }
     }
 }
+
+pub enum FeatureTestResult {
+    PlatformPreferred(IoEngineKind),
+    Worse {
+        engine: IoEngineKind,
+        remark: String,
+    },
+}
+
+impl FeatureTestResult {
+    #[cfg(target_os = "linux")]
+    const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::TokioEpollUring;
+    #[cfg(not(target_os = "linux"))]
+    const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::StdFs;
+}
+
+impl From<FeatureTestResult> for IoEngineKind {
+    fn from(val: FeatureTestResult) -> Self {
+        match val {
+            FeatureTestResult::PlatformPreferred(e) => e,
+            FeatureTestResult::Worse { engine, .. } => engine,
+        }
+    }
+}
+
+/// Somewhat costly under the hood, do only once.
+/// Panics if we can't set up the feature test.
+pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
+    std::thread::spawn(|| {
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+        #[cfg(not(target_os = "linux"))]
+        {
+            return Ok(FeatureTestResult::PlatformPreferred(
+                FeatureTestResult::PLATFORM_DEFAULT,
+            ));
+        }
+        #[cfg(target_os = "linux")]
+        Ok(match rt.block_on(tokio_epoll_uring::System::launch()) {
+            Ok(_) => FeatureTestResult::PlatformPreferred({
+                assert!(matches!(
+                    IoEngineKind::TokioEpollUring,
+                    FeatureTestResult::PLATFORM_PREFERRED
+                ));
+                FeatureTestResult::PLATFORM_PREFERRED
+            }),
+            Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => {
+                let remark = match e.raw_os_error() {
+                    Some(nix::libc::EPERM) => {
+                        // fall back
+                        "creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled "
+                            .to_string()
+                    }
+                   Some(nix::libc::EFAULT) => {
+                        // fail feature test
+                        anyhow::bail!(
+                            "creating tokio-epoll-uring fails with EFAULT, might have corrupted memory"
+                        );
+                    }
+                    Some(_) | None => {
+                        // fall back
+                        format!("creating tokio-epoll-uring fails with error: {e:#}")
+                    }
+               };
+                FeatureTestResult::Worse {
+                    engine: IoEngineKind::StdFs,
+                    remark,
+                }
+            }
+        })
+    })
+    .join()
+    .unwrap()
+}
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 877deee08f..81aed704bb 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -37,23 +37,18 @@ def test_pageserver_init_node_id(
     assert (
         bad_init.returncode == 1
     ), "pageserver should not be able to init new config without the node id"
-    assert "missing id" in bad_init.stderr
+    assert 'missing config value "id"' in bad_init.stderr
     assert not pageserver_config.exists(), "config file should not be created after init error"
 
-    completed_init = run_pageserver(
-        ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
-    )
+    good_init_cmd = ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
+    completed_init = run_pageserver(good_init_cmd)
     assert (
         completed_init.returncode == 0
     ), "pageserver should be able to create a new config with the node id given"
     assert pageserver_config.exists(), "config file should be created successfully"
 
-    bad_reinit = run_pageserver(
-        ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
-    )
-    assert (
-        bad_reinit.returncode == 1
-    ), "pageserver should not be able to init new config without the node id"
+    bad_reinit = run_pageserver(good_init_cmd)
+    assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists"
     assert "already exists, cannot init it" in bad_reinit.stderr
 
     bad_update = run_pageserver(["--update-config", "-c", "id = 3"])

From 1aa159accac6f80b2565cf79d30ac584959fe32a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 18:03:49 +0000
Subject: [PATCH 25/43] pageserver: cancellation for remote ops in tenant
 deletion on shutdown (#6105)

## Problem

Tenant deletion had a couple of TODOs where we weren't using proper
cancellation tokens that would have aborted the deletions during process
shutdown.

## Summary of changes

- Refactor enough that deletion/shutdown code has access to the
TenantManager's cancellation toke
- Use that cancellation token in tenant deletion instead of dummy
tokens.
---
 pageserver/src/bin/pageserver.rs            |  4 +-
 pageserver/src/lib.rs                       |  9 ++++-
 pageserver/src/task_mgr.rs                  |  4 +-
 pageserver/src/tenant/delete.rs             | 21 +++++------
 pageserver/src/tenant/mgr.rs                | 42 +++++++++++++--------
 test_runner/regress/test_tenant_delete.py   |  4 +-
 test_runner/regress/test_timeline_delete.py |  6 +--
 7 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 6380a4c6c1..1fd7c775d5 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -317,6 +317,7 @@ fn start_pageserver(
     let http_listener = tcp_listener::bind(http_addr)?;
 
     let pg_addr = &conf.listen_pg_addr;
+
     info!("Starting pageserver pg protocol handler on {pg_addr}");
     let pageserver_listener = tcp_listener::bind(pg_addr)?;
 
@@ -549,7 +550,7 @@ fn start_pageserver(
         let router_state = Arc::new(
             http::routes::State::new(
                 conf,
-                tenant_manager,
+                tenant_manager.clone(),
                 http_auth.clone(),
                 remote_storage.clone(),
                 broker_client.clone(),
@@ -693,6 +694,7 @@ fn start_pageserver(
                 let bg_remote_storage = remote_storage.clone();
                 let bg_deletion_queue = deletion_queue.clone();
                 BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                    &tenant_manager,
                     bg_remote_storage.map(|_| bg_deletion_queue),
                     0,
                 ));
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index b00db02a1c..f947a75f61 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -31,6 +31,7 @@ pub mod walredo;
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
+use tenant::mgr::TenantManager;
 use tracing::info;
 
 /// Current storage format version
@@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 pub use crate::metrics::preinitialize_metrics;
 
 #[tracing::instrument(skip_all, fields(%exit_code))]
-pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
+pub async fn shutdown_pageserver(
+    tenant_manager: &TenantManager,
+    deletion_queue: Option<DeletionQueue>,
+    exit_code: i32,
+) {
     use std::time::Duration;
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
@@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
     // Shut down all the tenants. This flushes everything to disk and kills
     // the checkpoint and GC tasks.
     timed(
-        tenant::mgr::shutdown_all_tenants(),
+        tenant_manager.shutdown(),
         "shutdown all tenants",
         Duration::from_secs(5),
     )
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 275a72c0b0..69e163effa 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -50,8 +50,6 @@ use once_cell::sync::Lazy;
 
 use utils::id::TimelineId;
 
-use crate::shutdown_pageserver;
-
 //
 // There are four runtimes:
 //
@@ -453,7 +451,7 @@ async fn task_finish(
     }
 
     if shutdown_process {
-        shutdown_pageserver(None, 1).await;
+        std::process::exit(1);
     }
 }
 
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index ffb7206b1e..cab60c3111 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -296,6 +296,7 @@ impl DeleteTenantFlow {
         remote_storage: Option<GenericRemoteStorage>,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         tenant: Arc<Tenant>,
+        cancel: &CancellationToken,
     ) -> Result<(), DeleteTenantError> {
         span::debug_assert_current_span_has_tenant_id();
 
@@ -303,7 +304,9 @@ impl DeleteTenantFlow {
 
         let mut guard = Self::prepare(&tenant).await?;
 
-        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
+        if let Err(e) =
+            Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
+        {
             tenant.set_broken(format!("{e:#}")).await;
             return Err(e);
         }
@@ -322,6 +325,7 @@ impl DeleteTenantFlow {
         conf: &'static PageServerConf,
         remote_storage: Option<&GenericRemoteStorage>,
         tenant: &Tenant,
+        cancel: &CancellationToken,
     ) -> Result<(), DeleteTenantError> {
         guard.mark_in_progress()?;
 
@@ -335,15 +339,9 @@ impl DeleteTenantFlow {
         // Though sounds scary, different mark name?
         // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
         if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(
-                conf,
-                remote_storage,
-                &tenant.tenant_shard_id,
-                // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
-                &CancellationToken::new(),
-            )
-            .await
-            .context("remote_mark")?
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
+                .await
+                .context("remote_mark")?
         }
 
         fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -546,8 +544,7 @@ impl DeleteTenantFlow {
             conf,
             remote_storage.as_ref(),
             &tenant.tenant_shard_id,
-            // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
-            &CancellationToken::new(),
+            &task_mgr::shutdown_token(),
         )
         .await?;
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 7cf03d8fd6..3aaab6e4ef 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -102,7 +102,7 @@ pub(crate) enum TenantsMap {
     /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
     /// New tenants can be added using [`tenant_map_acquire_slot`].
     Open(BTreeMap<TenantShardId, TenantSlot>),
-    /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
+    /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`].
     /// Existing tenants are still accessible, but no new tenants can be created.
     ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
 }
@@ -261,6 +261,12 @@ pub struct TenantManager {
     // See https://github.com/neondatabase/neon/issues/5796
     tenants: &'static std::sync::RwLock<TenantsMap>,
     resources: TenantSharedResources,
+
+    // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token.
+    // This is for edge cases like tenant deletion.  In normal cases (within a Tenant lifetime),
+    // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
+    // when the tenant detaches.
+    cancel: CancellationToken,
 }
 
 fn emergency_generations(
@@ -620,6 +626,7 @@ pub async fn init_tenant_mgr(
         conf,
         tenants: &TENANTS,
         resources,
+        cancel: CancellationToken::new(),
     })
 }
 
@@ -680,21 +687,6 @@ pub(crate) fn tenant_spawn(
     Ok(tenant)
 }
 
-///
-/// Shut down all tenants. This runs as part of pageserver shutdown.
-///
-/// NB: We leave the tenants in the map, so that they remain accessible through
-/// the management API until we shut it down. If we removed the shut-down tenants
-/// from the tenants map, the management API would return 404 for these tenants,
-/// because TenantsMap::get() now returns `None`.
-/// That could be easily misinterpreted by control plane, the consumer of the
-/// management API. For example, it could attach the tenant on a different pageserver.
-/// We would then be in split-brain once this pageserver restarts.
-#[instrument(skip_all)]
-pub(crate) async fn shutdown_all_tenants() {
-    shutdown_all_tenants0(&TENANTS).await
-}
-
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
     let mut join_set = JoinSet::new();
 
@@ -1428,6 +1420,7 @@ impl TenantManager {
             self.resources.remote_storage.clone(),
             &TENANTS,
             tenant,
+            &self.cancel,
         )
         .await;
 
@@ -1817,6 +1810,23 @@ impl TenantManager {
 
         Ok(())
     }
+
+    ///
+    /// Shut down all tenants. This runs as part of pageserver shutdown.
+    ///
+    /// NB: We leave the tenants in the map, so that they remain accessible through
+    /// the management API until we shut it down. If we removed the shut-down tenants
+    /// from the tenants map, the management API would return 404 for these tenants,
+    /// because TenantsMap::get() now returns `None`.
+    /// That could be easily misinterpreted by control plane, the consumer of the
+    /// management API. For example, it could attach the tenant on a different pageserver.
+    /// We would then be in split-brain once this pageserver restarts.
+    #[instrument(skip_all)]
+    pub(crate) async fn shutdown(&self) {
+        self.cancel.cancel();
+
+        shutdown_all_tenants0(self.tenants).await
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 52de889084..a164c7f60a 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -184,7 +184,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
             # allow errors caused by failpoints
             f".*failpoint: {failpoint}",
             # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
             # We may leave some upload tasks in the queue. They're likely deletes.
             # For uploads we explicitly wait with `last_flush_lsn_upload` below.
             # So by ignoring these instead of waiting for empty upload queue
@@ -327,7 +327,7 @@ def test_tenant_delete_is_resumed_on_attach(
             # From deletion polling
             f".*NotFound: tenant {env.initial_tenant}.*",
             # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
             # error from http response is also logged
             ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
             '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 96a5cc491a..0eb1327c9e 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -204,7 +204,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
         [
             f".*{timeline_id}.*failpoint: {failpoint}",
             # It appears when we stopped flush loop during deletion and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
             # This happens when we fail before scheduling background operation.
             # Timeline is left in stopping state and retry tries to stop it again.
             ".*Ignoring new state, equal to the existing one: Stopping",
@@ -398,7 +398,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
             ".*failpoint: timeline-delete-before-rm",
             ".*Ignoring new state, equal to the existing one: Stopping",
             # this happens, because the stuck timeline is visible to shutdown
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
         ]
     )
 
@@ -809,7 +809,7 @@ def test_timeline_delete_resumed_on_attach(
             # allow errors caused by failpoints
             f".*failpoint: {failpoint}",
             # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
             # error from http response is also logged
             ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
             # Polling after attach may fail with this

From ad6f538aefe3286d827663c2ebdc28f4c2ba9613 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 15 Mar 2024 19:57:05 +0100
Subject: [PATCH 26/43] tokio-epoll-uring: use it for on-demand downloads
 (#6992)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Problem

On-demand downloads are still using `tokio::fs`, which we know is
inefficient.

# Changes

- Add `pagebench ondemand-download-churn` to quantify on-demand download
throughput
- Requires dumping layer map, which required making `history_buffer`
impl `Deserialize`
- Implement an equivalent of `tokio::io::copy_buf` for owned buffers =>
`owned_buffers_io` module and children.
- Make layer file download sensitive to `io_engine::get()`, using
VirtualFile + above copy loop
- For this, I had to move some code into the `retry_download`, e.g.,
`sync_all()` call.

Drive-by:
- fix missing escaping in `scripts/ps_ec2_setup_instance_store`
- if we failed in retry_download to create a file, we'd try to remove
it, encounter `NotFound`, and `abort()` the process using
`on_fatal_io_error`. This PR adds treats `NotFound` as a success.

# Testing

Functional

- The copy loop is generic & unit tested.

Performance

- Used the `ondemand-download-churn` benchmark to manually test against
real S3.
- Results (public Notion page):
https://neondatabase.notion.site/Benchmarking-tokio-epoll-uring-on-demand-downloads-2024-04-15-newer-code-03c0fdc475c54492b44d9627b6e4e710?pvs=4
- Performance is equivalent at low concurrency. Jumpier situation at
high concurrency, but, still less CPU / throughput with
tokio-epoll-uring.
  - It’s a win.

# Future Work

Turn the manual performance testing described in the above results
document into a performance regression test:
https://github.com/neondatabase/neon/issues/7146
---
 libs/pageserver_api/src/models.rs             |  39 ++-
 libs/utils/src/history_buffer.rs              |  39 ++-
 pageserver/client/src/mgmt_api.rs             |  77 ++++-
 .../src/cmd/ondemand_download_churn.rs        | 272 ++++++++++++++++++
 pageserver/pagebench/src/main.rs              |   3 +
 .../tenant/remote_timeline_client/download.rs | 178 ++++++++----
 pageserver/src/tenant/storage_layer.rs        |   5 +-
 pageserver/src/virtual_file.rs                |  17 ++
 pageserver/src/virtual_file/io_engine.rs      |  75 ++---
 .../util/size_tracking_writer.rs              |  34 +++
 .../virtual_file/owned_buffers_io/write.rs    | 206 +++++++++++++
 scripts/ps_ec2_setup_instance_store           |   2 +-
 12 files changed, 845 insertions(+), 102 deletions(-)
 create mode 100644 pageserver/pagebench/src/cmd/ondemand_download_churn.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/write.rs

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 3aa84f8903..0d0702e38e 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -4,6 +4,7 @@ pub mod utilization;
 pub use utilization::PageserverUtilization;
 
 use std::{
+    borrow::Cow,
     collections::HashMap,
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
@@ -577,7 +578,7 @@ pub struct TimelineInfo {
     pub walreceiver_status: String,
 }
 
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerMapInfo {
     pub in_memory_layers: Vec<InMemoryLayerInfo>,
     pub historic_layers: Vec<HistoricLayerInfo>,
@@ -595,7 +596,7 @@ pub enum LayerAccessKind {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStatFullDetails {
     pub when_millis_since_epoch: u64,
-    pub task_kind: &'static str,
+    pub task_kind: Cow<'static, str>,
     pub access_kind: LayerAccessKind,
 }
 
@@ -654,23 +655,23 @@ impl LayerResidenceEvent {
     }
 }
 
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStats {
     pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
-    pub task_kind_access_flag: Vec<&'static str>,
+    pub task_kind_access_flag: Vec<Cow<'static, str>>,
     pub first: Option<LayerAccessStatFullDetails>,
     pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
     pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }
 
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
     Open { lsn_start: Lsn },
     Frozen { lsn_start: Lsn, lsn_end: Lsn },
 }
 
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
     Delta {
@@ -692,6 +693,32 @@ pub enum HistoricLayerInfo {
     },
 }
 
+impl HistoricLayerInfo {
+    pub fn layer_file_name(&self) -> &str {
+        match self {
+            HistoricLayerInfo::Delta {
+                layer_file_name, ..
+            } => layer_file_name,
+            HistoricLayerInfo::Image {
+                layer_file_name, ..
+            } => layer_file_name,
+        }
+    }
+    pub fn is_remote(&self) -> bool {
+        match self {
+            HistoricLayerInfo::Delta { remote, .. } => *remote,
+            HistoricLayerInfo::Image { remote, .. } => *remote,
+        }
+    }
+    pub fn set_remote(&mut self, value: bool) {
+        let field = match self {
+            HistoricLayerInfo::Delta { remote, .. } => remote,
+            HistoricLayerInfo::Image { remote, .. } => remote,
+        };
+        *field = value;
+    }
+}
+
 #[derive(Debug, Serialize, Deserialize)]
 pub struct DownloadRemoteLayersTaskSpawnRequest {
     pub max_concurrent_downloads: NonZeroUsize,
diff --git a/libs/utils/src/history_buffer.rs b/libs/utils/src/history_buffer.rs
index 1f07f5560f..bd35e2bad6 100644
--- a/libs/utils/src/history_buffer.rs
+++ b/libs/utils/src/history_buffer.rs
@@ -47,9 +47,10 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
     }
 }
 
-#[derive(serde::Serialize)]
+#[derive(serde::Serialize, serde::Deserialize)]
 struct SerdeRepr<T> {
     buffer: Vec<T>,
+    buffer_size: usize,
     drop_count: u64,
 }
 
@@ -61,6 +62,7 @@ where
         let HistoryBufferWithDropCounter { buffer, drop_count } = value;
         SerdeRepr {
             buffer: buffer.iter().cloned().collect(),
+            buffer_size: L,
             drop_count: *drop_count,
         }
     }
@@ -78,19 +80,52 @@ where
     }
 }
 
+impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
+where
+    T: Clone + serde::Deserialize<'de>,
+{
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let SerdeRepr {
+            buffer: des_buffer,
+            drop_count,
+            buffer_size,
+        } = SerdeRepr::<T>::deserialize(deserializer)?;
+        if buffer_size != L {
+            use serde::de::Error;
+            return Err(D::Error::custom(format!(
+                "invalid buffer_size, expecting {L} got {buffer_size}"
+            )));
+        }
+        let mut buffer = HistoryBuffer::new();
+        buffer.extend(des_buffer);
+        Ok(HistoryBufferWithDropCounter { buffer, drop_count })
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::HistoryBufferWithDropCounter;
 
     #[test]
     fn test_basics() {
-        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
+        let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
         b.write(1);
         b.write(2);
         b.write(3);
         assert!(b.iter().any(|e| *e == 2));
         assert!(b.iter().any(|e| *e == 3));
         assert!(!b.iter().any(|e| *e == 1));
+
+        // round-trip serde
+        let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
+            serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
+        assert_eq!(
+            round_tripped.iter().cloned().collect::<Vec<_>>(),
+            b.iter().cloned().collect::<Vec<_>>()
+        );
     }
 
     #[test]
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index ed9f633253..1a8f7e0524 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -169,7 +169,7 @@ impl Client {
         self.request(Method::GET, uri, ()).await
     }
 
-    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
         &self,
         method: Method,
         uri: U,
@@ -181,7 +181,16 @@ impl Client {
         } else {
             req
         };
-        let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
+        req.json(&body).send().await.map_err(Error::ReceiveBody)
+    }
+
+    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        let res = self.request_noerror(method, uri, body).await?;
         let response = res.error_from_body().await?;
         Ok(response)
     }
@@ -425,4 +434,68 @@ impl Client {
             .await
             .map_err(Error::ReceiveBody)
     }
+
+    pub async fn layer_map_info(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<LayerMapInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/layer",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id,
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn layer_evict(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        layer_file_name: &str,
+    ) -> Result<bool> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/layer/{}",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
+        );
+        let resp = self.request_noerror(Method::DELETE, &uri, ()).await?;
+        match resp.status() {
+            StatusCode::OK => Ok(true),
+            StatusCode::NOT_MODIFIED => Ok(false),
+            // TODO: dedupe this pattern / introduce separate error variant?
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
+
+    pub async fn layer_ondemand_download(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        layer_file_name: &str,
+    ) -> Result<bool> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/layer/{}",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
+        );
+        let resp = self.request_noerror(Method::GET, &uri, ()).await?;
+        match resp.status() {
+            StatusCode::OK => Ok(true),
+            StatusCode::NOT_MODIFIED => Ok(false),
+            // TODO: dedupe this pattern / introduce separate error variant?
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
 }
diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
new file mode 100644
index 0000000000..197e782dca
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -0,0 +1,272 @@
+use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
+
+use pageserver_client::mgmt_api;
+use rand::seq::SliceRandom;
+use tracing::{debug, info};
+use utils::id::{TenantTimelineId, TimelineId};
+
+use tokio::{
+    sync::{mpsc, OwnedSemaphorePermit},
+    task::JoinSet,
+};
+
+use std::{
+    num::NonZeroUsize,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+    time::{Duration, Instant},
+};
+
+/// Evict & on-demand download random layers.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+    #[clap(long)]
+    runtime: Option<humantime::Duration>,
+    #[clap(long, default_value = "1")]
+    tasks_per_target: NonZeroUsize,
+    #[clap(long, default_value = "1")]
+    concurrency_per_target: NonZeroUsize,
+    /// Probability for sending `latest=true` in the request (uniform distribution).
+    #[clap(long)]
+    limit_to_first_n_targets: Option<usize>,
+    /// Before starting the benchmark, live-reconfigure the pageserver to use the given
+    /// [`pageserver_api::models::virtual_file::IoEngineKind`].
+    #[clap(long)]
+    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()?;
+    let task = rt.spawn(main_impl(args));
+    rt.block_on(task).unwrap().unwrap();
+    Ok(())
+}
+
+#[derive(Debug, Default)]
+struct LiveStats {
+    evictions: AtomicU64,
+    downloads: AtomicU64,
+    timeline_restarts: AtomicU64,
+}
+
+impl LiveStats {
+    fn eviction_done(&self) {
+        self.evictions.fetch_add(1, Ordering::Relaxed);
+    }
+    fn download_done(&self) {
+        self.downloads.fetch_add(1, Ordering::Relaxed);
+    }
+    fn timeline_restart_done(&self) {
+        self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    if let Some(engine_str) = &args.set_io_engine {
+        mgmt_api_client.put_io_engine(engine_str).await?;
+    }
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: args.limit_to_first_n_targets,
+            targets: args.targets.clone(),
+        },
+    )
+    .await?;
+
+    let mut tasks = JoinSet::new();
+
+    let live_stats = Arc::new(LiveStats::default());
+    tasks.spawn({
+        let live_stats = Arc::clone(&live_stats);
+        async move {
+            let mut last_at = Instant::now();
+            loop {
+                tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
+                let now = Instant::now();
+                let delta: Duration = now - last_at;
+                last_at = now;
+
+                let LiveStats {
+                    evictions,
+                    downloads,
+                    timeline_restarts,
+                } = &*live_stats;
+                let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
+                info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
+            }
+        }
+    });
+
+    for tl in timelines {
+        for _ in 0..args.tasks_per_target.get() {
+            tasks.spawn(timeline_actor(
+                args,
+                Arc::clone(&mgmt_api_client),
+                tl,
+                Arc::clone(&live_stats),
+            ));
+        }
+    }
+
+    while let Some(res) = tasks.join_next().await {
+        res.unwrap();
+    }
+    Ok(())
+}
+
+async fn timeline_actor(
+    args: &'static Args,
+    mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
+    timeline: TenantTimelineId,
+    live_stats: Arc<LiveStats>,
+) {
+    // TODO: support sharding
+    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
+
+    struct Timeline {
+        joinset: JoinSet<()>,
+        layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
+        concurrency: Arc<tokio::sync::Semaphore>,
+    }
+    loop {
+        debug!("restarting timeline");
+        let layer_map_info = mgmt_api_client
+            .layer_map_info(tenant_shard_id, timeline.timeline_id)
+            .await
+            .unwrap();
+        let concurrency = Arc::new(tokio::sync::Semaphore::new(
+            args.concurrency_per_target.get(),
+        ));
+
+        let mut joinset = JoinSet::new();
+        let layers = layer_map_info
+            .historic_layers
+            .into_iter()
+            .map(|historic_layer| {
+                let (tx, rx) = mpsc::channel(1);
+                joinset.spawn(layer_actor(
+                    tenant_shard_id,
+                    timeline.timeline_id,
+                    historic_layer,
+                    rx,
+                    Arc::clone(&mgmt_api_client),
+                    Arc::clone(&live_stats),
+                ));
+                tx
+            })
+            .collect::<Vec<_>>();
+
+        let mut timeline = Timeline {
+            joinset,
+            layers,
+            concurrency,
+        };
+
+        live_stats.timeline_restart_done();
+
+        loop {
+            assert!(!timeline.joinset.is_empty());
+            if let Some(res) = timeline.joinset.try_join_next() {
+                debug!(?res, "a layer actor exited, should not happen");
+                timeline.joinset.shutdown().await;
+                break;
+            }
+
+            let mut permit = Some(
+                Arc::clone(&timeline.concurrency)
+                    .acquire_owned()
+                    .await
+                    .unwrap(),
+            );
+
+            loop {
+                let layer_tx = {
+                    let mut rng = rand::thread_rng();
+                    timeline.layers.choose_mut(&mut rng).expect("no layers")
+                };
+                match layer_tx.try_send(permit.take().unwrap()) {
+                    Ok(_) => break,
+                    Err(e) => match e {
+                        mpsc::error::TrySendError::Full(back) => {
+                            // TODO: retrying introduces bias away from slow downloaders
+                            permit.replace(back);
+                        }
+                        mpsc::error::TrySendError::Closed(_) => panic!(),
+                    },
+                }
+            }
+        }
+    }
+}
+
+async fn layer_actor(
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    mut layer: HistoricLayerInfo,
+    mut rx: mpsc::Receiver<tokio::sync::OwnedSemaphorePermit>,
+    mgmt_api_client: Arc<mgmt_api::Client>,
+    live_stats: Arc<LiveStats>,
+) {
+    #[derive(Clone, Copy)]
+    enum Action {
+        Evict,
+        OnDemandDownload,
+    }
+
+    while let Some(_permit) = rx.recv().await {
+        let action = if layer.is_remote() {
+            Action::OnDemandDownload
+        } else {
+            Action::Evict
+        };
+
+        let did_it = match action {
+            Action::Evict => {
+                let did_it = mgmt_api_client
+                    .layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name())
+                    .await
+                    .unwrap();
+                live_stats.eviction_done();
+                did_it
+            }
+            Action::OnDemandDownload => {
+                let did_it = mgmt_api_client
+                    .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
+                    .await
+                    .unwrap();
+                live_stats.download_done();
+                did_it
+            }
+        };
+        if !did_it {
+            debug!("local copy of layer map appears out of sync, re-downloading");
+            return;
+        }
+        debug!("did it");
+        layer.set_remote(match action {
+            Action::Evict => true,
+            Action::OnDemandDownload => false,
+        });
+    }
+}
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 5d688ed2d1..743102d853 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -16,6 +16,7 @@ mod util {
 mod cmd {
     pub(super) mod basebackup;
     pub(super) mod getpage_latest_lsn;
+    pub(super) mod ondemand_download_churn;
     pub(super) mod trigger_initial_size_calculation;
 }
 
@@ -25,6 +26,7 @@ enum Args {
     Basebackup(cmd::basebackup::Args),
     GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
     TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
+    OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
 }
 
 fn main() {
@@ -43,6 +45,7 @@ fn main() {
         Args::TriggerInitialSizeCalculation(args) => {
             cmd::trigger_initial_size_calculation::main(args)
         }
+        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
     }
     .unwrap()
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 6fff6e78e2..6ee8ad7155 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -23,7 +23,7 @@ use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
-use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
+use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::TimelineId;
 
@@ -73,55 +73,13 @@ pub async fn download_layer_file<'a>(
     // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
     let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
 
-    let (mut destination_file, bytes_amount) = download_retry(
-        || async {
-            let destination_file = tokio::fs::File::create(&temp_file_path)
-                .await
-                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
-                .map_err(DownloadError::Other)?;
-
-            let download = storage.download(&remote_path, cancel).await?;
-
-            let mut destination_file =
-                tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
-
-            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
-
-            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
-
-            match bytes_amount {
-                Ok(bytes_amount) => {
-                    let destination_file = destination_file.into_inner();
-                    Ok((destination_file, bytes_amount))
-                }
-                Err(e) => {
-                    if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
-                        on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
-                    }
-
-                    Err(e.into())
-                }
-            }
-        },
+    let bytes_amount = download_retry(
+        || async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
         &format!("download {remote_path:?}"),
         cancel,
     )
     .await?;
 
-    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
-    // A file will not be closed immediately when it goes out of scope if there are any IO operations
-    // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
-    // you should call flush before dropping it.
-    //
-    // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
-    // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
-    // But for additional safety lets check/wait for any pending operations.
-    destination_file
-        .flush()
-        .await
-        .with_context(|| format!("flush source file at {temp_file_path}"))
-        .map_err(DownloadError::Other)?;
-
     let expected = layer_metadata.file_size();
     if expected != bytes_amount {
         return Err(DownloadError::Other(anyhow!(
@@ -129,14 +87,6 @@ pub async fn download_layer_file<'a>(
         )));
     }
 
-    // not using sync_data because it can lose file size update
-    destination_file
-        .sync_all()
-        .await
-        .with_context(|| format!("failed to fsync source file at {temp_file_path}"))
-        .map_err(DownloadError::Other)?;
-    drop(destination_file);
-
     fail::fail_point!("remote-storage-download-pre-rename", |_| {
         Err(DownloadError::Other(anyhow!(
             "remote-storage-download-pre-rename failpoint triggered"
@@ -169,6 +119,128 @@ pub async fn download_layer_file<'a>(
     Ok(bytes_amount)
 }
 
+/// Download the object `src_path` in the remote `storage` to local path `dst_path`.
+///
+/// If Ok() is returned, the download succeeded and the inode & data have been made durable.
+/// (Note that the directory entry for the inode is not made durable.)
+/// The file size in bytes is returned.
+///
+/// If Err() is returned, there was some error. The file at `dst_path` has been unlinked.
+/// The unlinking has _not_ been made durable.
+async fn download_object<'a>(
+    storage: &'a GenericRemoteStorage,
+    src_path: &RemotePath,
+    dst_path: &Utf8PathBuf,
+    cancel: &CancellationToken,
+) -> Result<u64, DownloadError> {
+    let res = match crate::virtual_file::io_engine::get() {
+        crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
+        crate::virtual_file::io_engine::IoEngine::StdFs => {
+            async {
+                let destination_file = tokio::fs::File::create(dst_path)
+                    .await
+                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
+                    .map_err(DownloadError::Other)?;
+
+                let download = storage.download(src_path, cancel).await?;
+
+                let mut buf_writer =
+                    tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
+
+                let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
+
+                let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?;
+                buf_writer.flush().await?;
+
+                let mut destination_file = buf_writer.into_inner();
+
+                // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
+                // A file will not be closed immediately when it goes out of scope if there are any IO operations
+                // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
+                // you should call flush before dropping it.
+                //
+                // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
+                // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
+                // But for additional safety lets check/wait for any pending operations.
+                destination_file
+                    .flush()
+                    .await
+                    .with_context(|| format!("flush source file at {dst_path}"))
+                    .map_err(DownloadError::Other)?;
+
+                // not using sync_data because it can lose file size update
+                destination_file
+                    .sync_all()
+                    .await
+                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
+                    .map_err(DownloadError::Other)?;
+
+                Ok(bytes_amount)
+            }
+            .await
+        }
+        #[cfg(target_os = "linux")]
+        crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
+            use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
+            async {
+                let destination_file = VirtualFile::create(dst_path)
+                    .await
+                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
+                    .map_err(DownloadError::Other)?;
+
+                let mut download = storage.download(src_path, cancel).await?;
+
+                // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
+                // There's chunks_vectored() on the stream.
+                let (bytes_amount, destination_file) = async {
+                    let size_tracking = size_tracking_writer::Writer::new(destination_file);
+                    let mut buffered = owned_buffers_io::write::BufferedWriter::<
+                        { super::BUFFER_SIZE },
+                        _,
+                    >::new(size_tracking);
+                    while let Some(res) =
+                        futures::StreamExt::next(&mut download.download_stream).await
+                    {
+                        let chunk = match res {
+                            Ok(chunk) => chunk,
+                            Err(e) => return Err(e),
+                        };
+                        buffered
+                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
+                            .await?;
+                    }
+                    let size_tracking = buffered.flush_and_into_inner().await?;
+                    Ok(size_tracking.into_inner())
+                }
+                .await?;
+
+                // not using sync_data because it can lose file size update
+                destination_file
+                    .sync_all()
+                    .await
+                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
+                    .map_err(DownloadError::Other)?;
+
+                Ok(bytes_amount)
+            }
+            .await
+        }
+    };
+
+    // in case the download failed, clean up
+    match res {
+        Ok(bytes_amount) => Ok(bytes_amount),
+        Err(e) => {
+            if let Err(e) = tokio::fs::remove_file(dst_path).await {
+                if e.kind() != std::io::ErrorKind::NotFound {
+                    on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}"));
+                }
+            }
+            Err(e)
+        }
+    }
+}
+
 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
 
 pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 299950cc21..5c3bab9868 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -20,6 +20,7 @@ use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::models::{
     LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
+use std::borrow::Cow;
 use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
@@ -427,7 +428,7 @@ impl LayerAccessStatFullDetails {
         } = self;
         pageserver_api::models::LayerAccessStatFullDetails {
             when_millis_since_epoch: system_time_to_millis_since_epoch(when),
-            task_kind: task_kind.into(), // into static str, powered by strum_macros
+            task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
             access_kind: *access_kind,
         }
     }
@@ -525,7 +526,7 @@ impl LayerAccessStats {
                 .collect(),
             task_kind_access_flag: task_kind_flag
                 .iter()
-                .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
+                .map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
                 .collect(),
             first: first_access.as_ref().map(|a| a.as_api_model()),
             accesses_history: last_accesses.map(|m| m.as_api_model()),
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index ae44e9edc4..dee36d8afd 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -36,6 +36,23 @@ pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
 
+#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
+pub(crate) mod owned_buffers_io {
+    //! Abstractions for IO with owned buffers.
+    //!
+    //! Not actually tied to [`crate::virtual_file`] specifically, but, it's the primary
+    //! reason we need this abstraction.
+    //!
+    //! Over time, this could move into the `tokio-epoll-uring` crate, maybe `uring-common`,
+    //! but for the time being we're proving out the primitives in the neon.git repo
+    //! for faster iteration.
+
+    pub(crate) mod write;
+    pub(crate) mod util {
+        pub(crate) mod size_tracking_writer;
+    }
+}
+
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 7f2342e76e..55fa59e53b 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -282,49 +282,52 @@ impl From<FeatureTestResult> for IoEngineKind {
 /// Panics if we can't set up the feature test.
 pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
     std::thread::spawn(|| {
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
+
         #[cfg(not(target_os = "linux"))]
         {
-            return Ok(FeatureTestResult::PlatformPreferred(
-                FeatureTestResult::PLATFORM_DEFAULT,
-            ));
+            Ok(FeatureTestResult::PlatformPreferred(
+                FeatureTestResult::PLATFORM_PREFERRED,
+            ))
         }
         #[cfg(target_os = "linux")]
-        Ok(match rt.block_on(tokio_epoll_uring::System::launch()) {
-            Ok(_) => FeatureTestResult::PlatformPreferred({
-                assert!(matches!(
-                    IoEngineKind::TokioEpollUring,
+        {
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .unwrap();
+            Ok(match rt.block_on(tokio_epoll_uring::System::launch()) {
+                Ok(_) => FeatureTestResult::PlatformPreferred({
+                    assert!(matches!(
+                        IoEngineKind::TokioEpollUring,
+                        FeatureTestResult::PLATFORM_PREFERRED
+                    ));
                     FeatureTestResult::PLATFORM_PREFERRED
-                ));
-                FeatureTestResult::PLATFORM_PREFERRED
-            }),
-            Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => {
-                let remark = match e.raw_os_error() {
-                    Some(nix::libc::EPERM) => {
-                        // fall back
-                        "creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled "
-                            .to_string()
+                }),
+                Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => {
+                    let remark = match e.raw_os_error() {
+                        Some(nix::libc::EPERM) => {
+                            // fall back
+                            "creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled "
+                                .to_string()
+                        }
+                    Some(nix::libc::EFAULT) => {
+                            // fail feature test
+                            anyhow::bail!(
+                                "creating tokio-epoll-uring fails with EFAULT, might have corrupted memory"
+                            );
+                        }
+                        Some(_) | None => {
+                            // fall back
+                            format!("creating tokio-epoll-uring fails with error: {e:#}")
+                        }
+                };
+                    FeatureTestResult::Worse {
+                        engine: IoEngineKind::StdFs,
+                        remark,
                     }
-                   Some(nix::libc::EFAULT) => {
-                        // fail feature test
-                        anyhow::bail!(
-                            "creating tokio-epoll-uring fails with EFAULT, might have corrupted memory"
-                        );
-                    }
-                    Some(_) | None => {
-                        // fall back
-                        format!("creating tokio-epoll-uring fails with error: {e:#}")
-                    }
-               };
-                FeatureTestResult::Worse {
-                    engine: IoEngineKind::StdFs,
-                    remark,
                 }
-            }
-        })
+            })
+        }
     })
     .join()
     .unwrap()
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
new file mode 100644
index 0000000000..7505b7487e
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -0,0 +1,34 @@
+use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
+use tokio_epoll_uring::{BoundedBuf, IoBuf};
+
+pub struct Writer {
+    dst: VirtualFile,
+    bytes_amount: u64,
+}
+
+impl Writer {
+    pub fn new(dst: VirtualFile) -> Self {
+        Self {
+            dst,
+            bytes_amount: 0,
+        }
+    }
+    /// Returns the wrapped `VirtualFile` object as well as the number
+    /// of bytes that were written to it through this object.
+    pub fn into_inner(self) -> (u64, VirtualFile) {
+        (self.bytes_amount, self.dst)
+    }
+}
+
+impl OwnedAsyncWriter for Writer {
+    #[inline(always)]
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)> {
+        let (buf, res) = self.dst.write_all(buf).await;
+        let nwritten = res?;
+        self.bytes_amount += u64::try_from(nwritten).unwrap();
+        Ok((nwritten, buf))
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
new file mode 100644
index 0000000000..f1812d9b51
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -0,0 +1,206 @@
+use bytes::BytesMut;
+use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+
+/// A trait for doing owned-buffer write IO.
+/// Think [`tokio::io::AsyncWrite`] but with owned buffers.
+pub trait OwnedAsyncWriter {
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)>;
+}
+
+/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
+/// into `BUFFER_SIZE`-sized writes.
+///
+/// # Passthrough Of Large Writers
+///
+/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
+/// buffer to be flushed, even if it is not full yet. Then, the large
+/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
+///
+/// This pass-through is generally beneficial for throughput, but if
+/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
+/// unlimited large writes may cause latency or fairness issues.
+///
+/// In such cases, a different implementation that always buffers in memory
+/// may be preferable.
+pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
+    writer: W,
+    // invariant: always remains Some(buf)
+    // with buf.capacity() == BUFFER_SIZE except
+    // - while IO is ongoing => goes back to Some() once the IO completed successfully
+    // - after an IO error => stays `None` forever
+    // In these exceptional cases, it's `None`.
+    buf: Option<BytesMut>,
+}
+
+impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
+where
+    W: OwnedAsyncWriter,
+{
+    pub fn new(writer: W) -> Self {
+        Self {
+            writer,
+            buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
+        }
+    }
+
+    pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
+        self.flush().await?;
+        let Self { buf, writer } = self;
+        assert!(buf.is_some());
+        Ok(writer)
+    }
+
+    pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
+    where
+        B: IoBuf + Send,
+    {
+        // avoid memcpy for the middle of the chunk
+        if chunk.len() >= BUFFER_SIZE {
+            self.flush().await?;
+            // do a big write, bypassing `buf`
+            assert_eq!(
+                self.buf
+                    .as_ref()
+                    .expect("must not use after an error")
+                    .len(),
+                0
+            );
+            let chunk_len = chunk.len();
+            let (nwritten, chunk) = self.writer.write_all(chunk).await?;
+            assert_eq!(nwritten, chunk_len);
+            drop(chunk);
+            return Ok(());
+        }
+        // in-memory copy the < BUFFER_SIZED tail of the chunk
+        assert!(chunk.len() < BUFFER_SIZE);
+        let mut chunk = &chunk[..];
+        while !chunk.is_empty() {
+            let buf = self.buf.as_mut().expect("must not use after an error");
+            let need = BUFFER_SIZE - buf.len();
+            let have = chunk.len();
+            let n = std::cmp::min(need, have);
+            buf.extend_from_slice(&chunk[..n]);
+            chunk = &chunk[n..];
+            if buf.len() >= BUFFER_SIZE {
+                assert_eq!(buf.len(), BUFFER_SIZE);
+                self.flush().await?;
+            }
+        }
+        assert!(chunk.is_empty(), "by now we should have drained the chunk");
+        Ok(())
+    }
+
+    async fn flush(&mut self) -> std::io::Result<()> {
+        let buf = self.buf.take().expect("must not use after an error");
+        if buf.is_empty() {
+            self.buf = Some(buf);
+            return std::io::Result::Ok(());
+        }
+        let buf_len = buf.len();
+        let (nwritten, mut buf) = self.writer.write_all(buf).await?;
+        assert_eq!(nwritten, buf_len);
+        buf.clear();
+        self.buf = Some(buf);
+        Ok(())
+    }
+}
+
+impl OwnedAsyncWriter for Vec<u8> {
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)> {
+        let nbytes = buf.bytes_init();
+        if nbytes == 0 {
+            return Ok((0, Slice::into_inner(buf.slice_full())));
+        }
+        let buf = buf.slice(0..nbytes);
+        self.extend_from_slice(&buf[..]);
+        Ok((buf.len(), Slice::into_inner(buf)))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[derive(Default)]
+    struct RecorderWriter {
+        writes: Vec<Vec<u8>>,
+    }
+    impl OwnedAsyncWriter for RecorderWriter {
+        async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+            &mut self,
+            buf: B,
+        ) -> std::io::Result<(usize, B::Buf)> {
+            let nbytes = buf.bytes_init();
+            if nbytes == 0 {
+                self.writes.push(vec![]);
+                return Ok((0, Slice::into_inner(buf.slice_full())));
+            }
+            let buf = buf.slice(0..nbytes);
+            self.writes.push(Vec::from(&buf[..]));
+            Ok((buf.len(), Slice::into_inner(buf)))
+        }
+    }
+
+    macro_rules! write {
+        ($writer:ident, $data:literal) => {{
+            $writer
+                .write_buffered(::bytes::Bytes::from_static($data).slice_full())
+                .await?;
+        }};
+    }
+
+    #[tokio::test]
+    async fn test_buffered_writes_only() -> std::io::Result<()> {
+        let recorder = RecorderWriter::default();
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        write!(writer, b"a");
+        write!(writer, b"b");
+        write!(writer, b"c");
+        write!(writer, b"d");
+        write!(writer, b"e");
+        let recorder = writer.flush_and_into_inner().await?;
+        assert_eq!(
+            recorder.writes,
+            vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_passthrough_writes_only() -> std::io::Result<()> {
+        let recorder = RecorderWriter::default();
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        write!(writer, b"abc");
+        write!(writer, b"de");
+        write!(writer, b"");
+        write!(writer, b"fghijk");
+        let recorder = writer.flush_and_into_inner().await?;
+        assert_eq!(
+            recorder.writes,
+            vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
+        let recorder = RecorderWriter::default();
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        write!(writer, b"a");
+        write!(writer, b"bc");
+        write!(writer, b"d");
+        write!(writer, b"e");
+        let recorder = writer.flush_and_into_inner().await?;
+        assert_eq!(
+            recorder.writes,
+            vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
+        );
+        Ok(())
+    }
+}
diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store
index 4cca3a9857..1f88f252eb 100755
--- a/scripts/ps_ec2_setup_instance_store
+++ b/scripts/ps_ec2_setup_instance_store
@@ -40,7 +40,7 @@ To run your local neon.git build on the instance store volume,
 run the following commands from the top of the neon.git checkout
 
     # raise file descriptor limit of your shell and its child processes
-    sudo prlimit -p $$ --nofile=800000:800000
+    sudo prlimit -p \$\$ --nofile=800000:800000
 
     # test suite run
     export TEST_OUTPUT="$TEST_OUTPUT"

From 9752ad8489896f9df9f5bf0c7b1d31ae3fbafd9a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 19:45:58 +0000
Subject: [PATCH 27/43] pageserver, controller: improve secondary download APIs
 for large shards (#7131)

## Problem

The existing secondary download API relied on the caller to wait as long
as it took to complete -- for large shards that could be a long time, so
typical clients that might have a baked-in ~30s timeout would have a
problem.

## Summary of changes

- Take a `wait_ms` query parameter to instruct the pageserver how long
to wait: if the download isn't complete in this duration, then 201 is
returned instead of 200.
- For both 200 and 201 responses, include response body describing
download progress, in terms of layers and bytes. This is sufficient for
the caller to track how much data is being transferred and log/present
that status.
- In storage controller live migrations, use this API to apply a much
longer outer timeout, with smaller individual per-request timeouts, and
log the progress of the downloads.
- Add a test that injects layer download delays to exercise the new
behavior
---
 control_plane/attachment_service/src/http.rs  |   8 +-
 .../attachment_service/src/reconciler.rs      |  93 +++++--
 .../attachment_service/src/service.rs         |  86 +++++--
 control_plane/src/pageserver.rs               |   7 -
 libs/pageserver_api/src/models.rs             |  46 ++++
 libs/remote_storage/src/azure_blob.rs         |   4 +-
 libs/remote_storage/src/lib.rs                |   5 +-
 libs/remote_storage/src/local_fs.rs           |   5 +-
 libs/remote_storage/src/s3_bucket.rs          |   3 +-
 pageserver/client/src/mgmt_api.rs             |  23 +-
 pageserver/src/http/openapi_spec.yml          |  47 ++++
 pageserver/src/http/routes.rs                 |  41 ++-
 pageserver/src/tenant/secondary.rs            |  15 +-
 pageserver/src/tenant/secondary/downloader.rs | 237 +++++++++++++++---
 pageserver/src/tenant/secondary/heatmap.rs    |  22 ++
 test_runner/fixtures/neon_fixtures.py         |   4 +
 test_runner/fixtures/pageserver/http.py       |  10 +-
 .../regress/test_pageserver_secondary.py      | 101 ++++++++
 18 files changed, 647 insertions(+), 110 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 560a05e908..45ee354822 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -14,7 +14,7 @@ use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
-use utils::http::request::{must_get_query_param, parse_request_param};
+use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
 
 use utils::{
@@ -248,8 +248,10 @@ async fn handle_tenant_secondary_download(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    service.tenant_secondary_download(tenant_id).await?;
-    json_response(StatusCode::OK, ())
+    let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
+
+    let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
+    json_response(status, progress)
 }
 
 async fn handle_tenant_delete(
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 7f68a65c15..3bf23275bd 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -8,7 +8,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
@@ -258,22 +258,81 @@ impl Reconciler {
         tenant_shard_id: TenantShardId,
         node: &Node,
     ) -> Result<(), ReconcileError> {
-        match node
-            .with_client_retries(
-                |client| async move { client.tenant_secondary_download(tenant_shard_id).await },
-                &self.service_config.jwt_token,
-                1,
-                1,
-                Duration::from_secs(60),
-                &self.cancel,
-            )
-            .await
-        {
-            None => Err(ReconcileError::Cancel),
-            Some(Ok(_)) => Ok(()),
-            Some(Err(e)) => {
-                tracing::info!("  (skipping destination download: {})", e);
-                Ok(())
+        // This is not the timeout for a request, but the total amount of time we're willing to wait
+        // for a secondary location to get up to date before
+        const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
+
+        // This the long-polling interval for the secondary download requests we send to destination pageserver
+        // during a migration.
+        const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
+
+        let started_at = Instant::now();
+
+        loop {
+            let (status, progress) = match node
+                .with_client_retries(
+                    |client| async move {
+                        client
+                            .tenant_secondary_download(
+                                tenant_shard_id,
+                                Some(REQUEST_DOWNLOAD_TIMEOUT),
+                            )
+                            .await
+                    },
+                    &self.service_config.jwt_token,
+                    1,
+                    3,
+                    REQUEST_DOWNLOAD_TIMEOUT * 2,
+                    &self.cancel,
+                )
+                .await
+            {
+                None => Err(ReconcileError::Cancel),
+                Some(Ok(v)) => Ok(v),
+                Some(Err(e)) => {
+                    // Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
+                    // attaching, but we should not let an issue with a secondary location stop us proceeding
+                    // with a live migration.
+                    tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
+                    return Ok(());
+                }
+            }?;
+
+            if status == StatusCode::OK {
+                tracing::info!(
+                    "Downloads to {} complete: {}/{} layers, {}/{} bytes",
+                    node,
+                    progress.layers_downloaded,
+                    progress.layers_total,
+                    progress.bytes_downloaded,
+                    progress.bytes_total
+                );
+                return Ok(());
+            } else if status == StatusCode::ACCEPTED {
+                let total_runtime = started_at.elapsed();
+                if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
+                    tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
+                        total_runtime.as_millis(),
+                        progress.layers_downloaded,
+                        progress.layers_total,
+                        progress.bytes_downloaded,
+                        progress.bytes_total
+                    );
+                    // Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
+                    // it just makes the I/O performance for users less good.
+                    return Ok(());
+                }
+
+                // Log and proceed around the loop to retry.  We don't sleep between requests, because our HTTP call
+                // to the pageserver is a long-poll.
+                tracing::info!(
+                    "Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
+                    node,
+                    progress.layers_downloaded,
+                    progress.layers_total,
+                    progress.bytes_downloaded,
+                    progress.bytes_total
+                );
             }
         }
     }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 8439ea5567..29f87021b2 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -16,7 +16,15 @@ use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
-    controller_api::UtilizationScore,
+    controller_api::{
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
+        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
+    },
+    models::{SecondaryProgress, TenantConfigRequest},
+};
+
+use pageserver_api::{
     models::{
         self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
         PageserverUtilization, ShardParameters, TenantConfig, TenantCreateRequest,
@@ -30,14 +38,6 @@ use pageserver_api::{
         ValidateResponse, ValidateResponseTenant,
     },
 };
-use pageserver_api::{
-    controller_api::{
-        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
-    },
-    models::TenantConfigRequest,
-};
 use pageserver_client::mgmt_api;
 use tokio::sync::OwnedRwLockWriteGuard;
 use tokio_util::sync::CancellationToken;
@@ -2084,7 +2084,8 @@ impl Service {
     pub(crate) async fn tenant_secondary_download(
         &self,
         tenant_id: TenantId,
-    ) -> Result<(), ApiError> {
+        wait: Option<Duration>,
+    ) -> Result<(StatusCode, SecondaryProgress), ApiError> {
         let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
 
         // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
@@ -2107,32 +2108,71 @@ impl Service {
             targets
         };
 
-        // TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running
-        // downloads, they can return a clean 202 response instead of the HTTP client timing out.
-
         // Issue concurrent requests to all shards' locations
         let mut futs = FuturesUnordered::new();
         for (tenant_shard_id, node) in targets {
             let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
             futs.push(async move {
-                let result = client.tenant_secondary_download(tenant_shard_id).await;
-                (result, node)
+                let result = client
+                    .tenant_secondary_download(tenant_shard_id, wait)
+                    .await;
+                (result, node, tenant_shard_id)
             })
         }
 
         // Handle any errors returned by pageservers.  This includes cases like this request racing with
         // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as
         // well as more general cases like 503s, 500s, or timeouts.
-        while let Some((result, node)) = futs.next().await {
-            let Err(e) = result else { continue };
-
-            // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
-            // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
-            // than they had hoped for.
-            tracing::warn!("Ignoring tenant secondary download error from pageserver {node}: {e}",);
+        let mut aggregate_progress = SecondaryProgress::default();
+        let mut aggregate_status: Option<StatusCode> = None;
+        let mut error: Option<mgmt_api::Error> = None;
+        while let Some((result, node, tenant_shard_id)) = futs.next().await {
+            match result {
+                Err(e) => {
+                    // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
+                    // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
+                    // than they had hoped for.
+                    tracing::warn!("Secondary download error from pageserver {node}: {e}",);
+                    error = Some(e)
+                }
+                Ok((status_code, progress)) => {
+                    tracing::info!(%tenant_shard_id, "Shard status={status_code} progress: {progress:?}");
+                    aggregate_progress.layers_downloaded += progress.layers_downloaded;
+                    aggregate_progress.layers_total += progress.layers_total;
+                    aggregate_progress.bytes_downloaded += progress.bytes_downloaded;
+                    aggregate_progress.bytes_total += progress.bytes_total;
+                    aggregate_progress.heatmap_mtime =
+                        std::cmp::max(aggregate_progress.heatmap_mtime, progress.heatmap_mtime);
+                    aggregate_status = match aggregate_status {
+                        None => Some(status_code),
+                        Some(StatusCode::OK) => Some(status_code),
+                        Some(cur) => {
+                            // Other status codes (e.g. 202) -- do not overwrite.
+                            Some(cur)
+                        }
+                    };
+                }
+            }
         }
 
-        Ok(())
+        // If any of the shards return 202, indicate our result as 202.
+        match aggregate_status {
+            None => {
+                match error {
+                    Some(e) => {
+                        // No successes, and an error: surface it
+                        Err(ApiError::Conflict(format!("Error from pageserver: {e}")))
+                    }
+                    None => {
+                        // No shards found
+                        Err(ApiError::NotFound(
+                            anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
+                        ))
+                    }
+                }
+            }
+            Some(aggregate_status) => Ok((aggregate_status, aggregate_progress)),
+        }
     }
 
     pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index ab2f80fb0c..2603515681 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -568,13 +568,6 @@ impl PageServerNode {
         Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
     }
 
-    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
-        Ok(self
-            .http_client
-            .tenant_secondary_download(*tenant_id)
-            .await?)
-    }
-
     pub async fn timeline_create(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 0d0702e38e..aad4cc97fc 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -751,6 +751,52 @@ pub struct WalRedoManagerStatus {
     pub pid: Option<u32>,
 }
 
+/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
+/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
+/// what's happening.
+#[derive(Default, Debug, Serialize, Deserialize, Clone)]
+pub struct SecondaryProgress {
+    /// The remote storage LastModified time of the heatmap object we last downloaded.
+    #[serde(
+        serialize_with = "opt_ser_rfc3339_millis",
+        deserialize_with = "opt_deser_rfc3339_millis"
+    )]
+    pub heatmap_mtime: Option<SystemTime>,
+
+    /// The number of layers currently on-disk
+    pub layers_downloaded: usize,
+    /// The number of layers in the most recently seen heatmap
+    pub layers_total: usize,
+
+    /// The number of layer bytes currently on-disk
+    pub bytes_downloaded: u64,
+    /// The number of layer bytes in the most recently seen heatmap
+    pub bytes_total: u64,
+}
+
+fn opt_ser_rfc3339_millis<S: serde::Serializer>(
+    ts: &Option<SystemTime>,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    match ts {
+        Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
+        None => serializer.serialize_none(),
+    }
+}
+
+fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
+    match s {
+        None => Ok(None),
+        Some(s) => humantime::parse_rfc3339(&s)
+            .map_err(serde::de::Error::custom)
+            .map(Some),
+    }
+}
+
 pub mod virtual_file {
     #[derive(
         Copy,
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 1e337bc1e8..5fff3e25c9 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -157,9 +157,8 @@ impl AzureBlobStorage {
             let mut bufs = Vec::new();
             while let Some(part) = response.next().await {
                 let part = part?;
-                let etag_str: &str = part.blob.properties.etag.as_ref();
                 if etag.is_none() {
-                    etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+                    etag = Some(part.blob.properties.etag);
                 }
                 if last_modified.is_none() {
                     last_modified = Some(part.blob.properties.last_modified.into());
@@ -180,6 +179,7 @@ impl AzureBlobStorage {
                     "Azure GET response contained no buffers"
                 )));
             }
+            // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
             let etag = etag.unwrap();
             let last_modified = last_modified.unwrap();
 
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index fd832eb94f..ab2035f19a 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -42,6 +42,9 @@ pub use self::{
 };
 use s3_bucket::RequestKind;
 
+/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
+pub use azure_core::Etag;
+
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
@@ -293,7 +296,7 @@ pub struct Download {
     /// The last time the file was modified (`last-modified` HTTP header)
     pub last_modified: SystemTime,
     /// A way to identify this specific version of the resource (`etag` HTTP header)
-    pub etag: String,
+    pub etag: Etag,
     /// Extra key-value data, associated with the current remote file.
     pub metadata: Option<StorageMetadata>,
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index ea0756541b..313d8226b1 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -30,6 +30,7 @@ use crate::{
 };
 
 use super::{RemoteStorage, StorageMetadata};
+use crate::Etag;
 
 const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
 
@@ -626,9 +627,9 @@ async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, Downlo
 // Use mtime as stand-in for ETag.  We could calculate a meaningful one by md5'ing the contents of files we
 // read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
 // quickly, with less overhead than using a mock S3 server.
-fn mock_etag(meta: &std::fs::Metadata) -> String {
+fn mock_etag(meta: &std::fs::Metadata) -> Etag {
     let mtime = meta.modified().expect("Filesystem mtime missing");
-    format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis())
+    format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
 }
 
 #[cfg(test)]
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 56bc32ebdd..1cb85cfb1b 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -289,7 +289,8 @@ impl S3Bucket {
         let metadata = object_output.metadata().cloned().map(StorageMetadata);
         let etag = object_output
             .e_tag
-            .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?;
+            .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
+            .into();
         let last_modified = object_output
             .last_modified
             .ok_or(DownloadError::Other(anyhow::anyhow!(
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 1a8f7e0524..ab55d2b0a3 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -249,13 +249,26 @@ impl Client {
         Ok(())
     }
 
-    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
-        let uri = format!(
+    pub async fn tenant_secondary_download(
+        &self,
+        tenant_id: TenantShardId,
+        wait: Option<std::time::Duration>,
+    ) -> Result<(StatusCode, SecondaryProgress)> {
+        let mut path = reqwest::Url::parse(&format!(
             "{}/v1/tenant/{}/secondary/download",
             self.mgmt_api_endpoint, tenant_id
-        );
-        self.request(Method::POST, &uri, ()).await?;
-        Ok(())
+        ))
+        .expect("Cannot build URL");
+
+        if let Some(wait) = wait {
+            path.query_pairs_mut()
+                .append_pair("wait_ms", &format!("{}", wait.as_millis()));
+        }
+
+        let response = self.request(Method::POST, path, ()).await?;
+        let status = response.status();
+        let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?;
+        Ok((status, progress))
     }
 
     pub async fn location_config(
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4823710fb5..0771229845 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -965,12 +965,28 @@ paths:
         required: true
         schema:
           type: string
+      - name: wait_ms
+        description: If set, we will wait this long for download to complete, and if it isn't complete then return 202
+        in: query
+        required: false
+        schema:
+          type: integer
     post:
       description: |
         If the location is in secondary mode, download latest heatmap and layers
       responses:
         "200":
           description: Success
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SecondaryProgress"
+        "202":
+          description: Download has started but not yet finished
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SecondaryProgress"
         "500":
           description: Generic operation error
           content:
@@ -1623,6 +1639,37 @@ components:
             Lower is better score for how good this pageserver would be for the next tenant.
             The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
 
+    SecondaryProgress:
+      type: object
+      required:
+        - heatmap_mtime
+        - layers_downloaded
+        - layers_total
+        - bytes_downloaded
+        - bytes_total
+      properties:
+        heatmap_mtime:
+          type: string
+          format: date-time
+          description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format)
+        layers_downloaded:
+          type: integer
+          format: int64
+          description: How many layers from the latest layer heatmap are present on disk
+        bytes_downloaded:
+          type: integer
+          format: int64
+          description: How many bytes of layer content from the latest layer heatmap are present on disk
+        layers_total:
+          type: integer
+          format: int64
+          description: How many layers were in the latest layer heatmap
+        bytes_total:
+          type: integer
+          format: int64
+          description: How many bytes of layer content were in the latest layer heatmap
+
+
     Error:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7d3ede21ce..6d98d3f746 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1987,13 +1987,42 @@ async fn secondary_download_handler(
 ) -> Result<Response<Body>, ApiError> {
     let state = get_state(&request);
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    state
-        .secondary_controller
-        .download_tenant(tenant_shard_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+    let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis);
 
-    json_response(StatusCode::OK, ())
+    // We don't need this to issue the download request, but:
+    // - it enables us to cleanly return 404 if we get a request for an absent shard
+    // - we will use this to provide status feedback in the response
+    let Some(secondary_tenant) = state
+        .tenant_manager
+        .get_secondary_tenant_shard(tenant_shard_id)
+    else {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
+        ));
+    };
+
+    let timeout = wait.unwrap_or(Duration::MAX);
+
+    let status = match tokio::time::timeout(
+        timeout,
+        state.secondary_controller.download_tenant(tenant_shard_id),
+    )
+    .await
+    {
+        // Download job ran to completion.
+        Ok(Ok(())) => StatusCode::OK,
+        // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
+        // okay.  We could get an error here in the unlikely edge case that the tenant
+        // was detached between our check above and executing the download job.
+        Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
+        // A timeout is not an error: we have started the download, we're just not done
+        // yet.  The caller will get a response body indicating status.
+        Err(_) => StatusCode::ACCEPTED,
+    };
+
+    let progress = secondary_tenant.progress.lock().unwrap().clone();
+
+    json_response(status, progress)
 }
 
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 14e88b836e..19f36c722e 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -95,7 +95,11 @@ pub(crate) struct SecondaryTenant {
     shard_identity: ShardIdentity,
     tenant_conf: std::sync::Mutex<TenantConfOpt>,
 
+    // Internal state used by the Downloader.
     detail: std::sync::Mutex<SecondaryDetail>,
+
+    // Public state indicating overall progress of downloads relative to the last heatmap seen
+    pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
 }
 
 impl SecondaryTenant {
@@ -118,6 +122,8 @@ impl SecondaryTenant {
             tenant_conf: std::sync::Mutex::new(tenant_conf),
 
             detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
+
+            progress: std::sync::Mutex::default(),
         })
     }
 
@@ -247,9 +253,12 @@ impl SecondaryTenant {
 }
 
 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
-/// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
-/// where we want to immediately upload/download for a particular tenant.  In normal operation
-/// uploads & downloads are autonomous and not driven by this interface.
+/// and heatmap uploads.  This is not a hot data path: it's used for:
+/// - Live migrations, where we want to ensure a migration destination has the freshest possible
+///   content before trying to cut over.
+/// - Tests, where we want to immediately upload/download for a particular tenant.
+///
+/// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface.
 pub struct SecondaryController {
     upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
     download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index b679077358..a595096133 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -41,14 +41,16 @@ use crate::tenant::{
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
+use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
-    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
+    id::TimelineId,
 };
 
 use super::{
@@ -128,6 +130,7 @@ pub(super) struct SecondaryDetail {
     pub(super) config: SecondaryLocationConfig,
 
     last_download: Option<Instant>,
+    last_etag: Option<Etag>,
     next_download: Option<Instant>,
     pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
@@ -138,11 +141,26 @@ fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
     datetime.format("%d/%m/%Y %T")
 }
 
+/// Information returned from download function when it detects the heatmap has changed
+struct HeatMapModified {
+    etag: Etag,
+    last_modified: SystemTime,
+    bytes: Vec<u8>,
+}
+
+enum HeatMapDownload {
+    // The heatmap's etag has changed: return the new etag, mtime and the body bytes
+    Modified(HeatMapModified),
+    // The heatmap's etag is unchanged
+    Unmodified,
+}
+
 impl SecondaryDetail {
     pub(super) fn new(config: SecondaryLocationConfig) -> Self {
         Self {
             config,
             last_download: None,
+            last_etag: None,
             next_download: None,
             timelines: HashMap::new(),
         }
@@ -477,11 +495,31 @@ impl<'a> TenantDownloader<'a> {
         };
 
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+
+        // We will use the etag from last successful download to make the download conditional on changes
+        let last_etag = self
+            .secondary_state
+            .detail
+            .lock()
+            .unwrap()
+            .last_etag
+            .clone();
+
         // Download the tenant's heatmap
-        let heatmap_bytes = tokio::select!(
-            bytes = self.download_heatmap() => {bytes?},
+        let HeatMapModified {
+            last_modified: heatmap_mtime,
+            etag: heatmap_etag,
+            bytes: heatmap_bytes,
+        } = match tokio::select!(
+            bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
             _ = self.secondary_state.cancel.cancelled() => return Ok(())
-        );
+        ) {
+            HeatMapDownload::Unmodified => {
+                tracing::info!("Heatmap unchanged since last successful download");
+                return Ok(());
+            }
+            HeatMapDownload::Modified(m) => m,
+        };
 
         let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
 
@@ -498,6 +536,14 @@ impl<'a> TenantDownloader<'a> {
 
         tracing::debug!("Wrote local heatmap to {}", heatmap_path);
 
+        // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
+        // principle that deletions should be done before writes wherever possible, and so that we can use this
+        // phase to initialize our SecondaryProgress.
+        {
+            *self.secondary_state.progress.lock().unwrap() =
+                self.prepare_timelines(&heatmap, heatmap_mtime).await?;
+        }
+
         // Download the layers in the heatmap
         for timeline in heatmap.timelines {
             if self.secondary_state.cancel.is_cancelled() {
@@ -515,30 +561,159 @@ impl<'a> TenantDownloader<'a> {
                 .await?;
         }
 
+        // Only update last_etag after a full successful download: this way will not skip
+        // the next download, even if the heatmap's actual etag is unchanged.
+        self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
+
         Ok(())
     }
 
-    async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
+    /// Do any fast local cleanup that comes before the much slower process of downloading
+    /// layers from remote storage.  In the process, initialize the SecondaryProgress object
+    /// that will later be updated incrementally as we download layers.
+    async fn prepare_timelines(
+        &self,
+        heatmap: &HeatMapTenant,
+        heatmap_mtime: SystemTime,
+    ) -> Result<SecondaryProgress, UpdateError> {
+        let heatmap_stats = heatmap.get_stats();
+        // We will construct a progress object, and then populate its initial "downloaded" numbers
+        // while iterating through local layer state in [`Self::prepare_timelines`]
+        let mut progress = SecondaryProgress {
+            layers_total: heatmap_stats.layers,
+            bytes_total: heatmap_stats.bytes,
+            heatmap_mtime: Some(heatmap_mtime),
+            layers_downloaded: 0,
+            bytes_downloaded: 0,
+        };
+        // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
+        let mut delete_layers = Vec::new();
+        let mut delete_timelines = Vec::new();
+        {
+            let mut detail = self.secondary_state.detail.lock().unwrap();
+            for (timeline_id, timeline_state) in &mut detail.timelines {
+                let Some(heatmap_timeline_index) = heatmap
+                    .timelines
+                    .iter()
+                    .position(|t| t.timeline_id == *timeline_id)
+                else {
+                    // This timeline is no longer referenced in the heatmap: delete it locally
+                    delete_timelines.push(*timeline_id);
+                    continue;
+                };
+
+                let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
+
+                let layers_in_heatmap = heatmap_timeline
+                    .layers
+                    .iter()
+                    .map(|l| &l.name)
+                    .collect::<HashSet<_>>();
+                let layers_on_disk = timeline_state
+                    .on_disk_layers
+                    .iter()
+                    .map(|l| l.0)
+                    .collect::<HashSet<_>>();
+
+                let mut layer_count = layers_on_disk.len();
+                let mut layer_byte_count: u64 = timeline_state
+                    .on_disk_layers
+                    .values()
+                    .map(|l| l.metadata.file_size())
+                    .sum();
+
+                // Remove on-disk layers that are no longer present in heatmap
+                for layer in layers_on_disk.difference(&layers_in_heatmap) {
+                    layer_count -= 1;
+                    layer_byte_count -= timeline_state
+                        .on_disk_layers
+                        .get(layer)
+                        .unwrap()
+                        .metadata
+                        .file_size();
+
+                    delete_layers.push((*timeline_id, (*layer).clone()));
+                }
+
+                progress.bytes_downloaded += layer_byte_count;
+                progress.layers_downloaded += layer_count;
+            }
+        }
+
+        // Execute accumulated deletions
+        for (timeline_id, layer_name) in delete_layers {
+            let timeline_path = self
+                .conf
+                .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
+            let local_path = timeline_path.join(layer_name.to_string());
+            tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
+
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .maybe_fatal_err("Removing secondary layer")?;
+
+            // Update in-memory housekeeping to reflect the absence of the deleted layer
+            let mut detail = self.secondary_state.detail.lock().unwrap();
+            let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
+                continue;
+            };
+            timeline_state.on_disk_layers.remove(&layer_name);
+        }
+
+        for timeline_id in delete_timelines {
+            let timeline_path = self
+                .conf
+                .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
+            tracing::info!(timeline_id=%timeline_id,
+                "Timeline no longer in heatmap, removing from secondary location"
+            );
+            tokio::fs::remove_dir_all(&timeline_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .maybe_fatal_err("Removing secondary timeline")?;
+        }
+
+        Ok(progress)
+    }
+
+    /// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object
+    /// still matches `prev_etag`.
+    async fn download_heatmap(
+        &self,
+        prev_etag: Option<&Etag>,
+    ) -> Result<HeatMapDownload, UpdateError> {
         debug_assert_current_span_has_tenant_id();
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        // TODO: make download conditional on ETag having changed since last download
+        // TODO: pull up etag check into the request, to do a conditional GET rather than
+        // issuing a GET and then maybe ignoring the response body
         // (https://github.com/neondatabase/neon/issues/6199)
         tracing::debug!("Downloading heatmap for secondary tenant",);
 
         let heatmap_path = remote_heatmap_path(tenant_shard_id);
         let cancel = &self.secondary_state.cancel;
 
-        let heatmap_bytes = backoff::retry(
+        backoff::retry(
             || async {
                 let download = self
                     .remote_storage
                     .download(&heatmap_path, cancel)
                     .await
                     .map_err(UpdateError::from)?;
-                let mut heatmap_bytes = Vec::new();
-                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
-                let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
-                Ok(heatmap_bytes)
+
+                if Some(&download.etag) == prev_etag {
+                    Ok(HeatMapDownload::Unmodified)
+                } else {
+                    let mut heatmap_bytes = Vec::new();
+                    let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+                    let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
+                    SECONDARY_MODE.download_heatmap.inc();
+                    Ok(HeatMapDownload::Modified(HeatMapModified {
+                        etag: download.etag,
+                        last_modified: download.last_modified,
+                        bytes: heatmap_bytes,
+                    }))
+                }
             },
             |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
             FAILED_DOWNLOAD_WARN_THRESHOLD,
@@ -548,11 +723,7 @@ impl<'a> TenantDownloader<'a> {
         )
         .await
         .ok_or_else(|| UpdateError::Cancelled)
-        .and_then(|x| x)?;
-
-        SECONDARY_MODE.download_heatmap.inc();
-
-        Ok(heatmap_bytes)
+        .and_then(|x| x)
     }
 
     async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
@@ -593,27 +764,6 @@ impl<'a> TenantDownloader<'a> {
             }
         };
 
-        let layers_in_heatmap = timeline
-            .layers
-            .iter()
-            .map(|l| &l.name)
-            .collect::<HashSet<_>>();
-        let layers_on_disk = timeline_state
-            .on_disk_layers
-            .iter()
-            .map(|l| l.0)
-            .collect::<HashSet<_>>();
-
-        // Remove on-disk layers that are no longer present in heatmap
-        for layer in layers_on_disk.difference(&layers_in_heatmap) {
-            let local_path = timeline_path.join(layer.to_string());
-            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
-            tokio::fs::remove_file(&local_path)
-                .await
-                .or_else(fs_ext::ignore_not_found)
-                .maybe_fatal_err("Removing secondary layer")?;
-        }
-
         // Download heatmap layers that are not present on local disk, or update their
         // access time if they are already present.
         for layer in timeline.layers {
@@ -662,6 +812,12 @@ impl<'a> TenantDownloader<'a> {
                 }
             }
 
+            // Failpoint for simulating slow remote storage
+            failpoint_support::sleep_millis_async!(
+                "secondary-layer-download-sleep",
+                &self.secondary_state.cancel
+            );
+
             // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
             let downloaded_bytes = match download_layer_file(
                 self.conf,
@@ -701,6 +857,11 @@ impl<'a> TenantDownloader<'a> {
                 tokio::fs::remove_file(&local_path)
                     .await
                     .or_else(fs_ext::ignore_not_found)?;
+            } else {
+                tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
+                let mut progress = self.secondary_state.progress.lock().unwrap();
+                progress.bytes_downloaded += downloaded_bytes;
+                progress.layers_downloaded += 1;
             }
 
             SECONDARY_MODE.download_layer.inc();
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 99aaaeb8c8..73cdf6c6d4 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -62,3 +62,25 @@ impl HeatMapTimeline {
         }
     }
 }
+
+pub(crate) struct HeatMapStats {
+    pub(crate) bytes: u64,
+    pub(crate) layers: usize,
+}
+
+impl HeatMapTenant {
+    pub(crate) fn get_stats(&self) -> HeatMapStats {
+        let mut stats = HeatMapStats {
+            bytes: 0,
+            layers: 0,
+        };
+        for timeline in &self.timelines {
+            for layer in &timeline.layers {
+                stats.layers += 1;
+                stats.bytes += layer.metadata.file_size;
+            }
+        }
+
+        stats
+    }
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 70d3076371..56b23cef59 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1525,6 +1525,7 @@ class NeonCli(AbstractNeonCli):
         conf: Optional[Dict[str, Any]] = None,
         shard_count: Optional[int] = None,
         shard_stripe_size: Optional[int] = None,
+        placement_policy: Optional[str] = None,
         set_default: bool = False,
     ) -> Tuple[TenantId, TimelineId]:
         """
@@ -1558,6 +1559,9 @@ class NeonCli(AbstractNeonCli):
         if shard_stripe_size is not None:
             args.extend(["--shard-stripe-size", str(shard_stripe_size)])
 
+        if placement_policy is not None:
+            args.extend(["--placement-policy", str(placement_policy)])
+
         res = self.raw_cli(args)
         res.check_returncode()
         return tenant_id, timeline_id
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 4e355b73a9..99ec894106 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -357,9 +357,15 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
         self.verbose_error(res)
 
-    def tenant_secondary_download(self, tenant_id: Union[TenantId, TenantShardId]):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
+    def tenant_secondary_download(
+        self, tenant_id: Union[TenantId, TenantShardId], wait_ms: Optional[int] = None
+    ) -> tuple[int, dict[Any, Any]]:
+        url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download"
+        if wait_ms is not None:
+            url = url + f"?wait_ms={wait_ms}"
+        res = self.post(url)
         self.verbose_error(res)
+        return (res.status_code, res.json())
 
     def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
         assert "tenant_id" not in config.keys()
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 79145f61b3..8ef75414a3 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,4 +1,5 @@
 import json
+import os
 import random
 from pathlib import Path
 from typing import Any, Dict, Optional
@@ -553,3 +554,103 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
             )
         ),
     )
+
+
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@pytest.mark.parametrize("via_controller", [True, False])
+def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):
+    """
+    Test use of secondary download API for slow downloads, where slow means either a healthy
+    system with a large capacity shard, or some unhealthy remote storage.
+
+    The download API is meant to respect a client-supplied time limit, and return 200 or 202
+    selectively based on whether the download completed.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+
+    env.neon_cli.create_tenant(
+        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Double":1}'
+    )
+
+    attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
+    ps_attached = env.get_pageserver(attached_to_id)
+    ps_secondary = next(p for p in env.pageservers if p != ps_attached)
+
+    # Generate a bunch of small layers (we will apply a slowdown failpoint that works on a per-layer basis)
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(128)
+    ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
+    workload.write_rows(128)
+    ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
+    workload.write_rows(128)
+    ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
+    workload.write_rows(128)
+    ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
+
+    # Expect lots of layers
+    assert len(list_layers(ps_attached, tenant_id, timeline_id)) > 10
+
+    # Simulate large data by making layer downloads artifically slow
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")])
+
+    # Upload a heatmap, so that secondaries have something to download
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+
+    if via_controller:
+        http_client = env.storage_controller.pageserver_api()
+        http_client.tenant_location_conf(
+            tenant_id,
+            {
+                "mode": "Secondary",
+                "secondary_conf": {"warm": True},
+                "tenant_conf": {},
+                "generation": None,
+            },
+        )
+    else:
+        http_client = ps_secondary.http_client()
+
+    # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms
+    (status, progress_1) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000)
+    assert status == 202
+    assert progress_1["heatmap_mtime"] is not None
+    assert progress_1["layers_downloaded"] > 0
+    assert progress_1["bytes_downloaded"] > 0
+    assert progress_1["layers_total"] > progress_1["layers_downloaded"]
+    assert progress_1["bytes_total"] > progress_1["bytes_downloaded"]
+
+    # Multiple polls should work: use a shorter wait period this time
+    (status, progress_2) = http_client.tenant_secondary_download(tenant_id, wait_ms=1000)
+    assert status == 202
+    assert progress_2["heatmap_mtime"] is not None
+    assert progress_2["layers_downloaded"] > 0
+    assert progress_2["bytes_downloaded"] > 0
+    assert progress_2["layers_total"] > progress_2["layers_downloaded"]
+    assert progress_2["bytes_total"] > progress_2["bytes_downloaded"]
+
+    # Progress should be >= the first poll: this can only go backward if we see a new heatmap,
+    # and the heatmap period on the attached node is much longer than the runtime of this test, so no
+    # new heatmap should have been uploaded.
+    assert progress_2["layers_downloaded"] >= progress_1["layers_downloaded"]
+    assert progress_2["bytes_downloaded"] >= progress_1["bytes_downloaded"]
+    assert progress_2["layers_total"] == progress_1["layers_total"]
+    assert progress_2["bytes_total"] == progress_1["bytes_total"]
+
+    # Make downloads fast again: when the download completes within this last request, we
+    # get a 200 instead of a 202
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "off")])
+    (status, progress_3) = http_client.tenant_secondary_download(tenant_id, wait_ms=20000)
+    assert status == 200
+    assert progress_3["heatmap_mtime"] is not None
+    assert progress_3["layers_total"] == progress_3["layers_downloaded"]
+    assert progress_3["bytes_total"] == progress_3["bytes_downloaded"]

From 0694ee9531ffa2f4391cf0f27c89b5afab1ba052 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 15 Mar 2024 20:46:15 +0100
Subject: [PATCH 28/43] tokio-epoll-uring: retry on launch failures due to
 locked memory (#7141)

refs https://github.com/neondatabase/neon/issues/7136

Problem
-------

Before this PR, we were using
`tokio_epoll_uring::thread_local_system()`,
which panics on tokio_epoll_uring::System::launch() failure

As we've learned in [the

past](https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391),
some older Linux kernels account io_uring instances as locked memory.

And while we've raised the limit in prod considerably, we did hit it
once on 2024-03-11 16:30 UTC.
That was after we enabled tokio-epoll-uring fleet-wide, but before
we had shipped release-5090 (c6ed86d3d0690b52e7014b6a696effa95714e8cb)
which did away with the last mass-creation of tokio-epoll-uring
instances as per

    commit 3da410c8fee05b0cd65a5c0b83fffa3d5680cd77
    Author: Christian Schwarz <christian@neon.tech>
    Date:   Tue Mar 5 10:03:54 2024 +0100

tokio-epoll-uring: use it on the layer-creating code paths (#6378)

Nonetheless, it highlighted that panicking in this situation is probably
not ideal, as it can leave the pageserver process in a semi-broken
state.

Further, due to low sampling rate of Prometheus metrics, we don't know
much about the circumstances of this failure instance.

Solution
--------

This PR implements a custom thread_local_system() that is
pageserver-aware
and will do the following on failure:
- dump relevant stats to `tracing!`, hopefully they will be useful to
  understand the circumstances better
- if it's the locked memory failure (or any other ENOMEM): abort() the
  process
- if it's ENOMEM, retry with exponential back-off, capped at 3s.
- add metric counters so we can create an alert

This makes sense in the production environment where we know that
_usually_, there's ample locked memory allowance available, and we know
the failure rate is rare.
---
 Cargo.lock                                    |   2 +
 clippy.toml                                   |   2 +
 pageserver/Cargo.toml                         |   1 +
 pageserver/src/metrics.rs                     |  27 ++-
 pageserver/src/virtual_file/io_engine.rs      |  14 +-
 .../io_engine/tokio_epoll_uring_ext.rs        | 194 ++++++++++++++++++
 pageserver/src/virtual_file/open_options.rs   |   2 +-
 workspace_hack/Cargo.toml                     |   2 +
 8 files changed, 234 insertions(+), 10 deletions(-)
 create mode 100644 pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs

diff --git a/Cargo.lock b/Cargo.lock
index 99ba8b1cb3..022dc11f07 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3530,6 +3530,7 @@ dependencies = [
  "postgres_connection",
  "postgres_ffi",
  "pq_proto",
+ "procfs",
  "rand 0.8.5",
  "regex",
  "remote_storage",
@@ -6987,6 +6988,7 @@ dependencies = [
  "axum",
  "base64 0.21.1",
  "base64ct",
+ "byteorder",
  "bytes",
  "cc",
  "chrono",
diff --git a/clippy.toml b/clippy.toml
index 5f7dc66152..4c0c04f9a1 100644
--- a/clippy.toml
+++ b/clippy.toml
@@ -2,6 +2,8 @@ disallowed-methods = [
     "tokio::task::block_in_place",
     # Allow this for now, to deny it later once we stop using Handle::block_on completely
     # "tokio::runtime::Handle::block_on",
+    # use tokio_epoll_uring_ext instead
+    "tokio_epoll_uring::thread_local_system",
 ]
 
 disallowed-macros = [
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 5adeaffe1a..2702a2040a 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -48,6 +48,7 @@ postgres.workspace = true
 postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
+procfs.workspace = true
 rand.workspace = true
 regex.workspace = true
 scopeguard.workspace = true
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 03537ddb05..075bb76a1b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2465,7 +2465,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }
 
 pub mod tokio_epoll_uring {
-    use metrics::UIntGauge;
+    use metrics::{register_int_counter, UIntGauge};
+    use once_cell::sync::Lazy;
 
     pub struct Collector {
         descs: Vec<metrics::core::Desc>,
@@ -2473,15 +2474,13 @@ pub mod tokio_epoll_uring {
         systems_destroyed: UIntGauge,
     }
 
-    const NMETRICS: usize = 2;
-
     impl metrics::core::Collector for Collector {
         fn desc(&self) -> Vec<&metrics::core::Desc> {
             self.descs.iter().collect()
         }
 
         fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
-            let mut mfs = Vec::with_capacity(NMETRICS);
+            let mut mfs = Vec::with_capacity(Self::NMETRICS);
             let tokio_epoll_uring::metrics::Metrics {
                 systems_created,
                 systems_destroyed,
@@ -2495,6 +2494,8 @@ pub mod tokio_epoll_uring {
     }
 
     impl Collector {
+        const NMETRICS: usize = 2;
+
         #[allow(clippy::new_without_default)]
         pub fn new() -> Self {
             let mut descs = Vec::new();
@@ -2528,6 +2529,22 @@ pub mod tokio_epoll_uring {
             }
         }
     }
+
+    pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy<metrics::IntCounter> = Lazy::new(|| {
+        register_int_counter!(
+            "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count",
+            "Number of times where thread_local_system creation spanned multiple executor threads",
+        )
+        .unwrap()
+    });
+
+    pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy<metrics::IntCounter> = Lazy::new(|| {
+        register_int_counter!(
+            "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count",
+            "Number of times thread_local_system creation failed and was retried after back-off.",
+        )
+        .unwrap()
+    });
 }
 
 pub(crate) mod tenant_throttling {
@@ -2656,6 +2673,8 @@ pub fn preinitialize_metrics() {
         &WALRECEIVER_BROKER_UPDATES,
         &WALRECEIVER_CANDIDATES_ADDED,
         &WALRECEIVER_CANDIDATES_REMOVED,
+        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
+        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
     ]
     .into_iter()
     .for_each(|c| {
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 55fa59e53b..2dd0ce64d6 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -6,6 +6,10 @@
 //! Initialize using [`init`].
 //!
 //! Then use [`get`] and  [`super::OpenOptions`].
+//!
+//!
+
+pub(super) mod tokio_epoll_uring_ext;
 
 use tokio_epoll_uring::{IoBuf, Slice};
 use tracing::Instrument;
@@ -145,7 +149,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.read(file_guard, offset, buf).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
@@ -160,7 +164,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.fsync(file_guard).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
@@ -178,7 +182,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.fdatasync(file_guard).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
@@ -197,7 +201,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.statx(file_guard).await;
                 (
                     resources,
@@ -220,7 +224,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.write(file_guard, offset, buf).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
new file mode 100644
index 0000000000..c4b10f3a24
--- /dev/null
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -0,0 +1,194 @@
+//! Like [`::tokio_epoll_uring::thread_local_system()`], but with pageserver-specific
+//! handling in case the instance can't launched.
+//!
+//! This is primarily necessary due to ENOMEM aka OutOfMemory errors during io_uring creation
+//! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series.
+//! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
+
+use std::sync::atomic::AtomicU32;
+use std::sync::Arc;
+
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, info_span, warn, Instrument};
+use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
+
+use tokio_epoll_uring::{System, SystemHandle};
+
+use crate::virtual_file::on_fatal_io_error;
+
+use crate::metrics::tokio_epoll_uring as metrics;
+
+#[derive(Clone)]
+struct ThreadLocalState(Arc<ThreadLocalStateInner>);
+
+struct ThreadLocalStateInner {
+    cell: tokio::sync::OnceCell<SystemHandle>,
+    launch_attempts: AtomicU32,
+}
+
+impl ThreadLocalState {
+    pub fn new() -> Self {
+        Self(Arc::new(ThreadLocalStateInner {
+            cell: tokio::sync::OnceCell::default(),
+            launch_attempts: AtomicU32::new(0),
+        }))
+    }
+    pub fn make_id_string(&self) -> String {
+        format!("0x{:p}", Arc::as_ptr(&self.0))
+    }
+}
+
+impl Drop for ThreadLocalState {
+    fn drop(&mut self) {
+        info!(parent: None, id=%self.make_id_string(), "tokio-epoll-uring_ext: ThreadLocalState is being dropped and id might be re-used in the future");
+    }
+}
+
+thread_local! {
+    static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new();
+}
+
+/// Panics if we cannot [`System::launch`].
+pub async fn thread_local_system() -> Handle {
+    let fake_cancel = CancellationToken::new();
+    loop {
+        let thread_local_state = THREAD_LOCAL.with(|arc| arc.clone());
+        let inner = &thread_local_state.0;
+        let get_or_init_res = inner
+            .cell
+            .get_or_try_init(|| async {
+                let attempt_no = inner
+                    .launch_attempts
+                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+                let span = info_span!("tokio_epoll_uring_ext::thread_local_system", thread_local=%thread_local_state.make_id_string(), %attempt_no);
+                async {
+                    // Rate-limit retries per thread-local.
+                    // NB: doesn't yield to executor at attempt_no=0.
+                    utils::backoff::exponential_backoff(
+                        attempt_no,
+                        DEFAULT_BASE_BACKOFF_SECONDS,
+                        DEFAULT_MAX_BACKOFF_SECONDS,
+                        &fake_cancel,
+                    )
+                    .await;
+                    let res = System::launch()
+                    // this might move us to another executor thread => loop outside the get_or_try_init, not inside it
+                    .await;
+                    match res {
+                        Ok(system) => {
+                            info!("successfully launched system");
+                            metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc();
+                            Ok(system)
+                        }
+                        Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
+                            warn!("not enough locked memory to tokio-epoll-uring, will retry");
+                            info_span!("stats").in_scope(|| {
+                                emit_launch_failure_process_stats();
+                            });
+                            metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
+                            Err(())
+                        }
+                        // abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
+                        // This is equivalent to a fatal IO error.
+                        Err(ref e @ tokio_epoll_uring::LaunchResult::IoUringBuild(ref inner)) => {
+                            error!(error=%e, "failed to launch thread-local tokio-epoll-uring, this should not happen, aborting process");
+                            info_span!("stats").in_scope(|| {
+                                emit_launch_failure_process_stats();
+                            });
+                            on_fatal_io_error(inner, "launch thread-local tokio-epoll-uring");
+                        },
+                    }
+                }
+                .instrument(span)
+                .await
+            })
+            .await;
+        if get_or_init_res.is_ok() {
+            return Handle(thread_local_state);
+        }
+    }
+}
+
+fn emit_launch_failure_process_stats() {
+    // tokio-epoll-uring stats
+    // vmlck + rlimit
+    // number of threads
+    // rss / system memory usage generally
+
+    let tokio_epoll_uring::metrics::Metrics {
+        systems_created,
+        systems_destroyed,
+    } = tokio_epoll_uring::metrics::global();
+    info!(systems_created, systems_destroyed, "tokio-epoll-uring");
+
+    match procfs::process::Process::myself() {
+        Ok(myself) => {
+            match myself.limits() {
+                Ok(limits) => {
+                    info!(?limits.max_locked_memory, "/proc/self/limits");
+                }
+                Err(error) => {
+                    info!(%error, "no limit stats due to error");
+                }
+            }
+
+            match myself.status() {
+                Ok(status) => {
+                    let procfs::process::Status {
+                        vmsize,
+                        vmlck,
+                        vmpin,
+                        vmrss,
+                        rssanon,
+                        rssfile,
+                        rssshmem,
+                        vmdata,
+                        vmstk,
+                        vmexe,
+                        vmlib,
+                        vmpte,
+                        threads,
+                        ..
+                    } = status;
+                    info!(
+                        vmsize,
+                        vmlck,
+                        vmpin,
+                        vmrss,
+                        rssanon,
+                        rssfile,
+                        rssshmem,
+                        vmdata,
+                        vmstk,
+                        vmexe,
+                        vmlib,
+                        vmpte,
+                        threads,
+                        "/proc/self/status"
+                    );
+                }
+                Err(error) => {
+                    info!(%error, "no status status due to error");
+                }
+            }
+        }
+        Err(error) => {
+            info!(%error, "no process stats due to error");
+        }
+    };
+}
+
+#[derive(Clone)]
+pub struct Handle(ThreadLocalState);
+
+impl std::ops::Deref for Handle {
+    type Target = SystemHandle;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+             .0
+            .cell
+            .get()
+            .expect("must be already initialized when using this")
+    }
+}
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
index f75edb0bac..7f951270d1 100644
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -98,7 +98,7 @@ impl OpenOptions {
             OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()),
             #[cfg(target_os = "linux")]
             OpenOptions::TokioEpollUring(x) => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await;
                 system.open(path, x).await.map_err(|e| match e {
                     tokio_epoll_uring::Error::Op(e) => e,
                     tokio_epoll_uring::Error::System(system) => {
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8593b752c2..0646091006 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -24,6 +24,7 @@ aws-smithy-types = { version = "1", default-features = false, features = ["byte-
 axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
+byteorder = { version = "1", features = ["i128"] }
 bytes = { version = "1", features = ["serde"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 clap = { version = "4", features = ["derive", "string"] }
@@ -86,6 +87,7 @@ zstd-sys = { version = "2", default-features = false, features = ["legacy", "std
 
 [build-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
+byteorder = { version = "1", features = ["i128"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }

From 5cec5cb3cf4c1fc3004ae6fc412f96dccd91014e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 15 Mar 2024 20:48:51 +0100
Subject: [PATCH 29/43] fixup(#7120): the macOS code used an outdated constant
 name, broke the build (#7150)


From 30a3d80d2fe881f375483080f1370913d47704bf Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 18 Mar 2024 11:28:45 +0200
Subject: [PATCH 30/43] build: make procfs linux only dependency (#7156)

the dependency refuses to build on macos so builds on `main` are broken
right now, including the `release` PR.
---
 Cargo.lock                               | 1 -
 pageserver/Cargo.toml                    | 4 +++-
 pageserver/src/virtual_file/io_engine.rs | 1 +
 workspace_hack/Cargo.toml                | 2 --
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 022dc11f07..c4f925e3c7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6988,7 +6988,6 @@ dependencies = [
  "axum",
  "base64 0.21.1",
  "base64ct",
- "byteorder",
  "bytes",
  "cc",
  "chrono",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 2702a2040a..f304294591 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -48,7 +48,6 @@ postgres.workspace = true
 postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
-procfs.workspace = true
 rand.workspace = true
 regex.workspace = true
 scopeguard.workspace = true
@@ -90,6 +89,9 @@ enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true
 
+[target.'cfg(target_os = "linux")'.dependencies]
+procfs.workspace = true
+
 [dev-dependencies]
 criterion.workspace = true
 hex-literal.workspace = true
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 2dd0ce64d6..7a27be2ca1 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -9,6 +9,7 @@
 //!
 //!
 
+#[cfg(target_os = "linux")]
 pub(super) mod tokio_epoll_uring_ext;
 
 use tokio_epoll_uring::{IoBuf, Slice};
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 0646091006..8593b752c2 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -24,7 +24,6 @@ aws-smithy-types = { version = "1", default-features = false, features = ["byte-
 axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
-byteorder = { version = "1", features = ["i128"] }
 bytes = { version = "1", features = ["serde"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 clap = { version = "4", features = ["derive", "string"] }
@@ -87,7 +86,6 @@ zstd-sys = { version = "2", default-features = false, features = ["legacy", "std
 
 [build-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-byteorder = { version = "1", features = ["i128"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }

From 1d3ae57f18462e150a74d044c4f59c34a6c27e69 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 18 Mar 2024 10:37:20 +0000
Subject: [PATCH 31/43] pageserver: refactoring in TenantManager to reduce
 duplication (#6732)

## Problem

Followup to https://github.com/neondatabase/neon/pull/6725

In that PR, code for purging local files from a tenant shard was
duplicated.

## Summary of changes

- Refactor detach code into TenantManager
- `spawn_background_purge` method can now be common between detach and
split operations
---
 pageserver/src/http/routes.rs |  37 ++---
 pageserver/src/tenant/mgr.rs  | 259 +++++++++++++++++-----------------
 2 files changed, 151 insertions(+), 145 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6d98d3f746..97ffb99465 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -885,14 +885,16 @@ async fn tenant_detach_handler(
 
     let state = get_state(&request);
     let conf = state.conf;
-    mgr::detach_tenant(
-        conf,
-        tenant_shard_id,
-        detach_ignored.unwrap_or(false),
-        &state.deletion_queue_client,
-    )
-    .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
-    .await?;
+    state
+        .tenant_manager
+        .detach_tenant(
+            conf,
+            tenant_shard_id,
+            detach_ignored.unwrap_or(false),
+            &state.deletion_queue_client,
+        )
+        .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
+        .await?;
 
     json_response(StatusCode::OK, ())
 }
@@ -1403,7 +1405,9 @@ async fn update_tenant_config_handler(
         TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
     let state = get_state(&request);
-    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
+    state
+        .tenant_manager
+        .set_new_tenant_config(tenant_conf, tenant_id)
         .instrument(info_span!("tenant_config", %tenant_id))
         .await?;
 
@@ -1428,13 +1432,14 @@ async fn put_tenant_location_config_handler(
     // The `Detached` state is special, it doesn't upsert a tenant, it removes
     // its local disk content and drops it from memory.
     if let LocationConfigMode::Detached = request_data.config.mode {
-        if let Err(e) =
-            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
-                .instrument(info_span!("tenant_detach",
-                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
-                ))
-                .await
+        if let Err(e) = state
+            .tenant_manager
+            .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
+            .instrument(info_span!("tenant_detach",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug()
+            ))
+            .await
         {
             match e {
                 TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 3aaab6e4ef..facaaa2ad7 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -633,7 +633,7 @@ pub async fn init_tenant_mgr(
 /// Wrapper for Tenant::spawn that checks invariants before running, and inserts
 /// a broken tenant in the map if Tenant::spawn fails.
 #[allow(clippy::too_many_arguments)]
-pub(crate) fn tenant_spawn(
+fn tenant_spawn(
     conf: &'static PageServerConf,
     tenant_shard_id: TenantShardId,
     tenant_path: &Utf8Path,
@@ -825,40 +825,6 @@ pub(crate) enum SetNewTenantConfigError {
     Other(anyhow::Error),
 }
 
-pub(crate) async fn set_new_tenant_config(
-    conf: &'static PageServerConf,
-    new_tenant_conf: TenantConfOpt,
-    tenant_id: TenantId,
-) -> Result<(), SetNewTenantConfigError> {
-    // Legacy API: does not support sharding
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    info!("configuring tenant {tenant_id}");
-    let tenant = get_tenant(tenant_shard_id, true)?;
-
-    if !tenant.tenant_shard_id().shard_count.is_unsharded() {
-        // Note that we use ShardParameters::default below.
-        return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
-            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
-        )));
-    }
-
-    // This is a legacy API that only operates on attached tenants: the preferred
-    // API to use is the location_config/ endpoint, which lets the caller provide
-    // the full LocationConf.
-    let location_conf = LocationConf::attached_single(
-        new_tenant_conf.clone(),
-        tenant.generation,
-        &ShardParameters::default(),
-    );
-
-    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
-        .await
-        .map_err(SetNewTenantConfigError::Persist)?;
-    tenant.set_new_tenant_config(new_tenant_conf);
-    Ok(())
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum UpsertLocationError {
     #[error("Bad config request: {0}")]
@@ -1661,19 +1627,7 @@ impl TenantManager {
         let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
             .await
             .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MgmtRequest,
-            None,
-            None,
-            "tenant_files_delete",
-            false,
-            async move {
-                fs::remove_dir_all(tmp_path.as_path())
-                    .await
-                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-            },
-        );
+        self.spawn_background_purge(tmp_path);
 
         fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
             "failpoint"
@@ -1827,6 +1781,134 @@ impl TenantManager {
 
         shutdown_all_tenants0(self.tenants).await
     }
+
+    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+    /// the background, and thereby avoid blocking any API requests on this deletion completing.
+    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
+        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+        let task_tenant_id = None;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MgmtRequest,
+            task_tenant_id,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );
+    }
+
+    pub(crate) async fn detach_tenant(
+        &self,
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        detach_ignored: bool,
+        deletion_queue_client: &DeletionQueueClient,
+    ) -> Result<(), TenantStateError> {
+        let tmp_path = self
+            .detach_tenant0(
+                conf,
+                &TENANTS,
+                tenant_shard_id,
+                detach_ignored,
+                deletion_queue_client,
+            )
+            .await?;
+        self.spawn_background_purge(tmp_path);
+
+        Ok(())
+    }
+
+    async fn detach_tenant0(
+        &self,
+        conf: &'static PageServerConf,
+        tenants: &std::sync::RwLock<TenantsMap>,
+        tenant_shard_id: TenantShardId,
+        detach_ignored: bool,
+        deletion_queue_client: &DeletionQueueClient,
+    ) -> Result<Utf8PathBuf, TenantStateError> {
+        let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
+            let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
+            safe_rename_tenant_dir(&local_tenant_directory)
+                .await
+                .with_context(|| {
+                    format!("local tenant directory {local_tenant_directory:?} rename")
+                })
+        };
+
+        let removal_result = remove_tenant_from_memory(
+            tenants,
+            tenant_shard_id,
+            tenant_dir_rename_operation(tenant_shard_id),
+        )
+        .await;
+
+        // Flush pending deletions, so that they have a good chance of passing validation
+        // before this tenant is potentially re-attached elsewhere.
+        deletion_queue_client.flush_advisory();
+
+        // Ignored tenants are not present in memory and will bail the removal from memory operation.
+        // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
+        if detach_ignored
+            && matches!(
+                removal_result,
+                Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
+            )
+        {
+            let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+            if tenant_ignore_mark.exists() {
+                info!("Detaching an ignored tenant");
+                let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
+                    .await
+                    .with_context(|| {
+                        format!("Ignored tenant {tenant_shard_id} local directory rename")
+                    })?;
+                return Ok(tmp_path);
+            }
+        }
+
+        removal_result
+    }
+
+    pub(crate) async fn set_new_tenant_config(
+        &self,
+        new_tenant_conf: TenantConfOpt,
+        tenant_id: TenantId,
+    ) -> Result<(), SetNewTenantConfigError> {
+        // Legacy API: does not support sharding
+        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+        info!("configuring tenant {tenant_id}");
+        let tenant = get_tenant(tenant_shard_id, true)?;
+
+        if !tenant.tenant_shard_id().shard_count.is_unsharded() {
+            // Note that we use ShardParameters::default below.
+            return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
+            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
+        )));
+        }
+
+        // This is a legacy API that only operates on attached tenants: the preferred
+        // API to use is the location_config/ endpoint, which lets the caller provide
+        // the full LocationConf.
+        let location_conf = LocationConf::attached_single(
+            new_tenant_conf.clone(),
+            tenant.generation,
+            &ShardParameters::default(),
+        );
+
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
+            .await
+            .map_err(SetNewTenantConfigError::Persist)?;
+        tenant.set_new_tenant_config(new_tenant_conf);
+        Ok(())
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -2028,87 +2110,6 @@ pub(crate) enum TenantStateError {
     Other(#[from] anyhow::Error),
 }
 
-pub(crate) async fn detach_tenant(
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    detach_ignored: bool,
-    deletion_queue_client: &DeletionQueueClient,
-) -> Result<(), TenantStateError> {
-    let tmp_path = detach_tenant0(
-        conf,
-        &TENANTS,
-        tenant_shard_id,
-        detach_ignored,
-        deletion_queue_client,
-    )
-    .await?;
-    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-    let task_tenant_id = None;
-    task_mgr::spawn(
-        task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::MgmtRequest,
-        task_tenant_id,
-        None,
-        "tenant_files_delete",
-        false,
-        async move {
-            fs::remove_dir_all(tmp_path.as_path())
-                .await
-                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-        },
-    );
-    Ok(())
-}
-
-async fn detach_tenant0(
-    conf: &'static PageServerConf,
-    tenants: &std::sync::RwLock<TenantsMap>,
-    tenant_shard_id: TenantShardId,
-    detach_ignored: bool,
-    deletion_queue_client: &DeletionQueueClient,
-) -> Result<Utf8PathBuf, TenantStateError> {
-    let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
-        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
-        safe_rename_tenant_dir(&local_tenant_directory)
-            .await
-            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
-    };
-
-    let removal_result = remove_tenant_from_memory(
-        tenants,
-        tenant_shard_id,
-        tenant_dir_rename_operation(tenant_shard_id),
-    )
-    .await;
-
-    // Flush pending deletions, so that they have a good chance of passing validation
-    // before this tenant is potentially re-attached elsewhere.
-    deletion_queue_client.flush_advisory();
-
-    // Ignored tenants are not present in memory and will bail the removal from memory operation.
-    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
-    if detach_ignored
-        && matches!(
-            removal_result,
-            Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
-        )
-    {
-        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-        if tenant_ignore_mark.exists() {
-            info!("Detaching an ignored tenant");
-            let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
-                .await
-                .with_context(|| {
-                    format!("Ignored tenant {tenant_shard_id} local directory rename")
-                })?;
-            return Ok(tmp_path);
-        }
-    }
-
-    removal_result
-}
-
 pub(crate) async fn load_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,

From db749914d852382bf3ee579105ecfb35b020f7e5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 18 Mar 2024 13:29:20 +0100
Subject: [PATCH 32/43] fixup(#7141 / tokio_epoll_uring_ext): high frequency
 log message (#7160)

The PR #7141 added log message

```
ThreadLocalState is being dropped and id might be re-used in the future
```

which was supposed to be emitted when the thread-local is destroyed.
Instead, it was emitted on _each_ call to `thread_local_system()`,
ie.., on each tokio-epoll-uring operation.

Testing
-------

Reproduced the issue locally and verified that this PR fixes the issue.
---
 .../io_engine/tokio_epoll_uring_ext.rs        | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
index c4b10f3a24..9b2efef5d4 100644
--- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -6,7 +6,7 @@
 //! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
 
 use std::sync::atomic::AtomicU32;
-use std::sync::Arc;
+use std::sync::{Arc, Weak};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -24,23 +24,36 @@ struct ThreadLocalState(Arc<ThreadLocalStateInner>);
 struct ThreadLocalStateInner {
     cell: tokio::sync::OnceCell<SystemHandle>,
     launch_attempts: AtomicU32,
+    weak_self: Weak<ThreadLocalStateInner>,
 }
 
 impl ThreadLocalState {
     pub fn new() -> Self {
-        Self(Arc::new(ThreadLocalStateInner {
+        Self(Arc::new_cyclic(|weak| ThreadLocalStateInner {
             cell: tokio::sync::OnceCell::default(),
             launch_attempts: AtomicU32::new(0),
+            weak_self: Weak::clone(weak),
         }))
     }
-    pub fn make_id_string(&self) -> String {
-        format!("0x{:p}", Arc::as_ptr(&self.0))
-    }
 }
 
-impl Drop for ThreadLocalState {
+impl ThreadLocalStateInner {
+    pub fn make_id_string(&self) -> String {
+        format!("0x{:p}", self.weak_self.as_ptr())
+    }
+}
+
+impl Drop for ThreadLocalStateInner {
     fn drop(&mut self) {
-        info!(parent: None, id=%self.make_id_string(), "tokio-epoll-uring_ext: ThreadLocalState is being dropped and id might be re-used in the future");
+        info!(parent: None, id=%self.make_id_string(), "tokio_epoll_uring_ext: thread-local state is being dropped and id might be re-used in the future");
+    }
+}
+
+impl std::ops::Deref for ThreadLocalState {
+    type Target = ThreadLocalStateInner;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
     }
 }
 

From 877fd144012d43b078772f90610b77eb80723c89 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 18 Mar 2024 16:27:53 +0200
Subject: [PATCH 33/43] fix: spanless log message (#7155)

with `immediate_gc` the span only covered the `gc_iteration`, make it
cover the whole needless spawned task, which also does waiting for layer
drops and stray logging in tests.

also clarify some comments while we are here.

Fixes: #6910
---
 pageserver/src/http/routes.rs |  3 +--
 pageserver/src/tenant/mgr.rs  | 12 +++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 97ffb99465..229f3ae98f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1653,8 +1653,7 @@ async fn timeline_gc_handler(
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done =
-        mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
+    let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
     let gc_result = wait_task_done
         .await
         .context("wait for gc task")
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index facaaa2ad7..f456ca3006 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2730,7 +2730,7 @@ use {
     utils::http::error::ApiError,
 };
 
-pub(crate) async fn immediate_gc(
+pub(crate) fn immediate_gc(
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
     gc_req: TimelineGcRequest,
@@ -2752,6 +2752,8 @@ pub(crate) async fn immediate_gc(
     // Run in task_mgr to avoid race with tenant_detach operation
     let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
     let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
+
     // TODO: spawning is redundant now, need to hold the gate
     task_mgr::spawn(
         &tokio::runtime::Handle::current(),
@@ -2766,16 +2768,15 @@ pub(crate) async fn immediate_gc(
             #[allow(unused_mut)]
             let mut result = tenant
                 .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
-                .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
                 .await;
                 // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
                 // better once the types support it.
 
             #[cfg(feature = "testing")]
             {
+                // we need to synchronize with drop completion for python tests without polling for
+                // log messages
                 if let Ok(result) = result.as_mut() {
-                    // why not futures unordered? it seems it needs very much the same task structure
-                    // but would only run on single task.
                     let mut js = tokio::task::JoinSet::new();
                     for layer in std::mem::take(&mut result.doomed_layers) {
                         js.spawn(layer.wait_drop());
@@ -2791,7 +2792,7 @@ pub(crate) async fn immediate_gc(
 
                 if let Some(rtc) = rtc {
                     // layer drops schedule actions on remote timeline client to actually do the
-                    // deletions; don't care just exit fast about the shutdown error
+                    // deletions; don't care about the shutdown error, just exit fast
                     drop(rtc.wait_completion().await);
                 }
             }
@@ -2802,6 +2803,7 @@ pub(crate) async fn immediate_gc(
             }
             Ok(())
         }
+        .instrument(span)
     );
 
     // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task

From 2bc2fd9cfd722b354a905df737cb92643aabfd13 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 18 Mar 2024 16:12:01 +0100
Subject: [PATCH 34/43] fixup(#7160 / tokio_epoll_uring_ext): double-panic
 caused by info! in thread-local's drop() (#7164)

Manual testing of the changes in #7160 revealed that, if the
thread-local destructor ever runs (it apparently doesn't in our test
suite runs, otherwise #7160 would not have auto-merged), we can
encounter an `abort()` due to a double-panic in the tracing code.

This github comment here contains the stack trace:
https://github.com/neondatabase/neon/pull/7160#issuecomment-2003778176

This PR reverts #7160 and uses a atomic counter to identify the
thread-local in log messages, instead of the memory address of the
thread local, which may be re-used.
---
 .../io_engine/tokio_epoll_uring_ext.rs        | 29 +++++--------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
index 9b2efef5d4..6ea19d6b2d 100644
--- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -5,8 +5,8 @@
 //! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series.
 //! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
 
-use std::sync::atomic::AtomicU32;
-use std::sync::{Arc, Weak};
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+use std::sync::Arc;
 
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -24,38 +24,25 @@ struct ThreadLocalState(Arc<ThreadLocalStateInner>);
 struct ThreadLocalStateInner {
     cell: tokio::sync::OnceCell<SystemHandle>,
     launch_attempts: AtomicU32,
-    weak_self: Weak<ThreadLocalStateInner>,
+    /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
+    thread_local_state_id: u64,
 }
 
 impl ThreadLocalState {
     pub fn new() -> Self {
-        Self(Arc::new_cyclic(|weak| ThreadLocalStateInner {
+        Self(Arc::new(ThreadLocalStateInner {
             cell: tokio::sync::OnceCell::default(),
             launch_attempts: AtomicU32::new(0),
-            weak_self: Weak::clone(weak),
+            thread_local_state_id: THREAD_LOCAL_STATE_ID.fetch_add(1, Ordering::Relaxed),
         }))
     }
-}
 
-impl ThreadLocalStateInner {
     pub fn make_id_string(&self) -> String {
-        format!("0x{:p}", self.weak_self.as_ptr())
+        format!("{}", self.0.thread_local_state_id)
     }
 }
 
-impl Drop for ThreadLocalStateInner {
-    fn drop(&mut self) {
-        info!(parent: None, id=%self.make_id_string(), "tokio_epoll_uring_ext: thread-local state is being dropped and id might be re-used in the future");
-    }
-}
-
-impl std::ops::Deref for ThreadLocalState {
-    type Target = ThreadLocalStateInner;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
+static THREAD_LOCAL_STATE_ID: AtomicU64 = AtomicU64::new(0);
 
 thread_local! {
     static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new();

From ad5efb49ee9783f7c30ba38e60c40f6ab0761b89 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 18 Mar 2024 22:54:44 +0100
Subject: [PATCH 35/43] Support backpressure for sharding (#7100)

Add shard_number to PageserverFeedback and parse it on the compute side.
When compute receives a new ps_feedback, it calculates min LSNs among
feedbacks from all shards, and uses those LSNs for backpressure.

Add `test_sharding_backpressure` to verify that backpressure slows down
compute to wait for the slowest shard.
---
 libs/utils/src/pageserver_feedback.rs         |  38 +++--
 libs/walproposer/src/api_bindings.rs          |   4 +-
 libs/walproposer/src/walproposer.rs           |   2 +-
 .../walreceiver/walreceiver_connection.rs     |   1 +
 pageserver/src/walingest.rs                   |   2 +
 pgxn/neon/walproposer.c                       | 120 +++++++--------
 pgxn/neon/walproposer.h                       |  20 ++-
 pgxn/neon/walproposer_pg.c                    | 139 ++++++++++--------
 .../tests/walproposer_sim/walproposer_api.rs  |  26 +++-
 test_runner/fixtures/workload.py              |   5 +-
 test_runner/regress/test_sharding.py          | 128 ++++++++++++++++
 11 files changed, 336 insertions(+), 149 deletions(-)

diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs
index bc8fa7362e..3ddfa44f41 100644
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -29,12 +29,10 @@ pub struct PageserverFeedback {
     // Serialize with RFC3339 format.
     #[serde(with = "serde_systemtime")]
     pub replytime: SystemTime,
+    /// Used to track feedbacks from different shards. Always zero for unsharded tenants.
+    pub shard_number: u32,
 }
 
-// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
-// Do not remove previously available fields because this might be backwards incompatible.
-pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
-
 impl PageserverFeedback {
     pub fn empty() -> PageserverFeedback {
         PageserverFeedback {
@@ -43,6 +41,7 @@ impl PageserverFeedback {
             remote_consistent_lsn: Lsn::INVALID,
             disk_consistent_lsn: Lsn::INVALID,
             replytime: *PG_EPOCH,
+            shard_number: 0,
         }
     }
 
@@ -59,17 +58,26 @@ impl PageserverFeedback {
     //
     // TODO: change serialized fields names once all computes migrate to rename.
     pub fn serialize(&self, buf: &mut BytesMut) {
-        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
+        let buf_ptr = buf.len();
+        buf.put_u8(0); // # of keys, will be filled later
+        let mut nkeys = 0;
+
+        nkeys += 1;
         buf.put_slice(b"current_timeline_size\0");
         buf.put_i32(8);
         buf.put_u64(self.current_timeline_size);
 
+        nkeys += 1;
         buf.put_slice(b"ps_writelsn\0");
         buf.put_i32(8);
         buf.put_u64(self.last_received_lsn.0);
+
+        nkeys += 1;
         buf.put_slice(b"ps_flushlsn\0");
         buf.put_i32(8);
         buf.put_u64(self.disk_consistent_lsn.0);
+
+        nkeys += 1;
         buf.put_slice(b"ps_applylsn\0");
         buf.put_i32(8);
         buf.put_u64(self.remote_consistent_lsn.0);
@@ -80,9 +88,19 @@ impl PageserverFeedback {
             .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
             .as_micros() as i64;
 
+        nkeys += 1;
         buf.put_slice(b"ps_replytime\0");
         buf.put_i32(8);
         buf.put_i64(timestamp);
+
+        if self.shard_number > 0 {
+            nkeys += 1;
+            buf.put_slice(b"shard_number\0");
+            buf.put_i32(4);
+            buf.put_u32(self.shard_number);
+        }
+
+        buf[buf_ptr] = nkeys;
     }
 
     // Deserialize PageserverFeedback message
@@ -125,9 +143,8 @@ impl PageserverFeedback {
                 }
                 b"shard_number" => {
                     let len = buf.get_i32();
-                    // TODO: this will be implemented in the next update,
-                    //  for now, we just skip the value.
-                    buf.advance(len as usize);
+                    assert_eq!(len, 4);
+                    rf.shard_number = buf.get_u32();
                 }
                 _ => {
                     let len = buf.get_i32();
@@ -200,10 +217,7 @@ mod tests {
         rf.serialize(&mut data);
 
         // Add an extra field to the buffer and adjust number of keys
-        if let Some(first) = data.first_mut() {
-            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
-        }
-
+        data[0] += 1;
         data.put_slice(b"new_field_one\0");
         data.put_i32(8);
         data.put_u64(42);
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index f5ed6ebb97..906302e46e 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
     }
 }
 
-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp))
+        (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk));
     }
 }
 
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 734967da3f..14cc3e05a2 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,7 +142,7 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
+    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) {
         todo!()
     }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 8297ca6563..d9f780cfd1 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -448,6 +448,7 @@ pub(super) async fn handle_walreceiver_connection(
                 disk_consistent_lsn,
                 remote_consistent_lsn,
                 replytime: ts,
+                shard_number: timeline.tenant_shard_id.shard_number.0 as u32,
             };
 
             debug!("neon_status_update {status_update:?}");
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 63a2b30d09..9c7e8748d5 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -109,6 +109,8 @@ impl WalIngest {
             self.checkpoint_modified = true;
         }
 
+        failpoint_support::sleep_millis_async!("wal-ingest-record-sleep");
+
         match decoded.xl_rmid {
             pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                 // Heap AM records need some special handling, because they modify VM pages
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 9ff0493352..d7987954d4 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -70,7 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk);
 static bool RecvAppendResponses(Safekeeper *sk);
 static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp);
-static void HandleSafekeeperResponse(WalProposer *wp);
+static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk);
 static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
 static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
 static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state);
@@ -1405,7 +1405,6 @@ static bool
 RecvAppendResponses(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
-	XLogRecPtr	newCommitLsn;
 	bool		readAnything = false;
 
 	while (true)
@@ -1425,6 +1424,8 @@ RecvAppendResponses(Safekeeper *sk)
 			   LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
 			   sk->host, sk->port);
 
+		readAnything = true;
+
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
@@ -1438,35 +1439,28 @@ RecvAppendResponses(Safekeeper *sk)
 				   sk->appendResponse.term, wp->propTerm);
 		}
 
-		readAnything = true;
+		HandleSafekeeperResponse(wp, sk);
 	}
 
 	if (!readAnything)
 		return sk->state == SS_ACTIVE;
 
-	/* update commit_lsn */
-	newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
-	/*
-	 * Send the new value to all safekeepers.
-	 */
-	if (newCommitLsn > wp->commitLsn)
-	{
-		wp->commitLsn = newCommitLsn;
-		BroadcastAppendRequest(wp);
-	}
-
-	HandleSafekeeperResponse(wp);
-
 	return sk->state == SS_ACTIVE;
 }
 
+#define psfeedback_log(fmt, key, ...) \
+	wp_log(DEBUG2, "ParsePageserverFeedbackMessage: %s " fmt, key, __VA_ARGS__)
+
 /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
 static void
-ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf)
+ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *ps_feedback)
 {
 	uint8		nkeys;
 	int			i;
-	int32		len;
+
+	/* initialize the struct before parsing */
+	memset(ps_feedback, 0, sizeof(PageserverFeedback));
+	ps_feedback->present = true;
 
 	/* get number of custom keys */
 	nkeys = pq_getmsgbyte(reply_message);
@@ -1474,66 +1468,52 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 	for (i = 0; i < nkeys; i++)
 	{
 		const char *key = pq_getmsgstring(reply_message);
+		unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32));
 
 		if (strcmp(key, "current_timeline_size") == 0)
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->currentClusterSize = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
-				   rf->currentClusterSize);
+			Assert(value_len == sizeof(int64));
+			ps_feedback->currentClusterSize = pq_getmsgint64(reply_message);
+			psfeedback_log(UINT64_FORMAT, key, ps_feedback->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->last_received_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->last_received_lsn));
+			Assert(value_len == sizeof(int64));
+			ps_feedback->last_received_lsn = pq_getmsgint64(reply_message);
+			psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
+			Assert(value_len == sizeof(int64));
+			ps_feedback->disk_consistent_lsn = pq_getmsgint64(reply_message);
+			psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
+			Assert(value_len == sizeof(int64));
+			ps_feedback->remote_consistent_lsn = pq_getmsgint64(reply_message);
+			psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->replytime = pq_getmsgint64(reply_message);
-			{
-				char	   *replyTimeStr;
-
-				/* Copy because timestamptz_to_str returns a static buffer */
-				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
-				wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
-					   rf->replytime, replyTimeStr);
-
-				pfree(replyTimeStr);
-			}
+			Assert(value_len == sizeof(int64));
+			ps_feedback->replytime = pq_getmsgint64(reply_message);
+			psfeedback_log("%s", key, timestamptz_to_str(ps_feedback->replytime));
+		}
+		else if (strcmp(key, "shard_number") == 0)
+		{
+			Assert(value_len == sizeof(uint32));
+			ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32));
+			psfeedback_log("%u", key, ps_feedback->shard_number);
 		}
 		else
 		{
-			len = pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-
 			/*
 			 * Skip unknown keys to support backward compatibile protocol
 			 * changes
 			 */
-			wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
-			pq_getmsgbytes(reply_message, len);
+			wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, value_len);
+			pq_getmsgbytes(reply_message, value_len);
 		};
 	}
 }
@@ -1630,12 +1610,30 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 	return donor;
 }
 
+/*
+ * Process AppendResponse message from safekeeper.
+ */
 static void
-HandleSafekeeperResponse(WalProposer *wp)
+HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
 {
 	XLogRecPtr	candidateTruncateLsn;
+	XLogRecPtr  newCommitLsn;
 
-	wp->api.process_safekeeper_feedback(wp);
+	newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
+	if (newCommitLsn > wp->commitLsn)
+	{
+		wp->commitLsn = newCommitLsn;
+		/* Send new value to all safekeepers. */
+		BroadcastAppendRequest(wp);
+	}
+
+	/* 
+	 * Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown().
+	 * The last one will terminate the process if the shutdown is requested
+	 * and WAL is committed by the quorum. BroadcastAppendRequest() should be
+	 * called to notify safekeepers about the new commitLsn.
+	 */
+	wp->api.process_safekeeper_feedback(wp, sk);
 
 	/*
 	 * Try to advance truncateLsn -- the last record flushed to all
@@ -1811,8 +1809,10 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 				msg->hs.ts = pq_getmsgint64_le(&s);
 				msg->hs.xmin.value = pq_getmsgint64_le(&s);
 				msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
-				if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
-					ParsePageserverFeedbackMessage(wp, &s, &msg->rf);
+				if (s.len > s.cursor)
+					ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
+				else
+					msg->ps_feedback.present = false;
 				pq_getmsgend(&s);
 				return true;
 			}
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index bc674fd979..28585eb4e7 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -10,6 +10,7 @@
 
 #include "libpqwalproposer.h"
 #include "neon_walreader.h"
+#include "pagestore_client.h"
 
 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2
@@ -269,6 +270,8 @@ typedef struct HotStandbyFeedback
 
 typedef struct PageserverFeedback
 {
+	/* true if AppendResponse contains this feedback */
+	bool		present;
 	/* current size of the timeline on pageserver */
 	uint64		currentClusterSize;
 	/* standby_status_update fields that safekeeper received from pageserver */
@@ -276,14 +279,21 @@ typedef struct PageserverFeedback
 	XLogRecPtr	disk_consistent_lsn;
 	XLogRecPtr	remote_consistent_lsn;
 	TimestampTz replytime;
+	uint32		shard_number;
 } PageserverFeedback;
 
 typedef struct WalproposerShmemState
 {
 	slock_t		mutex;
-	PageserverFeedback feedback;
 	term_t		mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
+
+	/* last feedback from each shard */
+	PageserverFeedback shard_ps_feedback[MAX_SHARDS];
+	int num_shards;
+
+	/* aggregated feedback with min LSNs across shards */
+	PageserverFeedback min_ps_feedback;
 } WalproposerShmemState;
 
 /*
@@ -307,12 +317,12 @@ typedef struct AppendResponse
 	/* Feedback received from pageserver includes standby_status_update fields */
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
-	PageserverFeedback rf;
+	PageserverFeedback ps_feedback;
 } AppendResponse;
 
 /*  PageserverFeedback is extensible part of the message that is parsed separately */
 /*  Other fields are fixed part */
-#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
+#define APPENDRESPONSE_FIXEDPART_SIZE 56
 
 struct WalProposer;
 typedef struct WalProposer WalProposer;
@@ -560,11 +570,11 @@ typedef struct walproposer_api
 	void		(*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);
 
 	/*
-	 * Called after every new message from the safekeeper. Used to propagate
+	 * Called after every AppendResponse from the safekeeper. Used to propagate
 	 * backpressure feedback and to confirm WAL persistence (has been commited
 	 * on the quorum of safekeepers).
 	 */
-	void		(*process_safekeeper_feedback) (WalProposer *wp);
+	void		(*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk);
 
 	/*
 	 * Write a log message to the internal log processor. This is used only
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 8eec2f02c1..c46fd9b3ec 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -63,7 +63,6 @@ char	   *wal_acceptors_list = "";
 int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
 
-static AppendResponse quorumFeedback;
 static WalproposerShmemState *walprop_shared;
 static WalProposerConfig walprop_config;
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;
@@ -71,6 +70,10 @@ static const walproposer_api walprop_pg;
 static volatile sig_atomic_t got_SIGUSR2 = false;
 static bool reported_sigusr2 = false;
 
+static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr;
+static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr;
+static HotStandbyFeedback agg_hs_feedback;
+
 static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
 static void nwp_prepare_shmem(void);
@@ -402,21 +405,58 @@ walprop_pg_get_shmem_state(WalProposer *wp)
 	return walprop_shared;
 }
 
-static void
-replication_feedback_set(PageserverFeedback *rf)
+/*
+ * Record new ps_feedback in the array with shards and update min_feedback.
+ */
+static PageserverFeedback
+record_pageserver_feedback(PageserverFeedback *ps_feedback)
 {
+	PageserverFeedback min_feedback;
+
+	Assert(ps_feedback->present);
+	Assert(ps_feedback->shard_number < MAX_SHARDS);
+
 	SpinLockAcquire(&walprop_shared->mutex);
-	memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback));
+
+	/* Update the number of shards */
+	if (ps_feedback->shard_number + 1 > walprop_shared->num_shards)
+		walprop_shared->num_shards = ps_feedback->shard_number + 1;
+
+	/* Update the feedback */
+	memcpy(&walprop_shared->shard_ps_feedback[ps_feedback->shard_number], ps_feedback, sizeof(PageserverFeedback));
+
+	/* Calculate min LSNs */
+	memcpy(&min_feedback, ps_feedback, sizeof(PageserverFeedback));
+	for (int i = 0; i < walprop_shared->num_shards; i++)
+	{
+		PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i];
+		if (feedback->present)
+		{
+			if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn)
+				min_feedback.last_received_lsn = feedback->last_received_lsn;
+			
+			if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn)
+				min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn;
+			
+			if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn)
+				min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn;
+		}
+	}
+	/* Copy min_feedback back to shmem */
+	memcpy(&walprop_shared->min_ps_feedback, &min_feedback, sizeof(PageserverFeedback));
+
 	SpinLockRelease(&walprop_shared->mutex);
+
+	return min_feedback;
 }
 
 void
 replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
 {
 	SpinLockAcquire(&walprop_shared->mutex);
-	*writeLsn = walprop_shared->feedback.last_received_lsn;
-	*flushLsn = walprop_shared->feedback.disk_consistent_lsn;
-	*applyLsn = walprop_shared->feedback.remote_consistent_lsn;
+	*writeLsn = walprop_shared->min_ps_feedback.last_received_lsn;
+	*flushLsn = walprop_shared->min_ps_feedback.disk_consistent_lsn;
+	*applyLsn = walprop_shared->min_ps_feedback.remote_consistent_lsn;
 	SpinLockRelease(&walprop_shared->mutex);
 }
 
@@ -1869,39 +1909,6 @@ CheckGracefulShutdown(WalProposer *wp)
 	}
 }
 
-/*
- * Choose most advanced PageserverFeedback and set it to *rf.
- */
-static void
-GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
-{
-	int			latest_safekeeper = 0;
-	XLogRecPtr	last_received_lsn = InvalidXLogRecPtr;
-
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		if (wp->safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn)
-		{
-			latest_safekeeper = i;
-			last_received_lsn = wp->safekeeper[i].appendResponse.rf.last_received_lsn;
-		}
-	}
-
-	rf->currentClusterSize = wp->safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
-	rf->last_received_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn;
-	rf->disk_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn;
-	rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
-	rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
-
-	wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
-			" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
-			rf->currentClusterSize,
-			LSN_FORMAT_ARGS(rf->last_received_lsn),
-			LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
-			LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
-			rf->replytime);
-}
-
 /*
  * Combine hot standby feedbacks from all safekeepers.
  */
@@ -1949,26 +1956,38 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
  * None of that is functional in sync-safekeepers.
  */
 static void
-walprop_pg_process_safekeeper_feedback(WalProposer *wp)
+walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 {
-	HotStandbyFeedback hsFeedback;
-	XLogRecPtr	oldDiskConsistentLsn;
+	HotStandbyFeedback	hsFeedback;
+	bool				needToAdvanceSlot = false;
 
 	if (wp->config->syncSafekeepers)
 		return;
 
-	oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
-
-	/* Get PageserverFeedback fields from the most advanced safekeeper */
-	GetLatestNeonFeedback(&quorumFeedback.rf, wp);
-	replication_feedback_set(&quorumFeedback.rf);
-	SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
-
-	if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
+	/* handle fresh ps_feedback */
+	if (sk->appendResponse.ps_feedback.present)
 	{
-		if (wp->commitLsn > quorumFeedback.flushLsn)
-			quorumFeedback.flushLsn = wp->commitLsn;
+		PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback);
 
+		/* Only one main shard sends non-zero currentClusterSize */
+		if (sk->appendResponse.ps_feedback.currentClusterSize > 0)
+			SetZenithCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);
+
+		if (min_feedback.disk_consistent_lsn != standby_apply_lsn)
+		{
+			standby_apply_lsn = min_feedback.disk_consistent_lsn;
+			needToAdvanceSlot = true;
+		}
+	}
+
+	if (wp->commitLsn > standby_flush_lsn)
+	{
+		standby_flush_lsn = wp->commitLsn;
+		needToAdvanceSlot = true;
+	}
+
+	if (needToAdvanceSlot)
+	{
 		/*
 		 * Advance the replication slot to commitLsn. WAL before it is
 		 * hardened and will be fetched from one of safekeepers by
@@ -1977,23 +1996,23 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp)
 		 * Also wakes up syncrep waiters.
 		 */
 		ProcessStandbyReply(
-		/* write_lsn -  This is what durably stored in WAL service. */
-							quorumFeedback.flushLsn,
-		/* flush_lsn - This is what durably stored in WAL service. */
-							quorumFeedback.flushLsn,
+		/* write_lsn -  This is what durably stored in safekeepers quorum. */
+							standby_flush_lsn,
+		/* flush_lsn - This is what durably stored in safekeepers quorum. */
+							standby_flush_lsn,
 
 		/*
 		 * apply_lsn - This is what processed and durably saved at*
 		 * pageserver.
 		 */
-							quorumFeedback.rf.disk_consistent_lsn,
+							standby_apply_lsn,
 							walprop_pg_get_current_timestamp(wp), false);
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
 	{
-		quorumFeedback.hs = hsFeedback;
+		agg_hs_feedback = hsFeedback;
 		ProcessStandbyHSFeedback(hsFeedback.ts,
 								 XidFromFullTransactionId(hsFeedback.xmin),
 								 EpochFromFullTransactionId(hsFeedback.xmin),
diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs
index 5c79e9082b..42340ba1df 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_api.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs
@@ -224,6 +224,16 @@ impl SimulationApi {
             })
             .collect::<Vec<_>>();
 
+        let empty_feedback = PageserverFeedback {
+            present: false,
+            currentClusterSize: 0,
+            last_received_lsn: 0,
+            disk_consistent_lsn: 0,
+            remote_consistent_lsn: 0,
+            replytime: 0,
+            shard_number: 0,
+        };
+
         Self {
             os: args.os,
             safekeepers: RefCell::new(sk_conns),
@@ -232,15 +242,11 @@ impl SimulationApi {
             last_logged_commit_lsn: 0,
             shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState {
                 mutex: 0,
-                feedback: PageserverFeedback {
-                    currentClusterSize: 0,
-                    last_received_lsn: 0,
-                    disk_consistent_lsn: 0,
-                    remote_consistent_lsn: 0,
-                    replytime: 0,
-                },
                 mineLastElectedTerm: 0,
                 backpressureThrottlingTime: pg_atomic_uint64 { value: 0 },
+                shard_ps_feedback: [empty_feedback; 128],
+                num_shards: 0,
+                min_ps_feedback: empty_feedback,
             }),
             config: args.config,
             event_set: RefCell::new(None),
@@ -598,7 +604,11 @@ impl ApiImpl for SimulationApi {
         }
     }
 
-    fn process_safekeeper_feedback(&mut self, wp: &mut walproposer::bindings::WalProposer) {
+    fn process_safekeeper_feedback(
+        &mut self,
+        wp: &mut walproposer::bindings::WalProposer,
+        _sk: &mut walproposer::bindings::Safekeeper,
+    ) {
         debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn);
         if wp.commitLsn > self.last_logged_commit_lsn {
             self.os.log_event(format!("commit_lsn;{}", wp.commitLsn));
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index e852281fcf..ab8717de54 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -1,5 +1,5 @@
 import threading
-from typing import Optional
+from typing import Any, Optional
 
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -32,6 +32,7 @@ class Workload:
         tenant_id: TenantId,
         timeline_id: TimelineId,
         branch_name: Optional[str] = None,
+        endpoint_opts: Optional[dict[str, Any]] = None,
     ):
         self.env = env
         self.tenant_id = tenant_id
@@ -45,6 +46,7 @@ class Workload:
         self.churn_cursor = 0
 
         self._endpoint: Optional[Endpoint] = None
+        self._endpoint_opts = endpoint_opts or {}
 
     def reconfigure(self):
         """
@@ -66,6 +68,7 @@ class Workload:
                     tenant_id=self.tenant_id,
                     pageserver_id=pageserver_id,
                     endpoint_id=endpoint_id,
+                    **self._endpoint_opts,
                 )
                 self._endpoint.start(pageserver_id=pageserver_id)
             else:
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index e8511e428e..9e62933f7e 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,4 +1,5 @@
 import os
+import time
 from typing import Dict, List, Optional, Union
 
 import pytest
@@ -837,3 +838,130 @@ def test_sharding_split_failures(
         assert_split_done()
 
     env.storage_controller.consistency_check()
+
+
+def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
+    """
+    Check a scenario when one of the shards is much slower than others.
+    Without backpressure, this would lead to the slow shard falling behind
+    and eventually causing WAL timeouts.
+    """
+
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+
+    # 256KiB stripes: enable getting some meaningful data distribution without
+    # writing large quantities of data in this test.  The stripe size is given
+    # in number of 8KiB pages.
+    stripe_size = 32
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
+    )
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+    shards = env.storage_controller.locate(tenant_id)
+
+    # Slow down one of the shards, around ~1MB/s
+    pageservers[4].http_client().configure_failpoints(("wal-ingest-record-sleep", "5%sleep(1)"))
+
+    def shards_info():
+        infos = []
+        for shard in shards:
+            node_id = int(shard["node_id"])
+            pageserver = pageservers[node_id]
+            shard_info = pageserver.http_client().timeline_detail(shard["shard_id"], timeline_id)
+            infos.append(shard_info)
+            last_record_lsn = shard_info["last_record_lsn"]
+            current_physical_size = shard_info["current_physical_size"]
+            log.info(
+                f"Shard on pageserver {node_id}: lsn={last_record_lsn}, size={current_physical_size}"
+            )
+        return infos
+
+    shards_info()
+
+    workload = Workload(
+        env,
+        tenant_id,
+        timeline_id,
+        branch_name="main",
+        endpoint_opts={
+            "config_lines": [
+                # Tip: set to 100MB to make the test fail
+                "max_replication_write_lag=1MB",
+            ],
+        },
+    )
+    workload.init()
+
+    endpoint = workload.endpoint()
+
+    # on 2024-03-05, the default config on prod was [15MB, 10GB, null]
+    res = endpoint.safe_psql_many(
+        [
+            "SHOW max_replication_write_lag",
+            "SHOW max_replication_flush_lag",
+            "SHOW max_replication_apply_lag",
+        ]
+    )
+    log.info(f"backpressure config: {res}")
+
+    last_flush_lsn = None
+    last_timestamp = None
+
+    def update_write_lsn():
+        nonlocal last_flush_lsn
+        nonlocal last_timestamp
+
+        res = endpoint.safe_psql(
+            """
+            SELECT
+                pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag,
+                received_lsn,
+                pg_current_wal_flush_lsn() as flush_lsn,
+                neon.backpressure_throttling_time() as throttling_time
+            FROM neon.backpressure_lsns();
+            """,
+            dbname="postgres",
+        )[0]
+        log.info(
+            f"received_lsn_lag = {res[0]}, received_lsn = {res[1]}, flush_lsn = {res[2]}, throttling_time = {res[3]}"
+        )
+
+        lsn = Lsn(res[2])
+        now = time.time()
+
+        if last_timestamp is not None:
+            delta = now - last_timestamp
+            delta_bytes = lsn - last_flush_lsn
+            avg_speed = delta_bytes / delta / 1024 / 1024
+            log.info(
+                f"flush_lsn {lsn}, written {delta_bytes/1024}kb for {delta:.3f}s, avg_speed {avg_speed:.3f} MiB/s"
+            )
+
+        last_flush_lsn = lsn
+        last_timestamp = now
+
+    update_write_lsn()
+
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.validate()
+
+    update_write_lsn()
+    shards_info()
+
+    for _write_iter in range(30):
+        # approximately 1MB of data
+        workload.write_rows(8000, upload=False)
+        update_write_lsn()
+        infos = shards_info()
+        min_lsn = min(Lsn(info["last_record_lsn"]) for info in infos)
+        max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos)
+        diff = max_lsn - min_lsn
+        assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure"

From 49be446d95482c31febacf6b87b11aa47afb2bc2 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 18 Mar 2024 22:57:32 +0000
Subject: [PATCH 36/43] async password validation (#7171)

## Problem

password hashing can block main thread

## Summary of changes

spawn_blocking the password hash call
---
 proxy/src/auth/backend.rs       |  2 +-
 proxy/src/auth/flow.rs          |  7 ++++---
 proxy/src/scram.rs              | 15 ++++++++-------
 proxy/src/scram/exchange.rs     | 11 +++++++++--
 proxy/src/serverless/backend.rs |  2 +-
 5 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 11af85caa4..bc307230dd 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -254,7 +254,7 @@ async fn authenticate_with_secret(
     config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
     if let Some(password) = unauthenticated_password {
-        let auth_outcome = validate_password_and_exchange(&password, secret)?;
+        let auth_outcome = validate_password_and_exchange(&password, secret).await?;
         let keys = match auth_outcome {
             crate::sasl::Outcome::Success(key) => key,
             crate::sasl::Outcome::Failure(reason) => {
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 788381b6c0..f26dcb7c9a 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -126,7 +126,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
             .strip_suffix(&[0])
             .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
 
-        let outcome = validate_password_and_exchange(password, self.state.0)?;
+        let outcome = validate_password_and_exchange(password, self.state.0).await?;
 
         if let sasl::Outcome::Success(_) = &outcome {
             self.stream.write_message_noflush(&Be::AuthenticationOk)?;
@@ -180,7 +180,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     }
 }
 
-pub(crate) fn validate_password_and_exchange(
+pub(crate) async fn validate_password_and_exchange(
     password: &[u8],
     secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
@@ -200,7 +200,8 @@ pub(crate) fn validate_password_and_exchange(
                 &scram_secret,
                 sasl_client,
                 crate::config::TlsServerEndPoint::Undefined,
-            )?;
+            )
+            .await?;
 
             let client_key = match outcome {
                 sasl::Outcome::Success(client_key) => client_key,
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index a95e734d06..df4b3ec8d7 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -113,7 +113,7 @@ mod tests {
         );
     }
 
-    fn run_round_trip_test(server_password: &str, client_password: &str) {
+    async fn run_round_trip_test(server_password: &str, client_password: &str) {
         let scram_secret = ServerSecret::build(server_password).unwrap();
         let sasl_client =
             ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
@@ -123,6 +123,7 @@ mod tests {
             sasl_client,
             crate::config::TlsServerEndPoint::Undefined,
         )
+        .await
         .unwrap();
 
         match outcome {
@@ -131,14 +132,14 @@ mod tests {
         }
     }
 
-    #[test]
-    fn round_trip() {
-        run_round_trip_test("pencil", "pencil")
+    #[tokio::test]
+    async fn round_trip() {
+        run_round_trip_test("pencil", "pencil").await
     }
 
-    #[test]
+    #[tokio::test]
     #[should_panic(expected = "password doesn't match")]
-    fn failure() {
-        run_round_trip_test("pencil", "eraser")
+    async fn failure() {
+        run_round_trip_test("pencil", "eraser").await
     }
 }
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 9af7db5201..16575d5d98 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -71,7 +71,7 @@ impl<'a> Exchange<'a> {
     }
 }
 
-pub fn exchange(
+pub async fn exchange(
     secret: &ServerSecret,
     mut client: ScramSha256,
     tls_server_end_point: config::TlsServerEndPoint,
@@ -86,7 +86,14 @@ pub fn exchange(
         .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
     let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
         Continue(sent, server_first) => {
-            client.update(server_first.as_bytes())?;
+            // `client.update` might perform `pbkdf2(pw)`, best to spawn it in a blocking thread.
+            // TODO(conrad): take this code from tokio-postgres and make an async-aware pbkdf2 impl
+            client = tokio::task::spawn_blocking(move || {
+                client.update(server_first.as_bytes())?;
+                Ok::<ScramSha256, std::io::Error>(client)
+            })
+            .await
+            .expect("should not panic while performing password hash")?;
             sent
         }
         Success(x, _) => match x {},
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 29ef641265..72b55c45f0 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -50,7 +50,7 @@ impl PoolingBackend {
             }
         };
         let auth_outcome =
-            crate::auth::validate_password_and_exchange(&conn_info.password, secret)?;
+            crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
         let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => Ok(key),
             crate::sasl::Outcome::Failure(reason) => {

From b80704cd34baae1746a98c43db8dcb672e08dcf5 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 19 Mar 2024 10:30:33 +0000
Subject: [PATCH 37/43] tests: log hygiene checks for storage controller
 (#6710)

## Problem

As with the pageserver, we should fail tests that emit unexpected log
errors/warnings.

## Summary of changes

- Refactor existing log checks to be reusable
- Run log checks for attachment_service
- Add allow lists as needed.
---
 .../attachment_service/src/service.rs         | 25 ++++++++++++-
 test_runner/fixtures/neon_fixtures.py         | 26 ++++++-------
 .../fixtures/pageserver/allowed_errors.py     | 10 +++++
 test_runner/fixtures/utils.py                 | 37 +++++++++++++++++++
 test_runner/regress/test_branch_and_gc.py     | 12 +++---
 test_runner/regress/test_branch_behind.py     |  9 +++--
 test_runner/regress/test_compatibility.py     |  4 ++
 test_runner/regress/test_sharding.py          | 14 +++++++
 test_runner/regress/test_sharding_service.py  | 10 +++++
 test_runner/regress/test_tenants.py           |  4 +-
 10 files changed, 126 insertions(+), 25 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 29f87021b2..addfd9c232 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -7,7 +7,9 @@ use std::{
     time::{Duration, Instant},
 };
 
-use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus};
+use crate::{
+    id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
+};
 use anyhow::Context;
 use control_plane::storage_controller::{
     AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
@@ -733,7 +735,19 @@ impl Service {
                 tenant.waiter.advance(result.sequence);
             }
             Err(e) => {
-                tracing::warn!("Reconcile error: {}", e);
+                match e {
+                    ReconcileError::Cancel => {
+                        tracing::info!("Reconciler was cancelled");
+                    }
+                    ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
+                        // This might be due to the reconciler getting cancelled, or it might
+                        // be due to the `Node` being marked offline.
+                        tracing::info!("Reconciler cancelled during pageserver API call");
+                    }
+                    _ => {
+                        tracing::warn!("Reconcile error: {}", e);
+                    }
+                }
 
                 // Ordering: populate last_error before advancing error_seq,
                 // so that waiters will see the correct error after waiting.
@@ -3631,6 +3645,13 @@ impl Service {
                         observed_loc.conf = None;
                     }
 
+                    if new_nodes.len() == 1 {
+                        // Special case for single-node cluster: there is no point trying to reschedule
+                        // any tenant shards: avoid doing so, in order to avoid spewing warnings about
+                        // failures to schedule them.
+                        continue;
+                    }
+
                     if tenant_state.intent.demote_attached(node_id) {
                         tenant_state.sequence = tenant_state.sequence.next();
                         match tenant_state.schedule(scheduler) {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 56b23cef59..3ecd343224 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -51,7 +51,7 @@ from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
     DEFAULT_PAGESERVER_ALLOWED_ERRORS,
-    scan_pageserver_log_for_errors,
+    DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.types import IndexPartDump
@@ -77,6 +77,7 @@ from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
     allure_add_grafana_links,
     allure_attach_from_dir,
+    assert_no_errors,
     get_self_dir,
     subprocess_capture,
     wait_until,
@@ -944,6 +945,8 @@ class NeonEnvBuilder:
             for pageserver in self.env.pageservers:
                 pageserver.assert_no_errors()
 
+            self.env.storage_controller.assert_no_errors()
+
         try:
             self.overlay_cleanup_teardown()
         except Exception as e:
@@ -1961,6 +1964,7 @@ class NeonStorageController(MetricsGetter):
         self.env = env
         self.running = False
         self.auth_enabled = auth_enabled
+        self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
 
     def start(self):
         assert not self.running
@@ -1985,6 +1989,11 @@ class NeonStorageController(MetricsGetter):
                 msg = ""
             raise StorageControllerApiException(msg, res.status_code) from e
 
+    def assert_no_errors(self):
+        assert_no_errors(
+            self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
+        )
+
     def pageserver_api(self) -> PageserverHttpClient:
         """
         The storage controller implements a subset of the pageserver REST API, for mapping
@@ -2357,18 +2366,9 @@ class NeonPageserver(PgProtocol):
         return self.env.repo_dir / f"pageserver_{self.id}"
 
     def assert_no_errors(self):
-        logfile = self.workdir / "pageserver.log"
-        if not logfile.exists():
-            log.warning(f"Skipping log check: {logfile} does not exist")
-            return
-
-        with logfile.open("r") as f:
-            errors = scan_pageserver_log_for_errors(f, self.allowed_errors)
-
-        for _lineno, error in errors:
-            log.info(f"not allowed error: {error.strip()}")
-
-        assert not errors
+        assert_no_errors(
+            self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors
+        )
 
     def assert_no_metric_errors(self):
         """
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 839d4166c7..ec0f81b380 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -89,6 +89,16 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
 )
 
 
+DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
+    # Many tests will take pageservers offline, resulting in log warnings on the controller
+    # failing to connect to them.
+    ".*Call to node.*management API.*failed.*receive body.*",
+    ".*Call to node.*management API.*failed.*ReceiveBody.*",
+    # Many tests will start up with a node offline
+    ".*startup_reconcile: Could not scan node.*",
+]
+
+
 def _check_allowed_errors(input):
     allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
 
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 7fc3bae3af..9365d65fc9 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -11,6 +11,7 @@ from typing import (
     Any,
     Callable,
     Dict,
+    Iterable,
     List,
     Optional,
     Tuple,
@@ -447,3 +448,39 @@ def humantime_to_ms(humantime: str) -> float:
             )
 
     return round(total_ms, 3)
+
+
+def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
+    error_or_warn = re.compile(r"\s(ERROR|WARN)")
+    errors = []
+    for lineno, line in enumerate(input, start=1):
+        if len(line) == 0:
+            continue
+
+        if error_or_warn.search(line):
+            # Is this a torn log line?  This happens when force-killing a process and restarting
+            # Example: "2023-10-25T09:38:31.752314Z  WARN deletion executo2023-10-25T09:38:31.875947Z  INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
+            if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
+                continue
+
+            # It's an ERROR or WARN. Is it in the allow-list?
+            for a in allowed_errors:
+                if re.match(a, line):
+                    break
+            else:
+                errors.append((lineno, line))
+    return errors
+
+
+def assert_no_errors(log_file, service, allowed_errors):
+    if not log_file.exists():
+        log.warning(f"Skipping {service} log check: {log_file} does not exist")
+        return
+
+    with log_file.open("r") as f:
+        errors = scan_log_for_errors(f, allowed_errors)
+
+    for _lineno, error in errors:
+        log.info(f"not allowed {service} error: {error.strip()}")
+
+    assert not errors, f"Log errors on {service}: {errors[0]}"
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index bdc944f352..ddd02238ea 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
     env = neon_simple_env
     pageserver_http_client = env.pageserver.http_client()
 
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*invalid branch start lsn: less than latest GC cutoff.*",
-            ".*invalid branch start lsn: less than planned GC cutoff.*",
-        ]
-    )
+    error_regexes = [
+        ".*invalid branch start lsn: less than latest GC cutoff.*",
+        ".*invalid branch start lsn: less than planned GC cutoff.*",
+    ]
+    env.pageserver.allowed_errors.extend(error_regexes)
+    env.storage_controller.allowed_errors.extend(error_regexes)
 
     # Disable background GC but set the `pitr_interval` to be small, so GC can delete something
     tenant, _ = env.neon_cli.create_tenant(
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index 46c74a26b8..b79cad979f 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -14,9 +14,12 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
     env = neon_env_builder.init_start()
 
-    env.pageserver.allowed_errors.extend(
-        [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"]
-    )
+    error_regexes = [
+        ".*invalid branch start lsn.*",
+        ".*invalid start lsn .* for ancestor timeline.*",
+    ]
+    env.pageserver.allowed_errors.extend(error_regexes)
+    env.storage_controller.allowed_errors.extend(error_regexes)
 
     # Branch at the point where only 100 rows were inserted
     branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 5f815d3e6c..e0bb4c2062 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -238,6 +238,10 @@ def test_forward_compatibility(
             pg_distrib_dir=compatibility_postgres_distrib_dir,
         )
 
+        # TODO: remove this workaround after release-5090 is no longer the most recent release.
+        # There was a bug in that code that generates a warning in the storage controller log.
+        env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*")
+
         # Use current neon_local even though we're using old binaries for
         # everything else: our test code is written for latest CLI args.
         env.neon_local_binpath = neon_local_binpath
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 9e62933f7e..3470d2e609 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -725,6 +725,20 @@ def test_sharding_split_failures(
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
+    env.storage_controller.allowed_errors.extend(
+        [
+            # All split failures log a warning when then enqueue the abort operation
+            ".*Enqueuing background abort.*",
+            # We exercise failure cases where abort itself will also fail (node offline)
+            ".*abort_tenant_shard_split.*",
+            ".*Failed to abort.*",
+            # Tolerate any error lots that mention a failpoint
+            ".*failpoint.*",
+            # Node offline cases will fail to send requests
+            ".*Reconcile error: receive body: error sending request for url.*",
+        ]
+    )
+
     for ps in env.pageservers:
         # When we do node failures and abandon a shard, it will de-facto have old generation and
         # thereby be unable to publish remote consistent LSN updates
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 27ea425bb1..a6b0f76c96 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -177,6 +177,7 @@ def test_node_status_after_restart(
     assert len(nodes) == 2
 
     env.pageservers[1].stop()
+    env.storage_controller.allowed_errors.extend([".*Could not scan node"])
 
     env.storage_controller.stop()
     env.storage_controller.start()
@@ -681,6 +682,9 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
     tenant_id = TenantId.generate()
     body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
 
+    env.storage_controller.allowed_errors.append(".*Unauthorized.*")
+    env.storage_controller.allowed_errors.append(".*Forbidden.*")
+
     # No token
     with pytest.raises(
         StorageControllerApiException,
@@ -843,6 +847,12 @@ def test_sharding_service_heartbeats(
     env = neon_env_builder.init_configs()
     env.start()
 
+    # Default log allow list permits connection errors, but this test will use error responses on
+    # the utilization endpoint.
+    env.storage_controller.allowed_errors.append(
+        ".*Call to node.*management API.*failed.*failpoint.*"
+    )
+
     # Initially we have two online pageservers
     nodes = env.storage_controller.node_list()
     assert len(nodes) == 2
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 1e13a2f20f..f8701b65d7 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -36,7 +36,9 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
     )
     [d for d in tenants_dir.iterdir()]
 
-    neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*")
+    error_regexes = [".*tenant-config-before-write.*"]
+    neon_simple_env.pageserver.allowed_errors.extend(error_regexes)
+    neon_simple_env.storage_controller.allowed_errors.extend(error_regexes)
 
     pageserver_http = neon_simple_env.pageserver.http_client()
     pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))

From a8384a074e658193d1c4005763153898358b9d18 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 19 Mar 2024 10:43:24 -0400
Subject: [PATCH 38/43] fixup(#7168): neon_local: use pageserver defaults for
 known but unspecified config overrides (#7166)

e2e tests cannot run on macOS unless the file engine env var is
supplied.

```
./scripts/pytest test_runner/regress/test_neon_superuser.py -s
```

will fail with tokio-epoll-uring not supported.

This is because we persist the file engine config by default. In this
pull request, we only persist when someone specifies it, so that it can
use the default platform-variant config in the page server.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/local_env.rs  |  9 ++++-----
 control_plane/src/pageserver.rs | 12 ++++++++++--
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index c7f22cc8f8..bd3dbef453 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -127,8 +127,8 @@ pub struct PageServerConf {
     pub pg_auth_type: AuthType,
     pub http_auth_type: AuthType,
 
-    pub(crate) virtual_file_io_engine: String,
-    pub(crate) get_vectored_impl: String,
+    pub(crate) virtual_file_io_engine: Option<String>,
+    pub(crate) get_vectored_impl: Option<String>,
 }
 
 impl Default for PageServerConf {
@@ -139,9 +139,8 @@ impl Default for PageServerConf {
             listen_http_addr: String::new(),
             pg_auth_type: AuthType::Trust,
             http_auth_type: AuthType::Trust,
-            // FIXME: use the ones exposed by pageserver crate
-            virtual_file_io_engine: "tokio-epoll-uring".to_owned(),
-            get_vectored_impl: "sequential".to_owned(),
+            virtual_file_io_engine: None,
+            get_vectored_impl: None,
         }
     }
 }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 2603515681..c5eabc46db 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -101,8 +101,16 @@ impl PageServerNode {
 
         let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
         let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
-        let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'");
-        let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'");
+        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
+            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
+        } else {
+            String::new()
+        };
+        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
+            format!("get_vectored_impl='{get_vectored_impl}'")
+        } else {
+            String::new()
+        };
 
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 

From 64c6dfd3e44c4550604e6b97678140afa93f4409 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 14 Mar 2024 14:35:34 -0500
Subject: [PATCH 39/43] Move functions for creating/extracting tarballs into
 utils

Useful for other code paths which will handle zstd compression and
decompression.
---
 Cargo.lock                       |  3 ++
 libs/utils/Cargo.toml            |  3 ++
 libs/utils/src/lib.rs            |  2 +
 libs/utils/src/zstd.rs           | 78 ++++++++++++++++++++++++++++++++
 pageserver/src/import_datadir.rs | 72 +----------------------------
 pageserver/src/tenant.rs         | 13 ++++--
 6 files changed, 97 insertions(+), 74 deletions(-)
 create mode 100644 libs/utils/src/zstd.rs

diff --git a/Cargo.lock b/Cargo.lock
index c4f925e3c7..70f427f97d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6468,6 +6468,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "arc-swap",
+ "async-compression",
  "async-trait",
  "bincode",
  "byteorder",
@@ -6506,12 +6507,14 @@ dependencies = [
  "thiserror",
  "tokio",
  "tokio-stream",
+ "tokio-tar",
  "tokio-util",
  "tracing",
  "tracing-error",
  "tracing-subscriber",
  "url",
  "uuid",
+ "walkdir",
  "workspace_hack",
 ]
 
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 983e94d963..c2d9d9d396 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -13,6 +13,7 @@ testing = ["fail/failpoints"]
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
+async-compression.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
@@ -36,6 +37,7 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-tar.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
@@ -46,6 +48,7 @@ strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
 uuid.workspace = true
+walkdir.workspace = true
 
 pq_proto.workspace = true
 postgres_connection.workspace = true
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 890061dc59..04ce0626c8 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -87,6 +87,8 @@ pub mod failpoint_support;
 
 pub mod yielding_loop;
 
+pub mod zstd;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/libs/utils/src/zstd.rs b/libs/utils/src/zstd.rs
new file mode 100644
index 0000000000..be2dcc00f5
--- /dev/null
+++ b/libs/utils/src/zstd.rs
@@ -0,0 +1,78 @@
+use std::io::SeekFrom;
+
+use anyhow::{Context, Result};
+use async_compression::{
+    tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
+    zstd::CParameter,
+    Level,
+};
+use camino::Utf8Path;
+use nix::NixPath;
+use tokio::{
+    fs::{File, OpenOptions},
+    io::AsyncBufRead,
+    io::AsyncSeekExt,
+    io::AsyncWriteExt,
+};
+use tokio_tar::{Archive, Builder, HeaderMode};
+use walkdir::WalkDir;
+
+/// Creates a Zstandard tarball.
+pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
+    let file = OpenOptions::new()
+        .create(true)
+        .truncate(true)
+        .read(true)
+        .write(true)
+        .open(&tarball)
+        .await
+        .with_context(|| format!("tempfile creation {tarball}"))?;
+
+    let mut paths = Vec::new();
+    for entry in WalkDir::new(path) {
+        let entry = entry?;
+        let metadata = entry.metadata().expect("error getting dir entry metadata");
+        // Also allow directories so that we also get empty directories
+        if !(metadata.is_file() || metadata.is_dir()) {
+            continue;
+        }
+        let path = entry.into_path();
+        paths.push(path);
+    }
+    // Do a sort to get a more consistent listing
+    paths.sort_unstable();
+    let zstd = ZstdEncoder::with_quality_and_params(
+        file,
+        Level::Default,
+        &[CParameter::enable_long_distance_matching(true)],
+    );
+    let mut builder = Builder::new(zstd);
+    // Use reproducible header mode
+    builder.mode(HeaderMode::Deterministic);
+    for p in paths {
+        let rel_path = p.strip_prefix(path)?;
+        if rel_path.is_empty() {
+            // The top directory should not be compressed,
+            // the tar crate doesn't like that
+            continue;
+        }
+        builder.append_path_with_name(&p, rel_path).await?;
+    }
+    let mut zstd = builder.into_inner().await?;
+    zstd.shutdown().await?;
+    let mut compressed = zstd.into_inner();
+    let compressed_len = compressed.metadata().await?.len();
+    compressed.seek(SeekFrom::Start(0)).await?;
+    Ok((compressed, compressed_len))
+}
+
+/// Creates a Zstandard tarball.
+pub async fn extract_zst_tarball(
+    path: &Utf8Path,
+    tarball: impl AsyncBufRead + Unpin,
+) -> Result<()> {
+    let decoder = Box::pin(ZstdDecoder::new(tarball));
+    let mut archive = Archive::new(decoder);
+    archive.unpack(path).await?;
+    Ok(())
+}
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index d66df36b3a..343dec2ca1 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,28 +2,20 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
-use std::io::SeekFrom;
 use std::path::{Path, PathBuf};
 
 use anyhow::{bail, ensure, Context, Result};
-use async_compression::tokio::bufread::ZstdDecoder;
-use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
-use nix::NixPath;
-use tokio::fs::{File, OpenOptions};
-use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
+use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
-use tokio_tar::Builder;
-use tokio_tar::HeaderMode;
 use tracing::*;
 use walkdir::WalkDir;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
-use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
@@ -633,65 +625,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
     reader.read_to_end(&mut buf).await?;
     Ok(Bytes::from(buf))
 }
-
-pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
-    let file = OpenOptions::new()
-        .create(true)
-        .truncate(true)
-        .read(true)
-        .write(true)
-        .open(&tmp_path)
-        .await
-        .with_context(|| format!("tempfile creation {tmp_path}"))?;
-
-    let mut paths = Vec::new();
-    for entry in WalkDir::new(pgdata_path) {
-        let entry = entry?;
-        let metadata = entry.metadata().expect("error getting dir entry metadata");
-        // Also allow directories so that we also get empty directories
-        if !(metadata.is_file() || metadata.is_dir()) {
-            continue;
-        }
-        let path = entry.into_path();
-        paths.push(path);
-    }
-    // Do a sort to get a more consistent listing
-    paths.sort_unstable();
-    let zstd = ZstdEncoder::with_quality_and_params(
-        file,
-        Level::Default,
-        &[CParameter::enable_long_distance_matching(true)],
-    );
-    let mut builder = Builder::new(zstd);
-    // Use reproducible header mode
-    builder.mode(HeaderMode::Deterministic);
-    for path in paths {
-        let rel_path = path.strip_prefix(pgdata_path)?;
-        if rel_path.is_empty() {
-            // The top directory should not be compressed,
-            // the tar crate doesn't like that
-            continue;
-        }
-        builder.append_path_with_name(&path, rel_path).await?;
-    }
-    let mut zstd = builder.into_inner().await?;
-    zstd.shutdown().await?;
-    let mut compressed = zstd.into_inner();
-    let compressed_len = compressed.metadata().await?.len();
-    const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
-    if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
-        warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
-    }
-    compressed.seek(SeekFrom::Start(0)).await?;
-    Ok((compressed, compressed_len))
-}
-
-pub async fn extract_tar_zst(
-    pgdata_path: &Utf8Path,
-    tar_zst: impl AsyncBufRead + Unpin,
-) -> Result<()> {
-    let tar = Box::pin(ZstdDecoder::new(tar_zst));
-    let mut archive = Archive::new(tar);
-    archive.unpack(pgdata_path).await?;
-    Ok(())
-}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ddfb47369b..7a6ddd6a4e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -43,6 +43,8 @@ use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
 use utils::timeout::TimeoutCancellableError;
+use utils::zstd::create_zst_tarball;
+use utils::zstd::extract_zst_tarball;
 
 use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
@@ -3042,8 +3044,13 @@ impl Tenant {
             }
         }
 
-        let (pgdata_zstd, tar_zst_size) =
-            import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
+        let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
+        const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
+        if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
+            warn!(
+                "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
+            );
+        }
 
         pausable_failpoint!("before-initdb-upload");
 
@@ -3143,7 +3150,7 @@ impl Tenant {
 
             let buf_read =
                 BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
-            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
+            extract_zst_tarball(&pgdata_path, buf_read)
                 .await
                 .context("extract initdb tar")?;
         } else {

From a5d5c2a6a0c0e9da4ccbcd8e44dc97559eeee8c9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 19 Mar 2024 16:08:20 +0000
Subject: [PATCH 40/43] storage controller: tech debt (#7165)

This is a mixed bag of changes split out for separate review while
working on other things, and batched together to reduce load on CI
runners. Each commits stands alone for review purposes:
- do_tenant_shard_split was a long function and had a synchronous
validation phase at the start that could readily be pulled out into a
separate function. This also avoids the special casing of
ApiError::BadRequest when deciding whether an abort is needed on errors
- Add a 'describe' API (GET on tenant ID) that will enable storcon-cli
to see what's going on with a tenant
- the 'locate' API wasn't really meant for use in the field. It's for
tests: demote it to the /debug/ prefix
- The `Single` placement policy was a redundant duplicate of Double(0),
and Double was a bad name. Rename it Attached.
(https://github.com/neondatabase/neon/issues/7107)
- Some neon_local commands were added for debug/demos, which are now
replaced by commands in storcon-cli (#7114 ). Even though that's not
merged yet, we don't need the neon_local ones any more.

Closes https://github.com/neondatabase/neon/issues/7107

## Backward compat of Single/Double -> `Attached(n)` change

A database migration is used to convert any existing values.
---
 .../2024-03-18-184429_rename_policy/down.sql  |   3 +
 .../2024-03-18-184429_rename_policy/up.sql    |   3 +
 control_plane/attachment_service/src/http.rs  |  19 +-
 .../attachment_service/src/persistence.rs     |  13 +-
 .../attachment_service/src/reconciler.rs      |   2 +-
 .../attachment_service/src/service.rs         | 184 +++++++++++++-----
 .../attachment_service/src/tenant_state.rs    |  21 +-
 control_plane/src/bin/neon_local.rs           |  97 +--------
 control_plane/src/storage_controller.rs       |   2 +-
 libs/pageserver_api/src/controller_api.rs     |  45 ++++-
 test_runner/fixtures/neon_fixtures.py         |  15 +-
 test_runner/fixtures/types.py                 |   3 +
 .../regress/test_pageserver_secondary.py      |   2 +-
 test_runner/regress/test_sharding.py          |   4 +-
 14 files changed, 206 insertions(+), 207 deletions(-)
 create mode 100644 control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql

diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
new file mode 100644
index 0000000000..897c7e0d01
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
@@ -0,0 +1,3 @@
+
+UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';
+UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';
\ No newline at end of file
diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
new file mode 100644
index 0000000000..c898ac9aee
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
@@ -0,0 +1,3 @@
+
+UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}';
+UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"';
\ No newline at end of file
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 45ee354822..076b3a2f70 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -353,6 +353,16 @@ async fn handle_tenant_locate(
     json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 
+async fn handle_tenant_describe(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
+}
+
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -559,6 +569,9 @@ pub fn make_router(
             request_span(r, handle_node_drop)
         })
         .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
+        .get("/debug/v1/tenant/:tenant_id/locate", |r| {
+            tenant_service_handler(r, handle_tenant_locate)
+        })
         .get("/debug/v1/scheduler", |r| {
             request_span(r, handle_scheduler_dump)
         })
@@ -568,9 +581,6 @@ pub fn make_router(
         .put("/debug/v1/failpoints", |r| {
             request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
         })
-        .get("/control/v1/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(r, handle_tenant_locate)
-        })
         // Node operations
         .post("/control/v1/node", |r| {
             request_span(r, handle_node_register)
@@ -586,6 +596,9 @@ pub fn make_router(
         .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
             tenant_service_handler(r, handle_tenant_shard_split)
         })
+        .get("/control/v1/tenant/:tenant_id", |r| {
+            tenant_service_handler(r, handle_tenant_describe)
+        })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 3602cf8b1f..209d8ff075 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -211,15 +211,10 @@ impl Persistence {
 
         let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
             .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for (tenant_id, tenant) in &mut decoded.tenants {
-            // Backward compat: an old attachments.json from before PR #6251, replace
-            // empty strings with proper defaults.
-            if tenant.tenant_id.is_empty() {
-                tenant.tenant_id = tenant_id.to_string();
-                tenant.config = serde_json::to_string(&TenantConfig::default())
-                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
-                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+        for shard in decoded.tenants.values_mut() {
+            if shard.placement_policy == "\"Single\"" {
+                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
+                shard.placement_policy = "{\"Attached\":0}".to_string();
             }
         }
 
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 3bf23275bd..f00f35c74b 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -475,7 +475,7 @@ impl Reconciler {
             }
         }
 
-        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
         // this location will be deleted in the general case reconciliation that runs after this.
         let origin_secondary_conf = build_location_config(
             &self.shard,
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index addfd9c232..e38007c7af 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -20,8 +20,9 @@ use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
+        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
+        TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
+        TenantShardMigrateResponse, UtilizationScore,
     },
     models::{SecondaryProgress, TenantConfigRequest},
 };
@@ -202,6 +203,29 @@ enum TenantCreateOrUpdate {
     Update(Vec<ShardUpdate>),
 }
 
+struct ShardSplitParams {
+    old_shard_count: ShardCount,
+    new_shard_count: ShardCount,
+    new_stripe_size: Option<ShardStripeSize>,
+    targets: Vec<ShardSplitTarget>,
+    policy: PlacementPolicy,
+    shard_ident: ShardIdentity,
+}
+
+// When preparing for a shard split, we may either choose to proceed with the split,
+// or find that the work is already done and return NoOp.
+enum ShardSplitAction {
+    Split(ShardSplitParams),
+    NoOp(TenantShardSplitResponse),
+}
+
+// A parent shard which will be split
+struct ShardSplitTarget {
+    parent_id: TenantShardId,
+    node: Node,
+    child_ids: Vec<TenantShardId>,
+}
+
 /// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes
 /// might not be available.  We therefore use a queue of abort operations processed in the background.
 struct TenantShardSplitAbort {
@@ -1071,7 +1095,7 @@ impl Service {
                 shard_stripe_size: 0,
                 generation: Some(0),
                 generation_pageserver: None,
-                placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(),
+                placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                 splitting: SplitState::default(),
             };
@@ -1098,7 +1122,7 @@ impl Service {
                         TenantState::new(
                             attach_req.tenant_shard_id,
                             ShardIdentity::unsharded(),
-                            PlacementPolicy::Single,
+                            PlacementPolicy::Attached(0),
                         ),
                     );
                     tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
@@ -1127,7 +1151,7 @@ impl Service {
                     self.persistence
                         .update_tenant_shard(
                             attach_req.tenant_shard_id,
-                            PlacementPolicy::Single,
+                            PlacementPolicy::Attached(0),
                             conf,
                             None,
                         )
@@ -1152,7 +1176,7 @@ impl Service {
 
         if let Some(new_generation) = new_generation {
             tenant_state.generation = Some(new_generation);
-            tenant_state.policy = PlacementPolicy::Single;
+            tenant_state.policy = PlacementPolicy::Attached(0);
         } else {
             // This is a detach notification.  We must update placement policy to avoid re-attaching
             // during background scheduling/reconciliation, or during storage controller restart.
@@ -1505,11 +1529,11 @@ impl Service {
         &self,
         create_req: TenantCreateRequest,
     ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
-        // As a default, single is convenient for tests that don't choose a policy.
         let placement_policy = create_req
             .placement_policy
             .clone()
-            .unwrap_or(PlacementPolicy::Single);
+            // As a default, zero secondaries is convenient for tests that don't choose a policy.
+            .unwrap_or(PlacementPolicy::Attached(0));
 
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
@@ -1719,11 +1743,11 @@ impl Service {
             | LocationConfigMode::AttachedSingle
             | LocationConfigMode::AttachedStale => {
                 if nodes.len() > 1 {
-                    PlacementPolicy::Double(1)
+                    PlacementPolicy::Attached(1)
                 } else {
                     // Convenience for dev/test: if we just have one pageserver, import
-                    // tenants into Single mode so that scheduling will succeed.
-                    PlacementPolicy::Single
+                    // tenants into non-HA mode so that scheduling will succeed.
+                    PlacementPolicy::Attached(0)
                 }
             }
         };
@@ -2541,9 +2565,6 @@ impl Service {
         let locked = self.inner.read().unwrap();
         tracing::info!("Locating shards for tenant {tenant_id}");
 
-        // Take a snapshot of pageservers
-        let pageservers = locked.nodes.clone();
-
         let mut result = Vec::new();
         let mut shard_params: Option<ShardParameters> = None;
 
@@ -2557,7 +2578,8 @@ impl Service {
                         "Cannot locate a tenant that is not attached"
                     )))?;
 
-            let node = pageservers
+            let node = locked
+                .nodes
                 .get(&node_id)
                 .expect("Pageservers may not be deleted while referenced");
 
@@ -2605,6 +2627,47 @@ impl Service {
         })
     }
 
+    pub(crate) fn tenant_describe(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantDescribeResponse, ApiError> {
+        let locked = self.inner.read().unwrap();
+
+        let mut shard_zero = None;
+        let mut shards = Vec::new();
+
+        for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+        {
+            if tenant_shard_id.is_zero() {
+                shard_zero = Some(shard);
+            }
+
+            let response_shard = TenantDescribeResponseShard {
+                tenant_shard_id: *tenant_shard_id,
+                node_attached: *shard.intent.get_attached(),
+                node_secondary: shard.intent.get_secondary().to_vec(),
+                last_error: shard.last_error.lock().unwrap().clone(),
+                is_reconciling: shard.reconciler.is_some(),
+                is_pending_compute_notification: shard.pending_compute_notification,
+                is_splitting: matches!(shard.splitting, SplitState::Splitting),
+            };
+            shards.push(response_shard);
+        }
+
+        let Some(shard_zero) = shard_zero else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+            ));
+        };
+
+        Ok(TenantDescribeResponse {
+            shards,
+            stripe_size: shard_zero.shard.stripe_size,
+            policy: shard_zero.policy.clone(),
+            config: shard_zero.config.clone(),
+        })
+    }
+
     #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
     async fn abort_tenant_shard_split(
         &self,
@@ -2828,7 +2891,7 @@ impl Service {
                                 generation,
                                 &child_shard,
                                 &config,
-                                matches!(policy, PlacementPolicy::Double(n) if n > 0),
+                                matches!(policy, PlacementPolicy::Attached(n) if n > 0),
                             )),
                         },
                     );
@@ -2875,17 +2938,23 @@ impl Service {
         let new_shard_count = ShardCount::new(split_req.new_shard_count);
         let new_stripe_size = split_req.new_stripe_size;
 
-        let r = self.do_tenant_shard_split(tenant_id, split_req).await;
+        // Validate the request and construct parameters.  This phase is fallible, but does not require
+        // rollback on errors, as it does no I/O and mutates no state.
+        let shard_split_params = match self.prepare_tenant_shard_split(tenant_id, split_req)? {
+            ShardSplitAction::NoOp(resp) => return Ok(resp),
+            ShardSplitAction::Split(params) => params,
+        };
+
+        // Execute this split: this phase mutates state and does remote I/O on pageservers.  If it fails,
+        // we must roll back.
+        let r = self
+            .do_tenant_shard_split(tenant_id, shard_split_params)
+            .await;
 
         match r {
             Ok(r) => Ok(r),
-            Err(ApiError::BadRequest(_)) => {
-                // A request validation error does not require rollback: we rejected it before we started making any changes: just
-                // return the error
-                r
-            }
             Err(e) => {
-                // General case error handling: split might be part-done, we must do work to abort it.
+                // Split might be part-done, we must do work to abort it.
                 tracing::warn!("Enqueuing background abort of split on {tenant_id}");
                 self.abort_tx
                     .send(TenantShardSplitAbort {
@@ -2901,25 +2970,17 @@ impl Service {
         }
     }
 
-    pub(crate) async fn do_tenant_shard_split(
+    fn prepare_tenant_shard_split(
         &self,
         tenant_id: TenantId,
         split_req: TenantShardSplitRequest,
-    ) -> Result<TenantShardSplitResponse, ApiError> {
-        let mut policy = None;
-        let mut shard_ident = None;
-
-        // A parent shard which will be split
-        struct SplitTarget {
-            parent_id: TenantShardId,
-            node: Node,
-            child_ids: Vec<TenantShardId>,
-        }
-
+    ) -> Result<ShardSplitAction, ApiError> {
         fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest(
             anyhow::anyhow!("failpoint")
         )));
 
+        let mut policy = None;
+        let mut shard_ident = None;
         // Validate input, and calculate which shards we will create
         let (old_shard_count, targets) =
             {
@@ -2995,7 +3056,7 @@ impl Service {
 
                     // TODO: if any reconciliation is currently in progress for this shard, wait for it.
 
-                    targets.push(SplitTarget {
+                    targets.push(ShardSplitTarget {
                         parent_id: *tenant_shard_id,
                         node: node.clone(),
                         child_ids: tenant_shard_id
@@ -3005,9 +3066,9 @@ impl Service {
 
                 if targets.is_empty() {
                     if children_found.len() == split_req.new_shard_count as usize {
-                        return Ok(TenantShardSplitResponse {
+                        return Ok(ShardSplitAction::NoOp(TenantShardSplitResponse {
                             new_shards: children_found,
-                        });
+                        }));
                     } else {
                         // No shards found to split, and no existing children found: the
                         // tenant doesn't exist at all.
@@ -3038,12 +3099,36 @@ impl Service {
         };
         let policy = policy.unwrap();
 
+        Ok(ShardSplitAction::Split(ShardSplitParams {
+            old_shard_count,
+            new_shard_count: ShardCount::new(split_req.new_shard_count),
+            new_stripe_size: split_req.new_stripe_size,
+            targets,
+            policy,
+            shard_ident,
+        }))
+    }
+
+    async fn do_tenant_shard_split(
+        &self,
+        tenant_id: TenantId,
+        params: ShardSplitParams,
+    ) -> Result<TenantShardSplitResponse, ApiError> {
         // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
         // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
         // parent shards exist as expected, but it would be neater to do the above pre-checks within the
         // same database transaction rather than pre-check in-memory and then maybe-fail the database write.
         // (https://github.com/neondatabase/neon/issues/6676)
 
+        let ShardSplitParams {
+            old_shard_count,
+            new_shard_count,
+            new_stripe_size,
+            targets,
+            policy,
+            shard_ident,
+        } = params;
+
         // Before creating any new child shards in memory or on the pageservers, persist them: this
         // enables us to ensure that we will always be able to clean up if something goes wrong.  This also
         // acts as the protection against two concurrent attempts to split: one of them will get a database
@@ -3125,7 +3210,7 @@ impl Service {
         // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
 
         for target in &targets {
-            let SplitTarget {
+            let ShardSplitTarget {
                 parent_id,
                 node,
                 child_ids,
@@ -3135,8 +3220,8 @@ impl Service {
                 .tenant_shard_split(
                     *parent_id,
                     TenantShardSplitRequest {
-                        new_shard_count: split_req.new_shard_count,
-                        new_stripe_size: split_req.new_stripe_size,
+                        new_shard_count: new_shard_count.literal(),
+                        new_stripe_size,
                     },
                 )
                 .await
@@ -3185,11 +3270,8 @@ impl Service {
         ));
 
         // Replace all the shards we just split with their children: this phase is infallible.
-        let (response, child_locations) = self.tenant_shard_split_commit_inmem(
-            tenant_id,
-            ShardCount::new(split_req.new_shard_count),
-            split_req.new_stripe_size,
-        );
+        let (response, child_locations) =
+            self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
@@ -3254,17 +3336,15 @@ impl Service {
                 let old_attached = *shard.intent.get_attached();
 
                 match shard.policy {
-                    PlacementPolicy::Single => {
-                        shard.intent.clear_secondary(scheduler);
-                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
-                    }
-                    PlacementPolicy::Double(_n) => {
+                    PlacementPolicy::Attached(n) => {
                         // If our new attached node was a secondary, it no longer should be.
                         shard.intent.remove_secondary(scheduler, migrate_req.node_id);
 
                         // If we were already attached to something, demote that to a secondary
                         if let Some(old_attached) = old_attached {
-                            shard.intent.push_secondary(scheduler, old_attached);
+                            if n > 0 {
+                                shard.intent.push_secondary(scheduler, old_attached);
+                            }
                         }
 
                         shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 39e557616d..9dd368bf41 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -457,22 +457,7 @@ impl TenantState {
         // Add/remove nodes to fulfil policy
         use PlacementPolicy::*;
         match self.policy {
-            Single => {
-                // Should have exactly one attached, and zero secondaries
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
-                    modified = true;
-                }
-
-                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
-                modified |= modified_attached;
-
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
-                    modified = true;
-                }
-            }
-            Double(secondary_count) => {
+            Attached(secondary_count) => {
                 let retain_secondaries = if self.intent.attached.is_none()
                     && scheduler.node_preferred(&self.intent.secondary).is_some()
                 {
@@ -895,7 +880,7 @@ pub(crate) mod tests {
 
         let mut scheduler = Scheduler::new(nodes.values());
 
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
         tenant_state
             .schedule(&mut scheduler)
             .expect("we have enough nodes, scheduling should work");
@@ -943,7 +928,7 @@ pub(crate) mod tests {
         let nodes = make_test_nodes(3);
         let mut scheduler = Scheduler::new(nodes.values());
 
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
 
         tenant_state.observed.locations.insert(
             NodeId(3),
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 6c722f36b4..401feae706 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -437,7 +437,7 @@ async fn handle_tenant(
 
             let placement_policy = match create_match.get_one::<String>("placement-policy") {
                 Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
-                _ => PlacementPolicy::Single,
+                _ => PlacementPolicy::Attached(0),
             };
 
             let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
@@ -523,88 +523,6 @@ async fn handle_tenant(
                 .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
             println!("tenant {tenant_id} successfully configured on the pageserver");
         }
-        Some(("migrate", matches)) => {
-            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
-            let new_pageserver = get_pageserver(env, matches)?;
-            let new_pageserver_id = new_pageserver.conf.id;
-
-            let storage_controller = StorageController::from_env(env);
-            storage_controller
-                .tenant_migrate(tenant_shard_id, new_pageserver_id)
-                .await?;
-
-            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
-        }
-        Some(("status", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
-
-            let mut shard_table = comfy_table::Table::new();
-            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
-
-            let mut tenant_synthetic_size = None;
-
-            let storage_controller = StorageController::from_env(env);
-            for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
-                let pageserver =
-                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
-
-                let size = pageserver
-                    .http_client
-                    .tenant_details(shard.shard_id)
-                    .await?
-                    .tenant_info
-                    .current_physical_size
-                    .unwrap();
-
-                shard_table.add_row([
-                    format!("{}", shard.shard_id.shard_slug()),
-                    format!("{}", shard.node_id.0),
-                    format!("{} MiB", size / (1024 * 1024)),
-                ]);
-
-                if shard.shard_id.is_zero() {
-                    tenant_synthetic_size =
-                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
-                }
-            }
-
-            let Some(synthetic_size) = tenant_synthetic_size else {
-                bail!("Shard 0 not found")
-            };
-
-            let mut tenant_table = comfy_table::Table::new();
-            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
-            tenant_table.add_row([
-                "Synthetic size".to_string(),
-                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
-            ]);
-
-            println!("{tenant_table}");
-            println!("{shard_table}");
-        }
-        Some(("shard-split", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
-            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
-            let shard_stripe_size: Option<ShardStripeSize> = matches
-                .get_one::<Option<ShardStripeSize>>("shard-stripe-size")
-                .cloned()
-                .unwrap();
-
-            let storage_controller = StorageController::from_env(env);
-            let result = storage_controller
-                .tenant_split(tenant_id, shard_count, shard_stripe_size)
-                .await?;
-            println!(
-                "Split tenant {} into shards {}",
-                tenant_id,
-                result
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-        }
 
         Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
         None => bail!("no tenant subcommand provided"),
@@ -1578,19 +1496,6 @@ fn cli() -> Command {
             .subcommand(Command::new("config")
                 .arg(tenant_id_arg.clone())
                 .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
-            .subcommand(Command::new("migrate")
-                .about("Migrate a tenant from one pageserver to another")
-                .arg(tenant_id_arg.clone())
-                .arg(pageserver_id_arg.clone()))
-            .subcommand(Command::new("status")
-                .about("Human readable summary of the tenant's shards and attachment locations")
-                .arg(tenant_id_arg.clone()))
-            .subcommand(Command::new("shard-split")
-                .about("Increase the number of shards in the tenant")
-                .arg(tenant_id_arg.clone())
-                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
-                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
-                )
         )
         .subcommand(
             Command::new("pageserver")
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 18014adba4..e7697ecac8 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -475,7 +475,7 @@ impl StorageController {
     pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
         self.dispatch::<(), _>(
             Method::GET,
-            format!("control/v1/tenant/{tenant_id}/locate"),
+            format!("debug/v1/tenant/{tenant_id}/locate"),
             None,
         )
         .await
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 6053e8b8ed..e33bd0f486 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -6,7 +6,10 @@ use std::str::FromStr;
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
 
-use crate::{models::ShardParameters, shard::TenantShardId};
+use crate::{
+    models::{ShardParameters, TenantConfig},
+    shard::{ShardStripeSize, TenantShardId},
+};
 
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
@@ -57,6 +60,31 @@ pub struct TenantLocateResponse {
     pub shard_params: ShardParameters,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct TenantDescribeResponse {
+    pub shards: Vec<TenantDescribeResponseShard>,
+    pub stripe_size: ShardStripeSize,
+    pub policy: PlacementPolicy,
+    pub config: TenantConfig,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantDescribeResponseShard {
+    pub tenant_shard_id: TenantShardId,
+
+    pub node_attached: Option<NodeId>,
+    pub node_secondary: Vec<NodeId>,
+
+    pub last_error: String,
+
+    /// A task is currently running to reconcile this tenant's intent state with the state on pageservers
+    pub is_reconciling: bool,
+    /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
+    pub is_pending_compute_notification: bool,
+    /// A shard split is currently underway
+    pub is_splitting: bool,
+}
+
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
@@ -181,11 +209,8 @@ impl From<NodeSchedulingPolicy> for String {
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 pub enum PlacementPolicy {
-    /// Cheapest way to attach a tenant: just one pageserver, no secondary
-    Single,
-    /// Production-ready way to attach a tenant: one attached pageserver and
-    /// some number of secondaries.
-    Double(usize),
+    /// Normal live state: one attached pageserver and zero or more secondaries.
+    Attached(usize),
     /// Create one secondary mode locations. This is useful when onboarding
     /// a tenant, or for an idle tenant that we might want to bring online quickly.
     Secondary,
@@ -207,14 +232,14 @@ mod test {
     /// Check stability of PlacementPolicy's serialization
     #[test]
     fn placement_policy_encoding() -> anyhow::Result<()> {
-        let v = PlacementPolicy::Double(1);
+        let v = PlacementPolicy::Attached(1);
         let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "{\"Double\":1}");
+        assert_eq!(encoded, "{\"Attached\":1}");
         assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
 
-        let v = PlacementPolicy::Single;
+        let v = PlacementPolicy::Detached;
         let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "\"Single\"");
+        assert_eq!(encoded, "\"Detached\"");
         assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
         Ok(())
     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3ecd343224..1d30c45278 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1892,19 +1892,6 @@ class NeonCli(AbstractNeonCli):
 
         return self.raw_cli(args, check_return_code=True)
 
-    def tenant_migrate(
-        self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int]
-    ):
-        args = [
-            "tenant",
-            "migrate",
-            "--tenant-id",
-            str(tenant_shard_id),
-            "--id",
-            str(new_pageserver),
-        ]
-        return self.raw_cli(args, check_return_code=True, timeout=timeout_secs)
-
     def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
         return self.raw_cli(["start"], check_return_code=check_return_code)
 
@@ -2156,7 +2143,7 @@ class NeonStorageController(MetricsGetter):
         """
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate",
+            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate",
             headers=self.headers(TokenScope.ADMIN),
         )
         body = response.json()
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py
index ea648e460d..80c9b9ce9a 100644
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -158,6 +158,9 @@ class TenantShardId:
     def __str__(self):
         return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
 
+    def __repr__(self):
+        return self.__str__()
+
     def _tuple(self) -> tuple[TenantId, int, int]:
         return (self.tenant_id, self.shard_number, self.shard_count)
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8ef75414a3..e664547b69 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -576,7 +576,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll
     timeline_id = TimelineId.generate()
 
     env.neon_cli.create_tenant(
-        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Double":1}'
+        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}'
     )
 
     attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 3470d2e609..cb58c640c3 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -264,7 +264,7 @@ def test_sharding_split_smoke(
         destination = migrate_to_pageserver_ids.pop()
 
         log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}")
-        env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
+        env.storage_controller.tenant_shard_migrate(migrate_shard, destination)
 
     workload.validate()
 
@@ -299,7 +299,7 @@ def test_sharding_split_smoke(
         locations = pageserver.http_client().tenant_list_locations()
         shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
 
-    log.info("Shards after split: {shards_exist}")
+    log.info(f"Shards after split: {shards_exist}")
     assert len(shards_exist) == split_shard_count
 
     # Ensure post-split pageserver locations survive a restart (i.e. the child shards

From 4ba3f3518eddd8e5eebca90c564857e0d285932d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 20 Mar 2024 10:24:59 +0000
Subject: [PATCH 41/43] test: fix on demand activation test flakyness (#7180)

Warm-up (and the "tenant startup complete" metric update) happens in
a background tokio task. The tenant map is eagerly updated (can happen
before the task finishes).

The test assumed that if the tenant map was updated, then the metric
should reflect that. That's not the case, so we tweak the test to wait
for the metric.

Fixes https://github.com/neondatabase/neon/issues/7158
---
 test_runner/regress/test_timeline_size.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 205ca18050..628c484fbd 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -20,6 +20,7 @@ from fixtures.neon_fixtures import (
     VanillaPostgres,
     wait_for_last_flush_lsn,
 )
+from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
@@ -684,6 +685,13 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
     # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
 
 
+def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int):
+    def condition():
+        assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count
+
+    wait_until(5, 1.0, condition)
+
+
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     """
     Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
@@ -767,10 +775,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     # That one that we successfully accessed is now Active
     expect_activated += 1
     assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
-        == expect_activated - 1
-    )
+    wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
 
     # The ones we didn't touch are still in Attaching
     assert (
@@ -790,10 +795,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         == n_tenants - expect_activated
     )
 
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
-        == expect_activated - 1
-    )
+    wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
 
     # When we unblock logical size calculation, all tenants should proceed to active state via
     # the warmup route.
@@ -813,7 +815,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     assert (
         pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
     )
-    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
+    wait_for_tenant_startup_completions(pageserver_http, count=n_tenants)
 
     # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main
     # body of the test because it will disrupt tenant counts

From 6d996427b19ae20b0be30651838586307537b2b4 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 20 Mar 2024 12:26:31 +0000
Subject: [PATCH 42/43] proxy: enable sha2 asm support (#7184)

## Problem

faster sha2 hashing.

## Summary of changes

enable asm feature for sha2. this feature will be default in sha2 0.11,
so we might as well lean into it now. It provides a noticeable speed
boost on macos aarch64. Haven't tested on x86 though
---
 Cargo.lock                | 15 +++++++++++++--
 proxy/Cargo.toml          |  2 +-
 workspace_hack/Cargo.toml |  1 +
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 70f427f97d..cdbabf2f76 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5346,13 +5346,23 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
 
 [[package]]
 name = "sha2"
-version = "0.10.6"
+version = "0.10.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
+checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
 dependencies = [
  "cfg-if",
  "cpufeatures",
  "digest",
+ "sha2-asm",
+]
+
+[[package]]
+name = "sha2-asm"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
+dependencies = [
+ "cc",
 ]
 
 [[package]]
@@ -7032,6 +7042,7 @@ dependencies = [
  "scopeguard",
  "serde",
  "serde_json",
+ "sha2",
  "smallvec",
  "subtle",
  "syn 1.0.109",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index d8112c8bf0..b3a5bf873e 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -59,7 +59,7 @@ rustls.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-sha2.workspace = true
+sha2 = { workspace = true, features = ["asm"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8593b752c2..152c452dd4 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -64,6 +64,7 @@ rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
+sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["write"] }
 subtle = { version = "2" }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }

From fb66a3dd857bec7f99bba9c5dd5ee80213761878 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 20 Mar 2024 16:08:03 +0200
Subject: [PATCH 43/43] fix: ResidentLayer::load_keys should not create INFO
 level span (#7174)

Since #6115 with more often used get_value_reconstruct_data and friends,
we should not have needless INFO level span creation near hot paths. In
our prod configuration, INFO spans are always created, but in practice,
very rarely anything at INFO level is logged underneath.
`ResidentLayer::load_keys` is only used during compaction so it is not
that hot, but this aligns the access paths and their span usage.

PR changes the span level to debug to align with others, and adds the
layer name to the error which was missing.

Split off from #7030.
---
 pageserver/src/tenant/storage_layer/layer.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 0200ff8cf4..f37d7e6449 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1484,7 +1484,7 @@ impl ResidentLayer {
     }
 
     /// Loads all keys stored in the layer. Returns key, lsn and value size.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
     pub(crate) async fn load_keys<'a>(
         &'a self,
         ctx: &RequestContext,
@@ -1504,9 +1504,9 @@ impl ResidentLayer {
                 // while it's being held.
                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
                     .await
-                    .context("Layer index is corrupted")
+                    .with_context(|| format!("Layer index is corrupted for {self}"))
             }
-            Image(_) => anyhow::bail!("cannot load_keys on a image layer"),
+            Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")),
         }
     }