From 7c53fd0d56083cb4e0becff87292d3d0406943eb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 28 Feb 2025 13:31:52 +0100
Subject: [PATCH 01/61] refactor(page_service / timeline::handle): the
 GateGuard need not be a special case (#11030)

# Changes

While working on
- https://github.com/neondatabase/neon/pull/7202

I found myself needing to cache another expensive Arc::clone inside
inside the timeline::handle::Cache by wrapping it in another Arc.

Before this PR, it seemed like the only expensive thing we were caching
was the
connection handler tasks' clone of `Arc<Timeline>`.

But in fact the GateGuard was another such thing, but it was
special-cased
in the implementation.

So, this refactoring PR de-special-cases the GateGuard.

# Performance

With this PR we are doing strictly _less_ operations per `Cache::get`.
The reason is that we wrap the entire `Types::Timeline` into one Arc.
Before this PR, it was a separate Arc around the Arc<Timeline> and
one around the Arc<GateGuard>.

With this PR, we avoid an allocation per cached item, namely,
the separate Arc around the GateGuard.

This PR does not change the amount of shared mutable state.

So, all in all, it should be a net positive, albeit probably not
noticable
with our small non-NUMA instances and generally high CPU usage per
request.

# Reviewing

To understand the refactoring logistics, look at the changes to the unit
test types first.
Then read the improved module doc comment.
Then the remaining changes.

In the future, we could rename things to be even more generic.
For example, `Types::TenantMgr` could really be a `Types::Resolver`.
And `Types::Timeline` should, to avoid constant confusion in the doc
comment,
be called `Types::Cached` or `Types::Resolved`.
Because the `handle` module, after this PR, really doesn't care that
we're
using it for storing Arc's and GateGuards.

Then again, specicifity is sometimes more useful than being generic.
And writing the module doc comment in a totally generic way would
probably also be more confusing than helpful.
---
 pageserver/src/page_service.rs           |  46 ++++--
 pageserver/src/tenant/timeline/handle.rs | 195 ++++++++++++-----------
 2 files changed, 132 insertions(+), 109 deletions(-)
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 8972515163..603a5f65aa 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -392,10 +392,6 @@ impl TimelineHandles {
             .await
             .map_err(|e| match e {
                 timeline::handle::GetError::TenantManager(e) => e,
-                timeline::handle::GetError::TimelineGateClosed => {
-                    trace!("timeline gate closed");
-                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
-                }
                 timeline::handle::GetError::PerTimelineStateShutDown => {
                     trace!("per-timeline state shut down");
                     GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
@@ -422,24 +418,33 @@ pub(crate) struct TenantManagerTypes;
 impl timeline::handle::Types for TenantManagerTypes {
     type TenantManagerError = GetActiveTimelineError;
     type TenantManager = TenantManagerWrapper;
-    type Timeline = Arc<Timeline>;
+    type Timeline = TenantManagerCacheItem;
 }
 
-impl timeline::handle::ArcTimeline<TenantManagerTypes> for Arc<Timeline> {
-    fn gate(&self) -> &utils::sync::gate::Gate {
-        &self.gate
-    }
+pub(crate) struct TenantManagerCacheItem {
+    pub(crate) timeline: Arc<Timeline>,
+    #[allow(dead_code)] // we store it to keep the gate open
+    pub(crate) gate_guard: GateGuard,
+}
 
+impl std::ops::Deref for TenantManagerCacheItem {
+    type Target = Arc<Timeline>;
+    fn deref(&self) -> &Self::Target {
+        &self.timeline
+    }
+}
+
+impl timeline::handle::Timeline<TenantManagerTypes> for TenantManagerCacheItem {
     fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId {
-        Timeline::shard_timeline_id(self)
+        Timeline::shard_timeline_id(&self.timeline)
     }
 
     fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState<TenantManagerTypes> {
-        &self.handles
+        &self.timeline.handles
     }
 
     fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity {
-        Timeline::get_shard_identity(self)
+        Timeline::get_shard_identity(&self.timeline)
     }
 }
 
@@ -448,7 +453,7 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
         &self,
         timeline_id: TimelineId,
         shard_selector: ShardSelector,
-    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+    ) -> Result<TenantManagerCacheItem, GetActiveTimelineError> {
         let tenant_id = self.tenant_id.get().expect("we set this in get()");
         let timeout = ACTIVE_TENANT_TIMEOUT;
         let wait_start = Instant::now();
@@ -491,7 +496,20 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
         let timeline = tenant_shard
             .get_timeline(timeline_id, true)
             .map_err(GetActiveTimelineError::Timeline)?;
-        Ok(timeline)
+
+        let gate_guard = match timeline.gate.enter() {
+            Ok(guard) => guard,
+            Err(_) => {
+                return Err(GetActiveTimelineError::Timeline(
+                    GetTimelineError::ShuttingDown,
+                ));
+            }
+        };
+
+        Ok(TenantManagerCacheItem {
+            timeline,
+            gate_guard,
+        })
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 67fb89c433..809b350f38 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1,5 +1,4 @@
-//! An efficient way to keep the timeline gate open without preventing
-//! timeline shutdown for longer than a single call to a timeline method.
+//! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`.
 //!
 //! # Motivation
 //!
@@ -19,27 +18,32 @@
 //! we hold the Timeline gate open while we're invoking the method on the
 //! Timeline object.
 //!
-//! However, we want to avoid the overhead of entering the gate for every
-//! method invocation.
-//!
-//! Further, for shard routing, we want to avoid calling the tenant manager to
-//! resolve the shard for every request. Instead, we want to cache the
-//! routing result so we can bypass the tenant manager for all subsequent requests
-//! that get routed to that shard.
+//! We want to avoid the overhead of doing, for each incoming request,
+//! - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
+//! - cloning the `Arc<Timeline>` out of the tenant manager so we can
+//!   release the mgr rwlock before doing any request processing work
+//! - re-entering the Timeline gate for each Timeline method invocation.
 //!
 //! Regardless of how we accomplish the above, it should not
 //! prevent the Timeline from shutting down promptly.
 //!
+//!
 //! # Design
 //!
 //! ## Data Structures
 //!
-//! There are three user-facing data structures:
+//! There are two concepts expressed as associated types in the `Types` trait:
+//! - `TenantManager`: the thing that performs the expensive work. It produces
+//!   a `Timeline` object, which is the other associated type.
+//! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup.
+//!
+//! There are three user-facing data structures exposed by this module:
 //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
 //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
-//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
+//! - `Handle`: a smart pointer that derefs to the Types::Timeline.
 //! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
-//!   trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
+//!   trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always
+//!   point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`.
 //!
 //! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
 //! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
@@ -64,11 +68,14 @@
 //!
 //! To dispatch a request, the page service connection calls `Cache::get`.
 //!
-//! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
-//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
+//! A cache miss means we call Types::TenantManager::resolve for shard routing,
+//! cloning the `Arc<Timeline>` out of it, and entering the gate. The result of
+//! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls.
+//!
+//! We wrap the object returned from resolve() in an `Arc` and store that inside the
+//! `Arc<Mutex<HandleInner>>>`. A weak ref to the HandleInner is stored in the `Cache`
 //! and a strong ref in the `PerTimelineState`.
-//! A strong ref is returned wrapped in a `Handle`.
+//! Another strong ref is returned wrapped in a `Handle`.
 //!
 //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
 //! and find the weak ref in the cache.
@@ -78,51 +85,51 @@
 //! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
 //! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
 //! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
-//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
+//! It then drops the `Handle`, and thus the `Arc<Mutex<HandleInner>>` inside it.
 //!
 //! # Performance
 //!
 //! Remember from the introductory section:
 //!
-//! > However, we want to avoid the overhead of entering the gate for every
-//! > method invocation.
+//! > We want to avoid the overhead of doing, for each incoming request,
+//! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
+//! > - cloning the `Arc<Timeline>` out of the tenant manager so we can
+//! >   release the mgr rwlock before doing any request processing work
+//! > - re-entering the Timeline gate for each Timeline method invocation.
 //!
-//! Why do we want to avoid that?
-//! Because the gate is a shared location in memory and entering it involves
-//! bumping refcounts, which leads to cache contention if done frequently
-//! from multiple cores in parallel.
+//! All of these boil down to some state that is either globally shared among all shards
+//! or state shared among all tasks that serve a particular timeline.
+//! It is either protected by RwLock or manipulated via atomics.
+//! Even atomics are costly when shared across multiple cores.
+//! So, we want to avoid any permanent need for coordination between page_service tasks.
 //!
-//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
-//! That `Arc` is private to the `HandleInner` and hence to the connection.
+//! The solution is to add indirection: we wrap the Types::Timeline object that is
+//! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner`
+//! and hence to the single Cache / page_service connection.
 //! (Review the "Data Structures" section if that is unclear to you.)
 //!
-//! A `WeakHandle` is a weak ref to the `HandleInner`.
-//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
-//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
-//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
 //!
-//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
-//! Again, this is cheap because the `Arc` is private to the connection.
+//! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex<HandleInner>`),
+//! lock the mutex, take out a clone of the `Arc<Types::Timeline>`, and drop the Mutex.
+//! The Mutex is not contended because it is private to the connection.
+//! And again, the  `Arc<Types::Timeline>` clone is cheap because that wrapper
+//! Arc's refcounts are private to the connection.
+//!
+//! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection.
 //!
-//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
-//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
-//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
-//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
-//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
-//! so that we can clone it cheaply when upgrading a `WeakHandle`.
 //!
 //! # Shutdown
 //!
 //! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
 //!
 //! ```text
-//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline
 //! ```
 //!
 //! Further, there is this cycle:
 //!
 //! ```text
-//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline
 //! ```
 //!
 //! The former cycle is a memory leak if not broken.
@@ -135,9 +142,12 @@
 //! - Timeline shutdown (=> `PerTimelineState::shutdown`)
 //! - Connection shutdown (=> dropping the `Cache`).
 //!
-//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
-//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
-//! `Arc<GateGuard>`.
+//! Both transition the `HandleInner` from [`HandleInner::Open`] to
+//! [`HandleInner::ShutDown`], which drops the only long-lived
+//! `Arc<Types::Timeline>`. Once the last short-lived Arc<Types::Timeline>
+//! is dropped, the `Types::Timeline` gets dropped and thereby
+//! the `GateGuard` and the `Arc<Timeline>` that it stores,
+//! thereby breaking both cycles.
 //!
 //! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
 //! thereby breaking the cycle.
@@ -216,7 +226,7 @@ use crate::tenant::mgr::ShardSelector;
 pub(crate) trait Types: Sized + std::fmt::Debug {
     type TenantManagerError: Sized + std::fmt::Debug;
     type TenantManager: TenantManager<Self> + Sized;
-    type Timeline: ArcTimeline<Self> + Sized;
+    type Timeline: Timeline<Self> + Sized;
 }
 
 /// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
@@ -261,20 +271,15 @@ pub(crate) struct ShardTimelineId {
 
 /// See module-level comment.
 pub(crate) struct Handle<T: Types> {
-    timeline: Arc<T::Timeline>,
-    #[allow(dead_code)] // the field exists to keep the gate open
-    gate_guard: Arc<utils::sync::gate::GateGuard>,
     inner: Arc<Mutex<HandleInner<T>>>,
+    open: Arc<T::Timeline>,
 }
 pub(crate) struct WeakHandle<T: Types> {
     inner: Weak<Mutex<HandleInner<T>>>,
 }
+
 enum HandleInner<T: Types> {
-    KeepingTimelineGateOpen {
-        #[allow(dead_code)]
-        gate_guard: Arc<utils::sync::gate::GateGuard>,
-        timeline: Arc<T::Timeline>,
-    },
+    Open(Arc<T::Timeline>),
     ShutDown,
 }
 
@@ -307,8 +312,7 @@ pub(crate) trait TenantManager<T: Types> {
 }
 
 /// Abstract view of an [`Arc<Timeline>`], for testability.
-pub(crate) trait ArcTimeline<T: Types>: Clone {
-    fn gate(&self) -> &utils::sync::gate::Gate;
+pub(crate) trait Timeline<T: Types> {
     fn shard_timeline_id(&self) -> ShardTimelineId;
     fn get_shard_identity(&self) -> &ShardIdentity;
     fn per_timeline_state(&self) -> &PerTimelineState<T>;
@@ -318,7 +322,6 @@ pub(crate) trait ArcTimeline<T: Types>: Clone {
 #[derive(Debug)]
 pub(crate) enum GetError<T: Types> {
     TenantManager(T::TenantManagerError),
-    TimelineGateClosed,
     PerTimelineStateShutDown,
 }
 
@@ -434,21 +437,9 @@ impl<T: Types> Cache<T> {
                 }
 
                 trace!("creating new HandleInner");
-                let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
-                    gate_guard: Arc::new(
-                        // this enter() is expensive in production code because
-                        // it hits the global Arc<Timeline>::gate refcounts
-                        match timeline.gate().enter() {
-                            Ok(guard) => guard,
-                            Err(_) => {
-                                return Err(GetError::TimelineGateClosed);
-                            }
-                        },
-                    ),
-                    // this clone is expensive in production code because
-                    // it hits the global Arc<Timeline>::clone refcounts
-                    timeline: Arc::new(timeline.clone()),
-                }));
+                let timeline = Arc::new(timeline);
+                let handle_inner_arc =
+                    Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
                 let handle_weak = WeakHandle {
                     inner: Arc::downgrade(&handle_inner_arc),
                 };
@@ -503,18 +494,10 @@ impl<T: Types> WeakHandle<T> {
         };
         let lock_guard = inner.lock().expect("poisoned");
         match &*lock_guard {
-            HandleInner::KeepingTimelineGateOpen {
-                timeline,
-                gate_guard,
-            } => {
-                let gate_guard = Arc::clone(gate_guard);
-                let timeline = Arc::clone(timeline);
+            HandleInner::Open(open) => {
+                let open = Arc::clone(open);
                 drop(lock_guard);
-                Ok(Handle {
-                    timeline,
-                    gate_guard,
-                    inner,
-                })
+                Ok(Handle { open, inner })
             }
             HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
         }
@@ -528,7 +511,7 @@ impl<T: Types> WeakHandle<T> {
 impl<T: Types> std::ops::Deref for Handle<T> {
     type Target = T::Timeline;
     fn deref(&self) -> &Self::Target {
-        &self.timeline
+        &self.open
     }
 }
 
@@ -545,7 +528,7 @@ impl<T: Types> PerTimelineState<T> {
     /// to the [`Types::Timeline`] that embeds this per-timeline state.
     /// Even if [`TenantManager::resolve`] would still resolve to it.
     ///
-    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
+    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive.
     /// That's ok because they're short-lived. See module-level comment for details.
     #[instrument(level = "trace", skip_all)]
     pub(super) fn shutdown(&self) {
@@ -611,7 +594,7 @@ impl<T: Types> Drop for Cache<T> {
 impl<T: Types> HandleInner<T> {
     fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
         match std::mem::replace(self, HandleInner::ShutDown) {
-            HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
+            HandleInner::Open(timeline) => Some(timeline),
             HandleInner::ShutDown => {
                 // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
                 // may do it concurrently, but locking rules disallow holding per-timeline-state lock and
@@ -631,6 +614,7 @@ mod tests {
     use pageserver_api::reltag::RelTag;
     use pageserver_api::shard::ShardStripeSize;
     use utils::shard::ShardCount;
+    use utils::sync::gate::GateGuard;
 
     use super::*;
 
@@ -641,7 +625,7 @@ mod tests {
     impl Types for TestTypes {
         type TenantManagerError = anyhow::Error;
         type TenantManager = StubManager;
-        type Timeline = Arc<StubTimeline>;
+        type Timeline = Entered;
     }
 
     struct StubManager {
@@ -656,17 +640,19 @@ mod tests {
         myself: Weak<StubTimeline>,
     }
 
+    struct Entered {
+        timeline: Arc<StubTimeline>,
+        #[allow(dead_code)] // it's stored here to keep the gate open
+        gate_guard: Arc<GateGuard>,
+    }
+
     impl StubTimeline {
         fn getpage(&self) {
             // do nothing
         }
     }
 
-    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
-        fn gate(&self) -> &utils::sync::gate::Gate {
-            &self.gate
-        }
-
+    impl Timeline<TestTypes> for Entered {
         fn shard_timeline_id(&self) -> ShardTimelineId {
             ShardTimelineId {
                 shard_index: self.shard.shard_index(),
@@ -688,20 +674,34 @@ mod tests {
             &self,
             timeline_id: TimelineId,
             shard_selector: ShardSelector,
-        ) -> anyhow::Result<Arc<StubTimeline>> {
+        ) -> anyhow::Result<Entered> {
             for timeline in &self.shards {
                 if timeline.id == timeline_id {
+                    let enter_gate = || {
+                        let gate_guard = timeline.gate.enter()?;
+                        let gate_guard = Arc::new(gate_guard);
+                        anyhow::Ok(gate_guard)
+                    };
                     match &shard_selector {
                         ShardSelector::Zero if timeline.shard.is_shard_zero() => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                         }
                         ShardSelector::Zero => continue,
                         ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                         }
                         ShardSelector::Page(_) => continue,
                         ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                         }
                         ShardSelector::Known(_) => continue,
                     }
@@ -711,6 +711,13 @@ mod tests {
         }
     }
 
+    impl std::ops::Deref for Entered {
+        type Target = StubTimeline;
+        fn deref(&self) -> &Self::Target {
+            &self.timeline
+        }
+    }
+
     #[tokio::test(start_paused = true)]
     async fn test_timeline_shutdown() {
         crate::tenant::harness::setup_logging();
@@ -1038,7 +1045,6 @@ mod tests {
         let key = DBDIR_KEY;
 
         // Simulate 10 connections that's opened, used, and closed
-        let mut used_handles = vec![];
         for _ in 0..10 {
             let mut cache = Cache::<TestTypes>::default();
             let handle = {
@@ -1050,7 +1056,6 @@ mod tests {
                 handle
             };
             handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.timeline));
         }
 
         // No handles exist, thus gates are closed and don't require shutdown.

From c7ff3c4c9bd8b8f3aec9eb69551527ad3b105cce Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 28 Feb 2025 15:06:56 +0100
Subject: [PATCH 02/61] safekeeper: downgrade interpreted reader errors
 (#11034)

## Problem

This `critical!` could fire on IO errors, which is just noisy.

Resolves #11027.

## Summary of changes

Downgrade to error, except for decode errors. These could be either data
corruption or a bug, but seem worth investigating either way.
---
 safekeeper/src/send_interpreted_wal.rs | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index e196f91d3c..2c1c73c25c 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -287,7 +287,13 @@ impl InterpretedWalReader {
                 reader
                     .run_impl(start_pos)
                     .await
-                    .inspect_err(|err| critical!("failed to read WAL record: {err:?}"))
+                    .inspect_err(|err| match err {
+                        // TODO: we may want to differentiate these errors further.
+                        InterpretedWalReaderError::Decode(_) => {
+                            critical!("failed to decode WAL record: {err:?}");
+                        }
+                        err => error!("failed to read WAL record: {err}"),
+                    })
             }
             .instrument(info_span!("interpreted wal reader")),
         );
@@ -347,10 +353,12 @@ impl InterpretedWalReader {
             metric.dec();
         }
 
-        if let Err(err) = self.run_impl(start_pos).await {
-            critical!("failed to read WAL record: {err:?}");
-        } else {
-            info!("interpreted wal reader exiting");
+        match self.run_impl(start_pos).await {
+            Err(err @ InterpretedWalReaderError::Decode(_)) => {
+                critical!("failed to decode WAL record: {err:?}");
+            }
+            Err(err) => error!("failed to read WAL record: {err}"),
+            Ok(()) => info!("interpreted wal reader exiting"),
         }
 
         Err(CopyStreamHandlerEnd::Other(anyhow!(

From d9ced89ec074dd49c9f36177df9bc6c36c315133 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 28 Feb 2025 14:20:25 +0000
Subject: [PATCH 03/61] feat(proxy): require TLS to compute if prompted by
 cplane (#10717)

https://github.com/neondatabase/cloud/issues/23008

For TLS between proxy and compute, we are using an internally
provisioned CA to sign the compute certificates. This change ensures
that proxy will load them from a supplied env var pointing to the
correct file - this file and env var will be configured later, using a
kubernetes secret.

Control plane responds with a `server_name` field if and only if the
compute uses TLS. This server name is the name we use to validate the
certificate. Control plane still sends us the IP to connect to as well
(to support overlay IP).

To support this change, I'd had to split `host` and `host_addr` into
separate fields. Using `host_addr` and bypassing `lookup_addr` if
possible (which is what happens in production). `host` then is only used
for the TLS connection.

There's no blocker to merging this. The code paths will not be triggered
until the new control plane is deployed and the `enableTLS` compute flag
is enabled on a project.
---
 .../proxy/tokio-postgres2/src/cancel_query.rs |  9 ++-
 libs/proxy/tokio-postgres2/src/client.rs      |  2 +
 libs/proxy/tokio-postgres2/src/config.rs      | 12 ++++
 libs/proxy/tokio-postgres2/src/connect.rs     |  8 ++-
 .../tokio-postgres2/src/connect_socket.rs     | 12 +++-
 proxy/src/auth/backend/local.rs               |  1 +
 proxy/src/compute.rs                          | 35 ++++++----
 .../control_plane/client/cplane_proxy_v1.rs   | 22 +++++-
 proxy/src/control_plane/client/mock.rs        | 22 ++++--
 proxy/src/control_plane/messages.rs           |  6 ++
 proxy/src/proxy/connect_compute.rs            |  5 +-
 proxy/src/proxy/tests/mod.rs                  |  1 +
 proxy/src/serverless/backend.rs               | 69 ++++++++++++++++---
 proxy/src/serverless/conn_pool.rs             |  1 +
 proxy/src/serverless/http_conn_pool.rs        |  5 +-
 proxy/src/tls/client_config.rs                | 40 +++++++++--
 test_runner/fixtures/neon_fixtures.py         |  2 +
 17 files changed, 206 insertions(+), 46 deletions(-)

diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs
index b65fb571e6..0bdad0b554 100644
--- a/libs/proxy/tokio-postgres2/src/cancel_query.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs
@@ -34,8 +34,13 @@ where
         .make_tls_connect(hostname)
         .map_err(|e| Error::tls(e.into()))?;
 
-    let socket =
-        connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?;
+    let socket = connect_socket::connect_socket(
+        config.host_addr,
+        &config.host,
+        config.port,
+        config.connect_timeout,
+    )
+    .await?;
 
     cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await
 }
diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
index 39b1db75da..c70cb598de 100644
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::fmt;
+use std::net::IpAddr;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 use std::time::Duration;
@@ -137,6 +138,7 @@ impl InnerClient {
 
 #[derive(Clone, Serialize, Deserialize)]
 pub struct SocketConfig {
+    pub host_addr: Option<IpAddr>,
     pub host: Host,
     pub port: u16,
     pub connect_timeout: Option<Duration>,
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 4c25491b67..978d348741 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -1,5 +1,6 @@
 //! Connection configuration.
 
+use std::net::IpAddr;
 use std::time::Duration;
 use std::{fmt, str};
 
@@ -65,6 +66,7 @@ pub enum AuthKeys {
 /// Connection configuration.
 #[derive(Clone, PartialEq, Eq)]
 pub struct Config {
+    pub(crate) host_addr: Option<IpAddr>,
     pub(crate) host: Host,
     pub(crate) port: u16,
 
@@ -83,6 +85,7 @@ impl Config {
     /// Creates a new configuration.
     pub fn new(host: String, port: u16) -> Config {
         Config {
+            host_addr: None,
             host: Host::Tcp(host),
             port,
             password: None,
@@ -163,6 +166,15 @@ impl Config {
         self
     }
 
+    pub fn set_host_addr(&mut self, addr: IpAddr) -> &mut Config {
+        self.host_addr = Some(addr);
+        self
+    }
+
+    pub fn get_host_addr(&self) -> Option<IpAddr> {
+        self.host_addr
+    }
+
     /// Sets the SSL configuration.
     ///
     /// Defaults to `prefer`.
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index d2bd0dfbcd..7c3a358bba 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -1,3 +1,5 @@
+use std::net::IpAddr;
+
 use postgres_protocol2::message::backend::Message;
 use tokio::net::TcpStream;
 use tokio::sync::mpsc;
@@ -25,13 +27,14 @@ where
         .make_tls_connect(hostname)
         .map_err(|e| Error::tls(e.into()))?;
 
-    match connect_once(&config.host, config.port, tls, config).await {
+    match connect_once(config.host_addr, &config.host, config.port, tls, config).await {
         Ok((client, connection)) => Ok((client, connection)),
         Err(e) => Err(e),
     }
 }
 
 async fn connect_once<T>(
+    host_addr: Option<IpAddr>,
     host: &Host,
     port: u16,
     tls: T,
@@ -40,7 +43,7 @@ async fn connect_once<T>(
 where
     T: TlsConnect<TcpStream>,
 {
-    let socket = connect_socket(host, port, config.connect_timeout).await?;
+    let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?;
     let RawConnection {
         stream,
         parameters,
@@ -50,6 +53,7 @@ where
     } = connect_raw(socket, tls, config).await?;
 
     let socket_config = SocketConfig {
+        host_addr,
         host: host.clone(),
         port,
         connect_timeout: config.connect_timeout,
diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs
index 15411f7ef3..8c7d300451 100644
--- a/libs/proxy/tokio-postgres2/src/connect_socket.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs
@@ -1,5 +1,6 @@
 use std::future::Future;
 use std::io;
+use std::net::{IpAddr, SocketAddr};
 use std::time::Duration;
 
 use tokio::net::{self, TcpStream};
@@ -9,15 +10,20 @@ use crate::Error;
 use crate::config::Host;
 
 pub(crate) async fn connect_socket(
+    host_addr: Option<IpAddr>,
     host: &Host,
     port: u16,
     connect_timeout: Option<Duration>,
 ) -> Result<TcpStream, Error> {
     match host {
         Host::Tcp(host) => {
-            let addrs = net::lookup_host((&**host, port))
-                .await
-                .map_err(Error::connect)?;
+            let addrs = match host_addr {
+                Some(addr) => vec![SocketAddr::new(addr, port)],
+                None => net::lookup_host((&**host, port))
+                    .await
+                    .map_err(Error::connect)?
+                    .collect(),
+            };
 
             let mut last_err = None;
 
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 9c3a3772cd..7a6dceb194 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -35,6 +35,7 @@ impl LocalBackend {
                     endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
                     project_id: ProjectIdTag::get_interner().get_or_intern("local"),
                     branch_id: BranchIdTag::get_interner().get_or_intern("local"),
+                    compute_id: "local".into(),
                     cold_start_info: ColdStartInfo::WarmCached,
                 },
             },
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 5447a4a4c0..2560187608 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,3 +1,4 @@
+use std::fmt::Debug;
 use std::io;
 use std::net::SocketAddr;
 use std::time::Duration;
@@ -10,7 +11,7 @@ use postgres_protocol::message::backend::NoticeResponseBody;
 use pq_proto::StartupMessageParams;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
-use tokio::net::TcpStream;
+use tokio::net::{TcpStream, lookup_host};
 use tracing::{debug, error, info, warn};
 
 use crate::auth::backend::ComputeUserInfo;
@@ -180,21 +181,19 @@ impl ConnCfg {
         use postgres_client::config::Host;
 
         // wrap TcpStream::connect with timeout
-        let connect_with_timeout = |host, port| {
-            tokio::time::timeout(timeout, TcpStream::connect((host, port))).map(
-                move |res| match res {
-                    Ok(tcpstream_connect_res) => tcpstream_connect_res,
-                    Err(_) => Err(io::Error::new(
-                        io::ErrorKind::TimedOut,
-                        format!("exceeded connection timeout {timeout:?}"),
-                    )),
-                },
-            )
+        let connect_with_timeout = |addrs| {
+            tokio::time::timeout(timeout, TcpStream::connect(addrs)).map(move |res| match res {
+                Ok(tcpstream_connect_res) => tcpstream_connect_res,
+                Err(_) => Err(io::Error::new(
+                    io::ErrorKind::TimedOut,
+                    format!("exceeded connection timeout {timeout:?}"),
+                )),
+            })
         };
 
-        let connect_once = |host, port| {
-            debug!("trying to connect to compute node at {host}:{port}");
-            connect_with_timeout(host, port).and_then(|stream| async {
+        let connect_once = |addrs| {
+            debug!("trying to connect to compute node at {addrs:?}");
+            connect_with_timeout(addrs).and_then(|stream| async {
                 let socket_addr = stream.peer_addr()?;
                 let socket = socket2::SockRef::from(&stream);
                 // Disable Nagle's algorithm to not introduce latency between
@@ -216,7 +215,12 @@ impl ConnCfg {
             Host::Tcp(host) => host.as_str(),
         };
 
-        match connect_once(host, port).await {
+        let addrs = match self.0.get_host_addr() {
+            Some(addr) => vec![SocketAddr::new(addr, port)],
+            None => lookup_host((host, port)).await?.collect(),
+        };
+
+        match connect_once(&*addrs).await {
             Ok((sockaddr, stream)) => Ok((sockaddr, stream, host)),
             Err(err) => {
                 warn!("couldn't connect to compute node at {host}:{port}: {err}");
@@ -277,6 +281,7 @@ impl ConnCfg {
         } = connection;
 
         tracing::Span::current().record("pid", tracing::field::display(process_id));
+        tracing::Span::current().record("compute_id", tracing::field::display(&aux.compute_id));
         let stream = stream.into_inner();
 
         // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?)
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index 977fcf4727..2765aaa462 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -1,5 +1,7 @@
 //! Production console backend.
 
+use std::net::IpAddr;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -274,11 +276,27 @@ impl NeonControlPlaneClient {
                 Some(x) => x,
             };
 
+            let host_addr = IpAddr::from_str(host).ok();
+
+            let ssl_mode = match &body.server_name {
+                Some(_) => SslMode::Require,
+                None => SslMode::Disable,
+            };
+            let host_name = match body.server_name {
+                Some(host) => host,
+                None => host.to_owned(),
+            };
+
             // Don't set anything but host and port! This config will be cached.
             // We'll set username and such later using the startup message.
             // TODO: add more type safety (in progress).
-            let mut config = compute::ConnCfg::new(host.to_owned(), port);
-            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+            let mut config = compute::ConnCfg::new(host_name, port);
+
+            if let Some(addr) = host_addr {
+                config.set_host_addr(addr);
+            }
+
+            config.ssl_mode(ssl_mode);
 
             let node = NodeInfo {
                 config,
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 7da5464aa5..ee722e839e 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -1,5 +1,6 @@
 //! Mock console backend which relies on a user-provided postgres instance.
 
+use std::net::{IpAddr, Ipv4Addr};
 use std::str::FromStr;
 use std::sync::Arc;
 
@@ -167,10 +168,22 @@ impl MockControlPlane {
     }
 
     async fn do_wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
-        let mut config = compute::ConnCfg::new(
-            self.endpoint.host_str().unwrap_or("localhost").to_owned(),
-            self.endpoint.port().unwrap_or(5432),
-        );
+        let port = self.endpoint.port().unwrap_or(5432);
+        let mut config = match self.endpoint.host_str() {
+            None => {
+                let mut config = compute::ConnCfg::new("localhost".to_string(), port);
+                config.set_host_addr(IpAddr::V4(Ipv4Addr::LOCALHOST));
+                config
+            }
+            Some(host) => {
+                let mut config = compute::ConnCfg::new(host.to_string(), port);
+                if let Ok(addr) = IpAddr::from_str(host) {
+                    config.set_host_addr(addr);
+                }
+                config
+            }
+        };
+
         config.ssl_mode(postgres_client::config::SslMode::Disable);
 
         let node = NodeInfo {
@@ -179,6 +192,7 @@ impl MockControlPlane {
                 endpoint_id: (&EndpointId::from("endpoint")).into(),
                 project_id: (&ProjectId::from("project")).into(),
                 branch_id: (&BranchId::from("branch")).into(),
+                compute_id: "compute".into(),
                 cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
             },
         };
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index 8d6b2e96f5..ec4554eab5 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -2,6 +2,7 @@ use std::fmt::{self, Display};
 
 use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
+use smol_str::SmolStr;
 
 use crate::auth::IpPattern;
 use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
@@ -239,6 +240,7 @@ pub(crate) struct GetEndpointAccessControl {
 #[derive(Debug, Deserialize)]
 pub(crate) struct WakeCompute {
     pub(crate) address: Box<str>,
+    pub(crate) server_name: Option<String>,
     pub(crate) aux: MetricsAuxInfo,
 }
 
@@ -312,6 +314,9 @@ pub(crate) struct MetricsAuxInfo {
     pub(crate) endpoint_id: EndpointIdInt,
     pub(crate) project_id: ProjectIdInt,
     pub(crate) branch_id: BranchIdInt,
+    // note: we don't use interned strings for compute IDs.
+    // they churn too quickly and we have no way to clean up interned strings.
+    pub(crate) compute_id: SmolStr,
     #[serde(default)]
     pub(crate) cold_start_info: ColdStartInfo,
 }
@@ -378,6 +383,7 @@ mod tests {
             "endpoint_id": "endpoint",
             "project_id": "project",
             "branch_id": "branch",
+            "compute_id": "compute",
             "cold_start_info": "unknown",
         })
     }
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index b8b39fa121..e013fbbe2e 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -81,7 +81,10 @@ impl ConnectMechanism for TcpMechanism<'_> {
     type ConnectError = compute::ConnectionError;
     type Error = compute::ConnectionError;
 
-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    #[tracing::instrument(skip_all, fields(
+        pid = tracing::field::Empty,
+        compute_id = tracing::field::Empty
+    ))]
     async fn connect_once(
         &self,
         ctx: &RequestContext,
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 171f539b1e..e0b7539538 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -555,6 +555,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
             endpoint_id: (&EndpointId::from("endpoint")).into(),
             project_id: (&ProjectId::from("project")).into(),
             branch_id: (&BranchId::from("branch")).into(),
+            compute_id: "compute".into(),
             cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
         },
     };
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 72029102e0..b55661cec8 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,4 +1,5 @@
 use std::io;
+use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -6,11 +7,15 @@ use async_trait::async_trait;
 use ed25519_dalek::SigningKey;
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
 use jose_jwk::jose_b64;
+use postgres_client::config::SslMode;
 use rand::rngs::OsRng;
+use rustls::pki_types::{DnsName, ServerName};
 use tokio::net::{TcpStream, lookup_host};
+use tokio_rustls::TlsConnector;
 use tracing::field::display;
 use tracing::{debug, info};
 
+use super::AsyncRW;
 use super::conn_pool::poll_client;
 use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool};
 use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client};
@@ -190,7 +195,11 @@ impl PoolingBackend {
     // Wake up the destination if needed. Code here is a bit involved because
     // we reuse the code from the usual proxy and we need to prepare few structures
     // that this code expects.
-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    #[tracing::instrument(skip_all, fields(
+        pid = tracing::field::Empty,
+        compute_id = tracing::field::Empty,
+        conn_id = tracing::field::Empty,
+    ))]
     pub(crate) async fn connect_to_compute(
         &self,
         ctx: &RequestContext,
@@ -229,7 +238,10 @@ impl PoolingBackend {
     }
 
     // Wake up the destination if needed
-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    #[tracing::instrument(skip_all, fields(
+        compute_id = tracing::field::Empty,
+        conn_id = tracing::field::Empty,
+    ))]
     pub(crate) async fn connect_to_local_proxy(
         &self,
         ctx: &RequestContext,
@@ -276,7 +288,10 @@ impl PoolingBackend {
     /// # Panics
     ///
     /// Panics if called with a non-local_proxy backend.
-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    #[tracing::instrument(skip_all, fields(
+        pid = tracing::field::Empty,
+        conn_id = tracing::field::Empty,
+    ))]
     pub(crate) async fn connect_to_local_postgres(
         &self,
         ctx: &RequestContext,
@@ -552,6 +567,10 @@ impl ConnectMechanism for TokioMechanism {
         let (client, connection) = permit.release_result(res)?;
 
         tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
+        tracing::Span::current().record(
+            "compute_id",
+            tracing::field::display(&node_info.aux.compute_id),
+        );
         Ok(poll_client(
             self.pool.clone(),
             ctx,
@@ -587,16 +606,28 @@ impl ConnectMechanism for HyperMechanism {
         node_info: &CachedNodeInfo,
         config: &ComputeConfig,
     ) -> Result<Self::Connection, Self::ConnectError> {
+        let host_addr = node_info.config.get_host_addr();
         let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
 
+        let tls = if node_info.config.get_ssl_mode() == SslMode::Disable {
+            None
+        } else {
+            Some(&config.tls)
+        };
+
         let port = node_info.config.get_port();
-        let res = connect_http2(&host, port, config.timeout).await;
+        let res = connect_http2(host_addr, &host, port, config.timeout, tls).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;
 
+        tracing::Span::current().record(
+            "compute_id",
+            tracing::field::display(&node_info.aux.compute_id),
+        );
+
         Ok(poll_http2_client(
             self.pool.clone(),
             ctx,
@@ -612,18 +643,22 @@ impl ConnectMechanism for HyperMechanism {
 }
 
 async fn connect_http2(
+    host_addr: Option<IpAddr>,
     host: &str,
     port: u16,
     timeout: Duration,
+    tls: Option<&Arc<rustls::ClientConfig>>,
 ) -> Result<(http_conn_pool::Send, http_conn_pool::Connect), LocalProxyConnError> {
-    // assumption: host is an ip address so this should not actually perform any requests.
-    // todo: add that assumption as a guarantee in the control-plane API.
-    let mut addrs = lookup_host((host, port))
-        .await
-        .map_err(LocalProxyConnError::Io)?;
-
+    let addrs = match host_addr {
+        Some(addr) => vec![SocketAddr::new(addr, port)],
+        None => lookup_host((host, port))
+            .await
+            .map_err(LocalProxyConnError::Io)?
+            .collect(),
+    };
     let mut last_err = None;
 
+    let mut addrs = addrs.into_iter();
     let stream = loop {
         let Some(addr) = addrs.next() else {
             return Err(last_err.unwrap_or_else(|| {
@@ -651,6 +686,20 @@ async fn connect_http2(
         }
     };
 
+    let stream = if let Some(tls) = tls {
+        let host = DnsName::try_from(host)
+            .map_err(io::Error::other)
+            .map_err(LocalProxyConnError::Io)?
+            .to_owned();
+        let stream = TlsConnector::from(tls.clone())
+            .connect(ServerName::DnsName(host), stream)
+            .await
+            .map_err(LocalProxyConnError::Io)?;
+        Box::pin(stream) as AsyncRW
+    } else {
+        Box::pin(stream) as AsyncRW
+    };
+
     let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new())
         .timer(TokioTimer::new())
         .keep_alive_interval(Duration::from_secs(20))
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 6a9089fc2a..516d474a11 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -221,6 +221,7 @@ mod tests {
                 endpoint_id: (&EndpointId::from("endpoint")).into(),
                 project_id: (&ProjectId::from("project")).into(),
                 branch_id: (&BranchId::from("branch")).into(),
+                compute_id: "compute".into(),
                 cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
             },
             conn_id: uuid::Uuid::new_v4(),
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 338a79b4b3..bca2d4c165 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -6,9 +6,9 @@ use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
 use smol_str::ToSmolStr;
-use tokio::net::TcpStream;
 use tracing::{Instrument, debug, error, info, info_span};
 
+use super::AsyncRW;
 use super::backend::HttpConnError;
 use super::conn_pool_lib::{
     ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, ConnPoolEntry,
@@ -22,8 +22,7 @@ use crate::types::EndpointCacheKey;
 use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS};
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
-pub(crate) type Connect =
-    http2::Connection<TokioIo<TcpStream>, hyper::body::Incoming, TokioExecutor>;
+pub(crate) type Connect = http2::Connection<TokioIo<AsyncRW>, hyper::body::Incoming, TokioExecutor>;
 
 #[derive(Clone)]
 pub(crate) struct ClientDataHttp();
diff --git a/proxy/src/tls/client_config.rs b/proxy/src/tls/client_config.rs
index a2d695aae1..ce873e678e 100644
--- a/proxy/src/tls/client_config.rs
+++ b/proxy/src/tls/client_config.rs
@@ -1,17 +1,49 @@
+use std::env;
+use std::io::Cursor;
+use std::path::PathBuf;
 use std::sync::Arc;
 
-use anyhow::bail;
+use anyhow::{Context, bail};
 use rustls::crypto::ring;
 
-pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+/// We use an internal certificate authority when establishing a TLS connection with compute.
+fn load_internal_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> {
+    let Some(ca_file) = env::var_os("NEON_INTERNAL_CA_FILE") else {
+        return Ok(());
+    };
+    let ca_file = PathBuf::from(ca_file);
+
+    let ca = std::fs::read(&ca_file)
+        .with_context(|| format!("could not read CA from {}", ca_file.display()))?;
+
+    for cert in rustls_pemfile::certs(&mut Cursor::new(&*ca)) {
+        store
+            .add(cert.context("could not parse internal CA certificate")?)
+            .context("could not parse internal CA certificate")?;
+    }
+
+    Ok(())
+}
+
+/// For console redirect proxy, we need to establish a connection to compute via pg-sni-router.
+/// pg-sni-router needs TLS and uses a Let's Encrypt signed certificate, so we
+/// load certificates from our native store.
+fn load_native_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> {
     let der_certs = rustls_native_certs::load_native_certs();
 
     if !der_certs.errors.is_empty() {
         bail!("could not parse certificates: {:?}", der_certs.errors);
     }
 
-    let mut store = rustls::RootCertStore::empty();
     store.add_parsable_certificates(der_certs.certs);
+
+    Ok(())
+}
+
+fn load_compute_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+    let mut store = rustls::RootCertStore::empty();
+    load_native_certs(&mut store)?;
+    load_internal_certs(&mut store)?;
     Ok(Arc::new(store))
 }
 
@@ -22,7 +54,7 @@ pub fn compute_client_config_with_root_certs() -> anyhow::Result<rustls::ClientC
         rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_safe_default_protocol_versions()
             .expect("ring should support the default protocol versions")
-            .with_root_certificates(load_certs()?)
+            .with_root_certificates(load_compute_certs()?)
             .with_no_client_auth(),
     )
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 73c8406237..11fbe30767 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3601,6 +3601,7 @@ class NeonProxy(PgProtocol):
                             "project_id": "test_project_id",
                             "endpoint_id": "test_endpoint_id",
                             "branch_id": "test_branch_id",
+                            "compute_id": "test_compute_id",
                         },
                     }
                 },
@@ -3826,6 +3827,7 @@ def static_auth_broker(
         {
             "address": local_proxy_addr,
             "aux": {
+                "compute_id": "compute-foo-bar-1234-5678",
                 "endpoint_id": "ep-foo-bar-1234",
                 "branch_id": "br-foo-bar",
                 "project_id": "foo-bar",

From 23fb8053c5904d2ede67e09345de429ab56faefc Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 28 Feb 2025 15:49:09 +0000
Subject: [PATCH 04/61] storcon: soft disable SK heartbeats (#11041)

## Problem

JWT tokens aren't in place, so all SK heartbeats fail. This is
equivalent to a wait before applying the PS heartbeats and makes things
more flaky.

## Summary of Changes

Add a flag that skips loading SKs from the db on start-up and at
runtime.
---
 control_plane/src/local_env.rs          |  3 +++
 control_plane/src/storage_controller.rs |  4 ++++
 storage_controller/src/main.rs          |  5 +++++
 storage_controller/src/service.rs       | 26 ++++++++++++++++---------
 test_runner/fixtures/neon_fixtures.py   |  7 +++++++
 5 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index f4026efbbf..da7d7e5469 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -165,6 +165,8 @@ pub struct NeonStorageControllerConf {
 
     #[serde(with = "humantime_serde")]
     pub long_reconcile_threshold: Option<Duration>,
+
+    pub load_safekeepers: bool,
 }
 
 impl NeonStorageControllerConf {
@@ -188,6 +190,7 @@ impl Default for NeonStorageControllerConf {
             max_secondary_lag_bytes: None,
             heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
             long_reconcile_threshold: None,
+            load_safekeepers: true,
         }
     }
 }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 16e12f4e02..77a9075aa7 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -537,6 +537,10 @@ impl StorageController {
             args.push("--start-as-candidate".to_string());
         }
 
+        if self.config.load_safekeepers {
+            args.push("--load-safekeepers".to_string());
+        }
+
         if let Some(private_key) = &self.private_key {
             let claims = Claims::new(None, Scope::PageServerApi);
             let jwt_token =
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 04dd3bb3f6..380ffeb9b7 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -138,6 +138,10 @@ struct Cli {
     // Flag to use https for requests to pageserver API.
     #[arg(long, default_value = "false")]
     use_https_pageserver_api: bool,
+
+    /// Whether to load safekeeprs from the database and heartbeat them
+    #[arg(long, default_value = "false")]
+    load_safekeepers: bool,
 }
 
 enum StrictMode {
@@ -350,6 +354,7 @@ async fn async_main() -> anyhow::Result<()> {
         start_as_candidate: args.start_as_candidate,
         http_service_port: args.listen.port() as i32,
         use_https_pageserver_api: args.use_https_pageserver_api,
+        load_safekeepers: args.load_safekeepers,
     };
 
     // Validate that we can connect to the database
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 9ba9504718..26ccfd5445 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -389,6 +389,8 @@ pub struct Config {
     pub long_reconcile_threshold: Duration,
 
     pub use_https_pageserver_api: bool,
+
+    pub load_safekeepers: bool,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -1405,15 +1407,20 @@ impl Service {
             .set(nodes.len() as i64);
 
         tracing::info!("Loading safekeepers from database...");
-        let safekeepers = persistence
-            .list_safekeepers()
-            .await?
-            .into_iter()
-            .map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new()))
-            .collect::<Vec<_>>();
+        let safekeepers = if config.load_safekeepers {
+            persistence
+                .list_safekeepers()
+                .await?
+                .into_iter()
+                .map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new()))
+                .collect::<Vec<_>>()
+        } else {
+            tracing::info!("Skipping safekeeper loading");
+            Default::default()
+        };
+
         let safekeepers: HashMap<NodeId, Safekeeper> =
             safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
-        tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
 
         tracing::info!("Loading shards from database...");
         let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
@@ -8054,7 +8061,8 @@ impl Service {
     ) -> Result<(), DatabaseError> {
         let node_id = NodeId(record.id as u64);
         self.persistence.safekeeper_upsert(record.clone()).await?;
-        {
+
+        if self.config.load_safekeepers {
             let mut locked = self.inner.write().unwrap();
             let mut safekeepers = (*locked.safekeepers).clone();
             match safekeepers.entry(node_id) {
@@ -8086,7 +8094,7 @@ impl Service {
             .await?;
         let node_id = NodeId(id as u64);
         // After the change has been persisted successfully, update the in-memory state
-        {
+        if self.config.load_safekeepers {
             let mut locked = self.inner.write().unwrap();
             let mut safekeepers = (*locked.safekeepers).clone();
             let sk = safekeepers
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 11fbe30767..6001003e53 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1128,6 +1128,13 @@ class NeonEnv:
         if self.storage_controller_config is not None:
             cfg["storage_controller"] = self.storage_controller_config
 
+        # Disable new storcon flag in compat tests
+        if config.test_may_use_compatibility_snapshot_binaries:
+            if "storage_controller" in cfg:
+                cfg["storage_controller"]["load_safekeepers"] = False
+            else:
+                cfg["storage_controller"] = {"load_safekeepers": False}
+
         # Create config for pageserver
         http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
         pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"

From f79ee0bb8840be01693c1344b26cc1134e5d82a1 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 28 Feb 2025 10:49:15 -0500
Subject: [PATCH 05/61] fix(storcon): loop in chaos injection (#11004)

## Problem

Somehow the previous patch loses the loop in the chaos injector function
so everything will only run once.
https://github.com/neondatabase/neon/pull/10934

## Summary of changes

Add back the loop.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/service/chaos_injector.rs             | 71 ++++++++++---------
 1 file changed, 37 insertions(+), 34 deletions(-)

diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 2ff68d7037..a0419e0205 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -46,48 +46,51 @@ impl ChaosInjector {
         }
     }
 
+    fn get_cron_interval_sleep_future(&self) -> Option<tokio::time::Sleep> {
+        if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab {
+            match cron_to_next_duration(chaos_exit_crontab) {
+                Ok(interval_exit) => Some(interval_exit),
+                Err(e) => {
+                    tracing::error!("Error processing the cron schedule: {e}");
+                    None
+                }
+            }
+        } else {
+            None
+        }
+    }
+
     pub async fn run(&mut self, cancel: CancellationToken) {
         let mut interval = tokio::time::interval(self.interval);
-        let cron_interval = {
-            if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab {
-                match cron_to_next_duration(chaos_exit_crontab) {
-                    Ok(interval_exit) => Some(interval_exit),
-                    Err(e) => {
-                        tracing::error!("Error processing the cron schedule: {e}");
-                        None
-                    }
-                }
-            } else {
-                None
-            }
-        };
+        #[derive(Debug)]
         enum ChaosEvent {
             ShuffleTenant,
             ForceKill,
         }
-        let chaos_type = tokio::select! {
-            _ = interval.tick() => {
-                ChaosEvent::ShuffleTenant
-            }
-            Some(_) = maybe_sleep(cron_interval) => {
-                ChaosEvent::ForceKill
-            }
-            _ = cancel.cancelled() => {
-                tracing::info!("Shutting down");
-                return;
-            }
-        };
-
-        match chaos_type {
-            ChaosEvent::ShuffleTenant => {
-                self.inject_chaos().await;
-            }
-            ChaosEvent::ForceKill => {
-                self.force_kill().await;
+        loop {
+            let cron_interval = self.get_cron_interval_sleep_future();
+            let chaos_type = tokio::select! {
+                _ = interval.tick() => {
+                    ChaosEvent::ShuffleTenant
+                }
+                Some(_) = maybe_sleep(cron_interval) => {
+                    ChaosEvent::ForceKill
+                }
+                _ = cancel.cancelled() => {
+                    tracing::info!("Shutting down");
+                    return;
+                }
+            };
+            tracing::info!("Chaos iteration: {chaos_type:?}...");
+            match chaos_type {
+                ChaosEvent::ShuffleTenant => {
+                    self.inject_chaos().await;
+                }
+                ChaosEvent::ForceKill => {
+                    self.force_kill().await;
+                }
             }
         }
-
-        tracing::info!("Chaos iteration...");
     }
 
     /// If a shard has a secondary and attached location, then re-assign the secondary to be

From d857f63e3b1be9b1b70ef3f8c64cb088d126743c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 28 Feb 2025 18:00:22 +0100
Subject: [PATCH 06/61] pageserver: fix race that can wedge background tasks
 (#11047)

## Problem

`wait_for_active_tenant()`, used when starting background tasks, has a
race condition that can cause it to wait forever (until cancelled). It
first checks the current tenant state, and then subscribes for state
updates, but if the state changes between these then it won't be
notified about it.

We've seen this wedge compaction tasks, which can cause unbounded layer
file buildup and read amplification.

## Summary of changes

Use `watch::Receiver::wait_for()` to check both the current and new
tenant states.
---
 pageserver/src/tenant/tasks.rs | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index c90f81889b..589ac5ae88 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -473,21 +473,15 @@ async fn wait_for_active_tenant(
     }
 
     let mut update_rx = tenant.subscribe_for_state_updates();
-    loop {
-        tokio::select! {
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            result = update_rx.changed() => if result.is_err() {
+    tokio::select! {
+        result = update_rx.wait_for(|s| s == &TenantState::Active) => {
+            if result.is_err() {
                 return ControlFlow::Break(());
             }
-        }
-
-        match &*update_rx.borrow() {
-            TenantState::Active => {
-                debug!("Tenant state changed to active, continuing the task loop");
-                return ControlFlow::Continue(());
-            }
-            state => debug!("Not running the task loop, tenant is not active: {state:?}"),
-        }
+            debug!("Tenant state changed to active, continuing the task loop");
+            ControlFlow::Continue(())
+        },
+        _ = cancel.cancelled() => ControlFlow::Break(()),
     }
 }
 

From 56033189c10dc93fa0098f5ae77a951b481edc15 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Fri, 28 Feb 2025 19:58:42 +0200
Subject: [PATCH 07/61] feat(proxy): Log latency after connect to compute
 (#11048)

## Problem
To measure latency accurate we should associate the testodrome role
within a latency data

## Summary of changes
Add latency logging to associate different roles within a latency.

Relates to the #22486
---
 proxy/src/compute.rs     |  5 +++--
 proxy/src/context/mod.rs | 11 ++++++++++-
 proxy/src/metrics.rs     | 28 +++++++++++++++++++++-------
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 2560187608..dfa6015b10 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -287,8 +287,9 @@ impl ConnCfg {
         // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?)
         info!(
             cold_start_info = ctx.cold_start_info().as_str(),
-            "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
-            self.0.get_ssl_mode()
+            "connected to compute node at {host} ({socket_addr}) sslmode={:?}, latency={}",
+            self.0.get_ssl_mode(),
+            ctx.get_proxy_latency(),
         );
 
         // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index f87f4e9ef8..e10a04b4f1 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -17,7 +17,8 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::error::ErrorKind;
 use crate::intern::{BranchIdInt, ProjectIdInt};
 use crate::metrics::{
-    ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
+    ConnectOutcome, InvalidEndpointsGroup, LatencyAccumulated, LatencyTimer, Metrics, Protocol,
+    Waiting,
 };
 use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra};
 use crate::types::{DbName, EndpointId, RoleName};
@@ -346,6 +347,14 @@ impl RequestContext {
         }
     }
 
+    pub(crate) fn get_proxy_latency(&self) -> LatencyAccumulated {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .latency_timer
+            .accumulated()
+    }
+
     pub(crate) fn success(&self) {
         self.0
             .try_lock()
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index db1f096de1..b6a2a059ea 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -394,21 +394,31 @@ pub enum RedisMsgKind {
     HDel,
 }
 
-#[derive(Default)]
-struct Accumulated {
+#[derive(Default, Clone)]
+pub struct LatencyAccumulated {
     cplane: time::Duration,
     client: time::Duration,
     compute: time::Duration,
     retry: time::Duration,
 }
 
+impl std::fmt::Display for LatencyAccumulated {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "client: {:?}, cplane: {:?}, compute: {:?}, retry: {:?}",
+            self.client, self.cplane, self.compute, self.retry
+        )
+    }
+}
+
 pub struct LatencyTimer {
     // time since the stopwatch was started
     start: time::Instant,
     // time since the stopwatch was stopped
     stop: Option<time::Instant>,
     // accumulated time on the stopwatch
-    accumulated: Accumulated,
+    accumulated: LatencyAccumulated,
     // label data
     protocol: Protocol,
     cold_start_info: ColdStartInfo,
@@ -422,7 +432,7 @@ impl LatencyTimer {
         Self {
             start: time::Instant::now(),
             stop: None,
-            accumulated: Accumulated::default(),
+            accumulated: LatencyAccumulated::default(),
             protocol,
             cold_start_info: ColdStartInfo::Unknown,
             // assume failed unless otherwise specified
@@ -435,7 +445,7 @@ impl LatencyTimer {
         Self {
             start: time::Instant::now(),
             stop: None,
-            accumulated: Accumulated::default(),
+            accumulated: LatencyAccumulated::default(),
             protocol,
             cold_start_info: ColdStartInfo::Unknown,
             // assume failed unless otherwise specified
@@ -465,6 +475,10 @@ impl LatencyTimer {
         // success
         self.outcome = ConnectOutcome::Success;
     }
+
+    pub fn accumulated(&self) -> LatencyAccumulated {
+        self.accumulated.clone()
+    }
 }
 
 #[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
@@ -511,7 +525,7 @@ impl Drop for LatencyTimer {
             duration.saturating_sub(accumulated_total).as_secs_f64(),
         );
 
-        // Exclude client cplane, compue communication from the accumulated time.
+        // Exclude client, cplane, compute communication from the accumulated time.
         let accumulated_total =
             self.accumulated.client + self.accumulated.cplane + self.accumulated.compute;
         metric.observe(
@@ -524,7 +538,7 @@ impl Drop for LatencyTimer {
             duration.saturating_sub(accumulated_total).as_secs_f64(),
         );
 
-        // Exclude client cplane, compue, retry communication from the accumulated time.
+        // Exclude client, cplane, compute, retry communication from the accumulated time.
         let accumulated_total = self.accumulated.client
             + self.accumulated.cplane
             + self.accumulated.compute

From ee0c8ca8fd6a9abd5ad7bd18a8a240286f47e2f6 Mon Sep 17 00:00:00 2001
From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com>
Date: Fri, 28 Feb 2025 16:07:21 -0500
Subject: [PATCH 08/61] Add -fsigned-char for cross platform signed chars
 (#10852)

## Problem

In multi-character keys, the GIN index creates a CRC Hash of the first 3
bytes of the key.
The hash can have the first bit to be set or unset, needing to have a
consistent representation
of `char` across architectures for consistent results. GIN stores these
keys by their hashes
which determines the order in which the keys are obtained from the GIN
index.

By default, chars are signed in x86 and unsigned in arm, leading to
inconsistent behavior across different platform architectures. Adding
the `-fsigned-char` flag to the GCC compiler forces chars to be treated
as signed across platforms, ensuring the ordering in which the keys are
obtained consistent.

## Summary of changes

Added `-fsigned-char` to the `CFLAGS` to force GCC to use signed chars
across platforms.
Added a test to check this across platforms.

Fixes: https://github.com/neondatabase/cloud/issues/23199
---
 Makefile                                      |  7 +-
 compute/compute-node.Dockerfile               |  2 +-
 test_runner/regress/data/test_signed_char.out |  1 +
 test_runner/regress/test_signed_char.py       | 64 +++++++++++++++++++
 4 files changed, 71 insertions(+), 3 deletions(-)
 create mode 100644 test_runner/regress/data/test_signed_char.out
 create mode 100644 test_runner/regress/test_signed_char.py

diff --git a/Makefile b/Makefile
index 42ee643bb5..0911465fb8 100644
--- a/Makefile
+++ b/Makefile
@@ -11,15 +11,16 @@ ICU_PREFIX_DIR := /usr/local/icu
 #
 BUILD_TYPE ?= debug
 WITH_SANITIZERS ?= no
+PG_CFLAGS = -fsigned-char
 ifeq ($(BUILD_TYPE),release)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl
-	PG_CFLAGS = -O2 -g3 $(CFLAGS)
+	PG_CFLAGS += -O2 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 	# Unfortunately, `--profile=...` is a nightly feature
 	CARGO_BUILD_FLAGS += --release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
-	PG_CFLAGS = -O0 -g3 $(CFLAGS)
+	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
@@ -159,6 +160,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+	+@echo "Compiling pg_trgm $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install
 	+@echo "Compiling amcheck $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
 	+@echo "Compiling test_decoding $*"
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 0cdb44853f..c3aecfbdc5 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -162,7 +162,7 @@ FROM build-deps AS pg-build
 ARG PG_VERSION
 COPY vendor/postgres-${PG_VERSION:?} postgres
 RUN cd postgres && \
-    export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
+    export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \
     --with-icu --with-libxml --with-libxslt --with-lz4" && \
     if [ "${PG_VERSION:?}" != "v14" ]; then \
         # zstd is available only from PG15
diff --git a/test_runner/regress/data/test_signed_char.out b/test_runner/regress/data/test_signed_char.out
new file mode 100644
index 0000000000..a68876e383
--- /dev/null
+++ b/test_runner/regress/data/test_signed_char.out
@@ -0,0 +1 @@
+0000000094010815f81f042000000000b89f8000909f5000689f5000489f4000309f3000189f3000009f3000e89e3000d09e3000b89e3000a09e3000889e3000709e3000309e8000189e3000009e3000e89d3000d09d3000b89d3000a09d3000889d3000709d3000589d3000409d3000289d3000109d3000f89c3000e09c3000c89c3000b09c3000989c3000809c3000689c3000509c3000389c3000209c3000089c3000f09b3000d89b3000c09b3000a89b3000909b3000789b3000609b3000489b3000309b3000189b3000009b3000e89a3000d09a3000b89a3000a09a3000889a3000489a8000309a3000189a3000009a3000e8993000d0993000b8993000a09930008899300070993000589930004099300000998000e8983000d0983000b8983000a0983000889830007098300058983000409830002898300010983000f8973000b8978000a09730008897300070973000589730004097300028973000e8968000a89680006896800028968000e8958000a8958000909530005095800038953000209530000895300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000801000010018004c198900000000000000000029000000008010000100180049787f000000000000000000290000000080100001001800727c7000000000000000000029000000008010002800400020766200000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100001001800207262000000000000000000290000000080100028004000766239000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000280040006239380000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100008010002800400039383700000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100028004000383736000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000280040003736350000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100008010002800400036353400000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100001001800203034000000000000000000280000000080100001001800203933000000000000000000270000000080100001001800203833000000000000000000260000000080100001001800203733000000000000000000250000000080100001001800203633000000000000000000240000000080100001001800203533000000000000000000230000000080100028004000353433000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002034330000000000000000002200000000801000010018002033330000000000000000002100000000801000010018002032330000000000000000002000000000801000010018002031330000000000000000001f00000000801000010018002030330000000000000000001e00000000801000010018002039320000000000000000001d00000000801000010018002038320000000000000000001c00000000801000010018002037320000000000000000001b00000000801000010018002036320000000000000000001a0000000080100001001800203532000000000000000000190000000080100001001800203432000000000000000000180000000080100028004000343332000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002033320000000000000000001700000000801000010018002032320000000000000000001600000000801000010018002031320000000000000000001500000000801000010018002030320000000000000000001400000000801000010018002039310000000000000000001300000000801000010018002038310000000000000000001200000000801000010018002037310000000000000000001100000000801000010018002036310000000000000000001000000000801000010018002035310000000000000000000f00000000801000010018002034310000000000000000000e00000000801000010018002033310000000000000000000d0000000080100028004000333231000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002032310000000000000000000c00000000801000010018002031310000000000000000000b00000000801000010018002030310000000000000000000a00000000801000010018002039200000000000000000000900000000801000010018002038200000000000000000000800000000801000010018002037200000000000000000000700000000801000010018002036200000000000000000000600000000801000010018002035200000000000000000000500000000801000010018003034200000000000000000002800000000801000010018002034200000000000000000000400000000801000010018003933200000000000000000002700000000801000010018003833200000000000000000002600000000801000010018003733200000000000000000002500000000801000010018003633200000000000000000002400000000801000010018003533200000000000000000002300000000801000010018003433200000000000000000002200000000801000010018003333200000000000000000002100000000801000010018003233200000000000000000002000000000801000010018003133200000000000000000001f00000000801000010018003033200000000000000000001e00000000801000010018002033200000000000000000000300000000801000010018003932200000000000000000001d00000000801000010018003832200000000000000000001c00000000801000010018003732200000000000000000001b00000000801000010018003632200000000000000000001a00000000801000010018003532200000000000000000001900000000801000010018003432200000000000000000001800000000801000010018003332200000000000000000001700000000801000010018003232200000000000000000001600000000801000010018003132200000000000000000001500000000801000010018003032200000000000000000001400000000801000010018002032200000000000000000000200000000801000010018003931200000000000000000001300000000801000010018003831200000000000000000001200000000801000010018003731200000000000000000001100000000801000010018003631200000000000000000001000000000801000010018003531200000000000000000000f00000000801000010018003431200000000000000000000e00000000801000010018003331200000000000000000000d0000000080100028004000323120000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018003131200000000000000000000b00000000801000010018003031200000000000000000000a0000000080100001001800203120000000000000000000010000000080100001001800622020000000000000000000290000000080100001001800392020000000000000000000090000000080100001001800382020000000000000000000080000000080100001001800372020000000000000000000070000000080100001001800362020000000000000000000060000000080100001001800352020000000000000000000050000000080100002002000342020000000000000000000040001002400000000000000008010000b00280033202000000000000000000003000a001b010101010101010101000000000000008010000b00280032202000000000000000000002000a001201010101010101010100000000000000801000280040003120200000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100ffffffff00000200
\ No newline at end of file
diff --git a/test_runner/regress/test_signed_char.py b/test_runner/regress/test_signed_char.py
new file mode 100644
index 0000000000..8752a1ff3f
--- /dev/null
+++ b/test_runner/regress/test_signed_char.py
@@ -0,0 +1,64 @@
+from pathlib import Path
+
+from fixtures.neon_fixtures import NeonEnv
+
+SIGNED_CHAR_EXTRACT = """
+    WITH
+    -- Generates an intermediate table with block numbers of the index
+  pagenumbers AS (
+    SELECT num FROM generate_series(0, (pg_relation_size('test_payload_idx') / 8192) - 1) it(num)
+  )
+    SELECT num,
+    -- Gets the data of the page, skipping the first 8 bytes which is the LSN
+    substr(page, 9, 8192-8),
+    -- Returns information about the GIN index opaque area
+    (gin_page_opaque_info(page)).*
+    FROM pagenumbers,
+    -- Gets a page from the respective blocks of the table
+    LATERAL (SELECT get_raw_page('test_payload_idx', num)) AS p(page)
+    -- Filters to only return leaf pages from the GIN Index
+    WHERE ARRAY['leaf'] = ((gin_page_opaque_info(page)).flags);
+    """
+
+
+def test_signed_char(neon_simple_env: NeonEnv):
+    """
+    Test that postgres was compiled with -fsigned-char.
+    ---
+    In multi-character keys, the GIN index creates a CRC Hash of the first 3 bytes of the key.
+    The hash can have the first bit to be set or unset, needing to have a consistent representation
+    of char across architectures for consistent results. GIN stores these keys by their hashes
+    which determines the order in which the keys are obtained from the GIN index.
+    Using -fsigned-char enforces this order across platforms making this consistent.
+    The following query gets all the data present in the leaf page of a GIN index,
+    which is ordered by the CRC hash and is consistent across platforms.
+    """
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start("main")
+
+    with endpoint.connect().cursor() as ses1:
+        # Add the required extensions
+        ses1.execute("CREATE EXTENSION pg_trgm;")
+        ses1.execute("CREATE EXTENSION pageinspect;")
+        # Create a test table
+        ses1.execute("CREATE TABLE test (payload text);")
+        # Create a GIN based index
+        ses1.execute(
+            "CREATE INDEX test_payload_idx ON test USING gin (payload gin_trgm_ops) WITH (gin_pending_list_limit = 64);"
+        )
+        # insert a multibyte character to trigger order-dependent hashing
+        ses1.execute(
+            "INSERT INTO test SELECT '123456789BV' || CHR(127153) /* ace of spades, a multibyte character */ || i::text from generate_series(1, 40) as i(i);"
+        )
+        ses1.execute("INSERT INTO test SELECT 'Bóbr';")
+        # Clean pending list to flush data to pages
+        ses1.execute("select gin_clean_pending_list('test_payload_idx'::regclass);")
+        ses1.execute(SIGNED_CHAR_EXTRACT)
+        pages = ses1.fetchall()
+    # Compare expected output
+    page1 = pages[0]
+    data = bytes(page1[1]).hex()
+    with open(Path(__file__).parent / "data" / "test_signed_char.out", encoding="utf-8") as f:
+        expected = f.read().rstrip()
+
+    assert data == expected

From 066324d6ec07e499256a431abcd8a87078d7f7cb Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 1 Mar 2025 00:48:05 +0200
Subject: [PATCH 09/61] compute_ctl: Rearrange startup code (#11007)

Move most of the code to compute.rs, so that all the major startup steps
are visible in one place. You can now get a pretty good picture of what
happens in the latency-critical path at compute startup by reading
ComputeNode::start_compute().

This also clarifies the error handling in start_compute. Previously, the
start_postgres function sometimes returned an Err, and sometimes Ok but
with the compute status already set to Failed. Now the start_compute
function always returns Err on failure, and it's the caller's
responsibility to change the compute status to Failed. Separately from
that, it returns a handle to the Postgres process via a `&mut` reference
if it had already started Postgres (i.e. on success, or if the failure
happens after launching the Postgres process).

---------

Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
---
 compute_tools/src/bin/compute_ctl.rs          | 465 +----------
 compute_tools/src/catalog.rs                  |   6 +-
 compute_tools/src/compute.rs                  | 770 +++++++++++++-----
 compute_tools/src/http/routes/configure.rs    |   2 +-
 .../src/http/routes/extension_server.rs       |  10 +-
 compute_tools/src/logger.rs                   |  49 ++
 compute_tools/src/monitor.rs                  |   2 +-
 7 files changed, 667 insertions(+), 637 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 6dae1a2753..08966a6efb 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -33,39 +33,27 @@
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
-use std::collections::HashMap;
 use std::ffi::OsString;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
-use std::str::FromStr;
-use std::sync::atomic::Ordering;
-use std::sync::{Arc, Condvar, Mutex, RwLock, mpsc};
+use std::sync::mpsc;
 use std::thread;
 use std::time::Duration;
 
 use anyhow::{Context, Result};
-use chrono::Utc;
 use clap::Parser;
-use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
+use compute_api::responses::ComputeCtlConfig;
 use compute_api::spec::ComputeSpec;
-use compute_tools::compute::{
-    ComputeNode, ComputeState, PG_PID, ParsedSpec, forward_termination_signal,
-};
-use compute_tools::configurator::launch_configurator;
-use compute_tools::disk_quota::set_disk_quota;
+use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal};
 use compute_tools::extension_server::get_pg_version_string;
-use compute_tools::http::server::Server;
 use compute_tools::logger::*;
-use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
-use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
-use compute_tools::swap::resize_swap;
 use rlimit::{Resource, setrlimit};
 use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM};
 use signal_hook::iterator::Signals;
-use tracing::{error, info, warn};
+use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;
 
@@ -164,29 +152,40 @@ fn main() -> Result<()> {
     // enable core dumping for all child processes
     setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
 
-    let (pg_handle, start_pg_result) = {
-        // Enter startup tracing context
-        let _startup_context_guard = startup_context_from_env();
+    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
 
-        let cli_spec = try_spec_from_cli(&cli)?;
+    let cli_spec = try_spec_from_cli(&cli)?;
 
-        let compute = wait_spec(build_tag, &cli, cli_spec)?;
+    let compute_node = ComputeNode::new(
+        ComputeNodeParams {
+            compute_id: cli.compute_id,
+            connstr,
+            pgdata: cli.pgdata.clone(),
+            pgbin: cli.pgbin.clone(),
+            pgversion: get_pg_version_string(&cli.pgbin),
+            external_http_port: cli.external_http_port,
+            internal_http_port: cli.internal_http_port,
+            ext_remote_storage: cli.remote_ext_config.clone(),
+            resize_swap_on_bind: cli.resize_swap_on_bind,
+            set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
+            #[cfg(target_os = "linux")]
+            filecache_connstr: cli.filecache_connstr,
+            #[cfg(target_os = "linux")]
+            cgroup: cli.cgroup,
+            #[cfg(target_os = "linux")]
+            vm_monitor_addr: cli.vm_monitor_addr,
+            build_tag,
 
-        start_postgres(&cli, compute)?
+            live_config_allowed: cli_spec.live_config_allowed,
+        },
+        cli_spec.spec,
+    )?;
 
-        // Startup is finished, exit the startup tracing span
-    };
-
-    // PostgreSQL is now running, if startup was successful. Wait until it exits.
-    let wait_pg_result = wait_postgres(pg_handle)?;
-
-    let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
-
-    maybe_delay_exit(delay_exit);
+    let exit_code = compute_node.run()?;
 
     scenario.teardown();
 
-    deinit_and_exit(wait_pg_result);
+    deinit_and_exit(exit_code);
 }
 
 async fn init() -> Result<String> {
@@ -207,56 +206,6 @@ async fn init() -> Result<String> {
     Ok(build_tag)
 }
 
-fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
-    // Extract OpenTelemetry context for the startup actions from the
-    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
-    // tracing context.
-    //
-    // This is used to propagate the context for the 'start_compute' operation
-    // from the neon control plane. This allows linking together the wider
-    // 'start_compute' operation that creates the compute container, with the
-    // startup actions here within the container.
-    //
-    // There is no standard for passing context in env variables, but a lot of
-    // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
-    // https://github.com/open-telemetry/opentelemetry-specification/issues/740
-    //
-    // Switch to the startup context here, and exit it once the startup has
-    // completed and Postgres is up and running.
-    //
-    // If this pod is pre-created without binding it to any particular endpoint
-    // yet, this isn't the right place to enter the startup context. In that
-    // case, the control plane should pass the tracing context as part of the
-    // /configure API call.
-    //
-    // NOTE: This is supposed to only cover the *startup* actions. Once
-    // postgres is configured and up-and-running, we exit this span. Any other
-    // actions that are performed on incoming HTTP requests, for example, are
-    // performed in separate spans.
-    //
-    // XXX: If the pod is restarted, we perform the startup actions in the same
-    // context as the original startup actions, which probably doesn't make
-    // sense.
-    let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
-    if let Ok(val) = std::env::var("TRACEPARENT") {
-        startup_tracing_carrier.insert("traceparent".to_string(), val);
-    }
-    if let Ok(val) = std::env::var("TRACESTATE") {
-        startup_tracing_carrier.insert("tracestate".to_string(), val);
-    }
-    if !startup_tracing_carrier.is_empty() {
-        use opentelemetry::propagation::TextMapPropagator;
-        use opentelemetry_sdk::propagation::TraceContextPropagator;
-        let guard = TraceContextPropagator::new()
-            .extract(&startup_tracing_carrier)
-            .attach();
-        info!("startup tracing context attached");
-        Some(guard)
-    } else {
-        None
-    }
-}
-
 fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
     // First, try to get cluster spec from the cli argument
     if let Some(ref spec_json) = cli.spec_json {
@@ -307,357 +256,7 @@ struct CliSpecParams {
     live_config_allowed: bool,
 }
 
-fn wait_spec(
-    build_tag: String,
-    cli: &Cli,
-    CliSpecParams {
-        spec,
-        live_config_allowed,
-        compute_ctl_config: _,
-    }: CliSpecParams,
-) -> Result<Arc<ComputeNode>> {
-    let mut new_state = ComputeState::new();
-    let spec_set;
-
-    if let Some(spec) = spec {
-        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
-        info!("new pspec.spec: {:?}", pspec.spec);
-        new_state.pspec = Some(pspec);
-        spec_set = true;
-    } else {
-        spec_set = false;
-    }
-    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
-    let conn_conf = postgres::config::Config::from_str(connstr.as_str())
-        .context("cannot build postgres config from connstr")?;
-    let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
-        .context("cannot build tokio postgres config from connstr")?;
-    let compute_node = ComputeNode {
-        compute_id: cli.compute_id.clone(),
-        connstr,
-        conn_conf,
-        tokio_conn_conf,
-        pgdata: cli.pgdata.clone(),
-        pgbin: cli.pgbin.clone(),
-        pgversion: get_pg_version_string(&cli.pgbin),
-        external_http_port: cli.external_http_port,
-        internal_http_port: cli.internal_http_port,
-        live_config_allowed,
-        state: Mutex::new(new_state),
-        state_changed: Condvar::new(),
-        ext_remote_storage: cli.remote_ext_config.clone(),
-        ext_download_progress: RwLock::new(HashMap::new()),
-        build_tag,
-    };
-    let compute = Arc::new(compute_node);
-
-    // If this is a pooled VM, prewarm before starting HTTP server and becoming
-    // available for binding. Prewarming helps Postgres start quicker later,
-    // because QEMU will already have its memory allocated from the host, and
-    // the necessary binaries will already be cached.
-    if !spec_set {
-        compute.prewarm_postgres()?;
-    }
-
-    // Launch the external HTTP server first, so that we can serve control plane
-    // requests while configuration is still in progress.
-    Server::External(cli.external_http_port).launch(&compute);
-
-    // The internal HTTP server could be launched later, but there isn't much
-    // sense in waiting.
-    Server::Internal(cli.internal_http_port).launch(&compute);
-
-    if !spec_set {
-        // No spec provided, hang waiting for it.
-        info!("no compute spec provided, waiting");
-
-        let mut state = compute.state.lock().unwrap();
-        while state.status != ComputeStatus::ConfigurationPending {
-            state = compute.state_changed.wait(state).unwrap();
-
-            if state.status == ComputeStatus::ConfigurationPending {
-                info!("got spec, continue configuration");
-                // Spec is already set by the http server handler.
-                break;
-            }
-        }
-
-        // Record for how long we slept waiting for the spec.
-        let now = Utc::now();
-        state.metrics.wait_for_spec_ms = now
-            .signed_duration_since(state.start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-
-        // Reset start time, so that the total startup time that is calculated later will
-        // not include the time that we waited for the spec.
-        state.start_time = now;
-    }
-
-    launch_lsn_lease_bg_task_for_static(&compute);
-
-    Ok(compute)
-}
-
-fn start_postgres(
-    cli: &Cli,
-    compute: Arc<ComputeNode>,
-) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
-    // We got all we need, update the state.
-    let mut state = compute.state.lock().unwrap();
-
-    // Create a tracing span for the startup operation.
-    //
-    // We could otherwise just annotate the function with #[instrument], but if
-    // we're being configured from a /configure HTTP request, we want the
-    // startup to be considered part of the /configure request.
-    let _this_entered = {
-        // Temporarily enter the /configure request's span, so that the new span
-        // becomes its child.
-        let _parent_entered = state.startup_span.take().map(|p| p.entered());
-
-        tracing::info_span!("start_postgres")
-    }
-    .entered();
-
-    state.set_status(ComputeStatus::Init, &compute.state_changed);
-
-    info!(
-        "running compute with features: {:?}",
-        state.pspec.as_ref().unwrap().spec.features
-    );
-    // before we release the mutex, fetch some parameters for later.
-    let &ComputeSpec {
-        swap_size_bytes,
-        disk_quota_bytes,
-        #[cfg(target_os = "linux")]
-        disable_lfc_resizing,
-        ..
-    } = &state.pspec.as_ref().unwrap().spec;
-    drop(state);
-
-    // Launch remaining service threads
-    let _monitor_handle = launch_monitor(&compute);
-    let _configurator_handle = launch_configurator(&compute);
-
-    let mut prestartup_failed = false;
-    let mut delay_exit = false;
-
-    // Resize swap to the desired size if the compute spec says so
-    if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
-        // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
-        // *before* starting postgres.
-        //
-        // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
-        // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
-        // OOM-killed during startup because swap wasn't available yet.
-        match resize_swap(size_bytes) {
-            Ok(()) => {
-                let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-                info!(%size_bytes, %size_mib, "resized swap");
-            }
-            Err(err) => {
-                let err = err.context("failed to resize swap");
-                error!("{err:#}");
-
-                // Mark compute startup as failed; don't try to start postgres, and report this
-                // error to the control plane when it next asks.
-                prestartup_failed = true;
-                compute.set_failed_status(err);
-                delay_exit = true;
-            }
-        }
-    }
-
-    // Set disk quota if the compute spec says so
-    if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
-        (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
-    {
-        match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
-            Ok(()) => {
-                let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-                info!(%disk_quota_bytes, %size_mib, "set disk quota");
-            }
-            Err(err) => {
-                let err = err.context("failed to set disk quota");
-                error!("{err:#}");
-
-                // Mark compute startup as failed; don't try to start postgres, and report this
-                // error to the control plane when it next asks.
-                prestartup_failed = true;
-                compute.set_failed_status(err);
-                delay_exit = true;
-            }
-        }
-    }
-
-    // Start Postgres
-    let mut pg = None;
-    if !prestartup_failed {
-        pg = match compute.start_compute() {
-            Ok(pg) => {
-                info!(postmaster_pid = %pg.0.id(), "Postgres was started");
-                Some(pg)
-            }
-            Err(err) => {
-                error!("could not start the compute node: {:#}", err);
-                compute.set_failed_status(err);
-                delay_exit = true;
-                None
-            }
-        };
-    } else {
-        warn!("skipping postgres startup because pre-startup step failed");
-    }
-
-    // Start the vm-monitor if directed to. The vm-monitor only runs on linux
-    // because it requires cgroups.
-    cfg_if::cfg_if! {
-        if #[cfg(target_os = "linux")] {
-            use std::env;
-            use tokio_util::sync::CancellationToken;
-
-            // This token is used internally by the monitor to clean up all threads
-            let token = CancellationToken::new();
-
-            // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
-            let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
-                None
-            } else {
-                Some(cli.filecache_connstr.clone())
-            };
-
-            let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
-                let vm_monitor = tokio::spawn(vm_monitor::start(
-                    Box::leak(Box::new(vm_monitor::Args {
-                        cgroup: Some(cli.cgroup.clone()),
-                        pgconnstr,
-                        addr: cli.vm_monitor_addr.clone(),
-                    })),
-                    token.clone(),
-                ));
-                Some(vm_monitor)
-            } else {
-                None
-            };
-        }
-    }
-
-    Ok((
-        pg,
-        StartPostgresResult {
-            delay_exit,
-            compute,
-            #[cfg(target_os = "linux")]
-            token,
-            #[cfg(target_os = "linux")]
-            vm_monitor,
-        },
-    ))
-}
-
-type PostgresHandle = (std::process::Child, tokio::task::JoinHandle<Result<()>>);
-
-struct StartPostgresResult {
-    delay_exit: bool,
-    // passed through from WaitSpecResult
-    compute: Arc<ComputeNode>,
-
-    #[cfg(target_os = "linux")]
-    token: tokio_util::sync::CancellationToken,
-    #[cfg(target_os = "linux")]
-    vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
-}
-
-fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
-    // Wait for the child Postgres process forever. In this state Ctrl+C will
-    // propagate to Postgres and it will be shut down as well.
-    let mut exit_code = None;
-    if let Some((mut pg, logs_handle)) = pg {
-        info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
-
-        let ecode = pg
-            .wait()
-            .expect("failed to start waiting on Postgres process");
-        PG_PID.store(0, Ordering::SeqCst);
-
-        // Process has exited. Wait for the log collecting task to finish.
-        let _ = tokio::runtime::Handle::current()
-            .block_on(logs_handle)
-            .map_err(|e| tracing::error!("log task panicked: {:?}", e));
-
-        info!("Postgres exited with code {}, shutting down", ecode);
-        exit_code = ecode.code()
-    }
-
-    Ok(WaitPostgresResult { exit_code })
-}
-
-struct WaitPostgresResult {
-    exit_code: Option<i32>,
-}
-
-fn cleanup_after_postgres_exit(
-    StartPostgresResult {
-        mut delay_exit,
-        compute,
-        #[cfg(target_os = "linux")]
-        vm_monitor,
-        #[cfg(target_os = "linux")]
-        token,
-    }: StartPostgresResult,
-) -> Result<bool> {
-    // Terminate the vm_monitor so it releases the file watcher on
-    // /sys/fs/cgroup/neon-postgres.
-    // Note: the vm-monitor only runs on linux because it requires cgroups.
-    cfg_if::cfg_if! {
-        if #[cfg(target_os = "linux")] {
-            if let Some(handle) = vm_monitor {
-                // Kills all threads spawned by the monitor
-                token.cancel();
-                // Kills the actual task running the monitor
-                handle.abort();
-            }
-        }
-    }
-
-    // Maybe sync safekeepers again, to speed up next startup
-    let compute_state = compute.state.lock().unwrap().clone();
-    let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-    if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
-        info!("syncing safekeepers on shutdown");
-        let storage_auth_token = pspec.storage_auth_token.clone();
-        let lsn = compute.sync_safekeepers(storage_auth_token)?;
-        info!("synced safekeepers at lsn {lsn}");
-    }
-
-    let mut state = compute.state.lock().unwrap();
-    if state.status == ComputeStatus::TerminationPending {
-        state.status = ComputeStatus::Terminated;
-        compute.state_changed.notify_all();
-        // we were asked to terminate gracefully, don't exit to avoid restart
-        delay_exit = true
-    }
-    drop(state);
-
-    if let Err(err) = compute.check_for_core_dumps() {
-        error!("error while checking for core dumps: {err:?}");
-    }
-
-    Ok(delay_exit)
-}
-
-fn maybe_delay_exit(delay_exit: bool) {
-    // If launch failed, keep serving HTTP requests for a while, so the cloud
-    // control plane can get the actual error.
-    if delay_exit {
-        info!("giving control plane 30s to collect the error before shutdown");
-        thread::sleep(Duration::from_secs(30));
-    }
-}
-
-fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
+fn deinit_and_exit(exit_code: Option<i32>) -> ! {
     // Shutdown trace pipeline gracefully, so that it has a chance to send any
     // pending traces before we exit. Shutting down OTEL tracing provider may
     // hang for quite some time, see, for example:
diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index 2a7f56e6fc..db3e07e086 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -58,14 +58,14 @@ pub async fn get_database_schema(
     compute: &Arc<ComputeNode>,
     dbname: &str,
 ) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>> + use<>, SchemaDumpError> {
-    let pgbin = &compute.pgbin;
+    let pgbin = &compute.params.pgbin;
     let basepath = Path::new(pgbin).parent().unwrap();
     let pgdump = basepath.join("pg_dump");
 
     // Replace the DB in the connection string and disable it to parts.
     // This is the only option to handle DBs with special characters.
-    let conf =
-        postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?;
+    let conf = postgres_conf_for_db(&compute.params.connstr, dbname)
+        .map_err(|_| SchemaDumpError::Unexpected)?;
     let host = conf
         .get_hosts()
         .first()
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index c0e28790d6..9e065e84a4 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -28,28 +28,53 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::measured_stream::MeasuredReader;
 
+use crate::configurator::launch_configurator;
+use crate::disk_quota::set_disk_quota;
 use crate::installed_extensions::get_installed_extensions;
+use crate::logger::startup_context_from_env;
+use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
+use crate::monitor::launch_monitor;
 use crate::pg_helpers::*;
 use crate::spec::*;
+use crate::swap::resize_swap;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
 use crate::{config, extension_server, local_proxy};
 
 pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
 pub static PG_PID: AtomicU32 = AtomicU32::new(0);
 
-/// Compute node info shared across several `compute_ctl` threads.
-pub struct ComputeNode {
+/// Static configuration params that don't change after startup. These mostly
+/// come from the CLI args, or are derived from them.
+pub struct ComputeNodeParams {
     /// The ID of the compute
     pub compute_id: String,
     // Url type maintains proper escaping
     pub connstr: url::Url,
-    // We connect to Postgres from many different places, so build configs once
-    // and reuse them where needed.
-    pub conn_conf: postgres::config::Config,
-    pub tokio_conn_conf: tokio_postgres::config::Config,
+
+    pub resize_swap_on_bind: bool,
+    pub set_disk_quota_for_fs: Option<String>,
+
+    // VM monitor parameters
+    #[cfg(target_os = "linux")]
+    pub filecache_connstr: String,
+    #[cfg(target_os = "linux")]
+    pub cgroup: String,
+    #[cfg(target_os = "linux")]
+    pub vm_monitor_addr: String,
+
     pub pgdata: String,
     pub pgbin: String,
     pub pgversion: String,
+    pub build_tag: String,
+
+    /// The port that the compute's external HTTP server listens on
+    pub external_http_port: u16,
+    /// The port that the compute's internal HTTP server listens on
+    pub internal_http_port: u16,
+
+    /// the address of extension storage proxy gateway
+    pub ext_remote_storage: Option<String>,
+
     /// We should only allow live re- / configuration of the compute node if
     /// it uses 'pull model', i.e. it can go to control-plane and fetch
     /// the latest configuration. Otherwise, there could be a case:
@@ -63,10 +88,17 @@ pub struct ComputeNode {
     /// - we push spec and it does configuration
     /// - but then it is restarted without any spec again
     pub live_config_allowed: bool,
-    /// The port that the compute's external HTTP server listens on
-    pub external_http_port: u16,
-    /// The port that the compute's internal HTTP server listens on
-    pub internal_http_port: u16,
+}
+
+/// Compute node info shared across several `compute_ctl` threads.
+pub struct ComputeNode {
+    pub params: ComputeNodeParams,
+
+    // We connect to Postgres from many different places, so build configs once
+    // and reuse them where needed. These are derived from 'params.connstr'
+    pub conn_conf: postgres::config::Config,
+    pub tokio_conn_conf: tokio_postgres::config::Config,
+
     /// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
     /// To allow HTTP API server to serving status requests, while configuration
     /// is in progress, lock should be held only for short periods of time to do
@@ -74,11 +106,9 @@ pub struct ComputeNode {
     pub state: Mutex<ComputeState>,
     /// `Condvar` to allow notifying waiters about state changes.
     pub state_changed: Condvar,
-    /// the address of extension storage proxy gateway
-    pub ext_remote_storage: Option<String>,
+
     // key: ext_archive_name, value: started download time, download_completed?
     pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
-    pub build_tag: String,
 }
 
 // store some metrics about download size that might impact startup time
@@ -242,6 +272,25 @@ fn maybe_cgexec(cmd: &str) -> Command {
     }
 }
 
+struct PostgresHandle {
+    postgres: std::process::Child,
+    log_collector: tokio::task::JoinHandle<Result<()>>,
+}
+
+impl PostgresHandle {
+    /// Return PID of the postgres (postmaster) process
+    fn pid(&self) -> Pid {
+        Pid::from_raw(self.postgres.id() as i32)
+    }
+}
+
+struct StartVmMonitorResult {
+    #[cfg(target_os = "linux")]
+    token: tokio_util::sync::CancellationToken,
+    #[cfg(target_os = "linux")]
+    vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
+}
+
 pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String {
     let roles = spec
         .cluster
@@ -316,6 +365,421 @@ pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String {
 }
 
 impl ComputeNode {
+    pub fn new(params: ComputeNodeParams, cli_spec: Option<ComputeSpec>) -> Result<Self> {
+        let connstr = params.connstr.as_str();
+        let conn_conf = postgres::config::Config::from_str(connstr)
+            .context("cannot build postgres config from connstr")?;
+        let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr)
+            .context("cannot build tokio postgres config from connstr")?;
+
+        let mut new_state = ComputeState::new();
+        if let Some(cli_spec) = cli_spec {
+            let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
+            new_state.pspec = Some(pspec);
+        }
+
+        Ok(ComputeNode {
+            params,
+            conn_conf,
+            tokio_conn_conf,
+            state: Mutex::new(new_state),
+            state_changed: Condvar::new(),
+            ext_download_progress: RwLock::new(HashMap::new()),
+        })
+    }
+
+    /// Top-level control flow of compute_ctl. Returns a process exit code we should
+    /// exit with.
+    pub fn run(self) -> Result<Option<i32>> {
+        let this = Arc::new(self);
+
+        let cli_spec = this.state.lock().unwrap().pspec.clone();
+
+        // If this is a pooled VM, prewarm before starting HTTP server and becoming
+        // available for binding. Prewarming helps Postgres start quicker later,
+        // because QEMU will already have its memory allocated from the host, and
+        // the necessary binaries will already be cached.
+        if cli_spec.is_none() {
+            this.prewarm_postgres()?;
+        }
+
+        // Launch the external HTTP server first, so that we can serve control plane
+        // requests while configuration is still in progress.
+        crate::http::server::Server::External(this.params.external_http_port).launch(&this);
+
+        // The internal HTTP server could be launched later, but there isn't much
+        // sense in waiting.
+        crate::http::server::Server::Internal(this.params.internal_http_port).launch(&this);
+
+        // If we got a spec from the CLI already, use that. Otherwise wait for the
+        // control plane to pass it to us with a /configure HTTP request
+        let pspec = if let Some(cli_spec) = cli_spec {
+            cli_spec
+        } else {
+            this.wait_spec()?
+        };
+
+        launch_lsn_lease_bg_task_for_static(&this);
+
+        // We have a spec, start the compute
+        let mut delay_exit = false;
+        let mut vm_monitor = None;
+        let mut pg_process: Option<PostgresHandle> = None;
+
+        match this.start_compute(&mut pg_process) {
+            Ok(()) => {
+                // Success! Launch remaining services (just vm-monitor currently)
+                vm_monitor =
+                    Some(this.start_vm_monitor(pspec.spec.disable_lfc_resizing.unwrap_or(false)));
+            }
+            Err(err) => {
+                // Something went wrong with the startup. Log it and expose the error to
+                // HTTP status requests.
+                error!("could not start the compute node: {:#}", err);
+                this.set_failed_status(err);
+                delay_exit = true;
+
+                // If the error happened after starting PostgreSQL, kill it
+                if let Some(ref pg_process) = pg_process {
+                    kill(pg_process.pid(), Signal::SIGQUIT).ok();
+                }
+            }
+        }
+
+        // If startup was successful, or it failed in the late stages,
+        // PostgreSQL is now running. Wait until it exits.
+        let exit_code = if let Some(pg_handle) = pg_process {
+            let exit_status = this.wait_postgres(pg_handle);
+            info!("Postgres exited with code {}, shutting down", exit_status);
+            exit_status.code()
+        } else {
+            None
+        };
+
+        // Terminate the vm_monitor so it releases the file watcher on
+        // /sys/fs/cgroup/neon-postgres.
+        // Note: the vm-monitor only runs on linux because it requires cgroups.
+        if let Some(vm_monitor) = vm_monitor {
+            cfg_if::cfg_if! {
+                if #[cfg(target_os = "linux")] {
+                    // Kills all threads spawned by the monitor
+                    vm_monitor.token.cancel();
+                    if let Some(handle) = vm_monitor.vm_monitor {
+                        // Kills the actual task running the monitor
+                        handle.abort();
+                    }
+                }
+            }
+        }
+
+        // Reap the postgres process
+        delay_exit |= this.cleanup_after_postgres_exit()?;
+
+        // If launch failed, keep serving HTTP requests for a while, so the cloud
+        // control plane can get the actual error.
+        if delay_exit {
+            info!("giving control plane 30s to collect the error before shutdown");
+            std::thread::sleep(Duration::from_secs(30));
+        }
+        Ok(exit_code)
+    }
+
+    pub fn wait_spec(&self) -> Result<ParsedSpec> {
+        info!("no compute spec provided, waiting");
+        let mut state = self.state.lock().unwrap();
+        while state.status != ComputeStatus::ConfigurationPending {
+            state = self.state_changed.wait(state).unwrap();
+        }
+
+        info!("got spec, continue configuration");
+        let spec = state.pspec.as_ref().unwrap().clone();
+
+        // Record for how long we slept waiting for the spec.
+        let now = Utc::now();
+        state.metrics.wait_for_spec_ms = now
+            .signed_duration_since(state.start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+
+        // Reset start time, so that the total startup time that is calculated later will
+        // not include the time that we waited for the spec.
+        state.start_time = now;
+
+        Ok(spec)
+    }
+
+    /// Start compute.
+    ///
+    /// Prerequisites:
+    /// - the compute spec has been placed in self.state.pspec
+    ///
+    /// On success:
+    /// - status is set to ComputeStatus::Running
+    /// - self.running_postgres is set
+    ///
+    /// On error:
+    /// - status is left in ComputeStatus::Init. The caller is responsible for setting it to Failed
+    /// - if Postgres was started before the fatal error happened, self.running_postgres is
+    ///   set. The caller is responsible for killing it.
+    fn start_compute(self: &Arc<Self>, pg_handle: &mut Option<PostgresHandle>) -> Result<()> {
+        let compute_state: ComputeState;
+
+        let _this_entered;
+        {
+            let mut state_guard = self.state.lock().unwrap();
+
+            // Create a tracing span for the startup operation.
+            //
+            // We could otherwise just annotate the function with #[instrument], but if
+            // we're being configured from a /configure HTTP request, we want the
+            // startup to be considered part of the /configure request.
+            //
+            // Similarly, if a trace ID was passed in env variables, attach it to the span.
+            _this_entered = {
+                // Temporarily enter the parent span, so that the new span becomes its child.
+                if let Some(p) = state_guard.startup_span.take() {
+                    let _parent_entered = p.entered();
+                    tracing::info_span!("start_compute")
+                } else if let Some(otel_context) = startup_context_from_env() {
+                    use tracing_opentelemetry::OpenTelemetrySpanExt;
+                    let span = tracing::info_span!("start_compute");
+                    span.set_parent(otel_context);
+                    span
+                } else {
+                    tracing::info_span!("start_compute")
+                }
+            }
+            .entered();
+
+            state_guard.set_status(ComputeStatus::Init, &self.state_changed);
+            compute_state = state_guard.clone()
+        }
+
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        info!(
+            "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
+            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
+            pspec.tenant_id,
+            pspec.timeline_id,
+            pspec.spec.features,
+            pspec.spec.remote_extensions,
+        );
+
+        // Launch remaining service threads
+        let _monitor_handle = launch_monitor(self);
+        let _configurator_handle = launch_configurator(self);
+
+        // Resize swap to the desired size if the compute spec says so
+        if let (Some(size_bytes), true) =
+            (pspec.spec.swap_size_bytes, self.params.resize_swap_on_bind)
+        {
+            // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
+            // *before* starting postgres.
+            //
+            // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
+            // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
+            // OOM-killed during startup because swap wasn't available yet.
+            resize_swap(size_bytes).context("failed to resize swap")?;
+            let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+            info!(%size_bytes, %size_mib, "resized swap");
+        }
+
+        // Set disk quota if the compute spec says so
+        if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = (
+            pspec.spec.disk_quota_bytes,
+            self.params.set_disk_quota_for_fs.as_ref(),
+        ) {
+            set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint)
+                .context("failed to set disk quota")?;
+            let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+            info!(%disk_quota_bytes, %size_mib, "set disk quota");
+        }
+
+        // tune pgbouncer
+        if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
+            info!("tuning pgbouncer");
+
+            // Spawn a background task to do the tuning,
+            // so that we don't block the main thread that starts Postgres.
+            let pgbouncer_settings = pgbouncer_settings.clone();
+            let _handle = tokio::spawn(async move {
+                let res = tune_pgbouncer(pgbouncer_settings).await;
+                if let Err(err) = res {
+                    error!("error while tuning pgbouncer: {err:?}");
+                    // Continue with the startup anyway
+                }
+            });
+        }
+
+        // configure local_proxy
+        if let Some(local_proxy) = &pspec.spec.local_proxy_config {
+            info!("configuring local_proxy");
+
+            // Spawn a background task to do the configuration,
+            // so that we don't block the main thread that starts Postgres.
+            let local_proxy = local_proxy.clone();
+            let _handle = tokio::spawn(async move {
+                if let Err(err) = local_proxy::configure(&local_proxy) {
+                    error!("error while configuring local_proxy: {err:?}");
+                    // Continue with the startup anyway
+                }
+            });
+        }
+
+        // This part is sync, because we need to download
+        // remote shared_preload_libraries before postgres start (if any)
+        if let Some(remote_extensions) = &pspec.spec.remote_extensions {
+            // First, create control files for all availale extensions
+            extension_server::create_control_files(remote_extensions, &self.params.pgbin);
+
+            let library_load_start_time = Utc::now();
+            let rt = tokio::runtime::Handle::current();
+            let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?;
+
+            let library_load_time = Utc::now()
+                .signed_duration_since(library_load_start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            let mut state = self.state.lock().unwrap();
+            state.metrics.load_ext_ms = library_load_time;
+            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
+            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
+            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
+            info!(
+                "Loading shared_preload_libraries took {:?}ms",
+                library_load_time
+            );
+            info!("{:?}", remote_ext_metrics);
+        }
+
+        // Prepre pgdata directory. This downloads the basebackup, among other things.
+        self.prepare_pgdata(&compute_state)?;
+
+        // Start Postgres
+        let start_time = Utc::now();
+        let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
+        let postmaster_pid = pg_process.pid();
+        *pg_handle = Some(pg_process);
+
+        // If this is a primary endpoint, perform some post-startup configuration before
+        // opening it up for the world.
+        let config_time = Utc::now();
+        if pspec.spec.mode == ComputeMode::Primary {
+            self.configure_as_primary(&compute_state)?;
+            let conf = self.get_conn_conf(None);
+            tokio::task::spawn_blocking(|| {
+                let res = get_installed_extensions(conf);
+                match res {
+                    Ok(extensions) => {
+                        info!(
+                            "[NEON_EXT_STAT] {}",
+                            serde_json::to_string(&extensions)
+                                .expect("failed to serialize extensions list")
+                        );
+                    }
+                    Err(err) => error!("could not get installed extensions: {err:?}"),
+                }
+            });
+        }
+
+        // All done!
+        let startup_end_time = Utc::now();
+        let metrics = {
+            let mut state = self.state.lock().unwrap();
+            state.metrics.start_postgres_ms = config_time
+                .signed_duration_since(start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            state.metrics.config_ms = startup_end_time
+                .signed_duration_since(config_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            state.metrics.total_startup_ms = startup_end_time
+                .signed_duration_since(compute_state.start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            state.metrics.clone()
+        };
+        self.set_status(ComputeStatus::Running);
+
+        // Log metrics so that we can search for slow operations in logs
+        info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished");
+
+        Ok(())
+    }
+
+    /// Start the vm-monitor if directed to. The vm-monitor only runs on linux
+    /// because it requires cgroups.
+    fn start_vm_monitor(&self, disable_lfc_resizing: bool) -> StartVmMonitorResult {
+        cfg_if::cfg_if! {
+            if #[cfg(target_os = "linux")] {
+                use std::env;
+                use tokio_util::sync::CancellationToken;
+
+                // This token is used internally by the monitor to clean up all threads
+                let token = CancellationToken::new();
+
+                // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
+                let pgconnstr = if disable_lfc_resizing {
+                    None
+                } else {
+                    Some(self.params.filecache_connstr.clone())
+                };
+
+                let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
+                    let vm_monitor = tokio::spawn(vm_monitor::start(
+                        Box::leak(Box::new(vm_monitor::Args {
+                            cgroup: Some(self.params.cgroup.clone()),
+                            pgconnstr,
+                            addr: self.params.vm_monitor_addr.clone(),
+                        })),
+                        token.clone(),
+                    ));
+                    Some(vm_monitor)
+                } else {
+                    None
+                };
+                StartVmMonitorResult { token, vm_monitor }
+            } else {
+                StartVmMonitorResult { }
+            }
+        }
+    }
+
+    fn cleanup_after_postgres_exit(&self) -> Result<bool> {
+        // Maybe sync safekeepers again, to speed up next startup
+        let compute_state = self.state.lock().unwrap().clone();
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
+            info!("syncing safekeepers on shutdown");
+            let storage_auth_token = pspec.storage_auth_token.clone();
+            let lsn = self.sync_safekeepers(storage_auth_token)?;
+            info!("synced safekeepers at lsn {lsn}");
+        }
+
+        let mut delay_exit = false;
+        let mut state = self.state.lock().unwrap();
+        if state.status == ComputeStatus::TerminationPending {
+            state.status = ComputeStatus::Terminated;
+            self.state_changed.notify_all();
+            // we were asked to terminate gracefully, don't exit to avoid restart
+            delay_exit = true
+        }
+        drop(state);
+
+        if let Err(err) = self.check_for_core_dumps() {
+            error!("error while checking for core dumps: {err:?}");
+        }
+
+        Ok(delay_exit)
+    }
+
     /// Check that compute node has corresponding feature enabled.
     pub fn has_feature(&self, feature: ComputeFeature) -> bool {
         let state = self.state.lock().unwrap();
@@ -354,9 +818,10 @@ impl ComputeNode {
     fn create_pgdata(&self) -> Result<()> {
         // Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
         // If it is something different then create_dir() will error out anyway.
-        let _ok = fs::remove_dir_all(&self.pgdata);
-        fs::create_dir(&self.pgdata)?;
-        fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?;
+        let pgdata = &self.params.pgdata;
+        let _ok = fs::remove_dir_all(pgdata);
+        fs::create_dir(pgdata)?;
+        fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?;
 
         Ok(())
     }
@@ -421,7 +886,7 @@ impl ComputeNode {
         // sends an Error after finishing the tarball, we will not notice it.
         let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
         ar.set_ignore_zeros(true);
-        ar.unpack(&self.pgdata)?;
+        ar.unpack(&self.params.pgdata)?;
 
         // Report metrics
         let mut state = self.state.lock().unwrap();
@@ -566,9 +1031,9 @@ impl ComputeNode {
     pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
         let start_time = Utc::now();
 
-        let mut sync_handle = maybe_cgexec(&self.pgbin)
+        let mut sync_handle = maybe_cgexec(&self.params.pgbin)
             .args(["--sync-safekeepers"])
-            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
+            .env("PGDATA", &self.params.pgdata) // we cannot use -D in this mode
             .envs(if let Some(storage_auth_token) = &storage_auth_token {
                 vec![("NEON_AUTH_TOKEN", storage_auth_token)]
             } else {
@@ -625,14 +1090,14 @@ impl ComputeNode {
     pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         let spec = &pspec.spec;
-        let pgdata_path = Path::new(&self.pgdata);
+        let pgdata_path = Path::new(&self.params.pgdata);
 
         // Remove/create an empty pgdata directory and put configuration there.
         self.create_pgdata()?;
         config::write_postgres_conf(
             &pgdata_path.join("postgresql.conf"),
             &pspec.spec,
-            self.internal_http_port,
+            self.params.internal_http_port,
         )?;
 
         // Syncing safekeepers is only safe with primary nodes: if a primary
@@ -732,12 +1197,15 @@ impl ComputeNode {
         info!("prewarming");
 
         // Create pgdata
-        let pgdata = &format!("{}.warmup", self.pgdata);
+        let pgdata = &format!("{}.warmup", self.params.pgdata);
         create_pgdata(pgdata)?;
 
         // Run initdb to completion
         info!("running initdb");
-        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
+        let initdb_bin = Path::new(&self.params.pgbin)
+            .parent()
+            .unwrap()
+            .join("initdb");
         Command::new(initdb_bin)
             .args(["--pgdata", pgdata])
             .output()
@@ -753,7 +1221,7 @@ impl ComputeNode {
 
         // Start postgres
         info!("starting postgres");
-        let mut pg = maybe_cgexec(&self.pgbin)
+        let mut pg = maybe_cgexec(&self.params.pgbin)
             .args(["-D", pgdata])
             .spawn()
             .expect("cannot start postgres process");
@@ -780,15 +1248,12 @@ impl ComputeNode {
     ///
     /// Returns a handle to the child process and a handle to the logs thread.
     #[instrument(skip_all)]
-    pub fn start_postgres(
-        &self,
-        storage_auth_token: Option<String>,
-    ) -> Result<(std::process::Child, tokio::task::JoinHandle<Result<()>>)> {
-        let pgdata_path = Path::new(&self.pgdata);
+    pub fn start_postgres(&self, storage_auth_token: Option<String>) -> Result<PostgresHandle> {
+        let pgdata_path = Path::new(&self.params.pgdata);
 
         // Run postgres as a child process.
-        let mut pg = maybe_cgexec(&self.pgbin)
-            .args(["-D", &self.pgdata])
+        let mut pg = maybe_cgexec(&self.params.pgbin)
+            .args(["-D", &self.params.pgdata])
             .envs(if let Some(storage_auth_token) = &storage_auth_token {
                 vec![("NEON_AUTH_TOKEN", storage_auth_token)]
             } else {
@@ -805,7 +1270,29 @@ impl ComputeNode {
 
         wait_for_postgres(&mut pg, pgdata_path)?;
 
-        Ok((pg, logs_handle))
+        Ok(PostgresHandle {
+            postgres: pg,
+            log_collector: logs_handle,
+        })
+    }
+
+    /// Wait for the child Postgres process forever. In this state Ctrl+C will
+    /// propagate to Postgres and it will be shut down as well.
+    fn wait_postgres(&self, mut pg_handle: PostgresHandle) -> std::process::ExitStatus {
+        info!(postmaster_pid = %pg_handle.postgres.id(), "Waiting for Postgres to exit");
+
+        let ecode = pg_handle
+            .postgres
+            .wait()
+            .expect("failed to start waiting on Postgres process");
+        PG_PID.store(0, Ordering::SeqCst);
+
+        // Process has exited. Wait for the log collecting task to finish.
+        let _ = tokio::runtime::Handle::current()
+            .block_on(pg_handle.log_collector)
+            .map_err(|e| tracing::error!("log task panicked: {:?}", e));
+
+        ecode
     }
 
     /// Do post configuration of the already started Postgres. This function spawns a background task to
@@ -972,9 +1459,12 @@ impl ComputeNode {
     // `pg_ctl` for start / stop.
     #[instrument(skip_all)]
     fn pg_reload_conf(&self) -> Result<()> {
-        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
+        let pgctl_bin = Path::new(&self.params.pgbin)
+            .parent()
+            .unwrap()
+            .join("pg_ctl");
         Command::new(pgctl_bin)
-            .args(["reload", "-D", &self.pgdata])
+            .args(["reload", "-D", &self.params.pgdata])
             .output()
             .expect("cannot run pg_ctl process");
         Ok(())
@@ -1014,9 +1504,9 @@ impl ComputeNode {
         }
 
         // Write new config
-        let pgdata_path = Path::new(&self.pgdata);
+        let pgdata_path = Path::new(&self.params.pgdata);
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
-        config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?;
+        config::write_postgres_conf(&postgresql_conf_path, &spec, self.params.internal_http_port)?;
 
         if !spec.skip_pg_catalog_updates {
             let max_concurrent_connections = spec.reconfigure_concurrency;
@@ -1027,7 +1517,8 @@ impl ComputeNode {
                 self.pg_reload_conf()?;
 
                 if spec.mode == ComputeMode::Primary {
-                    let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
+                    let mut conf =
+                        tokio_postgres::Config::from_str(self.params.connstr.as_str()).unwrap();
                     conf.application_name("apply_config");
                     let conf = Arc::new(conf);
 
@@ -1053,166 +1544,52 @@ impl ComputeNode {
     }
 
     #[instrument(skip_all)]
-    pub fn start_compute(
-        &self,
-    ) -> Result<(std::process::Child, tokio::task::JoinHandle<Result<()>>)> {
-        let compute_state = self.state.lock().unwrap().clone();
+    pub fn configure_as_primary(&self, compute_state: &ComputeState) -> Result<()> {
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-        info!(
-            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
-            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
-            pspec.tenant_id,
-            pspec.timeline_id,
-        );
 
-        // tune pgbouncer
-        if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
-            info!("tuning pgbouncer");
+        assert!(pspec.spec.mode == ComputeMode::Primary);
+        if !pspec.spec.skip_pg_catalog_updates {
+            let pgdata_path = Path::new(&self.params.pgdata);
+            // temporarily reset max_cluster_size in config
+            // to avoid the possibility of hitting the limit, while we are applying config:
+            // creating new extensions, roles, etc...
+            config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
+                self.pg_reload_conf()?;
 
-            // Spawn a background task to do the tuning,
-            // so that we don't block the main thread that starts Postgres.
-            let pgbouncer_settings = pgbouncer_settings.clone();
-            let _handle = tokio::spawn(async move {
-                let res = tune_pgbouncer(pgbouncer_settings).await;
-                if let Err(err) = res {
-                    error!("error while tuning pgbouncer: {err:?}");
-                }
-            });
+                self.apply_config(compute_state)?;
+
+                Ok(())
+            })?;
+
+            let postgresql_conf_path = pgdata_path.join("postgresql.conf");
+            if config::line_in_file(
+                &postgresql_conf_path,
+                "neon.disable_logical_replication_subscribers=false",
+            )? {
+                info!(
+                    "updated postgresql.conf to set neon.disable_logical_replication_subscribers=false"
+                );
+            }
+            self.pg_reload_conf()?;
         }
+        self.post_apply_config()?;
 
-        if let Some(local_proxy) = &pspec.spec.local_proxy_config {
-            info!("configuring local_proxy");
-
-            // Spawn a background task to do the configuration,
-            // so that we don't block the main thread that starts Postgres.
-            let local_proxy = local_proxy.clone();
-            let _handle = tokio::spawn(async move {
-                if let Err(err) = local_proxy::configure(&local_proxy) {
-                    error!("error while configuring local_proxy: {err:?}");
-                }
-            });
-        }
-
-        info!(
-            "start_compute spec.remote_extensions {:?}",
-            pspec.spec.remote_extensions
-        );
-
-        // This part is sync, because we need to download
-        // remote shared_preload_libraries before postgres start (if any)
-        if let Some(remote_extensions) = &pspec.spec.remote_extensions {
-            // First, create control files for all availale extensions
-            extension_server::create_control_files(remote_extensions, &self.pgbin);
-
-            let library_load_start_time = Utc::now();
-            let rt = tokio::runtime::Handle::current();
-            let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?;
-
-            let library_load_time = Utc::now()
-                .signed_duration_since(library_load_start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64;
-            let mut state = self.state.lock().unwrap();
-            state.metrics.load_ext_ms = library_load_time;
-            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
-            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
-            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
-            info!(
-                "Loading shared_preload_libraries took {:?}ms",
-                library_load_time
-            );
-            info!("{:?}", remote_ext_metrics);
-        }
-
-        self.prepare_pgdata(&compute_state)?;
-
-        let start_time = Utc::now();
-        let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
-
-        let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary {
-            if !pspec.spec.skip_pg_catalog_updates {
-                let pgdata_path = Path::new(&self.pgdata);
-                // temporarily reset max_cluster_size in config
-                // to avoid the possibility of hitting the limit, while we are applying config:
-                // creating new extensions, roles, etc...
-                config::with_compute_ctl_tmp_override(
-                    pgdata_path,
-                    "neon.max_cluster_size=-1",
-                    || {
-                        self.pg_reload_conf()?;
-
-                        self.apply_config(&compute_state)?;
-
-                        Ok(())
-                    },
-                )?;
-
-                let postgresql_conf_path = pgdata_path.join("postgresql.conf");
-                if config::line_in_file(
-                    &postgresql_conf_path,
-                    "neon.disable_logical_replication_subscribers=false",
-                )? {
+        let conf = self.get_conn_conf(None);
+        tokio::task::spawn_blocking(|| {
+            let res = get_installed_extensions(conf);
+            match res {
+                Ok(extensions) => {
                     info!(
-                        "updated postgresql.conf to set neon.disable_logical_replication_subscribers=false"
+                        "[NEON_EXT_STAT] {}",
+                        serde_json::to_string(&extensions)
+                            .expect("failed to serialize extensions list")
                     );
                 }
-                self.pg_reload_conf()?;
+                Err(err) => error!("could not get installed extensions: {err:?}"),
             }
-            self.post_apply_config()?;
+        });
 
-            let conf = self.get_conn_conf(None);
-            tokio::task::spawn_blocking(|| {
-                let res = get_installed_extensions(conf);
-                match res {
-                    Ok(extensions) => {
-                        info!(
-                            "[NEON_EXT_STAT] {}",
-                            serde_json::to_string(&extensions)
-                                .expect("failed to serialize extensions list")
-                        );
-                    }
-                    Err(err) => error!("could not get installed extensions: {err:?}"),
-                }
-            });
-        }
-
-        let startup_end_time = Utc::now();
-        {
-            let mut state = self.state.lock().unwrap();
-            state.metrics.start_postgres_ms = config_time
-                .signed_duration_since(start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64;
-            state.metrics.config_ms = startup_end_time
-                .signed_duration_since(config_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64;
-            state.metrics.total_startup_ms = startup_end_time
-                .signed_duration_since(compute_state.start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64;
-        }
-        self.set_status(ComputeStatus::Running);
-
-        info!(
-            "finished configuration of compute for project {}",
-            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
-        );
-
-        // Log metrics so that we can search for slow operations in logs
-        let metrics = {
-            let state = self.state.lock().unwrap();
-            state.metrics.clone()
-        };
-        info!(?metrics, "compute start finished");
-
-        Ok(pg_process)
+        Ok(())
     }
 
     /// Update the `last_active` in the shared state, but ensure that it's a more recent one.
@@ -1241,7 +1618,7 @@ impl ComputeNode {
     pub fn check_for_core_dumps(&self) -> Result<()> {
         let core_dump_dir = match std::env::consts::OS {
             "macos" => Path::new("/cores/"),
-            _ => Path::new(&self.pgdata),
+            _ => Path::new(&self.params.pgdata),
         };
 
         // Collect core dump paths if any
@@ -1271,7 +1648,7 @@ impl ComputeNode {
 
             // Try first with gdb
             let backtrace = Command::new("gdb")
-                .args(["--batch", "-q", "-ex", "bt", &self.pgbin])
+                .args(["--batch", "-q", "-ex", "bt", &self.params.pgbin])
                 .arg(&core_path)
                 .output();
 
@@ -1348,7 +1725,8 @@ LIMIT 100",
         ext_path: RemotePath,
     ) -> Result<u64, DownloadError> {
         let ext_remote_storage =
-            self.ext_remote_storage
+            self.params
+                .ext_remote_storage
                 .as_ref()
                 .ok_or(DownloadError::BadInput(anyhow::anyhow!(
                     "Remote extensions storage is not configured",
@@ -1411,7 +1789,7 @@ LIMIT 100",
             &real_ext_name,
             &ext_path,
             ext_remote_storage,
-            &self.pgbin,
+            &self.params.pgbin,
         )
         .await
         .map_err(DownloadError::Other);
@@ -1519,7 +1897,7 @@ LIMIT 100",
         &self,
         spec: &ComputeSpec,
     ) -> Result<RemoteExtensionMetrics> {
-        if self.ext_remote_storage.is_none() {
+        if self.params.ext_remote_storage.is_none() {
             return Ok(RemoteExtensionMetrics {
                 num_ext_downloaded: 0,
                 largest_ext_size: 0,
@@ -1570,8 +1948,12 @@ LIMIT 100",
 
         let mut download_tasks = Vec::new();
         for library in &libs_vec {
-            let (ext_name, ext_path) =
-                remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?;
+            let (ext_name, ext_path) = remote_extensions.get_ext(
+                library,
+                true,
+                &self.params.build_tag,
+                &self.params.pgversion,
+            )?;
             download_tasks.push(self.download_extension(ext_name, ext_path));
         }
         let results = join_all(download_tasks).await;
diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs
index 63d428fff4..3c5a6a6d41 100644
--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
     State(compute): State<Arc<ComputeNode>>,
     request: Json<ConfigurationRequest>,
 ) -> Response {
-    if !compute.live_config_allowed {
+    if !compute.params.live_config_allowed {
         return JsonResponse::error(
             StatusCode::PRECONDITION_FAILED,
             "live configuration is not allowed for this compute node".to_string(),
diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs
index b0265d1e99..563b73ae65 100644
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -18,11 +18,11 @@ pub(in crate::http) struct ExtensionServerParams {
 /// Download a remote extension.
 pub(in crate::http) async fn download_extension(
     Path(filename): Path<String>,
-    params: Query<ExtensionServerParams>,
+    ext_server_params: Query<ExtensionServerParams>,
     State(compute): State<Arc<ComputeNode>>,
 ) -> Response {
     // Don't even try to download extensions if no remote storage is configured
-    if compute.ext_remote_storage.is_none() {
+    if compute.params.ext_remote_storage.is_none() {
         return JsonResponse::error(
             StatusCode::PRECONDITION_FAILED,
             "remote storage is not configured",
@@ -46,9 +46,9 @@ pub(in crate::http) async fn download_extension(
 
         remote_extensions.get_ext(
             &filename,
-            params.is_library,
-            &compute.build_tag,
-            &compute.pgversion,
+            ext_server_params.is_library,
+            &compute.params.build_tag,
+            &compute.params.pgversion,
         )
     };
 
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index 3749dfc844..a65614e94e 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,3 +1,5 @@
+use std::collections::HashMap;
+use tracing::info;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::prelude::*;
 
@@ -42,3 +44,50 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
 pub fn inlinify(s: &str) -> String {
     s.replace('\n', "\u{200B}")
 }
+
+pub fn startup_context_from_env() -> Option<opentelemetry::Context> {
+    // Extract OpenTelemetry context for the startup actions from the
+    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
+    // tracing context.
+    //
+    // This is used to propagate the context for the 'start_compute' operation
+    // from the neon control plane. This allows linking together the wider
+    // 'start_compute' operation that creates the compute container, with the
+    // startup actions here within the container.
+    //
+    // There is no standard for passing context in env variables, but a lot of
+    // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
+    // https://github.com/open-telemetry/opentelemetry-specification/issues/740
+    //
+    // Switch to the startup context here, and exit it once the startup has
+    // completed and Postgres is up and running.
+    //
+    // If this pod is pre-created without binding it to any particular endpoint
+    // yet, this isn't the right place to enter the startup context. In that
+    // case, the control plane should pass the tracing context as part of the
+    // /configure API call.
+    //
+    // NOTE: This is supposed to only cover the *startup* actions. Once
+    // postgres is configured and up-and-running, we exit this span. Any other
+    // actions that are performed on incoming HTTP requests, for example, are
+    // performed in separate spans.
+    //
+    // XXX: If the pod is restarted, we perform the startup actions in the same
+    // context as the original startup actions, which probably doesn't make
+    // sense.
+    let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
+    if let Ok(val) = std::env::var("TRACEPARENT") {
+        startup_tracing_carrier.insert("traceparent".to_string(), val);
+    }
+    if let Ok(val) = std::env::var("TRACESTATE") {
+        startup_tracing_carrier.insert("tracestate".to_string(), val);
+    }
+    if !startup_tracing_carrier.is_empty() {
+        use opentelemetry::propagation::TextMapPropagator;
+        use opentelemetry_sdk::propagation::TraceContextPropagator;
+        info!("got startup tracing context from env variables");
+        Some(TraceContextPropagator::new().extract(&startup_tracing_carrier))
+    } else {
+        None
+    }
+}
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 248505e473..83318538cd 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -18,7 +18,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
 // should be handled gracefully.
 fn watch_compute_activity(compute: &ComputeNode) {
     // Suppose that `connstr` doesn't change
-    let connstr = compute.connstr.clone();
+    let connstr = compute.params.connstr.clone();
     let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
 
     // During startup and configuration we connect to every Postgres database,

From 38ddfab6433e80ede4947c83a1ee2a53a1f073d9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 3 Mar 2025 02:29:37 +0200
Subject: [PATCH 10/61] compute_ctl: Perform more startup actions in parallel
 (#11008)

To speed up compute startup. Resizing swap in particular takes about 100
ms on my laptop. By performing it in parallel with downloading the
basebackup, that latency is effectively hidden. I would imagine that
downloading remote extensions can also take a non-trivial amount of
time, although I didn't try to measure that. In any case that's now also
performed in parallel with downloading the basebackup.
---
 compute_tools/src/compute.rs | 178 ++++++++++++++++++++++-------------
 1 file changed, 115 insertions(+), 63 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 9e065e84a4..a89d3345c1 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -23,7 +23,7 @@ use postgres::NoTls;
 use postgres::error::SqlState;
 use remote_storage::{DownloadError, RemotePath};
 use tokio::spawn;
-use tracing::{debug, error, info, instrument, warn};
+use tracing::{Instrument, debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::measured_stream::MeasuredReader;
@@ -522,9 +522,13 @@ impl ComputeNode {
     /// - status is left in ComputeStatus::Init. The caller is responsible for setting it to Failed
     /// - if Postgres was started before the fatal error happened, self.running_postgres is
     ///   set. The caller is responsible for killing it.
+    ///
+    /// Note that this is in the critical path of a compute cold start. Keep this fast.
+    /// Try to do things concurrently, to hide the latencies.
     fn start_compute(self: &Arc<Self>, pg_handle: &mut Option<PostgresHandle>) -> Result<()> {
         let compute_state: ComputeState;
 
+        let start_compute_span;
         let _this_entered;
         {
             let mut state_guard = self.state.lock().unwrap();
@@ -536,7 +540,7 @@ impl ComputeNode {
             // startup to be considered part of the /configure request.
             //
             // Similarly, if a trace ID was passed in env variables, attach it to the span.
-            _this_entered = {
+            start_compute_span = {
                 // Temporarily enter the parent span, so that the new span becomes its child.
                 if let Some(p) = state_guard.startup_span.take() {
                     let _parent_entered = p.entered();
@@ -549,8 +553,8 @@ impl ComputeNode {
                 } else {
                     tracing::info_span!("start_compute")
                 }
-            }
-            .entered();
+            };
+            _this_entered = start_compute_span.enter();
 
             state_guard.set_status(ComputeStatus::Init, &self.state_changed);
             compute_state = state_guard.clone()
@@ -567,23 +571,44 @@ impl ComputeNode {
             pspec.spec.remote_extensions,
         );
 
-        // Launch remaining service threads
-        let _monitor_handle = launch_monitor(self);
-        let _configurator_handle = launch_configurator(self);
+        ////// PRE-STARTUP PHASE: things that need to be finished before we start the Postgres process
+
+        // Collect all the tasks that must finish here
+        let mut pre_tasks = tokio::task::JoinSet::new();
+
+        // If there are any remote extensions in shared_preload_libraries, start downloading them
+        if pspec.spec.remote_extensions.is_some() {
+            let (this, spec) = (self.clone(), pspec.spec.clone());
+            pre_tasks.spawn(async move {
+                this.download_preload_extensions(&spec)
+                    .in_current_span()
+                    .await
+            });
+        }
+
+        // Prepare pgdata directory. This downloads the basebackup, among other things.
+        {
+            let (this, cs) = (self.clone(), compute_state.clone());
+            pre_tasks.spawn_blocking_child(move || this.prepare_pgdata(&cs));
+        }
 
         // Resize swap to the desired size if the compute spec says so
         if let (Some(size_bytes), true) =
             (pspec.spec.swap_size_bytes, self.params.resize_swap_on_bind)
         {
-            // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
-            // *before* starting postgres.
-            //
-            // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
-            // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
-            // OOM-killed during startup because swap wasn't available yet.
-            resize_swap(size_bytes).context("failed to resize swap")?;
-            let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-            info!(%size_bytes, %size_mib, "resized swap");
+            pre_tasks.spawn_blocking_child(move || {
+                // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
+                // *before* starting postgres.
+                //
+                // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
+                // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
+                // OOM-killed during startup because swap wasn't available yet.
+                resize_swap(size_bytes).context("failed to resize swap")?;
+                let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+                info!(%size_bytes, %size_mib, "resized swap");
+
+                Ok::<(), anyhow::Error>(())
+            });
         }
 
         // Set disk quota if the compute spec says so
@@ -591,10 +616,15 @@ impl ComputeNode {
             pspec.spec.disk_quota_bytes,
             self.params.set_disk_quota_for_fs.as_ref(),
         ) {
-            set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint)
-                .context("failed to set disk quota")?;
-            let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-            info!(%disk_quota_bytes, %size_mib, "set disk quota");
+            let disk_quota_fs_mountpoint = disk_quota_fs_mountpoint.clone();
+            pre_tasks.spawn_blocking_child(move || {
+                set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint)
+                    .context("failed to set disk quota")?;
+                let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+                info!(%disk_quota_bytes, %size_mib, "set disk quota");
+
+                Ok::<(), anyhow::Error>(())
+            });
         }
 
         // tune pgbouncer
@@ -628,37 +658,17 @@ impl ComputeNode {
             });
         }
 
-        // This part is sync, because we need to download
-        // remote shared_preload_libraries before postgres start (if any)
-        if let Some(remote_extensions) = &pspec.spec.remote_extensions {
-            // First, create control files for all availale extensions
-            extension_server::create_control_files(remote_extensions, &self.params.pgbin);
+        // Launch remaining service threads
+        let _monitor_handle = launch_monitor(self);
+        let _configurator_handle = launch_configurator(self);
 
-            let library_load_start_time = Utc::now();
-            let rt = tokio::runtime::Handle::current();
-            let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?;
-
-            let library_load_time = Utc::now()
-                .signed_duration_since(library_load_start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64;
-            let mut state = self.state.lock().unwrap();
-            state.metrics.load_ext_ms = library_load_time;
-            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
-            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
-            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
-            info!(
-                "Loading shared_preload_libraries took {:?}ms",
-                library_load_time
-            );
-            info!("{:?}", remote_ext_metrics);
+        // Wait for all the pre-tasks to finish before starting postgres
+        let rt = tokio::runtime::Handle::current();
+        while let Some(res) = rt.block_on(pre_tasks.join_next()) {
+            res??;
         }
 
-        // Prepre pgdata directory. This downloads the basebackup, among other things.
-        self.prepare_pgdata(&compute_state)?;
-
-        // Start Postgres
+        ////// START POSTGRES
         let start_time = Utc::now();
         let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
         let postmaster_pid = pg_process.pid();
@@ -669,6 +679,7 @@ impl ComputeNode {
         let config_time = Utc::now();
         if pspec.spec.mode == ComputeMode::Primary {
             self.configure_as_primary(&compute_state)?;
+
             let conf = self.get_conn_conf(None);
             tokio::task::spawn_blocking(|| {
                 let res = get_installed_extensions(conf);
@@ -714,6 +725,39 @@ impl ComputeNode {
         Ok(())
     }
 
+    #[instrument(skip_all)]
+    async fn download_preload_extensions(&self, spec: &ComputeSpec) -> Result<()> {
+        let remote_extensions = if let Some(remote_extensions) = &spec.remote_extensions {
+            remote_extensions
+        } else {
+            return Ok(());
+        };
+
+        // First, create control files for all available extensions
+        extension_server::create_control_files(remote_extensions, &self.params.pgbin);
+
+        let library_load_start_time = Utc::now();
+        let remote_ext_metrics = self.prepare_preload_libraries(spec).await?;
+
+        let library_load_time = Utc::now()
+            .signed_duration_since(library_load_start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+        let mut state = self.state.lock().unwrap();
+        state.metrics.load_ext_ms = library_load_time;
+        state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
+        state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
+        state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
+        info!(
+            "Loading shared_preload_libraries took {:?}ms",
+            library_load_time
+        );
+        info!("{:?}", remote_ext_metrics);
+
+        Ok(())
+    }
+
     /// Start the vm-monitor if directed to. The vm-monitor only runs on linux
     /// because it requires cgroups.
     fn start_vm_monitor(&self, disable_lfc_resizing: bool) -> StartVmMonitorResult {
@@ -1574,21 +1618,6 @@ impl ComputeNode {
         }
         self.post_apply_config()?;
 
-        let conf = self.get_conn_conf(None);
-        tokio::task::spawn_blocking(|| {
-            let res = get_installed_extensions(conf);
-            match res {
-                Ok(extensions) => {
-                    info!(
-                        "[NEON_EXT_STAT] {}",
-                        serde_json::to_string(&extensions)
-                            .expect("failed to serialize extensions list")
-                    );
-                }
-                Err(err) => error!("could not get installed extensions: {err:?}"),
-            }
-        });
-
         Ok(())
     }
 
@@ -2030,3 +2059,26 @@ pub fn forward_termination_signal() {
         kill(pg_pid, Signal::SIGINT).ok();
     }
 }
+
+// helper trait to call JoinSet::spawn_blocking(f), but propagates the current
+// tracing span to the thread.
+trait JoinSetExt<T> {
+    fn spawn_blocking_child<F>(&mut self, f: F) -> tokio::task::AbortHandle
+    where
+        F: FnOnce() -> T + Send + 'static,
+        T: Send;
+}
+
+impl<T: 'static> JoinSetExt<T> for tokio::task::JoinSet<T> {
+    fn spawn_blocking_child<F>(&mut self, f: F) -> tokio::task::AbortHandle
+    where
+        F: FnOnce() -> T + Send + 'static,
+        T: Send,
+    {
+        let sp = tracing::Span::current();
+        self.spawn_blocking(move || {
+            let _e = sp.enter();
+            f()
+        })
+    }
+}

From df0767176ad2dee3bfaa0fef30f33959e08ca2e6 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Mon, 3 Mar 2025 10:40:49 +0100
Subject: [PATCH 11/61] Change the tags names according to the curent state
 (#11059)

## Problem
We have not synced `force-test-extensions-upgrade.yml` with the last
changes.
The variable `TEST_EXTENSIONS_UPGRADE` was ignored in the script and
actually set to `NEW_COMPUTE_TAG` while it should be set to
`OLD_COMPUTE_TAG` as we are about to run compatibility tests.
## Summary of changes
The tag names were synced, the logic was fixed.
---
 .github/workflows/force-test-extensions-upgrade.yml |  5 +++--
 docker-compose/test_extensions_upgrade.sh           | 11 +++++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml
index 71c5158ef6..f2376306dc 100644
--- a/.github/workflows/force-test-extensions-upgrade.yml
+++ b/.github/workflows/force-test-extensions-upgrade.yml
@@ -52,8 +52,9 @@ jobs:
       - name: Test extension upgrade
         timeout-minutes: 20
         env:
-          NEWTAG: latest
-          OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
+          NEW_COMPUTE_TAG: latest
+          OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
+          TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
           PG_VERSION: ${{ matrix.pg-version }}
           FORCE_ALL_UPGRADE_TESTS: true
         run: ./docker-compose/test_extensions_upgrade.sh
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index 57c0182162..51d1e40802 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -6,8 +6,11 @@ generate_id() {
     local -n resvar=$1
     printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
 }
-if [ -z ${OLD_COMPUTE_TAG+x} ] || [ -z ${NEW_COMPUTE_TAG+x} ] || [ -z "${OLD_COMPUTE_TAG}" ] || [ -z "${NEW_COMPUTE_TAG}" ]; then
-  echo OLD_COMPUTE_TAG and NEW_COMPUTE_TAG must be defined
+echo "${OLD_COMPUTE_TAG}"
+echo "${NEW_COMPUTE_TAG}"
+echo "${TEST_EXTENSIONS_TAG}"
+if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then
+  echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set
   exit 1
 fi
 export PG_VERSION=${PG_VERSION:-16}
@@ -82,7 +85,7 @@ EXTENSIONS='[
 {"extname": "pg_repack", "extdir": "pg_repack-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
-COMPUTE_TAG=${NEW_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
+COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
 wait_for_ready
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
@@ -90,7 +93,7 @@ create_extensions "${EXTNAMES}"
 query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')"
 new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
 docker compose --profile test-extensions down
-COMPUTE_TAG=${OLD_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
+COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
 wait_for_ready
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"

From 625c526bdd271d845b9e42c2aae197c8b09a0d77 Mon Sep 17 00:00:00 2001
From: Misha Sakhnov <mikhail@skhnv.me>
Date: Mon, 3 Mar 2025 13:47:09 +0200
Subject: [PATCH 12/61] ci: create multiarch vm images (#11017)

## Problem

We build compute-nodes as multi-arch images, but not the
vm-compute-nodes. The PR adds multiarch vm images the same way as in
autoscaling repo.

## Summary of changes

Add architecture to the matrix for vm compute build steps
Add merge job

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/build_and_test.yml | 39 +++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index fb6da2f173..ac6e0634f0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -692,15 +692,15 @@ jobs:
                                              neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                              neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
-  vm-compute-node-image:
+  vm-compute-node-image-arch:
     needs: [ check-permissions, meta, compute-node-image ]
     if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
-    runs-on: [ self-hosted, large ]
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
     strategy:
       fail-fast: false
       matrix:
+        arch: [ amd64, arm64 ]
         version:
-          # see the comment for `compute-node-image-arch` job
           - pg: v14
             debian: bullseye
           - pg: v15
@@ -717,7 +717,7 @@ jobs:
 
       - name: Downloading vm-builder
         run: |
-          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-amd64 -o vm-builder
+          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder
           chmod +x vm-builder
 
       - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
@@ -738,12 +738,37 @@ jobs:
             -size=2G \
             -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
             -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-            -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-            -target-arch=linux/amd64
+            -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \
+            -target-arch=linux/${{ matrix.arch }}
 
       - name: Pushing vm-compute-node image
         run: |
-          docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}
+          docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }}
+
+  vm-compute-node-image:
+    needs: [ vm-compute-node-image-arch, meta ]
+    if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        version:
+          # see the comment for `compute-node-image-arch` job
+          - pg: v14
+          - pg: v15
+          - pg: v16
+          - pg: v17
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch compute-node image
+        run: |
+          docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
+                                             neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \
+                                             neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64
+
 
   test-images:
     needs: [ check-permissions, meta, neon-image, compute-node-image ]

From 8669bfe4937dafc36f430cbc2401b84f186ca81c Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 3 Mar 2025 14:50:07 +0200
Subject: [PATCH 13/61] Do not store zero pages in inmem SMGR for walredo
 (#11043)

## Problem


See https://neondb.slack.com/archives/C033RQ5SPDH/p1740157873114339

smgrextend for FSM fork is called during page reconstruction by walredo
process causing overflow of inmem SMGR (64 pages).

## Summary of changes

Do not store zero pages in inmem SMGR because `inmem_read` returns zero
page if it is not able to locate specified block.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon_walredo/inmem_smgr.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c
index ff2846a9e7..75b9ab4464 100644
--- a/pgxn/neon_walredo/inmem_smgr.c
+++ b/pgxn/neon_walredo/inmem_smgr.c
@@ -32,8 +32,8 @@
 
 #include "inmem_smgr.h"
 
-/* Size of the in-memory smgr: XLR_MAX_BLOCK_ID is 32, but we can update up to 3 forks for each block */
-#define MAX_PAGES 100
+/* Size of the in-memory smgr: XLR_MAX_BLOCK_ID is 32, so assume that 64 will be enough */
+#define MAX_PAGES 64
 
 /* If more than WARN_PAGES are used, print a warning in the log */
 #define WARN_PAGES 32
@@ -174,10 +174,7 @@ static void
 inmem_zeroextend(SMgrRelation reln, ForkNumber forknum,
 				 BlockNumber blocknum, int nblocks, bool skipFsync)
 {
-	char buffer[BLCKSZ] = {0};
-
-	for (int i = 0; i < nblocks; i++)
-		inmem_extend(reln, forknum, blocknum + i, buffer, skipFsync);
+	/* Do nothing: inmem_read will return zero page in any case */
 }
 #endif
 

From ef2b50994cab9702a84aca490d70f11fb0d1036b Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Mon, 3 Mar 2025 16:20:20 +0300
Subject: [PATCH 14/61] walproposer: basic infra to enable generations (#11002)

## Problem

Preparation for https://github.com/neondatabase/neon/issues/10851

## Summary of changes

Add walproposer `safekeepers_generations` field which can be set by
prefixing `neon.safekeepers` GUC with `g#n:`. Non zero value (n) forces
walproposer to use generations. In particular, this also disables
implicit timeline creation as timeline will be created by storcon. Add
test checking this. Also add missing infra: `--safekeepers-generation`
flag to neon_local endpoint start + fix `--start-timeout` flag: it
existed but value wasn't used.
---
 compute_tools/src/config.rs              | 14 ++++-
 control_plane/src/bin/neon_local.rs      | 20 +++++--
 control_plane/src/endpoint.rs            | 25 ++++++---
 libs/compute_api/src/spec.rs             | 11 ++++
 pgxn/neon/walproposer.c                  | 69 ++++++++++++++++++++++--
 pgxn/neon/walproposer.h                  | 19 ++++++-
 test_runner/fixtures/neon_cli.py         |  6 +++
 test_runner/fixtures/neon_fixtures.py    |  8 ++-
 test_runner/regress/test_wal_acceptor.py | 48 +++++++++++++++++
 9 files changed, 200 insertions(+), 20 deletions(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index e8056ec7eb..ca24ff76b3 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -1,5 +1,7 @@
+use std::fmt::Write as FmtWrite;
 use std::fs::{File, OpenOptions};
 use std::io;
+use std::io::Write;
 use std::io::prelude::*;
 use std::path::Path;
 
@@ -55,10 +57,20 @@ pub fn write_postgres_conf(
         writeln!(file, "neon.stripe_size={stripe_size}")?;
     }
     if !spec.safekeeper_connstrings.is_empty() {
+        let mut neon_safekeepers_value = String::new();
+        tracing::info!(
+            "safekeepers_connstrings is not zero, gen: {:?}",
+            spec.safekeepers_generation
+        );
+        // If generation is given, prepend sk list with g#number:
+        if let Some(generation) = spec.safekeepers_generation {
+            write!(neon_safekeepers_value, "g#{}:", generation)?;
+        }
+        neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(","));
         writeln!(
             file,
             "neon.safekeepers={}",
-            escape_conf_value(&spec.safekeeper_connstrings.join(","))
+            escape_conf_value(&neon_safekeepers_value)
         )?;
     }
     if let Some(s) = &spec.tenant_id {
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index f258025428..375b5d87d0 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -40,6 +40,7 @@ use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInf
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
+use safekeeper_api::membership::SafekeeperGeneration;
 use safekeeper_api::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
     DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -596,7 +597,15 @@ struct EndpointStartCmdArgs {
     #[clap(long = "pageserver-id")]
     endpoint_pageserver_id: Option<NodeId>,
 
-    #[clap(long)]
+    #[clap(
+        long,
+        help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
+    )]
+    safekeepers_generation: Option<u32>,
+    #[clap(
+        long,
+        help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
+    )]
     safekeepers: Option<String>,
 
     #[clap(
@@ -617,9 +626,9 @@ struct EndpointStartCmdArgs {
     )]
     allow_multiple: bool,
 
-    #[clap(short = 't', long, help = "timeout until we fail the command")]
-    #[arg(default_value = "10s")]
-    start_timeout: humantime::Duration,
+    #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
+    #[arg(default_value = "90s")]
+    start_timeout: Duration,
 }
 
 #[derive(clap::Args)]
@@ -1350,6 +1359,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
             let pageserver_id = args.endpoint_pageserver_id;
             let remote_ext_config = &args.remote_ext_config;
 
+            let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
             // If --safekeepers argument is given, use only the listed
             // safekeeper nodes; otherwise all from the env.
             let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
@@ -1425,11 +1435,13 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
             endpoint
                 .start(
                     &auth_token,
+                    safekeepers_generation,
                     safekeepers,
                     pageservers,
                     remote_ext_config.as_ref(),
                     stripe_size.0 as usize,
                     args.create_test_user,
+                    args.start_timeout,
                 )
                 .await?;
         }
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 50ccca36fe..87bfbd7570 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -42,7 +42,7 @@ use std::path::PathBuf;
 use std::process::Command;
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 
 use anyhow::{Context, Result, anyhow, bail};
 use compute_api::requests::ConfigurationRequest;
@@ -53,6 +53,7 @@ use compute_api::spec::{
 use nix::sys::signal::{Signal, kill};
 use pageserver_api::shard::ShardStripeSize;
 use reqwest::header::CONTENT_TYPE;
+use safekeeper_api::membership::SafekeeperGeneration;
 use serde::{Deserialize, Serialize};
 use tracing::debug;
 use url::Host;
@@ -576,14 +577,17 @@ impl Endpoint {
         Ok(safekeeper_connstrings)
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub async fn start(
         &self,
         auth_token: &Option<String>,
+        safekeepers_generation: Option<SafekeeperGeneration>,
         safekeepers: Vec<NodeId>,
         pageservers: Vec<(Host, u16)>,
         remote_ext_config: Option<&String>,
         shard_stripe_size: usize,
         create_test_user: bool,
+        start_timeout: Duration,
     ) -> Result<()> {
         if self.status() == EndpointStatus::Running {
             anyhow::bail!("The endpoint is already running");
@@ -655,6 +659,7 @@ impl Endpoint {
             timeline_id: Some(self.timeline_id),
             mode: self.mode,
             pageserver_connstring: Some(pageserver_connstring),
+            safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
             safekeeper_connstrings,
             storage_auth_token: auth_token.clone(),
             remote_extensions,
@@ -770,17 +775,18 @@ impl Endpoint {
         std::fs::write(pidfile_path, pid.to_string())?;
 
         // Wait for it to start
-        let mut attempt = 0;
         const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
-        const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
+        let start_at = Instant::now();
         loop {
-            attempt += 1;
             match self.get_status().await {
                 Ok(state) => {
                     match state.status {
                         ComputeStatus::Init => {
-                            if attempt == MAX_ATTEMPTS {
-                                bail!("compute startup timed out; still in Init state");
+                            if Instant::now().duration_since(start_at) > start_timeout {
+                                bail!(
+                                    "compute startup timed out {:?}; still in Init state",
+                                    start_timeout
+                                );
                             }
                             // keep retrying
                         }
@@ -807,8 +813,11 @@ impl Endpoint {
                     }
                 }
                 Err(e) => {
-                    if attempt == MAX_ATTEMPTS {
-                        return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
+                    if Instant::now().duration_since(start_at) > start_timeout {
+                        return Err(e).context(format!(
+                            "timed out {:?} waiting to connect to compute_ctl HTTP",
+                            start_timeout,
+                        ));
                     }
                 }
             }
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index d02bfd6814..df82d8b449 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -101,6 +101,17 @@ pub struct ComputeSpec {
     pub timeline_id: Option<TimelineId>,
     pub pageserver_connstring: Option<String>,
 
+    /// Safekeeper membership config generation. It is put in
+    /// neon.safekeepers GUC and serves two purposes:
+    /// 1) Non zero value forces walproposer to use membership configurations.
+    /// 2) If walproposer wants to update list of safekeepers to connect to
+    ///    taking them from some safekeeper mconf, it should check what value
+    ///    is newer by comparing the generation.
+    ///
+    /// Note: it could be SafekeeperGeneration, but this needs linking
+    /// compute_ctl with postgres_ffi.
+    #[serde(default)]
+    pub safekeepers_generation: Option<u32>,
     #[serde(default)]
     pub safekeeper_connstrings: Vec<String>,
 
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 356895aa82..7ec4ec99fc 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -83,6 +83,7 @@ static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
 static char *FormatEvents(WalProposer *wp, uint32 events);
 static void UpdateDonorShmem(WalProposer *wp);
 static char *MembershipConfigurationToString(MembershipConfiguration *mconf);
+static void MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst);
 static void MembershipConfigurationFree(MembershipConfiguration *mconf);
 
 WalProposer *
@@ -97,7 +98,32 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->config = config;
 	wp->api = api;
 
-	for (host = wp->config->safekeepers_list; host != NULL && *host != '\0'; host = sep)
+	wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
+
+	/*
+	 * If safekeepers list starts with g# parse generation number followed by
+	 * :
+	 */
+	if (strncmp(wp->config->safekeepers_list, "g#", 2) == 0)
+	{
+		char	   *endptr;
+
+		errno = 0;
+		wp->safekeepers_generation = strtoul(wp->config->safekeepers_list + 2, &endptr, 10);
+		if (errno != 0)
+		{
+			wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m");
+		}
+		/* Skip past : to the first hostname. */
+		host = endptr + 1;
+	}
+	else
+	{
+		host = wp->config->safekeepers_list;
+	}
+	wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation);
+
+	for (; host != NULL && *host != '\0'; host = sep)
 	{
 		port = strchr(host, ':');
 		if (port == NULL)
@@ -183,6 +209,12 @@ WalProposerFree(WalProposer *wp)
 	pfree(wp);
 }
 
+static bool
+WalProposerGenerationsEnabled(WalProposer *wp)
+{
+	return wp->safekeepers_generation != 0;
+}
+
 /*
  * Create new AppendRequest message and start sending it. This function is
  * called from walsender every time the new WAL is available.
@@ -600,10 +632,14 @@ static void
 SendStartWALPush(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
+
+	/* Forbid implicit timeline creation if generations are enabled. */
+	char	   *allow_timeline_creation = WalProposerGenerationsEnabled(wp) ? "false" : "true";
 #define CMD_LEN 512
 	char		cmd[CMD_LEN];
 
-	snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version);
+
+	snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d', allow_timeline_creation '%s')", wp->config->proto_version, allow_timeline_creation);
 	if (!wp->api.conn_send_query(sk, cmd))
 	{
 		wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s",
@@ -705,6 +741,18 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		   sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term);
 	pfree(mconf_toml);
 
+	/*
+	 * Adopt mconf of safekeepers if it is higher. TODO: mconf change should
+	 * restart wp if it started voting.
+	 */
+	if (sk->greetResponse.mconf.generation > wp->mconf.generation)
+	{
+		MembershipConfigurationFree(&wp->mconf);
+		MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
+		/* full conf was just logged above */
+		wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
+	}
+
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
 
@@ -1896,7 +1944,8 @@ PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf
 						pq_sendint64_le(buf, m->termHistory->entries[i].term);
 						pq_sendint64_le(buf, m->termHistory->entries[i].lsn);
 					}
-					/* 
+
+					/*
 					 * Removed timeline_start_lsn. Still send it as a valid
 					 * value until safekeepers taking it from term history are
 					 * deployed.
@@ -2162,7 +2211,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 		}
 	}
 	wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version);
-	return false; /* keep the compiler quiet */
+	return false;				/* keep the compiler quiet */
 }
 
 /*
@@ -2570,6 +2619,18 @@ MembershipConfigurationToString(MembershipConfiguration *mconf)
 	return s.data;
 }
 
+static void
+MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst)
+{
+	dst->generation = src->generation;
+	dst->members.len = src->members.len;
+	dst->members.m = palloc0(sizeof(SafekeeperId) * dst->members.len);
+	memcpy(dst->members.m, src->members.m, sizeof(SafekeeperId) * dst->members.len);
+	dst->new_members.len = src->new_members.len;
+	dst->new_members.m = palloc0(sizeof(SafekeeperId) * dst->new_members.len);
+	memcpy(dst->new_members.m, src->new_members.m, sizeof(SafekeeperId) * dst->new_members.len);
+}
+
 static void
 MembershipConfigurationFree(MembershipConfiguration *mconf)
 {
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index eee55f924f..8d1ae26cac 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -160,7 +160,10 @@ typedef struct MemberSet
 	SafekeeperId *m;			/* ids themselves */
 } MemberSet;
 
-/* Timeline safekeeper membership configuration. */
+/*
+ * Timeline safekeeper membership configuration as sent in the
+ * protocol.
+ */
 typedef struct MembershipConfiguration
 {
 	Generation	generation;
@@ -761,8 +764,22 @@ typedef struct WalProposer
 	/* (n_safekeepers / 2) + 1 */
 	int			quorum;
 
+	/*
+	 * Generation of the membership conf of which safekeepers[] are presumably
+	 * members. To make cplane life a bit easier and have more control in
+	 * tests with which sks walproposer gets connected neon.safekeepers GUC
+	 * doesn't provide full mconf, only the list of endpoints to connect to.
+	 * We still would like to know generation associated with it because 1) we
+	 * need some handle to enforce using generations in walproposer, and
+	 * non-zero value of this serves the purpose; 2) currently we don't do
+	 * that, but in theory walproposer can update list of safekeepers to
+	 * connect to upon receiving mconf from safekeepers, and generation number
+	 * must be checked to see which list is newer.
+	 */
+	Generation	safekeepers_generation;
 	/* Number of occupied slots in safekeepers[] */
 	int			n_safekeepers;
+	/* Safekeepers walproposer is connecting to. */
 	Safekeeper	safekeeper[MAX_SAFEKEEPERS];
 
 	/* WAL has been generated up to this point */
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 97a5a36814..6e53987e7c 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -525,12 +525,14 @@ class NeonLocalCli(AbstractNeonCli):
     def endpoint_start(
         self,
         endpoint_id: str,
+        safekeepers_generation: int | None = None,
         safekeepers: list[int] | None = None,
         remote_ext_config: str | None = None,
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
         create_test_user: bool = False,
         basebackup_request_tries: int | None = None,
+        timeout: str | None = None,
         env: dict[str, str] | None = None,
     ) -> subprocess.CompletedProcess[str]:
         args = [
@@ -543,6 +545,8 @@ class NeonLocalCli(AbstractNeonCli):
         if remote_ext_config is not None:
             args.extend(["--remote-ext-config", remote_ext_config])
 
+        if safekeepers_generation is not None:
+            args.extend(["--safekeepers-generation", str(safekeepers_generation)])
         if safekeepers is not None:
             args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
         if endpoint_id is not None:
@@ -553,6 +557,8 @@ class NeonLocalCli(AbstractNeonCli):
             args.extend(["--allow-multiple"])
         if create_test_user:
             args.extend(["--create-test-user"])
+        if timeout is not None:
+            args.extend(["--start-timeout", str(timeout)])
 
         res = self.raw_cli(args, extra_env_vars)
         res.check_returncode()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6001003e53..53df10be49 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4005,10 +4005,12 @@ class Endpoint(PgProtocol, LogUtils):
         self,
         remote_ext_config: str | None = None,
         pageserver_id: int | None = None,
+        safekeeper_generation: int | None = None,
         safekeepers: list[int] | None = None,
         allow_multiple: bool = False,
         create_test_user: bool = False,
         basebackup_request_tries: int | None = None,
+        timeout: str | None = None,
         env: dict[str, str] | None = None,
     ) -> Self:
         """
@@ -4018,19 +4020,21 @@ class Endpoint(PgProtocol, LogUtils):
 
         assert self.endpoint_id is not None
 
-        # If `safekeepers` is not None, they are remember them as active and use
-        # in the following commands.
+        # If `safekeepers` is not None, remember them as active and use in the
+        # following commands.
         if safekeepers is not None:
             self.active_safekeepers = safekeepers
 
         self.env.neon_cli.endpoint_start(
             self.endpoint_id,
+            safekeepers_generation=safekeeper_generation,
             safekeepers=self.active_safekeepers,
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
             create_test_user=create_test_user,
             basebackup_request_tries=basebackup_request_tries,
+            timeout=timeout,
             env=env,
         )
         self._running.release(1)
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 0a05189bfb..8f70b460c6 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2281,6 +2281,54 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
         http_cli.timeline_status(tenant_id, timeline_id)
 
 
+def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that having neon.safekeepers starting with g#n: with non zero n enables
+    generations, which as a side effect disables automatic timeline creation.
+
+    This is kind of bootstrapping test: here membership conf & timeline is
+    created manually, later storcon will do that.
+    """
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    ps = env.pageservers[0]
+    ps_http_cli = ps.http_client()
+
+    http_clis = [sk.http_client() for sk in env.safekeepers]
+
+    config_lines = [
+        "neon.safekeeper_proto_version = 3",
+    ]
+    ep = env.endpoints.create("main", config_lines=config_lines)
+
+    # expected to fail because timeline is not created on safekeepers
+    with pytest.raises(Exception, match=r".*timed out.*"):
+        ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s")
+    # figure out initial LSN.
+    ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
+    init_lsn = ps_timeline_detail["last_record_lsn"]
+    log.info(f"initial LSN: {init_lsn}")
+    # sk timeline creation request expects minor version
+    pg_version = ps_timeline_detail["pg_version"] * 10000
+    # create inital mconf
+    sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers]
+    mconf = Configuration(generation=1, members=sk_ids, new_members=None)
+    create_r = TimelineCreateRequest(
+        tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
+    )
+    log.info(f"sending timeline create: {create_r.to_json()}")
+
+    for sk_http_cli in http_clis:
+        sk_http_cli.timeline_create(create_r)
+    # Once timeline created endpoint should start.
+    ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
+    ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt

From 38277497fd400e2948293c8e29512be8dc231735 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 3 Mar 2025 13:46:50 +0000
Subject: [PATCH 15/61] pageserver: log shutdown at info level for basebackup
 (#11046)

## Problem

Timeline shutdown during basebackup logs at error level because the the
canecellation error is smushed into BasebackupError::Server.

## Summary of changes
Introduce BasebackupError::Shutdown and use it. `log_query_error` will
now see `QueryError::Shutdown` and log at info level.
---
 pageserver/src/basebackup.rs   | 67 +++++++++++++++++++---------------
 pageserver/src/page_service.rs |  1 +
 2 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index ce54bd9c1c..de527e307b 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -33,8 +33,9 @@ use utils::lsn::Lsn;
 
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::Version;
-use crate::tenant::Timeline;
 use crate::tenant::storage_layer::IoConcurrency;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};
 
 #[derive(Debug, thiserror::Error)]
 pub enum BasebackupError {
@@ -42,6 +43,26 @@ pub enum BasebackupError {
     Server(#[from] anyhow::Error),
     #[error("basebackup client error {0:#} when {1}")]
     Client(#[source] io::Error, &'static str),
+    #[error("basebackup during shutdown")]
+    Shutdown,
+}
+
+impl From<PageReconstructError> for BasebackupError {
+    fn from(value: PageReconstructError) -> Self {
+        match value {
+            PageReconstructError::Cancelled => BasebackupError::Shutdown,
+            err => BasebackupError::Server(err.into()),
+        }
+    }
+}
+
+impl From<GetVectoredError> for BasebackupError {
+    fn from(value: GetVectoredError) -> Self {
+        match value {
+            GetVectoredError::Cancelled => BasebackupError::Shutdown,
+            err => BasebackupError::Server(err.into()),
+        }
+    }
 }
 
 /// Create basebackup with non-rel data in it.
@@ -127,7 +148,7 @@ where
             timeline
                 .gate
                 .enter()
-                .map_err(|e| BasebackupError::Server(e.into()))?,
+                .map_err(|_| BasebackupError::Shutdown)?,
         ),
     };
     basebackup
@@ -323,8 +344,7 @@ where
             let slru_partitions = self
                 .timeline
                 .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
+                .await?
                 .partition(
                     self.timeline.get_shard_identity(),
                     Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -336,11 +356,10 @@ where
                 let blocks = self
                     .timeline
                     .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .await?;
 
                 for (key, block) in blocks {
-                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
+                    let block = block?;
                     slru_builder.add_block(&key, block).await?;
                 }
             }
@@ -349,11 +368,8 @@ where
 
         let mut min_restart_lsn: Lsn = Lsn::MAX;
         // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self
-            .timeline
-            .list_dbdirs(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+        for ((spcnode, dbnode), has_relmap_file) in
+            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
         {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
@@ -362,8 +378,7 @@ where
             let rels = self
                 .timeline
                 .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;
             for &rel in rels.iter() {
                 // Send init fork as main fork to provide well formed empty
                 // contents of UNLOGGED relations. Postgres copies it in
@@ -391,8 +406,7 @@ where
         let aux_files = self
             .timeline
             .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
         let aux_scan_time = start_time.elapsed();
         let aux_estimated_size = aux_files
             .values()
@@ -451,16 +465,14 @@ where
         for xid in self
             .timeline
             .list_twophase_files(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+            .await?
         {
             self.add_twophase_file(xid).await?;
         }
         let repl_origins = self
             .timeline
             .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
         let n_origins = repl_origins.len();
         if n_origins != 0 {
             //
@@ -505,8 +517,7 @@ where
         let nblocks = self
             .timeline
             .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
@@ -532,8 +543,7 @@ where
                     // TODO: investigate using get_vectored for the entire startblk..endblk range.
                     // But this code path is not on the critical path for most basebackups (?).
                     .get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
@@ -567,8 +577,7 @@ where
             let img = self
                 .timeline
                 .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;
 
             if img.len()
                 != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
@@ -622,8 +631,7 @@ where
                 && self
                     .timeline
                     .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?
+                    .await?
                     .is_empty()
             {
                 return Ok(());
@@ -674,8 +682,7 @@ where
         let img = self
             .timeline
             .get_twophase_file(xid, self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 603a5f65aa..ba2ed9dc81 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -2113,6 +2113,7 @@ impl PageServerHandler {
                 // TODO: passthrough the error site to the final error message?
                 BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
                 BasebackupError::Server(e) => QueryError::Other(e),
+                BasebackupError::Shutdown => QueryError::Shutdown,
             }
         }
 

From a07599949fdcf7fdd1e396b9bb53b667a2f34948 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 3 Mar 2025 16:25:48 +0100
Subject: [PATCH 16/61] First version of a new benchmark to test larger OLTP
 workload (#11053)

## Problem

We want to support larger tenants (regarding logical database size,
number of transactions per second etc.) and should increase our test
coverage of OLTP transactions at larger scale.

## Summary of changes

Start a new benchmark that over time will add more OLTP tests at larger
scale.
This PR covers the first version and will be extended in further PRs.

Also fix some infrastructure:
- default for new connections and large tenants is to use connection
pooler pgbouncer, however our fixture always added
`statement_timeout=120` which is not compatible with pooler
[see](https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter)
- action to create branch timed out after 10 seconds and 10 retries but
for large tenants it can take longer so use increasing back-off for
retries

## Test run

https://github.com/neondatabase/neon/actions/runs/13593446706
---
 .github/actionlint.yml                        |   1 +
 .github/actions/neon-branch-create/action.yml |  12 +-
 .github/workflows/large_oltp_benchmark.yml    | 147 ++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         |   7 +-
 .../large_synthetic_oltp/insert_webhooks.sql  |  47 ++++++
 .../select_any_webhook_with_skew.sql          |  15 ++
 .../select_recent_webhook.sql                 |   9 ++
 .../test_perf_oltp_large_tenant.py            |  90 +++++++++++
 8 files changed, 324 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/large_oltp_benchmark.yml
 create mode 100644 test_runner/performance/large_synthetic_oltp/insert_webhooks.sql
 create mode 100644 test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql
 create mode 100644 test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql
 create mode 100644 test_runner/performance/test_perf_oltp_large_tenant.py

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 1e6c2d0aa2..667ff7f92e 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -32,3 +32,4 @@ config-variables:
   - NEON_DEV_AWS_ACCOUNT_ID
   - NEON_PROD_AWS_ACCOUNT_ID
   - AWS_ECR_REGION
+  - BENCHMARK_LARGE_OLTP_PROJECTID
diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml
index 9f752d5a89..71dd6f3af2 100644
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -84,7 +84,13 @@ runs:
           --header "Authorization: Bearer ${API_KEY}"
           )
 
-        role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
+        role_name=$(echo "$roles" | jq --raw-output '
+          (.roles | map(select(.protected == false))) as $roles |
+          if any($roles[]; .name == "neondb_owner")
+          then "neondb_owner"
+          else $roles[0].name
+          end
+        ')
         echo "role_name=${role_name}" >> $GITHUB_OUTPUT
       env:
         API_HOST: ${{ inputs.api_host }}
@@ -107,13 +113,13 @@ runs:
             )
 
           if [ -z "${reset_password}" ]; then
-            sleep 1
+            sleep $i
             continue
           fi
 
           password=$(echo $reset_password | jq --raw-output '.role.password')
           if [ "${password}" == "null" ]; then
-            sleep 1
+            sleep $i # increasing backoff
             continue
           fi
 
diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml
new file mode 100644
index 0000000000..f33e11cd08
--- /dev/null
+++ b/.github/workflows/large_oltp_benchmark.yml
@@ -0,0 +1,147 @@
+name: large oltp benchmark
+
+on:
+  # uncomment to run on push for debugging your PR
+  push:
+    branches: [ bodobolero/synthetic_oltp_workload ]
+
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │  ┌───────────── day of the month (1 - 31)
+    #          │ │  │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │  │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow globally because we need dedicated resources which only exist once
+  group: large-oltp-bench-workflow
+  cancel-in-progress: true
+
+jobs:
+  oltp:
+    strategy:
+      fail-fast: false # allow other variants to continue even if one fails
+      matrix:
+        include:
+          - target: new_branch 
+            custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 
+          - target: reuse_branch 
+            custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 
+      max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h 
+      TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }}
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      PG_VERSION: 16 # pre-determined by pre-determined project
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
+      PLATFORM: ${{ matrix.target }}
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    # Increase timeout to 8h, default timeout is 6h
+    timeout-minutes: 480
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Configure AWS credentials # necessary to download artefacts
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Create Neon Branch for large tenant
+      if: ${{ matrix.target == 'new_branch' }}
+      id: create-neon-branch-oltp-target
+      uses: ./.github/actions/neon-branch-create
+      with:
+          project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+          case "${{ matrix.target }}" in
+              new_branch)
+              CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
+              ;;
+              reuse_branch)
+              CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
+              ;;
+              *)
+              echo >&2 "Unknown target=${{ matrix.target }}"
+              exit 1
+              ;;
+          esac
+
+          echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+    - name: Benchmark pgbench with custom-scripts
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
+        pg_version: ${{ env.PG_VERSION }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Delete Neon Branch for large tenant
+      if: ${{ always() && matrix.target == 'new_branch' }}
+      uses: ./.github/actions/neon-branch-delete
+      with:
+        project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
+        branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      id: create-allure-report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+      with:
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+  
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
+        slack-message: |
+          Periodic large oltp perf testing: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 53df10be49..3aa018e99e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -253,10 +253,15 @@ class PgProtocol:
         # enough for our tests, but if you need a longer, you can
         # change it by calling "SET statement_timeout" after
         # connecting.
+        # pooler does not support statement_timeout
+        # Check if the hostname contains the string 'pooler'
+        hostname = result.get("host", "")
+        log.info(f"Hostname: {hostname}")
         options = result.get("options", "")
-        if "statement_timeout" not in options:
+        if "statement_timeout" not in options and "pooler" not in hostname:
             options = f"-cstatement_timeout=120s {options}"
         result["options"] = options
+
         return result
 
     # autocommit=True here by default because that's what we need most of the time
diff --git a/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql
new file mode 100644
index 0000000000..69e6366a53
--- /dev/null
+++ b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql
@@ -0,0 +1,47 @@
+\set event_type random(1,10)
+\set service_key random(1, 3)
+
+INSERT INTO webhook.incoming_webhooks (
+    created_at, 
+    delivery_id, 
+    upstream_emitted_at, 
+    service_key, 
+    event_id, 
+    source, 
+    body, 
+    json, 
+    additional_data, 
+    is_body_encrypted, 
+    event_type
+) VALUES (
+    now(),
+    gen_random_uuid(),
+    now() - interval '10 minutes',
+    CASE :service_key::int
+        WHEN 1 THEN 'shopify'
+        WHEN 2 THEN 'stripe'
+        WHEN 3 THEN 'github'
+    END,
+    'evt_' || gen_random_uuid(),  -- Ensures uniqueness
+    CASE :service_key::int
+        WHEN 1 THEN 'Shopify'
+        WHEN 2 THEN 'Stripe'
+        WHEN 3 THEN 'GitHub'
+    END,
+    '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}',
+    '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}'::jsonb,
+    '{"metadata": {"user_agent": "Mozilla/5.0", "ip_address": "203.0.113.42"}}'::jsonb,
+    false,
+    CASE :event_type::int
+        WHEN 1 THEN 'ORDER_PLACED'
+        WHEN 2 THEN 'ORDER_CANCELLED'
+        WHEN 3 THEN 'PAYMENT_SUCCESSFUL'
+        WHEN 4 THEN 'PAYMENT_FAILED'
+        WHEN 5 THEN 'CUSTOMER_CREATED'
+        WHEN 6 THEN 'CUSTOMER_UPDATED'
+        WHEN 7 THEN 'PRODUCT_UPDATED'
+        WHEN 8 THEN 'INVENTORY_LOW'
+        WHEN 9 THEN 'SHIPPING_DISPATCHED'
+        WHEN 10 THEN 'REFUND_ISSUED'
+    END
+);
\ No newline at end of file
diff --git a/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql
new file mode 100644
index 0000000000..b2f173f011
--- /dev/null
+++ b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql
@@ -0,0 +1,15 @@
+-- Zipfian distributions model real-world access patterns where:
+--	A few values (popular IDs) are accessed frequently.
+--	Many values are accessed rarely.
+-- This is useful for simulating realistic workloads, like webhook processing where recent events are more frequently accessed.
+
+\set alpha 1.2  
+\set min_id 1
+\set max_id 135000000
+
+\set zipf_random_id random_zipfian(:min_id, :max_id, :alpha)
+
+SELECT * 
+FROM webhook.incoming_webhooks
+WHERE id = (:zipf_random_id)::bigint
+LIMIT 1;
\ No newline at end of file
diff --git a/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql
new file mode 100644
index 0000000000..78a843bf0f
--- /dev/null
+++ b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql
@@ -0,0 +1,9 @@
+-- select one of the most recent webhook records (created in the branch timeline during the bench run)
+SELECT * 
+FROM webhook.incoming_webhooks
+WHERE id = (
+    SELECT (floor(random() * (
+        (SELECT last_value FROM webhook.incoming_webhooks_id_seq) - 1350000001 + 1
+    ) + 1350000001))::bigint
+)
+LIMIT 1;
\ No newline at end of file
diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py
new file mode 100644
index 0000000000..ae00dbb3b5
--- /dev/null
+++ b/test_runner/performance/test_perf_oltp_large_tenant.py
@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+import os
+import timeit
+from pathlib import Path
+
+import pytest
+from fixtures.benchmark_fixture import PgBenchRunResult
+from fixtures.compare_fixtures import PgCompare
+
+from performance.test_perf_pgbench import get_durations_matrix, utc_now_timestamp
+
+
+def get_custom_scripts(
+    default: str = "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4",
+) -> list[str]:
+    # We parametrize each run with the custom scripts to run and their weights.
+    # The custom scripts and their weights are passed through TEST_PGBENCH_CUSTOM_SCRIPTS env variable.
+    # Delimit the custom scripts for one run by spaces and for different runs by commas, for example:
+    # "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4,insert_webhooks.sql@8 select_any_webhook_with_skew.sql@2"
+    # Databases/branches  are pre-created and passed through BENCHMARK_CONNSTR env variable.
+    scripts = os.getenv("TEST_PGBENCH_CUSTOM_SCRIPTS", default=str(default))
+    rv = []
+    for s in scripts.split(","):
+        rv.append(s)
+    return rv
+
+
+def run_test_pgbench(env: PgCompare, custom_scripts: str, duration: int):
+    password = env.pg.default_options.get("password", None)
+    options = env.pg.default_options.get("options", "")
+    # drop password from the connection string by passing password=None and set password separately
+    connstr = env.pg.connstr(password=None, options=options)
+    # if connstr does not contain pooler we can set statement_timeout to 0
+    if "pooler" not in connstr:
+        options = "-cstatement_timeout=0 " + env.pg.default_options.get("options", "")
+        connstr = env.pg.connstr(password=None, options=options)
+
+    script_args = [
+        "pgbench",
+        "-n",  # no explicit vacuum before the test - we want to rely on auto-vacuum
+        "-M",
+        "prepared",
+        "--client=500",
+        "--jobs=100",
+        f"-T{duration}",
+        "-P60",  # progress every minute
+        "--progress-timestamp",
+    ]
+    for script in custom_scripts.split():
+        script_args.extend(["-f", f"test_runner/performance/large_synthetic_oltp/{script}"])
+    script_args.append(connstr)
+
+    run_pgbench(
+        env,
+        "custom-scripts",
+        script_args,
+        password=password,
+    )
+
+
+def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None):
+    environ: dict[str, str] = {}
+    if password is not None:
+        environ["PGPASSWORD"] = password
+
+    run_start_timestamp = utc_now_timestamp()
+    t0 = timeit.default_timer()
+    out = env.pg_bin.run_capture(cmdline, env=environ)
+    run_duration = timeit.default_timer() - t0
+    run_end_timestamp = utc_now_timestamp()
+    env.flush()
+
+    stdout = Path(f"{out}.stdout").read_text()
+
+    res = PgBenchRunResult.parse_from_stdout(
+        stdout=stdout,
+        run_duration=run_duration,
+        run_start_timestamp=run_start_timestamp,
+        run_end_timestamp=run_end_timestamp,
+    )
+    env.zenbenchmark.record_pg_bench_result(prefix, res)
+
+
+@pytest.mark.parametrize("custom_scripts", get_custom_scripts())
+@pytest.mark.parametrize("duration", get_durations_matrix())
+@pytest.mark.remote_cluster
+def test_perf_oltp_large_tenant(remote_compare: PgCompare, custom_scripts: str, duration: int):
+    run_test_pgbench(remote_compare, custom_scripts, duration)
+    # todo: run re-index, analyze, vacuum, etc. after the test and measure and report its duration

From b953daa21fce8b41e4c4556da06279223f8f411a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 3 Mar 2025 16:03:51 +0000
Subject: [PATCH 17/61] safekeeper: allow remote deletion to proceed after
 dropped requests (#11042)

## Problem

If a caller times out on safekeeper timeline deletion on a large
timeline, and waits a while before retrying, the deletion will not
progress while the retry is waiting. The net effect is very very slow
deletion as it only proceeds in 30 second bursts across 5 minute idle
periods.

Related: https://github.com/neondatabase/neon/issues/10265

## Summary of changes

- Run remote deletion in a background task
- Carry a watch::Receiver on the Timeline for other callers to join the
wait
- Restart deletion if the API is called again and the previous attempt
failed
---
 safekeeper/src/timeline.rs                    |  94 ++++-
 safekeeper/src/wal_backup.rs                  |   8 +-
 test_runner/fixtures/remote_storage.py        |  11 +
 test_runner/fixtures/safekeeper/http.py       |   3 +-
 test_runner/fixtures/safekeeper_utils.py      |  92 +++++
 .../regress/test_safekeeper_deletion.py       | 331 ++++++++++++++++++
 test_runner/regress/test_wal_acceptor.py      | 307 +---------------
 7 files changed, 541 insertions(+), 305 deletions(-)
 create mode 100644 test_runner/fixtures/safekeeper_utils.py
 create mode 100644 test_runner/regress/test_safekeeper_deletion.py

diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 930f66a207..d3c841ec09 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -415,6 +415,9 @@ impl From<TimelineError> for ApiError {
     }
 }
 
+/// We run remote deletion in a background task, this is how it sends its results back.
+type RemoteDeletionReceiver = tokio::sync::watch::Receiver<Option<anyhow::Result<()>>>;
+
 /// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline.
 /// It also holds SharedState and provides mutually exclusive access to it.
 pub struct Timeline {
@@ -446,6 +449,8 @@ pub struct Timeline {
     manager_ctl: ManagerCtl,
     conf: Arc<SafeKeeperConf>,
 
+    remote_deletion: std::sync::Mutex<Option<RemoteDeletionReceiver>>,
+
     /// Hold this gate from code that depends on the Timeline's non-shut-down state.  While holding
     /// this gate, you must respect [`Timeline::cancel`]
     pub(crate) gate: Gate,
@@ -494,6 +499,7 @@ impl Timeline {
             walreceivers,
             gate: Default::default(),
             cancel: CancellationToken::default(),
+            remote_deletion: std::sync::Mutex::new(None),
             manager_ctl: ManagerCtl::new(),
             conf,
             broker_active: AtomicBool::new(false),
@@ -598,15 +604,95 @@ impl Timeline {
         shared_state.sk.close_wal_store();
 
         if !only_local && self.conf.is_wal_backup_enabled() {
-            // Note: we concurrently delete remote storage data from multiple
-            // safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
-            // do some retries anyway.
-            wal_backup::delete_timeline(&self.ttid).await?;
+            self.remote_delete().await?;
         }
         let dir_existed = delete_dir(&self.timeline_dir).await?;
         Ok(dir_existed)
     }
 
+    /// Delete timeline content from remote storage.  If the returned future is dropped,
+    /// deletion will continue in the background.
+    ///
+    /// This function ordinarily spawns a task and stashes a result receiver into [`Self::remote_deletion`].  If
+    /// deletion is already happening, it may simply wait for an existing task's result.
+    ///
+    /// Note: we concurrently delete remote storage data from multiple
+    /// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
+    /// do some retries anyway.
+    async fn remote_delete(&self) -> Result<()> {
+        // We will start a background task to do the deletion, so that it proceeds even if our
+        // API request is dropped.  Future requests will see the existing deletion task and wait
+        // for it to complete.
+        let mut result_rx = {
+            let mut remote_deletion_state = self.remote_deletion.lock().unwrap();
+            let result_rx = if let Some(result_rx) = remote_deletion_state.as_ref() {
+                if let Some(result) = result_rx.borrow().as_ref() {
+                    if let Err(e) = result {
+                        // A previous remote deletion failed: we will start a new one
+                        tracing::error!("remote deletion failed, will retry ({e})");
+                        None
+                    } else {
+                        // A previous remote deletion call already succeeded
+                        return Ok(());
+                    }
+                } else {
+                    // Remote deletion is still in flight
+                    Some(result_rx.clone())
+                }
+            } else {
+                // Remote deletion was not attempted yet, start it now.
+                None
+            };
+
+            match result_rx {
+                Some(result_rx) => result_rx,
+                None => self.start_remote_delete(&mut remote_deletion_state),
+            }
+        };
+
+        // Wait for a result
+        let Ok(result) = result_rx.wait_for(|v| v.is_some()).await else {
+            // Unexpected: sender should always send a result before dropping the channel, even if it has an error
+            return Err(anyhow::anyhow!(
+                "remote deletion task future was dropped without sending a result"
+            ));
+        };
+
+        result
+            .as_ref()
+            .expect("We did a wait_for on this being Some above")
+            .as_ref()
+            .map(|_| ())
+            .map_err(|e| anyhow::anyhow!("remote deletion failed: {e}"))
+    }
+
+    /// Spawn background task to do remote deletion, return a receiver for its outcome
+    fn start_remote_delete(
+        &self,
+        guard: &mut std::sync::MutexGuard<Option<RemoteDeletionReceiver>>,
+    ) -> RemoteDeletionReceiver {
+        tracing::info!("starting remote deletion");
+        let (result_tx, result_rx) = tokio::sync::watch::channel(None);
+        let ttid = self.ttid;
+        tokio::task::spawn(
+            async move {
+                let r = wal_backup::delete_timeline(&ttid).await;
+                if let Err(e) = &r {
+                    // Log error here in case nobody ever listens for our result (e.g. dropped API request)
+                    tracing::error!("remote deletion failed: {e}");
+                }
+
+                // Ignore send results: it's legal for the Timeline to give up waiting for us.
+                let _ = result_tx.send(Some(r));
+            }
+            .instrument(info_span!("remote_delete", timeline = %self.ttid)),
+        );
+
+        **guard = Some(result_rx.clone());
+
+        result_rx
+    }
+
     /// Returns if timeline is cancelled.
     pub fn is_cancelled(&self) -> bool {
         self.cancel.is_cancelled()
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 6176e64698..56f4a2faf9 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -21,9 +21,9 @@ use tokio::sync::{OnceCell, watch};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::backoff;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::lsn::Lsn;
+use utils::{backoff, pausable_failpoint};
 
 use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
 use crate::timeline::WalResidentTimeline;
@@ -564,6 +564,12 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     // We don't currently have http requests timeout cancellation, but if/once
     // we have listing should get streaming interface to make progress.
 
+    pausable_failpoint!("sk-delete-timeline-remote-pause");
+
+    fail::fail_point!("sk-delete-timeline-remote", |_| {
+        Err(anyhow::anyhow!("failpoint: sk-delete-timeline-remote"))
+    });
+
     let cancel = CancellationToken::new(); // not really used
     backoff::retry(
         || async {
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 4df2b2df2b..cac84c07e7 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -282,6 +282,17 @@ class S3Storage:
     def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str:
         return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}"
 
+    def safekeeper_tenants_path(self) -> str:
+        return f"{self.prefix_in_bucket}"
+
+    def safekeeper_tenant_path(self, tenant_id: TenantShardId | TenantId) -> str:
+        return f"{self.safekeeper_tenants_path()}/{tenant_id}"
+
+    def safekeeper_timeline_path(
+        self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId
+    ) -> str:
+        return f"{self.safekeeper_tenant_path(tenant_id)}/{timeline_id}"
+
     def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str:
         """
         Gets the latest generation key from a list of keys.
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 7038d87aba..e409151b76 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -229,13 +229,14 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
 
     # only_local doesn't remove segments in the remote storage.
     def timeline_delete(
-        self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
+        self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False, **kwargs
     ) -> dict[Any, Any]:
         res = self.delete(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
             params={
                 "only_local": str(only_local).lower(),
             },
+            **kwargs,
         )
         res.raise_for_status()
         res_json = res.json()
diff --git a/test_runner/fixtures/safekeeper_utils.py b/test_runner/fixtures/safekeeper_utils.py
new file mode 100644
index 0000000000..158baf7bb6
--- /dev/null
+++ b/test_runner/fixtures/safekeeper_utils.py
@@ -0,0 +1,92 @@
+from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonPageserver, Safekeeper
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.utils import get_dir_size
+
+
+def is_segment_offloaded(
+    sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn
+):
+    http_cli = sk.http_client()
+    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(f"sk status is {tli_status}")
+    return tli_status.backup_lsn >= seg_end
+
+
+def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
+    http_cli = sk.http_client()
+    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(f"sk status is {tli_status}")
+    return tli_status.flush_lsn >= lsn
+
+
+def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb):
+    http_cli = sk.http_client()
+    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
+    sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id))
+    sk_wal_size_mb = sk_wal_size / 1024 / 1024
+    log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}")
+    return sk_wal_size_mb <= target_size_mb
+
+
+def wait_lsn_force_checkpoint(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    endpoint: Endpoint,
+    ps: NeonPageserver,
+    pageserver_conn_options=None,
+):
+    pageserver_conn_options = pageserver_conn_options or {}
+    lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver")
+
+    wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
+
+
+def wait_lsn_force_checkpoint_at_sk(
+    safekeeper: Safekeeper,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ps: NeonPageserver,
+    pageserver_conn_options=None,
+):
+    sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id)
+    wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
+
+
+def wait_lsn_force_checkpoint_at(
+    lsn: Lsn,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ps: NeonPageserver,
+    pageserver_conn_options=None,
+):
+    """
+    Wait until pageserver receives given lsn, force checkpoint and wait for
+    upload, i.e. remote_consistent_lsn advancement.
+    """
+    pageserver_conn_options = pageserver_conn_options or {}
+
+    auth_token = None
+    if "password" in pageserver_conn_options:
+        auth_token = pageserver_conn_options["password"]
+
+    # wait for the pageserver to catch up
+    wait_for_last_record_lsn(
+        ps.http_client(auth_token=auth_token),
+        tenant_id,
+        timeline_id,
+        lsn,
+    )
+
+    # force checkpoint to advance remote_consistent_lsn
+    ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id)
+
+    # ensure that remote_consistent_lsn is advanced
+    wait_for_upload(
+        ps.http_client(auth_token=auth_token),
+        tenant_id,
+        timeline_id,
+        lsn,
+    )
diff --git a/test_runner/regress/test_safekeeper_deletion.py b/test_runner/regress/test_safekeeper_deletion.py
new file mode 100644
index 0000000000..b46095d583
--- /dev/null
+++ b/test_runner/regress/test_safekeeper_deletion.py
@@ -0,0 +1,331 @@
+from __future__ import annotations
+
+import threading
+import time
+from contextlib import closing
+from enum import StrEnum
+
+import pytest
+import requests
+from fixtures.common_types import Lsn, TimelineId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnvBuilder,
+)
+from fixtures.remote_storage import S3Storage, s3_storage
+from fixtures.safekeeper_utils import is_segment_offloaded
+from fixtures.utils import wait_until
+
+
+@pytest.mark.parametrize("auth_enabled", [False, True])
+def test_safekeeper_delete_timeline(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
+    neon_env_builder.auth_enabled = auth_enabled
+    env = neon_env_builder.init_start()
+
+    # FIXME: are these expected?
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Timeline .* was not found in global map.*",
+            ".*Timeline .* was cancelled and cannot be used anymore.*",
+        ]
+    )
+
+    # Create two tenants: one will be deleted, other should be preserved.
+    tenant_id = env.initial_tenant
+    timeline_id_1 = env.create_branch("br1")  # Active, delete explicitly
+    timeline_id_2 = env.create_branch("br2")  # Inactive, delete explicitly
+    timeline_id_3 = env.create_branch("br3")  # Active, delete with the tenant
+    timeline_id_4 = env.create_branch("br4")  # Inactive, delete with the tenant
+
+    tenant_id_other, timeline_id_other = env.create_tenant()
+
+    # Populate branches
+    endpoint_1 = env.endpoints.create_start("br1")
+    endpoint_2 = env.endpoints.create_start("br2")
+    endpoint_3 = env.endpoints.create_start("br3")
+    endpoint_4 = env.endpoints.create_start("br4")
+    endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other)
+    for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]:
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                cur.execute("CREATE TABLE t(key int primary key)")
+    sk = env.safekeepers[0]
+    sk_data_dir = sk.data_dir
+    if not auth_enabled:
+        sk_http = sk.http_client()
+        sk_http_other = sk_http
+    else:
+        sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
+        sk_http_other = sk.http_client(
+            auth_token=env.auth_keys.generate_tenant_token(tenant_id_other)
+        )
+        sk_http_noauth = sk.http_client(gen_sk_wide_token=False)
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state.
+    endpoint_2.stop_and_destroy()
+    endpoint_4.stop_and_destroy()
+    sk.stop()
+    sk.start()
+
+    # Ensure connections to Safekeeper are established
+    for endpoint in [endpoint_1, endpoint_3, endpoint_other]:
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                cur.execute("INSERT INTO t (key) VALUES (1)")
+
+    # Stop all computes gracefully before safekeepers stop responding to them
+    endpoint_1.stop_and_destroy()
+    endpoint_3.stop_and_destroy()
+
+    # Remove initial tenant's br1 (active)
+    assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
+    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Ensure repeated deletion succeeds
+    assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
+    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    if auth_enabled:
+        # Ensure we cannot delete the other tenant
+        for sk_h in [sk_http, sk_http_noauth]:
+            with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
+                assert sk_h.timeline_delete(tenant_id_other, timeline_id_other)
+            with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
+                assert sk_h.tenant_delete_force(tenant_id_other)
+        assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Remove initial tenant's br2 (inactive)
+    assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"]
+    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
+    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Remove non-existing branch, should succeed
+    assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"]
+    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
+    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Remove initial tenant fully (two branches are active)
+    response = sk_http.tenant_delete_force(tenant_id)
+    assert response[str(timeline_id_3)]["dir_existed"]
+    assert not (sk_data_dir / str(tenant_id)).exists()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Remove initial tenant again.
+    response = sk_http.tenant_delete_force(tenant_id)
+    # assert response == {}
+    assert not (sk_data_dir / str(tenant_id)).exists()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Ensure the other tenant still works
+    sk_http_other.timeline_status(tenant_id_other, timeline_id_other)
+    with closing(endpoint_other.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("INSERT INTO t (key) VALUES (123)")
+
+
+def test_safekeeper_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder):
+    """
+    Test deleting timelines on a safekeeper while they're under load.
+
+    This should not happen under normal operation, but it can happen if
+    there is some rogue compute/pageserver that is writing/reading to a
+    safekeeper that we're migrating a timeline away from, or if the timeline
+    is being deleted while such a rogue client is running.
+    """
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+
+    # Create two endpoints that will generate load
+    timeline_id_a = env.create_branch("deleteme_a")
+    timeline_id_b = env.create_branch("deleteme_b")
+
+    endpoint_a = env.endpoints.create("deleteme_a")
+    endpoint_a.start()
+    endpoint_b = env.endpoints.create("deleteme_b")
+    endpoint_b.start()
+
+    # Get tenant and timeline IDs
+    tenant_id = env.initial_tenant
+
+    # Start generating load on both timelines
+    def generate_load(endpoint: Endpoint):
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)")
+                while True:
+                    try:
+                        cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'")
+                    except:  # noqa
+                        # Ignore errors since timeline may be deleted
+                        break
+
+    t_a = threading.Thread(target=generate_load, args=(endpoint_a,))
+    t_b = threading.Thread(target=generate_load, args=(endpoint_b,))
+    try:
+        t_a.start()
+        t_b.start()
+
+        # Let the load run for a bit
+        log.info("Warming up...")
+        time.sleep(2)
+
+        # Safekeeper errors will propagate to the pageserver: it is correct that these are
+        # logged at error severity because they indicate the pageserver is trying to read
+        # a timeline that it shouldn't.
+        env.pageserver.allowed_errors.extend(
+            [
+                ".*Timeline.*was cancelled.*",
+                ".*Timeline.*was not found.*",
+            ]
+        )
+
+        # Try deleting timelines while under load
+        sk = env.safekeepers[0]
+        sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
+
+        # Delete first timeline
+        log.info(f"Deleting {timeline_id_a}...")
+        assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"]
+
+        # Delete second timeline
+        log.info(f"Deleting {timeline_id_b}...")
+        assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"]
+
+        # Verify timelines are gone from disk
+        sk_data_dir = sk.data_dir
+        assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists()
+        # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists()
+
+    finally:
+        log.info("Stopping endpoints...")
+        # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang
+        endpoint_a.stop(mode="immediate")
+        endpoint_b.stop(mode="immediate")
+        log.info("Joining threads...")
+        t_a.join()
+        t_b.join()
+
+
+class RemoteDeleteFailpoint(StrEnum):
+    PAUSE = "sk-delete-timeline-remote-pause"
+    FAIL = "sk-delete-timeline-remote"
+
+
+@pytest.mark.parametrize("failpoint", [RemoteDeleteFailpoint.PAUSE, RemoteDeleteFailpoint.FAIL])
+def test_safekeeper_delete_remote_errors(
+    neon_env_builder: NeonEnvBuilder, failpoint: RemoteDeleteFailpoint
+):
+    """
+    Test that errors and delays during remote deletion are handled correctly.
+    """
+
+    # Configure safekeepers with ultra-fast eviction policy
+    neon_env_builder.safekeeper_extra_opts = [
+        "--enable-offload",
+        "--delete-offloaded-wal",
+        "--control-file-save-interval",
+        "1s",
+    ]
+    neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
+    env = neon_env_builder.init_start()
+
+    # FIXME: pageserver is intermittently emitting this
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*unsupported command START_WAL_PUSH in START_WAL_PUSH.*",
+        ]
+    )
+
+    timeline_id_a = env.create_branch("deleteme_a")
+    endpoint_a = env.endpoints.create("deleteme_a")
+    endpoint_a.start()
+    with closing(endpoint_a.connect()) as conn:
+        with conn.cursor() as cur:
+            # roughly fills one segment
+            cur.execute("create table t(key int, value text)")
+            cur.execute("insert into t select generate_series(1,250000), 'payload'")
+    endpoint_a.stop()
+
+    # Ensure something is uploaded to remote storage
+    def assert_is_uploaded():
+        assert is_segment_offloaded(
+            env.safekeepers[0], env.initial_tenant, timeline_id_a, Lsn("0/2000000")
+        )
+
+    wait_until(assert_is_uploaded)
+
+    def list_timeline_remote():
+        assert isinstance(env.safekeepers_remote_storage, S3Storage)
+        prefix = f"{env.safekeepers_remote_storage.safekeeper_timeline_path(env.initial_tenant, timeline_id_a)}/"
+
+        listing = env.safekeepers_remote_storage.client.list_objects_v2(
+            Bucket=env.safekeepers_remote_storage.bucket_name,
+            Prefix=prefix,
+        )
+        return listing.get("Contents", [])
+
+    assert list_timeline_remote() != []
+
+    sk_http = env.safekeepers[0].http_client()
+    env.pageserver.http_client().timeline_delete(env.initial_tenant, timeline_id_a)
+
+    # Set up failpoint
+    if failpoint == RemoteDeleteFailpoint.PAUSE:
+        sk_http.configure_failpoints((failpoint, "pause"))
+    elif failpoint == RemoteDeleteFailpoint.FAIL:
+        sk_http.configure_failpoints((failpoint, "return"))
+    else:
+        raise NotImplementedError(f"Unknown failpoint: {failpoint}")
+
+    # Delete the timeline - this should hit the configured failpoint
+    if failpoint == RemoteDeleteFailpoint.PAUSE:
+        # Expect time out
+        with pytest.raises(requests.exceptions.ReadTimeout, match="timed out"):
+            sk_http.timeline_delete(env.initial_tenant, timeline_id_a, timeout=5)
+
+        # Assert deletion didn't happy yet
+        assert list_timeline_remote() != []
+
+        # Unblock the background task that should still be running
+        sk_http.configure_failpoints((failpoint, "off"))
+
+        # Expect that after unblocking, remote deletion proceeds
+        def assert_remote_deleted():
+            assert list_timeline_remote() == []
+
+        wait_until(assert_remote_deleted)
+
+    elif failpoint == RemoteDeleteFailpoint.FAIL:
+        # Expect immediate failure
+        with pytest.raises(sk_http.HTTPError, match="Internal Server Error"):
+            sk_http.timeline_delete(env.initial_tenant, timeline_id_a)
+
+        sk_http.configure_failpoints((failpoint, "off"))
+    else:
+        raise NotImplementedError(f"Unknown failpoint: {failpoint}")
+
+    # Retry should succeed
+    sk_http.timeline_delete(env.initial_tenant, timeline_id_a)
+
+    # Remote storage should be empty
+    assert list_timeline_remote() == []
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 8f70b460c6..0366e88389 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -27,7 +27,6 @@ from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
     Endpoint,
     NeonEnvBuilder,
-    NeonPageserver,
     PgBin,
     PgProtocol,
     Safekeeper,
@@ -38,8 +37,6 @@ from fixtures.pageserver.utils import (
     assert_prefix_empty,
     assert_prefix_not_empty,
     timeline_delete_wait_completed,
-    wait_for_last_record_lsn,
-    wait_for_upload,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
@@ -55,9 +52,16 @@ from fixtures.safekeeper.http import (
     TimelineCreateRequest,
 )
 from fixtures.safekeeper.utils import wait_walreceivers_absent
+from fixtures.safekeeper_utils import (
+    is_flush_lsn_caught_up,
+    is_segment_offloaded,
+    is_wal_trimmed,
+    wait_lsn_force_checkpoint,
+    wait_lsn_force_checkpoint_at,
+    wait_lsn_force_checkpoint_at_sk,
+)
 from fixtures.utils import (
     PropagatingThread,
-    get_dir_size,
     query_scalar,
     run_only_on_default_postgres,
     skip_in_debug_build,
@@ -69,68 +73,6 @@ if TYPE_CHECKING:
     from typing import Any, Self
 
 
-def wait_lsn_force_checkpoint(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    endpoint: Endpoint,
-    ps: NeonPageserver,
-    pageserver_conn_options=None,
-):
-    pageserver_conn_options = pageserver_conn_options or {}
-    lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
-    log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver")
-
-    wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
-
-
-def wait_lsn_force_checkpoint_at_sk(
-    safekeeper: Safekeeper,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    ps: NeonPageserver,
-    pageserver_conn_options=None,
-):
-    sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id)
-    wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
-
-
-def wait_lsn_force_checkpoint_at(
-    lsn: Lsn,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    ps: NeonPageserver,
-    pageserver_conn_options=None,
-):
-    """
-    Wait until pageserver receives given lsn, force checkpoint and wait for
-    upload, i.e. remote_consistent_lsn advancement.
-    """
-    pageserver_conn_options = pageserver_conn_options or {}
-
-    auth_token = None
-    if "password" in pageserver_conn_options:
-        auth_token = pageserver_conn_options["password"]
-
-    # wait for the pageserver to catch up
-    wait_for_last_record_lsn(
-        ps.http_client(auth_token=auth_token),
-        tenant_id,
-        timeline_id,
-        lsn,
-    )
-
-    # force checkpoint to advance remote_consistent_lsn
-    ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id)
-
-    # ensure that remote_consistent_lsn is advanced
-    wait_for_upload(
-        ps.http_client(auth_token=auth_token),
-        tenant_id,
-        timeline_id,
-        lsn,
-    )
-
-
 @dataclass
 class TimelineMetrics:
     timeline_id: TimelineId
@@ -475,31 +417,6 @@ def wait(f, desc, timeout=30, wait_f=None):
             wait_f()
 
 
-def is_segment_offloaded(
-    sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn
-):
-    http_cli = sk.http_client()
-    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
-    log.info(f"sk status is {tli_status}")
-    return tli_status.backup_lsn >= seg_end
-
-
-def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
-    http_cli = sk.http_client()
-    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
-    log.info(f"sk status is {tli_status}")
-    return tli_status.flush_lsn >= lsn
-
-
-def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb):
-    http_cli = sk.http_client()
-    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
-    sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id))
-    sk_wal_size_mb = sk_wal_size / 1024 / 1024
-    log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}")
-    return sk_wal_size_mb <= target_size_mb
-
-
 def test_wal_backup(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
     remote_storage_kind = s3_storage()
@@ -1685,214 +1602,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
 
-@pytest.mark.parametrize("auth_enabled", [False, True])
-def test_delete(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
-    neon_env_builder.auth_enabled = auth_enabled
-    env = neon_env_builder.init_start()
-
-    # FIXME: are these expected?
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*Timeline .* was not found in global map.*",
-            ".*Timeline .* was cancelled and cannot be used anymore.*",
-        ]
-    )
-
-    # Create two tenants: one will be deleted, other should be preserved.
-    tenant_id = env.initial_tenant
-    timeline_id_1 = env.create_branch("br1")  # Active, delete explicitly
-    timeline_id_2 = env.create_branch("br2")  # Inactive, delete explicitly
-    timeline_id_3 = env.create_branch("br3")  # Active, delete with the tenant
-    timeline_id_4 = env.create_branch("br4")  # Inactive, delete with the tenant
-
-    tenant_id_other, timeline_id_other = env.create_tenant()
-
-    # Populate branches
-    endpoint_1 = env.endpoints.create_start("br1")
-    endpoint_2 = env.endpoints.create_start("br2")
-    endpoint_3 = env.endpoints.create_start("br3")
-    endpoint_4 = env.endpoints.create_start("br4")
-    endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other)
-    for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]:
-        with closing(endpoint.connect()) as conn:
-            with conn.cursor() as cur:
-                cur.execute("CREATE TABLE t(key int primary key)")
-    sk = env.safekeepers[0]
-    sk_data_dir = sk.data_dir
-    if not auth_enabled:
-        sk_http = sk.http_client()
-        sk_http_other = sk_http
-    else:
-        sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
-        sk_http_other = sk.http_client(
-            auth_token=env.auth_keys.generate_tenant_token(tenant_id_other)
-        )
-        sk_http_noauth = sk.http_client(gen_sk_wide_token=False)
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state.
-    endpoint_2.stop_and_destroy()
-    endpoint_4.stop_and_destroy()
-    sk.stop()
-    sk.start()
-
-    # Ensure connections to Safekeeper are established
-    for endpoint in [endpoint_1, endpoint_3, endpoint_other]:
-        with closing(endpoint.connect()) as conn:
-            with conn.cursor() as cur:
-                cur.execute("INSERT INTO t (key) VALUES (1)")
-
-    # Stop all computes gracefully before safekeepers stop responding to them
-    endpoint_1.stop_and_destroy()
-    endpoint_3.stop_and_destroy()
-
-    # Remove initial tenant's br1 (active)
-    assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
-    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Ensure repeated deletion succeeds
-    assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
-    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    if auth_enabled:
-        # Ensure we cannot delete the other tenant
-        for sk_h in [sk_http, sk_http_noauth]:
-            with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
-                assert sk_h.timeline_delete(tenant_id_other, timeline_id_other)
-            with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
-                assert sk_h.tenant_delete_force(tenant_id_other)
-        assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Remove initial tenant's br2 (inactive)
-    assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"]
-    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
-    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Remove non-existing branch, should succeed
-    assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"]
-    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
-    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Remove initial tenant fully (two branches are active)
-    response = sk_http.tenant_delete_force(tenant_id)
-    assert response[str(timeline_id_3)]["dir_existed"]
-    assert not (sk_data_dir / str(tenant_id)).exists()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Remove initial tenant again.
-    response = sk_http.tenant_delete_force(tenant_id)
-    # assert response == {}
-    assert not (sk_data_dir / str(tenant_id)).exists()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Ensure the other tenant still works
-    sk_http_other.timeline_status(tenant_id_other, timeline_id_other)
-    with closing(endpoint_other.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("INSERT INTO t (key) VALUES (123)")
-
-
-def test_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder):
-    """
-    Test deleting timelines on a safekeeper while they're under load.
-
-    This should not happen under normal operation, but it can happen if
-    there is some rogue compute/pageserver that is writing/reading to a
-    safekeeper that we're migrating a timeline away from, or if the timeline
-    is being deleted while such a rogue client is running.
-    """
-    neon_env_builder.auth_enabled = True
-    env = neon_env_builder.init_start()
-
-    # Create two endpoints that will generate load
-    timeline_id_a = env.create_branch("deleteme_a")
-    timeline_id_b = env.create_branch("deleteme_b")
-
-    endpoint_a = env.endpoints.create("deleteme_a")
-    endpoint_a.start()
-    endpoint_b = env.endpoints.create("deleteme_b")
-    endpoint_b.start()
-
-    # Get tenant and timeline IDs
-    tenant_id = env.initial_tenant
-
-    # Start generating load on both timelines
-    def generate_load(endpoint: Endpoint):
-        with closing(endpoint.connect()) as conn:
-            with conn.cursor() as cur:
-                cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)")
-                while True:
-                    try:
-                        cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'")
-                    except:  # noqa
-                        # Ignore errors since timeline may be deleted
-                        break
-
-    t_a = threading.Thread(target=generate_load, args=(endpoint_a,))
-    t_b = threading.Thread(target=generate_load, args=(endpoint_b,))
-    try:
-        t_a.start()
-        t_b.start()
-
-        # Let the load run for a bit
-        log.info("Warming up...")
-        time.sleep(2)
-
-        # Safekeeper errors will propagate to the pageserver: it is correct that these are
-        # logged at error severity because they indicate the pageserver is trying to read
-        # a timeline that it shouldn't.
-        env.pageserver.allowed_errors.extend(
-            [
-                ".*Timeline.*was cancelled.*",
-                ".*Timeline.*was not found.*",
-            ]
-        )
-
-        # Try deleting timelines while under load
-        sk = env.safekeepers[0]
-        sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
-
-        # Delete first timeline
-        log.info(f"Deleting {timeline_id_a}...")
-        assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"]
-
-        # Delete second timeline
-        log.info(f"Deleting {timeline_id_b}...")
-        assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"]
-
-        # Verify timelines are gone from disk
-        sk_data_dir = sk.data_dir
-        assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists()
-        # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists()
-
-    finally:
-        log.info("Stopping endpoints...")
-        # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang
-        endpoint_a.stop(mode="immediate")
-        endpoint_b.stop(mode="immediate")
-        log.info("Joining threads...")
-        t_a.join()
-        t_b.join()
-
-
 # Basic pull_timeline test.
 # When live_sk_change is False, compute is restarted to change set of
 # safekeepers; otherwise it is live reload.

From 8298bc903c0148db187074374b85f6ae5c0f9347 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 3 Mar 2025 17:52:59 +0000
Subject: [PATCH 18/61] pageserver: handle in-memory layer overlaps with
 persistent layers (#11000)

## Problem

Image layers may be nested inside in-memory layers as diagnosed
[here](https://github.com/neondatabase/neon/issues/10720#issuecomment-2649419252).
The read path doesn't support this and may skip over the image layer,
resulting in a failure to reconstruct the page.

## Summary of changes

We already support nesting of image layers inside delta layers. The
logic lives in `LayerMap::select_layer`.
The main goal of this PR is to propagate the candidate in-memory layer
down to that point and update
the selection logic.

Important changes are:
1. Support partial reads for the in-memory layer. Previously, we could
only specify the start LSN of the read.
We need to control the end LSN too.
2. `LayerMap::ranged_search` considers in-memory layers too. Previously,
the search for in-memory layers
was done explicitly in `Timeline::get_reconstruct_data_timeline`. Note
that `LayerMap::ranged_search` now returns
a weak readable layer which the `LayerManager` can upgrade. This dance
is such that we can unit test the layer selection logic.
3. Update `LayerMap::select_layer` to consider the candidate in-memory
layer too

Loosely related drive bys:
1. Remove the "keys not found" tracking in the ranged search. This
wasn't used anywhere and it just complicates things.
2. Remove the difficulty map stuff from the layer map. Again, not used
anywhere.

Closes https://github.com/neondatabase/neon/issues/9185
Closes https://github.com/neondatabase/neon/issues/10720
---
 pageserver/benches/bench_layer_map.rs         |  76 --
 pageserver/src/tenant.rs                      | 179 +++-
 pageserver/src/tenant/layer_map.rs            | 809 +++++++++++++-----
 .../layer_map/historic_layer_coverage.rs      |   6 +
 pageserver/src/tenant/storage_layer.rs        |   9 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   4 +-
 .../src/tenant/storage_layer/layer/tests.rs   |   1 +
 pageserver/src/tenant/timeline.rs             | 143 +++-
 .../src/tenant/timeline/layer_manager.rs      |  38 +-
 9 files changed, 949 insertions(+), 316 deletions(-)

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index e11af49449..e1444778b8 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -7,7 +7,6 @@ use std::time::Instant;
 
 use criterion::measurement::WallTime;
 use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main};
-use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc};
 use pageserver_api::key::Key;
@@ -72,41 +71,6 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
         .collect()
 }
 
-// Construct a partitioning for testing get_difficulty map when we
-// don't have an exact result of `collect_keyspace` to work with.
-fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
-    let mut parts = Vec::new();
-
-    // We add a partition boundary at the start of each image layer,
-    // no matter what lsn range it covers. This is just the easiest
-    // thing to do. A better thing to do would be to get a real
-    // partitioning from some database. Even better, remove the need
-    // for key partitions by deciding where to create image layers
-    // directly based on a coverage-based difficulty map.
-    let mut keys: Vec<_> = layer_map
-        .iter_historic_layers()
-        .filter_map(|l| {
-            if l.is_incremental() {
-                None
-            } else {
-                let kr = l.get_key_range();
-                Some(kr.start.next())
-            }
-        })
-        .collect();
-    keys.sort();
-
-    let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
-    for key in keys {
-        parts.push(KeySpace {
-            ranges: vec![current_key..key],
-        });
-        current_key = key;
-    }
-
-    KeyPartitioning { parts }
-}
-
 // Benchmark using metadata extracted from our performance test environment, from
 // a project where we have run pgbench many timmes. The pgbench database was initialized
 // between each test run.
@@ -148,41 +112,6 @@ fn bench_from_real_project(c: &mut Criterion) {
     // Choose uniformly distributed queries
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
 
-    // Choose inputs for get_difficulty_map
-    let latest_lsn = layer_map
-        .iter_historic_layers()
-        .map(|l| l.get_lsn_range().end)
-        .max()
-        .unwrap();
-    let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
-
-    // Check correctness of get_difficulty_map
-    // TODO put this in a dedicated test outside of this mod
-    {
-        println!("running correctness check");
-
-        let now = Instant::now();
-        let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
-        assert!(result_bruteforce.len() == partitioning.parts.len());
-        println!("Finished bruteforce in {:?}", now.elapsed());
-
-        let now = Instant::now();
-        let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
-        assert!(result_fast.len() == partitioning.parts.len());
-        println!("Finished fast in {:?}", now.elapsed());
-
-        // Assert results are equal. Manually iterate for easier debugging.
-        let zip = std::iter::zip(
-            &partitioning.parts,
-            std::iter::zip(result_bruteforce, result_fast),
-        );
-        for (_part, (bruteforce, fast)) in zip {
-            assert_eq!(bruteforce, fast);
-        }
-
-        println!("No issues found");
-    }
-
     // Define and name the benchmark function
     let mut group = c.benchmark_group("real_map");
     group.bench_function("uniform_queries", |b| {
@@ -192,11 +121,6 @@ fn bench_from_real_project(c: &mut Criterion) {
             }
         });
     });
-    group.bench_function("get_difficulty_map", |b| {
-        b.iter(|| {
-            layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
-        });
-    });
     group.finish();
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 11d656eb25..776e523c2e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2501,6 +2501,7 @@ impl Tenant {
         initdb_lsn: Lsn,
         pg_version: u32,
         ctx: &RequestContext,
+        in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
         delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
@@ -2522,6 +2523,11 @@ impl Tenant {
                 .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
                 .await?;
         }
+        for in_memory in in_memory_layer_desc {
+            tline
+                .force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx)
+                .await?;
+        }
         let layer_names = tline
             .layers
             .read()
@@ -5913,6 +5919,8 @@ mod tests {
     #[cfg(feature = "testing")]
     use timeline::GcInfo;
     #[cfg(feature = "testing")]
+    use timeline::InMemoryLayerTestDesc;
+    #[cfg(feature = "testing")]
     use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
     use timeline::{CompactOptions, DeltaLayerTestDesc};
     use utils::id::TenantId;
@@ -7925,6 +7933,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 Vec::new(), // delta layers
                 vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
                 Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
@@ -8012,6 +8021,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 Vec::new(), // delta layers
                 vec![(
                     Lsn(0x20),
@@ -8227,6 +8237,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 // delta layers
                 vec![
                     DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8307,6 +8318,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 // delta layers
                 vec![
                     DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8380,6 +8392,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 // delta layers
                 vec![
                     DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8512,6 +8525,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 vec![
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -8705,6 +8719,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                     Lsn(0x10)..Lsn(0x40),
                     delta1,
@@ -8761,6 +8776,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 Vec::new(),
                 image_layers,
                 end_lsn,
@@ -8967,6 +8983,7 @@ mod tests {
                     Lsn(0x08),
                     DEFAULT_PG_VERSION,
                     &ctx,
+                    Vec::new(), // in-memory layers
                     vec![
                         DeltaLayerTestDesc::new_with_inferred_key_range(
                             Lsn(0x08)..Lsn(0x10),
@@ -8985,7 +9002,7 @@ mod tests {
                             delta3,
                         ),
                     ], // delta layers
-                    vec![], // image layers
+                    vec![],     // image layers
                     Lsn(0x50),
                 )
                 .await?
@@ -8996,6 +9013,7 @@ mod tests {
                     Lsn(0x10),
                     DEFAULT_PG_VERSION,
                     &ctx,
+                    Vec::new(), // in-memory layers
                     vec![
                         DeltaLayerTestDesc::new_with_inferred_key_range(
                             Lsn(0x10)..Lsn(0x48),
@@ -9546,6 +9564,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 vec![
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
@@ -9793,6 +9812,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 vec![
                     // delta1 and delta 2 only contain a single key but multiple updates
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
@@ -10028,6 +10048,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                vec![],                       // in-memory layers
                 vec![],                       // delta layers
                 vec![(Lsn(0x18), img_layer)], // image layers
                 Lsn(0x18),
@@ -10274,6 +10295,7 @@ mod tests {
                 baseline_image_layer_lsn,
                 DEFAULT_PG_VERSION,
                 &ctx,
+                vec![], // in-memory layers
                 vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                     delta_layer_start_lsn..delta_layer_end_lsn,
                     delta_layer_spec,
@@ -10305,6 +10327,158 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> {
+        let harness =
+            TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        let will_init_keys = [2, 6];
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let mut expected_key_values = HashMap::new();
+
+        let baseline_image_layer_lsn = Lsn(0x10);
+        let mut baseline_img_layer = Vec::new();
+        for i in 0..5 {
+            let key = get_key(i);
+            let value = format!("value {i}@{baseline_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            baseline_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let nested_image_layer_lsn = Lsn(0x50);
+        let mut nested_img_layer = Vec::new();
+        for i in 5..10 {
+            let key = get_key(i);
+            let value = format!("value {i}@{nested_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            nested_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let frozen_layer = {
+            let lsn_range = Lsn(0x40)..Lsn(0x60);
+            let mut data = Vec::new();
+            for i in 0..10 {
+                let key = get_key(i);
+                let key_in_nested = nested_img_layer
+                    .iter()
+                    .any(|(key_with_img, _)| *key_with_img == key);
+                let lsn = {
+                    if key_in_nested {
+                        Lsn(nested_image_layer_lsn.0 + 5)
+                    } else {
+                        lsn_range.start
+                    }
+                };
+
+                let will_init = will_init_keys.contains(&i);
+                if will_init {
+                    data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
+
+                    expected_key_values.insert(key, "".to_string());
+                } else {
+                    let delta = format!("@{lsn}");
+                    data.push((
+                        key,
+                        lsn,
+                        Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                    ));
+
+                    expected_key_values
+                        .get_mut(&key)
+                        .expect("An image exists for each key")
+                        .push_str(delta.as_str());
+                }
+            }
+
+            InMemoryLayerTestDesc {
+                lsn_range,
+                is_open: false,
+                data,
+            }
+        };
+
+        let (open_layer, last_record_lsn) = {
+            let start_lsn = Lsn(0x70);
+            let mut data = Vec::new();
+            let mut end_lsn = Lsn(0);
+            for i in 0..10 {
+                let key = get_key(i);
+                let lsn = Lsn(start_lsn.0 + i as u64);
+                let delta = format!("@{lsn}");
+                data.push((
+                    key,
+                    lsn,
+                    Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                ));
+
+                expected_key_values
+                    .get_mut(&key)
+                    .expect("An image exists for each key")
+                    .push_str(delta.as_str());
+
+                end_lsn = std::cmp::max(end_lsn, lsn);
+            }
+
+            (
+                InMemoryLayerTestDesc {
+                    lsn_range: start_lsn..Lsn::MAX,
+                    is_open: true,
+                    data,
+                },
+                end_lsn,
+            )
+        };
+
+        assert!(
+            nested_image_layer_lsn > frozen_layer.lsn_range.start
+                && nested_image_layer_lsn < frozen_layer.lsn_range.end
+        );
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                baseline_image_layer_lsn,
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![open_layer, frozen_layer], // in-memory layers
+                Vec::new(),                     // delta layers
+                vec![
+                    (baseline_image_layer_lsn, baseline_img_layer),
+                    (nested_image_layer_lsn, nested_img_layer),
+                ], // image layers
+                last_record_lsn,
+            )
+            .await?;
+
+        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let results = tline
+            .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
+            .await
+            .expect("No vectored errors");
+        for (key, res) in results {
+            let value = res.expect("No key errors");
+            let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
+            assert_eq!(value, Bytes::from(expected_value.clone()));
+
+            tracing::info!("key={key} value={expected_value}");
+        }
+
+        Ok(())
+    }
+
     fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
         (
             k1.is_delta,
@@ -10420,6 +10594,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                vec![], // in-memory layers
                 vec![
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -10804,6 +10979,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                vec![], // in-memory layers
                 vec![
                     // delta1/2/4 only contain a single key but multiple updates
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
@@ -11055,6 +11231,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                vec![], // in-memory layers
                 vec![
                     // delta1/2/4 only contain a single key but multiple updates
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 59f5a6bd90..2b04e53f10 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -62,8 +62,7 @@ use utils::lsn::Lsn;
 
 use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
 use crate::context::RequestContext;
-use crate::keyspace::KeyPartitioning;
-use crate::tenant::storage_layer::InMemoryLayer;
+use crate::tenant::storage_layer::{InMemoryLayer, ReadableLayerWeak};
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -167,7 +166,7 @@ impl Drop for BatchedUpdates<'_> {
 /// Return value of LayerMap::search
 #[derive(Eq, PartialEq, Debug, Hash)]
 pub struct SearchResult {
-    pub layer: Arc<PersistentLayerDesc>,
+    pub layer: ReadableLayerWeak,
     pub lsn_floor: Lsn,
 }
 
@@ -175,19 +174,37 @@ pub struct SearchResult {
 ///
 /// Contains a mapping from a layer description to a keyspace
 /// accumulator that contains all the keys which intersect the layer
-/// from the original search space. Keys that were not found are accumulated
-/// in a separate key space accumulator.
+/// from the original search space.
 #[derive(Debug)]
 pub struct RangeSearchResult {
     pub found: HashMap<SearchResult, KeySpaceAccum>,
-    pub not_found: KeySpaceAccum,
 }
 
 impl RangeSearchResult {
     fn new() -> Self {
         Self {
             found: HashMap::new(),
-            not_found: KeySpaceAccum::new(),
+        }
+    }
+
+    fn map_to_in_memory_layer(
+        in_memory_layer: Option<InMemoryLayerDesc>,
+        range: Range<Key>,
+    ) -> RangeSearchResult {
+        match in_memory_layer {
+            Some(inmem) => {
+                let search_result = SearchResult {
+                    lsn_floor: inmem.get_lsn_range().start,
+                    layer: ReadableLayerWeak::InMemoryLayer(inmem),
+                };
+
+                let mut accum = KeySpaceAccum::new();
+                accum.add_range(range);
+                RangeSearchResult {
+                    found: HashMap::from([(search_result, accum)]),
+                }
+            }
+            None => RangeSearchResult::new(),
         }
     }
 }
@@ -199,6 +216,7 @@ struct RangeSearchCollector<Iter>
 where
     Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
 {
+    in_memory_layer: Option<InMemoryLayerDesc>,
     delta_coverage: Peekable<Iter>,
     image_coverage: Peekable<Iter>,
     key_range: Range<Key>,
@@ -234,10 +252,12 @@ where
     fn new(
         key_range: Range<Key>,
         end_lsn: Lsn,
+        in_memory_layer: Option<InMemoryLayerDesc>,
         delta_coverage: Iter,
         image_coverage: Iter,
     ) -> Self {
         Self {
+            in_memory_layer,
             delta_coverage: delta_coverage.peekable(),
             image_coverage: image_coverage.peekable(),
             key_range,
@@ -266,8 +286,7 @@ where
                 return self.result;
             }
             Some(layer_type) => {
-                // Changes for the range exist. Record anything before the first
-                // coverage change as not found.
+                // Changes for the range exist.
                 let coverage_start = layer_type.next_change_at_key();
                 let range_before = self.key_range.start..coverage_start;
                 self.pad_range(range_before);
@@ -297,10 +316,22 @@ where
         self.result
     }
 
-    /// Mark a range as not found (i.e. no layers intersect it)
+    /// Map a range which does not intersect any persistent layers to
+    /// the in-memory layer candidate.
     fn pad_range(&mut self, key_range: Range<Key>) {
         if !key_range.is_empty() {
-            self.result.not_found.add_range(key_range);
+            if let Some(ref inmem) = self.in_memory_layer {
+                let search_result = SearchResult {
+                    layer: ReadableLayerWeak::InMemoryLayer(inmem.clone()),
+                    lsn_floor: inmem.get_lsn_range().start,
+                };
+
+                self.result
+                    .found
+                    .entry(search_result)
+                    .or_default()
+                    .add_range(key_range);
+            }
         }
     }
 
@@ -310,6 +341,7 @@ where
         let selected = LayerMap::select_layer(
             self.current_delta.clone(),
             self.current_image.clone(),
+            self.in_memory_layer.clone(),
             self.end_lsn,
         );
 
@@ -365,6 +397,24 @@ where
     }
 }
 
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+pub struct InMemoryLayerDesc {
+    handle: InMemoryLayerHandle,
+    lsn_range: Range<Lsn>,
+}
+
+impl InMemoryLayerDesc {
+    pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn_range.clone()
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+enum InMemoryLayerHandle {
+    Open,
+    Frozen(usize),
+}
+
 impl LayerMap {
     ///
     /// Find the latest layer (by lsn.end) that covers the given
@@ -394,69 +444,161 @@ impl LayerMap {
     /// layer result, or simplify the api to `get_latest_image` and
     /// `get_latest_delta`, and only call `get_latest_image` once.
     ///
-    /// NOTE: This only searches the 'historic' layers, *not* the
-    /// 'open' and 'frozen' layers!
-    ///
     pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
-        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+        let in_memory_layer = self.search_in_memory_layer(end_lsn);
+
+        let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) {
+            Some(version) => version,
+            None => {
+                return in_memory_layer.map(|desc| SearchResult {
+                    lsn_floor: desc.get_lsn_range().start,
+                    layer: ReadableLayerWeak::InMemoryLayer(desc),
+                });
+            }
+        };
+
         let latest_delta = version.delta_coverage.query(key.to_i128());
         let latest_image = version.image_coverage.query(key.to_i128());
 
-        Self::select_layer(latest_delta, latest_image, end_lsn)
+        Self::select_layer(latest_delta, latest_image, in_memory_layer, end_lsn)
     }
 
+    /// Select a layer from three potential candidates (in-memory, delta and image layer).
+    /// The candidates represent the first layer of each type which intersect a key range.
+    ///
+    /// Layer types have an in implicit priority (image > delta > in-memory). For instance,
+    /// if we have the option of reading an LSN range from both an image and a delta, we
+    /// should read from the image.
     fn select_layer(
         delta_layer: Option<Arc<PersistentLayerDesc>>,
         image_layer: Option<Arc<PersistentLayerDesc>>,
+        in_memory_layer: Option<InMemoryLayerDesc>,
         end_lsn: Lsn,
     ) -> Option<SearchResult> {
         assert!(delta_layer.as_ref().is_none_or(|l| l.is_delta()));
         assert!(image_layer.as_ref().is_none_or(|l| !l.is_delta()));
 
-        match (delta_layer, image_layer) {
-            (None, None) => None,
-            (None, Some(image)) => {
+        match (delta_layer, image_layer, in_memory_layer) {
+            (None, None, None) => None,
+            (None, Some(image), None) => {
                 let lsn_floor = image.get_lsn_range().start;
                 Some(SearchResult {
-                    layer: image,
+                    layer: ReadableLayerWeak::PersistentLayer(image),
                     lsn_floor,
                 })
             }
-            (Some(delta), None) => {
+            (Some(delta), None, None) => {
                 let lsn_floor = delta.get_lsn_range().start;
                 Some(SearchResult {
-                    layer: delta,
+                    layer: ReadableLayerWeak::PersistentLayer(delta),
                     lsn_floor,
                 })
             }
-            (Some(delta), Some(image)) => {
+            (Some(delta), Some(image), None) => {
                 let img_lsn = image.get_lsn_range().start;
                 let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
                 let image_exact_match = img_lsn + 1 == end_lsn;
                 if image_is_newer || image_exact_match {
                     Some(SearchResult {
-                        layer: image,
+                        layer: ReadableLayerWeak::PersistentLayer(image),
+                        lsn_floor: img_lsn,
+                    })
+                } else {
+                    // If the delta overlaps with the image in the LSN dimension, do a partial
+                    // up to the image layer.
+                    let lsn_floor =
+                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
+                    Some(SearchResult {
+                        layer: ReadableLayerWeak::PersistentLayer(delta),
+                        lsn_floor,
+                    })
+                }
+            }
+            (None, None, Some(inmem)) => {
+                let lsn_floor = inmem.get_lsn_range().start;
+                Some(SearchResult {
+                    layer: ReadableLayerWeak::InMemoryLayer(inmem),
+                    lsn_floor,
+                })
+            }
+            (None, Some(image), Some(inmem)) => {
+                // If the in-memory layer overlaps with the image in the LSN dimension, do a partial
+                // up to the image layer.
+                let img_lsn = image.get_lsn_range().start;
+                let image_is_newer = image.get_lsn_range().end >= inmem.get_lsn_range().end;
+                let image_exact_match = img_lsn + 1 == end_lsn;
+                if image_is_newer || image_exact_match {
+                    Some(SearchResult {
+                        layer: ReadableLayerWeak::PersistentLayer(image),
                         lsn_floor: img_lsn,
                     })
                 } else {
                     let lsn_floor =
-                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
+                        std::cmp::max(inmem.get_lsn_range().start, image.get_lsn_range().start + 1);
                     Some(SearchResult {
-                        layer: delta,
+                        layer: ReadableLayerWeak::InMemoryLayer(inmem),
                         lsn_floor,
                     })
                 }
             }
+            (Some(delta), None, Some(inmem)) => {
+                // Overlaps between delta and in-memory layers are not a valid
+                // state, but we handle them here for completeness.
+                let delta_end = delta.get_lsn_range().end;
+                let delta_is_newer = delta_end >= inmem.get_lsn_range().end;
+                let delta_exact_match = delta_end == end_lsn;
+                if delta_is_newer || delta_exact_match {
+                    Some(SearchResult {
+                        lsn_floor: delta.get_lsn_range().start,
+                        layer: ReadableLayerWeak::PersistentLayer(delta),
+                    })
+                } else {
+                    // If the in-memory layer overlaps with the delta in the LSN dimension, do a partial
+                    // up to the delta layer.
+                    let lsn_floor =
+                        std::cmp::max(inmem.get_lsn_range().start, delta.get_lsn_range().end);
+                    Some(SearchResult {
+                        layer: ReadableLayerWeak::InMemoryLayer(inmem),
+                        lsn_floor,
+                    })
+                }
+            }
+            (Some(delta), Some(image), Some(inmem)) => {
+                // Determine the preferred persistent layer without taking the in-memory layer
+                // into consideration.
+                let persistent_res =
+                    Self::select_layer(Some(delta.clone()), Some(image.clone()), None, end_lsn)
+                        .unwrap();
+                let persistent_l = match persistent_res.layer {
+                    ReadableLayerWeak::PersistentLayer(l) => l,
+                    ReadableLayerWeak::InMemoryLayer(_) => unreachable!(),
+                };
+
+                // Now handle the in-memory layer overlaps.
+                let inmem_res = if persistent_l.is_delta() {
+                    Self::select_layer(Some(persistent_l), None, Some(inmem.clone()), end_lsn)
+                        .unwrap()
+                } else {
+                    Self::select_layer(None, Some(persistent_l), Some(inmem.clone()), end_lsn)
+                        .unwrap()
+                };
+
+                Some(SearchResult {
+                    layer: inmem_res.layer,
+                    // Use the more restrictive LSN floor
+                    lsn_floor: std::cmp::max(persistent_res.lsn_floor, inmem_res.lsn_floor),
+                })
+            }
         }
     }
 
     pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> RangeSearchResult {
+        let in_memory_layer = self.search_in_memory_layer(end_lsn);
+
         let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) {
             Some(version) => version,
             None => {
-                let mut result = RangeSearchResult::new();
-                result.not_found.add_range(key_range);
-                return result;
+                return RangeSearchResult::map_to_in_memory_layer(in_memory_layer, key_range);
             }
         };
 
@@ -464,7 +606,13 @@ impl LayerMap {
         let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
         let image_changes = version.image_coverage.range_overlaps(&raw_range);
 
-        let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
+        let collector = RangeSearchCollector::new(
+            key_range,
+            end_lsn,
+            in_memory_layer,
+            delta_changes,
+            image_changes,
+        );
         collector.collect()
     }
 
@@ -571,17 +719,36 @@ impl LayerMap {
     }
 
     /// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
-    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
-    where
-        Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
-    {
+    pub(crate) fn search_in_memory_layer(&self, below: Lsn) -> Option<InMemoryLayerDesc> {
+        let is_below = |l: &Arc<InMemoryLayer>| {
+            let start_lsn = l.get_lsn_range().start;
+            below > start_lsn
+        };
+
         if let Some(open) = &self.open_layer {
-            if pred(open) {
-                return Some(open.clone());
+            if is_below(open) {
+                return Some(InMemoryLayerDesc {
+                    handle: InMemoryLayerHandle::Open,
+                    lsn_range: open.get_lsn_range(),
+                });
             }
         }
 
-        self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
+        self.frozen_layers
+            .iter()
+            .enumerate()
+            .rfind(|(_idx, l)| is_below(l))
+            .map(|(idx, l)| InMemoryLayerDesc {
+                handle: InMemoryLayerHandle::Frozen(idx),
+                lsn_range: l.get_lsn_range(),
+            })
+    }
+
+    pub(crate) fn in_memory_layer(&self, desc: &InMemoryLayerDesc) -> Arc<InMemoryLayer> {
+        match desc.handle {
+            InMemoryLayerHandle::Open => self.open_layer.as_ref().unwrap().clone(),
+            InMemoryLayerHandle::Frozen(idx) => self.frozen_layers[idx].clone(),
+        }
     }
 
     ///
@@ -737,136 +904,6 @@ impl LayerMap {
         max_stacked_deltas
     }
 
-    /// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
-    ///
-    /// The `partition_range` argument is used as context for the reimage-worthiness decision.
-    ///
-    /// Used as a helper for correctness checks only. Performance not critical.
-    pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range<Key>) -> usize {
-        match self.search(key, lsn) {
-            Some(search_result) => {
-                if search_result.layer.is_incremental() {
-                    (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize)
-                        + self.get_difficulty(search_result.lsn_floor, key, partition_range)
-                } else {
-                    0
-                }
-            }
-            None => 0,
-        }
-    }
-
-    /// Used for correctness checking. Results are expected to be identical to
-    /// self.get_difficulty_map. Assumes self.search is correct.
-    pub fn get_difficulty_map_bruteforce(
-        &self,
-        lsn: Lsn,
-        partitioning: &KeyPartitioning,
-    ) -> Vec<usize> {
-        // Looking at the difficulty as a function of key, it could only increase
-        // when a delta layer starts or an image layer ends. Therefore it's sufficient
-        // to check the difficulties at:
-        // - the key.start for each non-empty part range
-        // - the key.start for each delta
-        // - the key.end for each image
-        let keys_iter: Box<dyn Iterator<Item = Key>> = {
-            let mut keys: Vec<Key> = self
-                .iter_historic_layers()
-                .map(|layer| {
-                    if layer.is_incremental() {
-                        layer.get_key_range().start
-                    } else {
-                        layer.get_key_range().end
-                    }
-                })
-                .collect();
-            keys.sort();
-            Box::new(keys.into_iter())
-        };
-        let mut keys_iter = keys_iter.peekable();
-
-        // Iter the partition and keys together and query all the necessary
-        // keys, computing the max difficulty for each part.
-        partitioning
-            .parts
-            .iter()
-            .map(|part| {
-                let mut difficulty = 0;
-                // Partition ranges are assumed to be sorted and disjoint
-                // TODO assert it
-                for range in &part.ranges {
-                    if !range.is_empty() {
-                        difficulty =
-                            std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range));
-                    }
-                    while let Some(key) = keys_iter.peek() {
-                        if key >= &range.end {
-                            break;
-                        }
-                        let key = keys_iter.next().unwrap();
-                        if key < range.start {
-                            continue;
-                        }
-                        difficulty =
-                            std::cmp::max(difficulty, self.get_difficulty(lsn, key, range));
-                    }
-                }
-                difficulty
-            })
-            .collect()
-    }
-
-    /// For each part of a keyspace partitioning, return the maximum number of layers
-    /// that would be needed for page reconstruction in that part at the given LSN.
-    ///
-    /// If `limit` is provided we don't try to count above that number.
-    ///
-    /// This method is used to decide where to create new image layers. Computing the
-    /// result for the entire partitioning at once allows this function to be more
-    /// efficient, and further optimization is possible by using iterators instead,
-    /// to allow early return.
-    ///
-    /// TODO actually use this method instead of count_deltas. Currently we only use
-    ///      it for benchmarks.
-    pub fn get_difficulty_map(
-        &self,
-        lsn: Lsn,
-        partitioning: &KeyPartitioning,
-        limit: Option<usize>,
-    ) -> Vec<usize> {
-        // TODO This is a naive implementation. Perf improvements to do:
-        // 1. Instead of calling self.image_coverage and self.count_deltas,
-        //    iterate the image and delta coverage only once.
-        partitioning
-            .parts
-            .iter()
-            .map(|part| {
-                let mut difficulty = 0;
-                for range in &part.ranges {
-                    if limit == Some(difficulty) {
-                        break;
-                    }
-                    for (img_range, last_img) in self.image_coverage(range, lsn) {
-                        if limit == Some(difficulty) {
-                            break;
-                        }
-                        let img_lsn = if let Some(last_img) = last_img {
-                            last_img.get_lsn_range().end
-                        } else {
-                            Lsn(0)
-                        };
-
-                        if img_lsn < lsn {
-                            let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit);
-                            difficulty = std::cmp::max(difficulty, num_deltas);
-                        }
-                    }
-                }
-                difficulty
-            })
-            .collect()
-    }
-
     /// Return all L0 delta layers
     pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
         &self.l0_delta_layers
@@ -1069,6 +1106,10 @@ mod tests {
     use std::collections::HashMap;
     use std::path::PathBuf;
 
+    use crate::{
+        DEFAULT_PG_VERSION,
+        tenant::{harness::TenantHarness, storage_layer::LayerName},
+    };
     use pageserver_api::key::DBDIR_KEY;
     use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
     use utils::id::{TenantId, TimelineId};
@@ -1076,7 +1117,6 @@ mod tests {
 
     use super::*;
     use crate::tenant::IndexPart;
-    use crate::tenant::storage_layer::LayerName;
 
     #[derive(Clone)]
     struct LayerDesc {
@@ -1101,7 +1141,6 @@ mod tests {
     }
 
     fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
-        assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
         let lhs: HashMap<SearchResult, KeySpace> = lhs
             .found
             .into_iter()
@@ -1127,17 +1166,12 @@ mod tests {
         let mut key = key_range.start;
         while key != key_range.end {
             let res = layer_map.search(key, end_lsn);
-            match res {
-                Some(res) => {
-                    range_search_result
-                        .found
-                        .entry(res)
-                        .or_default()
-                        .add_key(key);
-                }
-                None => {
-                    range_search_result.not_found.add_key(key);
-                }
+            if let Some(res) = res {
+                range_search_result
+                    .found
+                    .entry(res)
+                    .or_default()
+                    .add_key(key);
             }
 
             key = key.next();
@@ -1152,20 +1186,49 @@ mod tests {
         let range = Key::from_i128(100)..Key::from_i128(200);
 
         let res = layer_map.range_search(range.clone(), Lsn(100));
-        assert_eq!(
-            res.not_found.to_keyspace(),
-            KeySpace {
-                ranges: vec![range]
-            }
-        );
+        assert_range_search_result_eq(res, RangeSearchResult::new());
     }
 
-    #[test]
-    fn ranged_search() {
+    #[tokio::test]
+    async fn ranged_search() {
+        let harness = TenantHarness::create("ranged_search").await.unwrap();
+        let (tenant, ctx) = harness.load().await;
+        let timeline_id = TimelineId::generate();
+        // Create the timeline such that the in-memory layers can be written
+        // to the timeline directory.
+        tenant
+            .create_test_timeline(timeline_id, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        let gate = utils::sync::gate::Gate::default();
+        let add_in_memory_layer = async |layer_map: &mut LayerMap, lsn_range: Range<Lsn>| {
+            let layer = InMemoryLayer::create(
+                harness.conf,
+                timeline_id,
+                harness.tenant_shard_id,
+                lsn_range.start,
+                &gate,
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+            layer.freeze(lsn_range.end).await;
+
+            layer_map.frozen_layers.push_back(Arc::new(layer));
+        };
+
+        let in_memory_layer_configurations = [
+            vec![],
+            // Overlaps with the top-most image
+            vec![Lsn(35)..Lsn(50)],
+        ];
+
         let layers = vec![
             LayerDesc {
                 key_range: Key::from_i128(15)..Key::from_i128(50),
-                lsn_range: Lsn(0)..Lsn(5),
+                lsn_range: Lsn(5)..Lsn(6),
                 is_delta: false,
             },
             LayerDesc {
@@ -1185,19 +1248,27 @@ mod tests {
             },
             LayerDesc {
                 key_range: Key::from_i128(35)..Key::from_i128(40),
-                lsn_range: Lsn(35)..Lsn(40),
+                lsn_range: Lsn(40)..Lsn(41),
                 is_delta: false,
             },
         ];
 
-        let layer_map = create_layer_map(layers.clone());
-        for start in 0..60 {
-            for end in (start + 1)..60 {
-                let range = Key::from_i128(start)..Key::from_i128(end);
-                let result = layer_map.range_search(range.clone(), Lsn(100));
-                let expected = brute_force_range_search(&layer_map, range, Lsn(100));
+        let mut layer_map = create_layer_map(layers.clone());
+        for in_memory_layers in in_memory_layer_configurations {
+            for in_mem_layer_range in in_memory_layers {
+                add_in_memory_layer(&mut layer_map, in_mem_layer_range).await;
+            }
 
-                assert_range_search_result_eq(result, expected);
+            for start in 0..60 {
+                for end in (start + 1)..60 {
+                    let range = Key::from_i128(start)..Key::from_i128(end);
+                    let result = layer_map.range_search(range.clone(), Lsn(100));
+                    let expected = brute_force_range_search(&layer_map, range, Lsn(100));
+
+                    eprintln!("{start}..{end}: {result:?}");
+
+                    assert_range_search_result_eq(result, expected);
+                }
             }
         }
     }
@@ -1490,12 +1561,348 @@ mod tests {
 
         // Sanity: the layer that holds latest data for the DBDIR key should always be visible
         // (just using this key as a key that will always exist for any layermap fixture)
-        let dbdir_layer = layer_map
-            .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
-            .unwrap();
+        let dbdir_layer = {
+            let readable_layer = layer_map
+                .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
+                .unwrap();
+
+            match readable_layer.layer {
+                ReadableLayerWeak::PersistentLayer(desc) => desc,
+                ReadableLayerWeak::InMemoryLayer(_) => unreachable!(""),
+            }
+        };
         assert!(matches!(
-            layer_visibilities.get(&dbdir_layer.layer).unwrap(),
+            layer_visibilities.get(&dbdir_layer).unwrap(),
             LayerVisibilityHint::Visible
         ));
     }
 }
+
+#[cfg(test)]
+mod select_layer_tests {
+    use super::*;
+
+    fn create_persistent_layer(
+        start_lsn: u64,
+        end_lsn: u64,
+        is_delta: bool,
+    ) -> Arc<PersistentLayerDesc> {
+        if !is_delta {
+            assert_eq!(end_lsn, start_lsn + 1);
+        }
+
+        Arc::new(PersistentLayerDesc::new_test(
+            Key::MIN..Key::MAX,
+            Lsn(start_lsn)..Lsn(end_lsn),
+            is_delta,
+        ))
+    }
+
+    fn create_inmem_layer(start_lsn: u64, end_lsn: u64) -> InMemoryLayerDesc {
+        InMemoryLayerDesc {
+            handle: InMemoryLayerHandle::Open,
+            lsn_range: Lsn(start_lsn)..Lsn(end_lsn),
+        }
+    }
+
+    #[test]
+    fn test_select_layer_empty() {
+        assert!(LayerMap::select_layer(None, None, None, Lsn(100)).is_none());
+    }
+
+    #[test]
+    fn test_select_layer_only_delta() {
+        let delta = create_persistent_layer(10, 20, true);
+        let result = LayerMap::select_layer(Some(delta.clone()), None, None, Lsn(100)).unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_only_image() {
+        let image = create_persistent_layer(10, 11, false);
+        let result = LayerMap::select_layer(None, Some(image.clone()), None, Lsn(100)).unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_only_inmem() {
+        let inmem = create_inmem_layer(10, 20);
+        let result = LayerMap::select_layer(None, None, Some(inmem.clone()), Lsn(100)).unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem));
+    }
+
+    #[test]
+    fn test_select_layer_image_inside_delta() {
+        let delta = create_persistent_layer(10, 20, true);
+        let image = create_persistent_layer(15, 16, false);
+
+        let result =
+            LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(100))
+                .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(16));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            None,
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(15));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_newer_image() {
+        let delta = create_persistent_layer(10, 20, true);
+        let image = create_persistent_layer(25, 26, false);
+
+        let result =
+            LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(30))
+                .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(25));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+
+        let result =
+            LayerMap::select_layer(Some(delta.clone()), None, None, result.lsn_floor).unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_delta_with_older_image() {
+        let delta = create_persistent_layer(15, 25, true);
+        let image = create_persistent_layer(10, 11, false);
+
+        let result =
+            LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(30))
+                .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(15));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+
+        let result =
+            LayerMap::select_layer(None, Some(image.clone()), None, result.lsn_floor).unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_image_inside_inmem() {
+        let image = create_persistent_layer(15, 16, false);
+        let inmem = create_inmem_layer(10, 25);
+
+        let result =
+            LayerMap::select_layer(None, Some(image.clone()), Some(inmem.clone()), Lsn(30))
+                .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(16));
+        assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem));
+
+        let result = LayerMap::select_layer(
+            None,
+            Some(image.clone()),
+            Some(inmem.clone()),
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(15));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+
+        let result =
+            LayerMap::select_layer(None, None, Some(inmem.clone()), result.lsn_floor).unwrap();
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem));
+    }
+
+    #[test]
+    fn test_select_layer_delta_inside_inmem() {
+        let delta_top = create_persistent_layer(15, 20, true);
+        let delta_bottom = create_persistent_layer(10, 15, true);
+        let inmem = create_inmem_layer(15, 25);
+
+        let result =
+            LayerMap::select_layer(Some(delta_top.clone()), None, Some(inmem.clone()), Lsn(30))
+                .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(20));
+        assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem));
+
+        let result = LayerMap::select_layer(
+            Some(delta_top.clone()),
+            None,
+            Some(inmem.clone()),
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(15));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta_top))
+        );
+
+        let result = LayerMap::select_layer(
+            Some(delta_bottom.clone()),
+            None,
+            Some(inmem.clone()),
+            result.lsn_floor,
+        )
+        .unwrap();
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta_bottom))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_all_overlap_1() {
+        let inmem = create_inmem_layer(10, 30);
+        let delta = create_persistent_layer(15, 25, true);
+        let image = create_persistent_layer(20, 21, false);
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            Some(inmem.clone()),
+            Lsn(50),
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(25));
+        assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem));
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            Some(inmem.clone()),
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(21));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            Some(inmem.clone()),
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(20));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_all_overlap_2() {
+        let inmem = create_inmem_layer(20, 30);
+        let delta = create_persistent_layer(10, 40, true);
+        let image = create_persistent_layer(25, 26, false);
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            Some(inmem.clone()),
+            Lsn(50),
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(26));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            Some(inmem.clone()),
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(25));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_all_overlap_3() {
+        let inmem = create_inmem_layer(30, 40);
+        let delta = create_persistent_layer(10, 30, true);
+        let image = create_persistent_layer(20, 21, false);
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            Some(inmem.clone()),
+            Lsn(50),
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(30));
+        assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem));
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            None,
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(21));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            None,
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(20));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+    }
+}
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index f8bec48886..b3dc8e56a3 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -63,6 +63,8 @@ pub struct HistoricLayerCoverage<Value> {
     /// The latest state
     head: LayerCoverageTuple<Value>,
 
+    /// TODO: this could be an ordered vec using binary search.
+    /// We push into this map everytime we add a layer, so might see some benefit
     /// All previous states
     historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
 }
@@ -419,6 +421,10 @@ pub struct BufferedHistoricLayerCoverage<Value> {
     buffer: BTreeMap<LayerKey, Option<Value>>,
 
     /// All current layers. This is not used for search. Only to make rebuilds easier.
+    // TODO: This map is never cleared. Rebuilds could use the post-trim last entry of
+    // [`Self::historic_coverage`] instead of doubling memory usage.
+    // [`Self::len`]: can require rebuild and serve from latest historic
+    // [`Self::iter`]: already requires rebuild => can serve from latest historic
     layers: BTreeMap<LayerKey, Value>,
 }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 7f313f46a2..ece163b24a 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -40,6 +40,7 @@ use utils::sync::gate::GateGuard;
 
 use self::inmemory_layer::InMemoryLayerFileId;
 use super::PageReconstructError;
+use super::layer_map::InMemoryLayerDesc;
 use super::timeline::{GetVectoredError, ReadPath};
 use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
@@ -721,6 +722,12 @@ struct LayerToVisitId {
     lsn_floor: Lsn,
 }
 
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub enum ReadableLayerWeak {
+    PersistentLayer(Arc<PersistentLayerDesc>),
+    InMemoryLayer(InMemoryLayerDesc),
+}
+
 /// Layer wrapper for the read path. Note that it is valid
 /// to use these layers even after external operations have
 /// been performed on them (compaction, freeze, etc.).
@@ -873,7 +880,7 @@ impl ReadableLayer {
             }
             ReadableLayer::InMemoryLayer(layer) => {
                 layer
-                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
                     .await
             }
         }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index ffdfe1dc27..46135b5330 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -416,7 +416,7 @@ impl InMemoryLayer {
     pub(crate) async fn get_values_reconstruct_data(
         self: &Arc<InMemoryLayer>,
         keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
@@ -433,8 +433,6 @@ impl InMemoryLayer {
         let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
         let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
 
-        let lsn_range = self.start_lsn..end_lsn;
-
         for range in keyspace.ranges.iter() {
             for (key, vec_map) in inner
                 .index
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index d43dfefdbc..a7f3c6b8c5 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -49,6 +49,7 @@ async fn smoke_test() {
             Lsn(0x10),
             14,
             &ctx,
+            Default::default(), // in-memory layers
             Default::default(),
             image_layers,
             Lsn(0x100),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 851f84f603..17dbcee74e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3914,39 +3914,22 @@ impl Timeline {
                 let guard = timeline.layers.read().await;
                 let layers = guard.layer_map()?;
 
-                let in_memory_layer = layers.find_in_memory_layer(|l| {
-                    let start_lsn = l.get_lsn_range().start;
-                    cont_lsn > start_lsn
-                });
+                for range in unmapped_keyspace.ranges.iter() {
+                    let results = layers.range_search(range.clone(), cont_lsn);
 
-                match in_memory_layer {
-                    Some(l) => {
-                        let lsn_range = l.get_lsn_range().start..cont_lsn;
-                        fringe.update(
-                            ReadableLayer::InMemoryLayer(l),
-                            unmapped_keyspace.clone(),
-                            lsn_range,
-                        );
-                    }
-                    None => {
-                        for range in unmapped_keyspace.ranges.iter() {
-                            let results = layers.range_search(range.clone(), cont_lsn);
-
-                            results
-                                .found
-                                .into_iter()
-                                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                                    (
-                                        ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
-                                        keyspace_accum.to_keyspace(),
-                                        lsn_floor..cont_lsn,
-                                    )
-                                })
-                                .for_each(|(layer, keyspace, lsn_range)| {
-                                    fringe.update(layer, keyspace, lsn_range)
-                                });
-                        }
-                    }
+                    results
+                        .found
+                        .into_iter()
+                        .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                            (
+                                guard.upgrade(layer),
+                                keyspace_accum.to_keyspace(),
+                                lsn_floor..cont_lsn,
+                            )
+                        })
+                        .for_each(|(layer, keyspace, lsn_range)| {
+                            fringe.update(layer, keyspace, lsn_range)
+                        });
                 }
 
                 // It's safe to drop the layer map lock after planning the next round of reads.
@@ -5555,6 +5538,14 @@ pub struct DeltaLayerTestDesc {
     pub data: Vec<(Key, Lsn, Value)>,
 }
 
+#[cfg(test)]
+#[derive(Clone)]
+pub struct InMemoryLayerTestDesc {
+    pub lsn_range: Range<Lsn>,
+    pub data: Vec<(Key, Lsn, Value)>,
+    pub is_open: bool,
+}
+
 #[cfg(test)]
 impl DeltaLayerTestDesc {
     pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
@@ -6567,6 +6558,92 @@ impl Timeline {
         Ok(())
     }
 
+    /// Force create an in-memory layer and place them into the layer map.
+    #[cfg(test)]
+    pub(super) async fn force_create_in_memory_layer(
+        self: &Arc<Timeline>,
+        mut in_memory: InMemoryLayerTestDesc,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use utils::bin_ser::BeSer;
+
+        // Validate LSNs
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(in_memory.lsn_range.start >= check_start_lsn);
+        }
+
+        let last_record_lsn = self.get_last_record_lsn();
+        let layer_end_lsn = if in_memory.is_open {
+            in_memory
+                .data
+                .iter()
+                .map(|(_key, lsn, _value)| lsn)
+                .max()
+                .cloned()
+        } else {
+            Some(in_memory.lsn_range.end)
+        };
+
+        if let Some(end) = layer_end_lsn {
+            assert!(
+                end <= last_record_lsn,
+                "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
+                end,
+                last_record_lsn,
+            );
+        }
+
+        in_memory.data.iter().for_each(|(_key, lsn, _value)| {
+            assert!(*lsn >= in_memory.lsn_range.start);
+            assert!(*lsn < in_memory.lsn_range.end);
+        });
+
+        // Build the batch
+        in_memory
+            .data
+            .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+
+        let data = in_memory
+            .data
+            .into_iter()
+            .map(|(key, lsn, value)| {
+                let value_size = value.serialized_size().unwrap() as usize;
+                (key.to_compact(), lsn, value_size, value)
+            })
+            .collect::<Vec<_>>();
+
+        let batch = SerializedValueBatch::from_values(data);
+
+        // Create the in-memory layer and write the batch into it
+        let layer = InMemoryLayer::create(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            in_memory.lsn_range.start,
+            &self.gate,
+            ctx,
+        )
+        .await
+        .unwrap();
+
+        layer.put_batch(batch, ctx).await.unwrap();
+        if !in_memory.is_open {
+            layer.freeze(in_memory.lsn_range.end).await;
+        }
+
+        info!("force created in-memory layer {:?}", in_memory.lsn_range);
+
+        // Link the layer to the layer map
+        {
+            let mut guard = self.layers.write().await;
+            let layer_map = guard.open_mut().unwrap();
+            layer_map.force_insert_in_memory_layer(Arc::new(layer));
+        }
+
+        Ok(())
+    }
+
     /// Return all keys at the LSN in the image layers
     #[cfg(test)]
     pub(crate) async fn inspect_image_layers(
@@ -6999,6 +7076,7 @@ mod tests {
                 Lsn(0x10),
                 14,
                 &ctx,
+                Vec::new(), // in-memory layers
                 delta_layers,
                 image_layers,
                 Lsn(0x100),
@@ -7132,6 +7210,7 @@ mod tests {
                 Lsn(0x10),
                 14,
                 &ctx,
+                Vec::new(), // in-memory layers
                 delta_layers,
                 image_layers,
                 Lsn(0x100),
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index e552ea83de..1b489028dc 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -8,14 +8,14 @@ use tracing::trace;
 use utils::id::TimelineId;
 use utils::lsn::{AtomicLsn, Lsn};
 
-use super::TimelineWriterState;
+use super::{ReadableLayer, TimelineWriterState};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::metrics::TimelineMetrics;
 use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
 use crate::tenant::storage_layer::{
     AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
-    PersistentLayerKey, ResidentLayer,
+    PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
 };
 
 /// Provides semantic APIs to manipulate the layer map.
@@ -37,6 +37,21 @@ impl Default for LayerManager {
 }
 
 impl LayerManager {
+    pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
+        match weak {
+            ReadableLayerWeak::PersistentLayer(desc) => {
+                ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
+            }
+            ReadableLayerWeak::InMemoryLayer(desc) => {
+                let inmem = self
+                    .layer_map()
+                    .expect("no concurrent shutdown")
+                    .in_memory_layer(&desc);
+                ReadableLayer::InMemoryLayer(inmem)
+            }
+        }
+    }
+
     pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
         // The assumption for the `expect()` is that all code maintains the following invariant:
         // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
@@ -470,6 +485,25 @@ impl OpenLayerManager {
         mapping.remove(layer);
         layer.delete_on_drop();
     }
+
+    #[cfg(test)]
+    pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc<InMemoryLayer>) {
+        use pageserver_api::models::InMemoryLayerInfo;
+
+        match layer.info() {
+            InMemoryLayerInfo::Open { .. } => {
+                assert!(self.layer_map.open_layer.is_none());
+                self.layer_map.open_layer = Some(layer);
+            }
+            InMemoryLayerInfo::Frozen { lsn_start, .. } => {
+                if let Some(last) = self.layer_map.frozen_layers.back() {
+                    assert!(last.get_lsn_range().end <= lsn_start);
+                }
+
+                self.layer_map.frozen_layers.push_back(layer);
+            }
+        }
+    }
 }
 
 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);

From 9a4e2eab61844784b8323cddaae1ac3952b9f6f6 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Mon, 3 Mar 2025 20:00:53 +0200
Subject: [PATCH 19/61] Fix artifact name for build with sanitizers (#11066)

## Problem
When a build is made with sanitizers, this is not reflected in the
artifact name, which can lead to overriding normal builds with sanitized
ones.

## Summary of changes
Take this property of a build into account when constructing the
artifact name.
---
 .github/actions/run-python-test-set/action.yml | 8 +++++++-
 .github/workflows/_build-and-test-locally.yml  | 4 ++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 122fe48b68..fa6f882161 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -44,6 +44,11 @@ inputs:
     description: 'Postgres version to use for tests'
     required: false
     default: 'v16'
+  sanitizers:
+    description: 'enabled or disabled'
+    required: false
+    default: 'disabled'
+    type: string
   benchmark_durations:
     description: 'benchmark durations JSON'
     required: false
@@ -59,7 +64,7 @@ runs:
       if: inputs.build_type != 'remote'
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
         path: /tmp/neon
         aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
 
@@ -112,6 +117,7 @@ runs:
         ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
         RERUN_FAILED: ${{ inputs.rerun_failed }}
         PG_VERSION: ${{ inputs.pg_version }}
+        SANITIZERS: ${{ inputs.sanitizers }}
       shell: bash -euxo pipefail {0}
       run: |
         # PLATFORM will be embedded in the perf test report
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 30fde127b0..6a2070424a 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -280,7 +280,7 @@ jobs:
       - name: Upload Neon artifact
         uses: ./.github/actions/upload
         with:
-          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
           path: /tmp/neon
           aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
@@ -347,6 +347,7 @@ jobs:
           real_s3_region: eu-central-1
           rerun_failed: true
           pg_version: ${{ matrix.pg_version }}
+          sanitizers: ${{ inputs.sanitizers }}
           aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
           # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
           # Attempt to stop tests gracefully to generate test reports
@@ -359,7 +360,6 @@ jobs:
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
           USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
-          SANITIZERS: ${{ inputs.sanitizers }}
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540

From 5197e43396f65bab9f9bf54edf8e2b899b1c1b69 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 3 Mar 2025 19:04:01 +0000
Subject: [PATCH 20/61] pageserver: add recurse flag to layer download spec
 (#11068)

I missed updating the open api spec in the original PR.
We need this so that the cplane auto-generated client sees the flag.
---
 pageserver/src/http/openapi_spec.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 12252739fd..0fb9a240d5 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -842,6 +842,12 @@ paths:
         required: false
         schema:
           type: integer
+      - name: recurse
+        description: When set, will recurse with the downloads into ancestor timelines
+        in: query
+        required: false
+        schema:
+          type: boolean
     post:
       description: |
         Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter

From 6ca49b4d0c90009bc0c9b9934fe3d3835ade65ea Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 3 Mar 2025 19:16:03 +0000
Subject: [PATCH 21/61] safekeeper: fix a gap tracking edge case (#11054)

The interpreted reader tracks a record aligned current position in the
WAL stream.
Partial reads move the stream internally, but not from the pov of the
interpreted WAL reader.

Hence, where new shards subscribe with a start position that matches the
reader's current position,
but we've also done some partial reads. This confuses the gap tracking.
To make it more robust,
update the current batch start to the min between the new start position
and its current value.

Since no record has been decoded yet (position matches), we can't have
lost it
---
 safekeeper/src/send_interpreted_wal.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index 2c1c73c25c..bf03f27d48 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -184,6 +184,16 @@ impl InterpretedWalReaderState {
                         to: *current_position,
                     }
                 } else {
+                    // Edge case: The new shard is at the same current position as
+                    // the reader. Note that the current position is WAL record aligned,
+                    // so the reader might have done some partial reads and updated the
+                    // batch start. If that's the case, adjust the batch start to match
+                    // starting position of the new shard. It can lead to some shards
+                    // seeing overlaps, but in that case the actual record LSNs are checked
+                    // which should be fine based on the filtering logic.
+                    if let Some(start) = current_batch_wal_start {
+                        *start = std::cmp::min(*start, new_shard_start_pos);
+                    }
                     CurrentPositionUpdate::NotReset(*current_position)
                 }
             }

From dbf9a8026162f01f95c9e218180fa0885b37410b Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 3 Mar 2025 15:23:20 -0500
Subject: [PATCH 22/61] fix(pageserver): avoid flooding gc-compaction logs
 (#11024)

## Problem

The "did not trigger" gets logged at 10k/minute in staging.

## Summary of changes

Change it to debug level.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index c835980a7d..76c28e11ab 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -321,7 +321,7 @@ impl GcCompactionQueue {
                 l1_size, l2_size, l2_lsn, gc_cutoff
             );
         } else {
-            info!(
+            debug!(
                 "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}",
                 l1_size, l2_size, l2_lsn, gc_cutoff
             );

From 6d0976dad5517531a2163ddd67e3d1e1b9cd9756 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 3 Mar 2025 16:05:43 -0500
Subject: [PATCH 23/61] feat(pageserver): persist reldir v2 migration status
 (#10980)

## Problem

part of https://github.com/neondatabase/neon/issues/9516

## Summary of changes

Similar to the aux v2 migration, we persist the relv2 migration status
into index_part, so that even the config item is set to false, we will
still read from the v2 storage to avoid loss of data.

Note that only the two variants `None` and
`Some(RelSizeMigration::Migrating)` are used for now. We don't have full
migration implemented so it will never be set to
`RelSizeMigration::Migrated`.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs             | 19 ++++++++
 pageserver/src/http/routes.rs                 |  1 +
 pageserver/src/pgdatadir_mapping.rs           | 47 +++++++++++++++++--
 pageserver/src/tenant.rs                      |  6 ++-
 .../src/tenant/remote_timeline_client.rs      | 19 +++++++-
 .../tenant/remote_timeline_client/index.rs    | 16 +------
 pageserver/src/tenant/timeline.rs             | 28 ++++++++++-
 pageserver/src/tenant/timeline/delete.rs      |  1 +
 .../performance/test_perf_many_relations.py   |  7 +++
 test_runner/regress/test_pg_regress.py        | 15 ++++++
 test_runner/regress/test_relations.py         | 44 ++++++++++++++++-
 11 files changed, 178 insertions(+), 25 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ea565e7769..fabfe28aa2 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1165,6 +1165,21 @@ pub struct OffloadedTimelineInfo {
     pub archived_at: chrono::DateTime<chrono::Utc>,
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum RelSizeMigration {
+    /// The tenant is using the old rel_size format.
+    /// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
+    /// `None` is the same as `Some(RelSizeMigration::Legacy)`.
+    Legacy,
+    /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
+    /// persisted in the index part. The read path will read both formats and merge them.
+    Migrating,
+    /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
+    /// in the index part, and the read path will not read the old format.
+    Migrated,
+}
+
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
@@ -1243,7 +1258,11 @@ pub struct TimelineInfo {
     // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
     // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
     // read.
+    /// Whether the timeline is archived.
     pub is_archived: Option<bool>,
+
+    /// The status of the rel_size migration.
+    pub rel_size_migration: Option<RelSizeMigration>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a3ee31d6e6..cd79aa6680 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -481,6 +481,7 @@ async fn build_timeline_info_common(
 
         state,
         is_archived: Some(is_archived),
+        rel_size_migration: Some(timeline.get_rel_size_v2_status()),
 
         walreceiver_status,
     };
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index c10dfb4542..8aa96dd672 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -21,6 +21,7 @@ use pageserver_api::key::{
     slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
 };
 use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::models::RelSizeMigration;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
@@ -492,7 +493,9 @@ impl Timeline {
         // Otherwise, read the old reldir keyspace.
         // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
 
-        if self.get_rel_size_v2_enabled() {
+        if let RelSizeMigration::Migrated | RelSizeMigration::Migrating =
+            self.get_rel_size_v2_status()
+        {
             // fetch directory listing (new)
             let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
             let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
@@ -544,7 +547,7 @@ impl Timeline {
                 forknum: *forknum,
             }));
 
-        if !self.get_rel_size_v2_enabled() {
+        if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() {
             return Ok(rels_v1);
         }
 
@@ -1720,6 +1723,35 @@ impl DatadirModification<'_> {
         Ok(())
     }
 
+    /// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that
+    /// we enable it, we also need to persist it in `index_part.json`.
+    pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result<bool> {
+        let status = self.tline.get_rel_size_v2_status();
+        let config = self.tline.get_rel_size_v2_enabled();
+        match (config, status) {
+            (false, RelSizeMigration::Legacy) => {
+                // tenant config didn't enable it and we didn't write any reldir_v2 key yet
+                Ok(false)
+            }
+            (false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
+                // index_part already persisted that the timeline has enabled rel_size_v2
+                Ok(true)
+            }
+            (true, RelSizeMigration::Legacy) => {
+                // The first time we enable it, we need to persist it in `index_part.json`
+                self.tline
+                    .update_rel_size_v2_status(RelSizeMigration::Migrating)?;
+                tracing::info!("enabled rel_size_v2");
+                Ok(true)
+            }
+            (true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
+                // index_part already persisted that the timeline has enabled rel_size_v2
+                // and we don't need to do anything
+                Ok(true)
+            }
+        }
+    }
+
     /// Store a relmapper file (pg_filenode.map) in the repository
     pub async fn put_relmap_file(
         &mut self,
@@ -1728,6 +1760,8 @@ impl DatadirModification<'_> {
         img: Bytes,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+
         // Add it to the directory (if it doesn't exist already)
         let buf = self.get(DBDIR_KEY, ctx).await?;
         let mut dbdir = DbDirectory::des(&buf)?;
@@ -1748,7 +1782,7 @@ impl DatadirModification<'_> {
             })?;
             self.pending_directory_entries
                 .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
-            if self.tline.get_rel_size_v2_enabled() {
+            if v2_enabled {
                 self.pending_directory_entries
                     .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
             }
@@ -1905,7 +1939,9 @@ impl DatadirModification<'_> {
             return Err(RelationError::AlreadyExists);
         }
 
-        if self.tline.get_rel_size_v2_enabled() {
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+
+        if v2_enabled {
             let sparse_rel_dir_key =
                 rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
             // check if the rel_dir_key exists in v2
@@ -2031,6 +2067,7 @@ impl DatadirModification<'_> {
         drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
         for ((spc_node, db_node), rel_tags) in drop_relations {
             let dir_key = rel_dir_to_key(spc_node, db_node);
             let buf = self.get(dir_key, ctx).await?;
@@ -2043,7 +2080,7 @@ impl DatadirModification<'_> {
                         .push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
                     dirty = true;
                     true
-                } else if self.tline.get_rel_size_v2_enabled() {
+                } else if v2_enabled {
                     // The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
                     // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
                     // logic).
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 776e523c2e..fee007b2d7 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -31,8 +31,8 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use itertools::Itertools as _;
 use once_cell::sync::Lazy;
-use pageserver_api::models;
 pub use pageserver_api::models::TenantState;
+use pageserver_api::models::{self, RelSizeMigration};
 use pageserver_api::models::{
     CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
     WalRedoManagerStatus,
@@ -1123,6 +1123,7 @@ impl Tenant {
             CreateTimelineCause::Load,
             idempotency.clone(),
             index_part.gc_compaction.clone(),
+            index_part.rel_size_migration.clone(),
         )?;
         let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
         anyhow::ensure!(
@@ -4128,6 +4129,7 @@ impl Tenant {
         cause: CreateTimelineCause,
         create_idempotency: CreateTimelineIdempotency,
         gc_compaction_state: Option<GcCompactionState>,
+        rel_size_v2_status: Option<RelSizeMigration>,
     ) -> anyhow::Result<Arc<Timeline>> {
         let state = match cause {
             CreateTimelineCause::Load => {
@@ -4160,6 +4162,7 @@ impl Tenant {
             self.attach_wal_lag_cooldown.clone(),
             create_idempotency,
             gc_compaction_state,
+            rel_size_v2_status,
             self.cancel.child_token(),
         );
 
@@ -5231,6 +5234,7 @@ impl Tenant {
                 CreateTimelineCause::Load,
                 create_guard.idempotency.clone(),
                 None,
+                None,
             )
             .context("Failed to create timeline data structure")?;
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 4ba5844fea..2ca482ca43 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -194,7 +194,7 @@ pub(crate) use download::{
 };
 use index::GcCompactionState;
 pub(crate) use index::LayerFileMetadata;
-use pageserver_api::models::TimelineArchivalState;
+use pageserver_api::models::{RelSizeMigration, TimelineArchivalState};
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use regex::Regex;
 use remote_storage::{
@@ -900,7 +900,7 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    /// Launch an index-file upload operation in the background, setting `import_pgdata` field.
+    /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field.
     pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
         self: &Arc<Self>,
         gc_compaction_state: GcCompactionState,
@@ -912,6 +912,21 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field.
+    pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update(
+        self: &Arc<Self>,
+        rel_size_v2_status: RelSizeMigration,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status);
+        // TODO: allow this operation to bypass the validation check because we might upload the index part
+        // with no layers but the flag updated. For now, we just modify the index part in memory and the next
+        // upload will include the flag.
+        // self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
+
     ///
     /// Launch an index-file upload operation in the background, if necessary.
     ///
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index ceaed58bbd..16c38be907 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -7,6 +7,7 @@ use std::collections::HashMap;
 
 use chrono::NaiveDateTime;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::RelSizeMigration;
 use pageserver_api::shard::ShardIndex;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
@@ -117,21 +118,6 @@ pub struct GcCompactionState {
     pub(crate) last_completed_lsn: Lsn,
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub enum RelSizeMigration {
-    /// The tenant is using the old rel_size format.
-    /// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
-    /// `None` is the same as `Some(RelSizeMigration::Legacy)`.
-    Legacy,
-    /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
-    /// persisted in the index part. The read path will read both formats and merge them.
-    Migrating,
-    /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
-    /// in the index part, and the read path will not read the old format.
-    Migrated,
-}
-
 impl IndexPart {
     /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
     /// used to understand later versions.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 17dbcee74e..7ed7910732 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart
 use pageserver_api::models::{
     CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
     DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-    InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState,
+    InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState,
 };
 use pageserver_api::reltag::{BlockNumber, RelTag};
 use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
@@ -436,6 +436,8 @@ pub struct Timeline {
     /// May host a background Tokio task which downloads all the layers from the current
     /// heatmap on demand.
     heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
+
+    pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
 }
 
 pub(crate) enum PreviousHeatmap {
@@ -2368,6 +2370,9 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
+    /// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path
+    /// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is
+    /// possible that the index part persists the state while the config doesn't get persisted.
     pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2376,6 +2381,14 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
     }
 
+    pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration {
+        self.rel_size_v2_status
+            .load()
+            .as_ref()
+            .map(|s| s.as_ref().clone())
+            .unwrap_or(RelSizeMigration::Legacy)
+    }
+
     fn get_compaction_upper_limit(&self) -> usize {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2636,6 +2649,7 @@ impl Timeline {
         attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
         create_idempotency: crate::tenant::CreateTimelineIdempotency,
         gc_compaction_state: Option<GcCompactionState>,
+        rel_size_v2_status: Option<RelSizeMigration>,
         cancel: CancellationToken,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2794,6 +2808,8 @@ impl Timeline {
                 previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
 
                 heatmap_layers_downloader: Mutex::new(None),
+
+                rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
             };
 
             result.repartition_threshold =
@@ -2870,6 +2886,16 @@ impl Timeline {
             .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state)
     }
 
+    pub(crate) fn update_rel_size_v2_status(
+        &self,
+        rel_size_v2_status: RelSizeMigration,
+    ) -> anyhow::Result<()> {
+        self.rel_size_v2_status
+            .store(Some(Arc::new(rel_size_v2_status.clone())));
+        self.remote_client
+            .schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status)
+    }
+
     pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
         self.gc_compaction_state.load_full().as_ref().clone()
     }
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 7cdc69e55f..c9666bb4e1 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -306,6 +306,7 @@ impl DeleteTimelineFlow {
                 CreateTimelineCause::Delete,
                 crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
                 None, // doesn't matter what we put here
+                None, // doesn't matter what we put here
             )
             .context("create_timeline_struct")?;
 
diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py
index 2570c55f6c..e2f0a79018 100644
--- a/test_runner/performance/test_perf_many_relations.py
+++ b/test_runner/performance/test_perf_many_relations.py
@@ -83,6 +83,13 @@ def test_perf_simple_many_relations_reldir_v2(
         ],
     )
 
+    assert (
+        env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
+            "rel_size_migration"
+        ]
+        != "legacy"
+    )
+
     n = 100000
     step = 5000
     # Create many relations
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 6a76ad5ca8..df243c13f1 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -358,6 +358,21 @@ def test_tx_abort_with_many_relations(
         ],
     )
 
+    if reldir_type == "v1":
+        assert (
+            env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
+                "rel_size_migration"
+            ]
+            == "legacy"
+        )
+    else:
+        assert (
+            env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
+                "rel_size_migration"
+            ]
+            != "legacy"
+        )
+
     # How many relations: this number is tuned to be long enough to take tens of seconds
     # if the rollback code path is buggy, tripping the test's timeout.
     if reldir_type == "v1":
diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py
index 3e29c92a96..07eacfc775 100644
--- a/test_runner/regress/test_relations.py
+++ b/test_runner/regress/test_relations.py
@@ -19,6 +19,17 @@ def test_pageserver_reldir_v2(
     endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)")
     endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)")
 
+    assert (
+        env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
+            "rel_size_migration"
+        ]
+        == "legacy"
+    )
+
+    # Ensure the pageserver accepts the table creation SQLs before the migration. In theory, we can also do
+    # a "wait_flush_lsn" here, but it's easier to just do a restart.
+    env.pageserver.restart()
+
     # Switch to v2
     env.pageserver.http_client().update_tenant_config(
         env.initial_tenant,
@@ -27,6 +38,13 @@ def test_pageserver_reldir_v2(
         },
     )
 
+    assert (
+        env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
+            "rel_size_migration"
+        ]
+        == "legacy"
+    )
+
     # Check if both relations are still accessible
     endpoint.safe_psql("SELECT * FROM foo1")
     endpoint.safe_psql("SELECT * FROM foo2")
@@ -41,12 +59,14 @@ def test_pageserver_reldir_v2(
 
     # Create a relation in v2
     endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)")
+    endpoint.safe_psql("CREATE TABLE foo4 (id INTEGER PRIMARY KEY, val text)")
     # Delete a relation in v1
     endpoint.safe_psql("DROP TABLE foo1")
 
     # Check if both relations are still accessible
     endpoint.safe_psql("SELECT * FROM foo2")
     endpoint.safe_psql("SELECT * FROM foo3")
+    endpoint.safe_psql("SELECT * FROM foo4")
 
     # Restart the endpoint
     endpoint.stop()
@@ -57,7 +77,7 @@ def test_pageserver_reldir_v2(
     endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
     endpoint.safe_psql("SELECT * FROM foo2")
     endpoint.safe_psql("SELECT * FROM foo3")
-
+    endpoint.safe_psql("SELECT * FROM foo4")
     endpoint.safe_psql("DROP TABLE foo3")
     endpoint.stop()
     endpoint.start()
@@ -66,3 +86,25 @@ def test_pageserver_reldir_v2(
     endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
     endpoint.safe_psql("SELECT * FROM foo2")
     endpoint.safe_psql("DROP TABLE IF EXISTS foo3")
+    endpoint.safe_psql("SELECT * FROM foo4")
+
+    # Set the config to false to emulate the case where the config is not persisted when the tenant gets detached/attached.
+    env.pageserver.http_client().update_tenant_config(
+        env.initial_tenant,
+        {
+            "rel_size_v2_enabled": False,
+        },
+    )
+
+    # Check if the relation is still accessible
+    endpoint.safe_psql("SELECT * FROM foo2")
+    endpoint.safe_psql("SELECT * FROM foo4")
+
+    env.pageserver.restart()
+
+    assert (
+        env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
+            "rel_size_migration"
+        ]
+        == "migrating"
+    )

From 65addfc5246ad9ef633874dddb3ab0d66c028fe3 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 3 Mar 2025 23:04:59 +0100
Subject: [PATCH 24/61] storcon: add per-tenant rate limiting for API requests
 (#10924)

## Problem

Incoming requests often take the service lock, and sometimes even do
database transactions. That creates a risk that a rogue client can
starve the controller of the ability to do its primary job of
reconciling tenants to an available state.

## Summary of changes

* Use the `governor` crate to rate limit tenant requests at 10 requests
per second. This is ~10-100x lower than the worst "attack" we've seen
from a client bug. Admin APIs are not rate limited.
* Add a `storage_controller_http_request_rate_limited` histogram for
rate limited requests.
* Log a warning every 10 seconds for rate limited tenants.

The rate limiter is parametrized on TenantId, because the kinds of
client bug we're protecting against generally happen within tenant
scope, and the rates should be somewhat stable: we expect the global
rate of requests to increase as we do more work, but we do not expect
the rate of requests to one tenant to increase.

---------

Co-authored-by: John Spray <john@neon.tech>
---
 Cargo.lock                                    | 77 ++++++++++++++++++-
 Cargo.toml                                    |  1 +
 storage_controller/Cargo.toml                 |  1 +
 storage_controller/src/http.rs                | 69 +++++++++++++++--
 storage_controller/src/main.rs                |  6 ++
 storage_controller/src/metrics.rs             |  4 +
 storage_controller/src/service.rs             |  5 ++
 .../fixtures/pageserver/allowed_errors.py     |  2 +
 8 files changed, 156 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 293ed465ff..a978e4d744 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2410,9 +2410,9 @@ checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
 
 [[package]]
 name = "futures-timer"
-version = "3.0.2"
+version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
+checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24"
 
 [[package]]
 name = "futures-util"
@@ -2515,6 +2515,27 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
+[[package]]
+name = "governor"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "842dc78579ce01e6a1576ad896edc92fca002dd60c9c3746b7fc2bec6fb429d0"
+dependencies = [
+ "cfg-if",
+ "dashmap 6.1.0",
+ "futures-sink",
+ "futures-timer",
+ "futures-util",
+ "no-std-compat",
+ "nonzero_ext",
+ "parking_lot 0.12.1",
+ "portable-atomic",
+ "quanta",
+ "rand 0.8.5",
+ "smallvec",
+ "spinning_top",
+]
+
 [[package]]
 name = "group"
 version = "0.12.1"
@@ -3725,6 +3746,12 @@ dependencies = [
  "memoffset 0.9.0",
 ]
 
+[[package]]
+name = "no-std-compat"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c"
+
 [[package]]
 name = "nom"
 version = "7.1.3"
@@ -3735,6 +3762,12 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "nonzero_ext"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21"
+
 [[package]]
 name = "notify"
 version = "8.0.0"
@@ -4591,6 +4624,12 @@ dependencies = [
  "never-say-never",
 ]
 
+[[package]]
+name = "portable-atomic"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
+
 [[package]]
 name = "postgres"
 version = "0.19.7"
@@ -5052,6 +5091,21 @@ dependencies = [
  "zerocopy",
 ]
 
+[[package]]
+name = "quanta"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e"
+dependencies = [
+ "crossbeam-utils",
+ "libc",
+ "once_cell",
+ "raw-cpuid",
+ "wasi 0.11.0+wasi-snapshot-preview1",
+ "web-sys",
+ "winapi",
+]
+
 [[package]]
 name = "quick-xml"
 version = "0.26.0"
@@ -5182,6 +5236,15 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "raw-cpuid"
+version = "11.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "rayon"
 version = "1.7.0"
@@ -6395,6 +6458,15 @@ version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
 
+[[package]]
+name = "spinning_top"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300"
+dependencies = [
+ "lock_api",
+]
+
 [[package]]
 name = "spki"
 version = "0.6.0"
@@ -6471,6 +6543,7 @@ dependencies = [
  "diesel_migrations",
  "fail",
  "futures",
+ "governor",
  "hex",
  "http-utils",
  "humantime",
diff --git a/Cargo.toml b/Cargo.toml
index ff45d46a47..870b3412db 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -95,6 +95,7 @@ futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
+governor = "0.8"
 hashbrown = "0.14"
 hashlink = "0.9.1"
 hdrhistogram = "7.5.2"
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index b63ba154da..6b657b5ea0 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -21,6 +21,7 @@ clap.workspace = true
 cron.workspace = true
 fail.workspace = true
 futures.workspace = true
+governor.workspace = true
 hex.workspace = true
 hyper0.workspace = true
 humantime.workspace = true
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 64f0be3c23..3e448d7013 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1,5 +1,5 @@
 use std::str::FromStr;
-use std::sync::Arc;
+use std::sync::{Arc, LazyLock};
 use std::time::{Duration, Instant};
 
 use anyhow::Context;
@@ -33,6 +33,7 @@ use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
 use pageserver_client::{BlockUnblock, mgmt_api};
 use routerify::Middleware;
 use tokio_util::sync::CancellationToken;
+use tracing::warn;
 use utils::auth::{Scope, SwappableJwtAuth};
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -49,6 +50,7 @@ use crate::service::{LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIME
 pub struct HttpState {
     service: Arc<crate::service::Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
+    rate_limiter: governor::DefaultKeyedRateLimiter<TenantId>,
     neon_metrics: NeonMetrics,
     allowlist_routes: &'static [&'static str],
 }
@@ -59,9 +61,11 @@ impl HttpState {
         auth: Option<Arc<SwappableJwtAuth>>,
         build_info: BuildInfo,
     ) -> Self {
+        let quota = governor::Quota::per_second(service.get_config().tenant_rate_limit);
         Self {
             service,
             auth,
+            rate_limiter: governor::RateLimiter::keyed(quota),
             neon_metrics: NeonMetrics::new(build_info),
             allowlist_routes: &[
                 "/status",
@@ -82,6 +86,40 @@ fn get_state(request: &Request<Body>) -> &HttpState {
         .as_ref()
 }
 
+/// Rate limits tenant requests.
+///
+/// TODO: this should be a request middleware, but requires us to extract the tenant ID from
+/// different URLs in a systematic way.
+///
+/// TODO: consider returning a 429 response if these start piling up.
+async fn maybe_rate_limit(request: &Request<Body>, tenant_id: TenantId) {
+    // Check if the tenant should be rate-limited.
+    let rate_limiter = &get_state(request).rate_limiter;
+    if rate_limiter.check_key(&tenant_id).is_ok() {
+        return;
+    }
+
+    // Measure the rate limiting delay.
+    let _timer = METRICS_REGISTRY
+        .metrics_group
+        .storage_controller_http_request_rate_limited
+        .start_timer();
+
+    // Log rate limited tenants once every 10 seconds.
+    static LOG_RATE_LIMITER: LazyLock<governor::DefaultKeyedRateLimiter<TenantId>> =
+        LazyLock::new(|| {
+            let quota = governor::Quota::with_period(Duration::from_secs(10)).unwrap();
+            governor::RateLimiter::keyed(quota)
+        });
+
+    if LOG_RATE_LIMITER.check_key(&tenant_id).is_ok() {
+        warn!("tenant {tenant_id} is rate limited")
+    }
+
+    // Wait for quota.
+    rate_limiter.until_key_ready(&tenant_id).await;
+}
+
 /// Pageserver calls into this on startup, to learn which tenants it should attach
 async fn handle_re_attach(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::GenerationsApi)?;
@@ -247,6 +285,7 @@ async fn handle_tenant_config_get(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -264,6 +303,7 @@ async fn handle_tenant_time_travel_remote_storage(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -311,6 +351,7 @@ async fn handle_tenant_secondary_download(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
+    maybe_rate_limit(&req, tenant_id).await;
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -329,6 +370,7 @@ async fn handle_tenant_delete(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -356,6 +398,7 @@ async fn handle_tenant_timeline_create(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -381,6 +424,7 @@ async fn handle_tenant_timeline_delete(
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -457,6 +501,7 @@ async fn handle_tenant_timeline_archival_config(
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -482,6 +527,7 @@ async fn handle_tenant_timeline_detach_ancestor(
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -504,6 +550,7 @@ async fn handle_tenant_timeline_block_unblock_gc(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
@@ -521,6 +568,7 @@ async fn handle_tenant_timeline_download_heatmap_layers(
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
 
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_shard_id.tenant_id).await;
 
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
     let concurrency: Option<usize> = parse_query_param(&req, "concurrency")?;
@@ -550,6 +598,7 @@ async fn handle_tenant_timeline_passthrough(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_or_shard_id: TenantShardId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_or_shard_id.tenant_id).await;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -654,6 +703,7 @@ async fn handle_tenant_locate(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
 
     check_permissions(&req, Scope::Admin)?;
+    // NB: don't rate limit: admin operation.
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -669,9 +719,9 @@ async fn handle_tenant_describe(
     service: Arc<Service>,
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Scrubber)?;
-
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::Scrubber)?;
+    // NB: don't rate limit: scrubber operation.
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1006,6 +1056,7 @@ async fn handle_tenant_shard_split(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
+    // NB: don't rate limit: admin operation.
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1028,6 +1079,7 @@ async fn handle_tenant_shard_migrate(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
+    // NB: don't rate limit: admin operation.
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1051,6 +1103,7 @@ async fn handle_tenant_shard_migrate_secondary(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
+    // NB: don't rate limit: admin operation.
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1074,6 +1127,7 @@ async fn handle_tenant_shard_cancel_reconcile(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
+    // NB: don't rate limit: admin operation.
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1093,6 +1147,7 @@ async fn handle_tenant_shard_cancel_reconcile(
 
 async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
+    // NB: don't rate limit: admin operation.
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1148,9 +1203,9 @@ async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError
 }
 
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::PageServerApi)?;
-
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1165,9 +1220,9 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
 }
 
 async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::PageServerApi)?;
-
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 380ffeb9b7..6ef17c0007 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,3 +1,4 @@
+use std::num::NonZeroU32;
 use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::Duration;
@@ -98,6 +99,10 @@ struct Cli {
     #[arg(long)]
     priority_reconciler_concurrency: Option<usize>,
 
+    /// Tenant API rate limit, as requests per second per tenant.
+    #[arg(long, default_value = "10")]
+    tenant_rate_limit: NonZeroU32,
+
     /// How long to wait for the initial database connection to be available.
     #[arg(long, default_value = "5s")]
     db_connect_timeout: humantime::Duration,
@@ -339,6 +344,7 @@ async fn async_main() -> anyhow::Result<()> {
         priority_reconciler_concurrency: args
             .priority_reconciler_concurrency
             .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT),
+        tenant_rate_limit: args.tenant_rate_limit,
         split_threshold: args.split_threshold,
         neon_local_repo_dir: args.neon_local_repo_dir,
         max_secondary_lag_bytes: args.max_secondary_lag_bytes,
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index f490edb68f..ea390df726 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -76,6 +76,10 @@ pub(crate) struct StorageControllerMetricGroup {
     pub(crate) storage_controller_http_request_latency:
         measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
 
+    /// HTTP rate limiting latency across all tenants and endpoints
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 10.0))]
+    pub(crate) storage_controller_http_request_rate_limited: measured::Histogram<10>,
+
     /// Count of HTTP requests to the pageserver that resulted in an error,
     /// broken down by the pageserver node id, request name and method
     pub(crate) storage_controller_pageserver_request_error:
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 26ccfd5445..8fc7f7a0c5 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5,6 +5,7 @@ use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap, HashSet};
 use std::error::Error;
+use std::num::NonZeroU32;
 use std::ops::Deref;
 use std::path::PathBuf;
 use std::str::FromStr;
@@ -365,6 +366,10 @@ pub struct Config {
     /// How many high-priority Reconcilers may be spawned concurrently
     pub priority_reconciler_concurrency: usize,
 
+    /// How many API requests per second to allow per tenant, across all
+    /// tenant-scoped API endpoints. Further API requests queue until ready.
+    pub tenant_rate_limit: NonZeroU32,
+
     /// How large must a shard grow in bytes before we split it?
     /// None disables auto-splitting.
     pub split_threshold: Option<u64>,
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 4fce558840..abddfa2768 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -124,6 +124,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
     # controller's attempts to notify the endpoint).
     ".*reconciler.*neon_local notification hook failed.*",
     ".*reconciler.*neon_local error.*",
+    # Tenant rate limits may fire in tests that submit lots of API requests.
+    ".*tenant \\S+ is rate limited.*",
 ]
 
 

From 435bf452e6ec4b9a5e10388911ccd80140cb3311 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 4 Mar 2025 08:18:19 +0000
Subject: [PATCH 25/61] tests: remove obsolete err log whitelisting (#11069)

The pageserver read path now supports overlapped in-memory and image
layers via
https://github.com/neondatabase/neon/pull/11000. These allowed errors
are now obsolete.
---
 test_runner/regress/test_sharding.py         | 11 -----------
 test_runner/regress/test_storage_scrubber.py | 11 -----------
 2 files changed, 22 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index f58bbcd3c0..cb28f5b12d 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1814,14 +1814,3 @@ def test_sharding_gc(
         shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"])
         log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}")
         assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn
-
-    for ps in env.pageservers:
-        # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
-        # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
-        # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
-        ps.allowed_errors.extend(
-            [
-                ".*could not find data for key.*",
-                ".*could not ingest record.*",
-            ]
-        )
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index d44c176b35..0f4e5688a9 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -312,17 +312,6 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
     drop_local_state(env, tenant_id)
     workload.validate()
 
-    for ps in env.pageservers:
-        # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
-        # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
-        # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
-        ps.allowed_errors.extend(
-            [
-                ".*could not find data for key.*",
-                ".*could not ingest record.*",
-            ]
-        )
-
 
 def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder):
     """

From a2902e774aaebbb3e424ad23be30a86e413ab431 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 4 Mar 2025 13:13:41 +0100
Subject: [PATCH 26/61] http-utils: generate heap profiles with jemalloc_pprof
 (#11075)

## Problem

The code to generate symbolized pprof heap profiles and flamegraph SVGs
has been upstreamed to the `jemalloc_pprof` crate:

* https://github.com/polarsignals/rust-jemalloc-pprof/pull/22
* https://github.com/polarsignals/rust-jemalloc-pprof/pull/23

## Summary of changes

Use `jemalloc_pprof` to generate symbolized pprof heap profiles and
flamegraph SVGs.

This reintroduces a bunch of internal jemalloc stack frames that we'd
previously strip, e.g. each stack now always ends with
`prof_backtrace_impl` (where jemalloc takes a stack trace for heap
profiling), but that seems ok.
---
 Cargo.lock                      |  18 ++-
 Cargo.toml                      |   4 +-
 libs/http-utils/Cargo.toml      |   3 -
 libs/http-utils/src/endpoint.rs |  58 ++------
 libs/http-utils/src/lib.rs      |   1 -
 libs/http-utils/src/pprof.rs    | 238 --------------------------------
 libs/utils/Cargo.toml           |   1 -
 7 files changed, 21 insertions(+), 302 deletions(-)
 delete mode 100644 libs/http-utils/src/pprof.rs

diff --git a/Cargo.lock b/Cargo.lock
index a978e4d744..030753bca5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2798,12 +2798,9 @@ name = "http-utils"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "backtrace",
  "bytes",
  "fail",
- "flate2",
  "hyper 0.14.30",
- "inferno 0.12.0",
  "itertools 0.10.5",
  "jemalloc_pprof",
  "metrics",
@@ -3302,9 +3299,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
 [[package]]
 name = "jemalloc_pprof"
-version = "0.6.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
+checksum = "5622af6d21ff86ed7797ef98e11b8f302da25ec69a7db9f6cde8e2e1c8df9992"
 dependencies = [
  "anyhow",
  "libc",
@@ -3503,9 +3500,9 @@ dependencies = [
 
 [[package]]
 name = "mappings"
-version = "0.6.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
+checksum = "e434981a332777c2b3062652d16a55f8e74fa78e6b1882633f0d77399c84fc2a"
 dependencies = [
  "anyhow",
  "libc",
@@ -4794,12 +4791,14 @@ dependencies = [
 
 [[package]]
 name = "pprof_util"
-version = "0.6.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
+checksum = "9fa015c78eed2130951e22c58d2095849391e73817ab2e74f71b0b9f63dd8416"
 dependencies = [
  "anyhow",
+ "backtrace",
  "flate2",
+ "inferno 0.12.0",
  "num",
  "paste",
  "prost",
@@ -7715,7 +7714,6 @@ dependencies = [
  "anyhow",
  "arc-swap",
  "async-compression",
- "backtrace",
  "bincode",
  "byteorder",
  "bytes",
diff --git a/Cargo.toml b/Cargo.toml
index 870b3412db..2303723e43 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,7 +53,6 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-backtrace = "0.3.74"
 flate2 = "1.0.26"
 assert-json-diff = "2"
 async-stream = "0.3"
@@ -114,11 +113,10 @@ hyper-util = "0.1"
 tokio-tungstenite = "0.21.0"
 indexmap = "2"
 indoc = "2"
-inferno = "0.12.0"
 ipnet = "2.10.0"
 itertools = "0.10"
 itoa = "1.0.11"
-jemalloc_pprof = "0.6"
+jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
 jsonwebtoken = "9"
 lasso = "0.7"
 libc = "0.2"
diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml
index d72e4bd012..d16dac7876 100644
--- a/libs/http-utils/Cargo.toml
+++ b/libs/http-utils/Cargo.toml
@@ -6,11 +6,8 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
-backtrace.workspace = true
 bytes.workspace = true
-inferno.workspace = true
 fail.workspace = true
-flate2.workspace = true
 hyper0.workspace = true
 itertools.workspace = true
 jemalloc_pprof.workspace = true
diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs
index 6128113580..f4f93df62f 100644
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -3,8 +3,6 @@ use std::io::Write as _;
 use std::str::FromStr;
 use std::time::Duration;
 
-use ::pprof::ProfilerGuardBuilder;
-use ::pprof::protos::Message as _;
 use anyhow::{Context, anyhow};
 use bytes::{Bytes, BytesMut};
 use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
@@ -12,7 +10,8 @@ use hyper::http::HeaderValue;
 use hyper::{Body, Method, Request, Response};
 use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
 use once_cell::sync::Lazy;
-use regex::Regex;
+use pprof::ProfilerGuardBuilder;
+use pprof::protos::Message as _;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
 use tokio::sync::{Mutex, Notify, mpsc};
@@ -22,7 +21,6 @@ use tracing::{Instrument, debug, info, info_span, warn};
 use utils::auth::{AuthError, Claims, SwappableJwtAuth};
 
 use crate::error::{ApiError, api_error_handler, route_error_handler};
-use crate::pprof;
 use crate::request::{get_query_param, parse_query_param};
 
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -449,20 +447,6 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
         Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
     };
 
-    // Functions and mappings to strip when symbolizing pprof profiles. If true,
-    // also remove child frames.
-    static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
-        vec![
-            (Regex::new("^__rust").unwrap(), false),
-            (Regex::new("^_start$").unwrap(), false),
-            (Regex::new("^irallocx_prof").unwrap(), true),
-            (Regex::new("^prof_alloc_prep").unwrap(), true),
-            (Regex::new("^std::rt::lang_start").unwrap(), false),
-            (Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
-        ]
-    });
-    const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
-
     // Obtain profiler handle.
     let mut prof_ctl = jemalloc_pprof::PROF_CTL
         .as_ref()
@@ -495,45 +479,27 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
         }
 
         Format::Pprof => {
-            let data = tokio::task::spawn_blocking(move || {
-                let bytes = prof_ctl.dump_pprof()?;
-                // Symbolize the profile.
-                // TODO: consider moving this upstream to jemalloc_pprof and avoiding the
-                // serialization roundtrip.
-                let profile = pprof::decode(&bytes)?;
-                let profile = pprof::symbolize(profile)?;
-                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
-                pprof::encode(&profile)
-            })
-            .await
-            .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-            .map_err(ApiError::InternalServerError)?;
+            let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
+                .await
+                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+                .map_err(ApiError::InternalServerError)?;
             Response::builder()
                 .status(200)
                 .header(CONTENT_TYPE, "application/octet-stream")
-                .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"")
+                .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb.gz\"")
                 .body(Body::from(data))
                 .map_err(|err| ApiError::InternalServerError(err.into()))
         }
 
         Format::Svg => {
-            let body = tokio::task::spawn_blocking(move || {
-                let bytes = prof_ctl.dump_pprof()?;
-                let profile = pprof::decode(&bytes)?;
-                let profile = pprof::symbolize(profile)?;
-                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
-                let mut opts = inferno::flamegraph::Options::default();
-                opts.title = "Heap inuse".to_string();
-                opts.count_name = "bytes".to_string();
-                pprof::flamegraph(profile, &mut opts)
-            })
-            .await
-            .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-            .map_err(ApiError::InternalServerError)?;
+            let svg = tokio::task::spawn_blocking(move || prof_ctl.dump_flamegraph())
+                .await
+                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+                .map_err(ApiError::InternalServerError)?;
             Response::builder()
                 .status(200)
                 .header(CONTENT_TYPE, "image/svg+xml")
-                .body(Body::from(body))
+                .body(Body::from(svg))
                 .map_err(|err| ApiError::InternalServerError(err.into()))
         }
     }
diff --git a/libs/http-utils/src/lib.rs b/libs/http-utils/src/lib.rs
index c692a54257..1e9b3c761a 100644
--- a/libs/http-utils/src/lib.rs
+++ b/libs/http-utils/src/lib.rs
@@ -2,7 +2,6 @@ pub mod endpoint;
 pub mod error;
 pub mod failpoints;
 pub mod json;
-pub mod pprof;
 pub mod request;
 
 extern crate hyper0 as hyper;
diff --git a/libs/http-utils/src/pprof.rs b/libs/http-utils/src/pprof.rs
deleted file mode 100644
index 529017f350..0000000000
--- a/libs/http-utils/src/pprof.rs
+++ /dev/null
@@ -1,238 +0,0 @@
-use std::borrow::Cow;
-use std::collections::{HashMap, HashSet};
-use std::ffi::c_void;
-use std::io::Write as _;
-
-use anyhow::bail;
-use flate2::Compression;
-use flate2::write::{GzDecoder, GzEncoder};
-use itertools::Itertools as _;
-use pprof::protos::{Function, Line, Location, Message as _, Profile};
-use regex::Regex;
-
-/// Decodes a gzip-compressed Protobuf-encoded pprof profile.
-pub fn decode(bytes: &[u8]) -> anyhow::Result<Profile> {
-    let mut gz = GzDecoder::new(Vec::new());
-    gz.write_all(bytes)?;
-    Ok(Profile::parse_from_bytes(&gz.finish()?)?)
-}
-
-/// Encodes a pprof profile as gzip-compressed Protobuf.
-pub fn encode(profile: &Profile) -> anyhow::Result<Vec<u8>> {
-    let mut gz = GzEncoder::new(Vec::new(), Compression::default());
-    profile.write_to_writer(&mut gz)?;
-    Ok(gz.finish()?)
-}
-
-/// Symbolizes a pprof profile using the current binary.
-pub fn symbolize(mut profile: Profile) -> anyhow::Result<Profile> {
-    if !profile.function.is_empty() {
-        return Ok(profile); // already symbolized
-    }
-
-    // Collect function names.
-    let mut functions: HashMap<String, Function> = HashMap::new();
-    let mut strings: HashMap<String, i64> = profile
-        .string_table
-        .into_iter()
-        .enumerate()
-        .map(|(i, s)| (s, i as i64))
-        .collect();
-
-    // Helper to look up or register a string.
-    let mut string_id = |s: &str| -> i64 {
-        // Don't use .entry() to avoid unnecessary allocations.
-        if let Some(id) = strings.get(s) {
-            return *id;
-        }
-        let id = strings.len() as i64;
-        strings.insert(s.to_string(), id);
-        id
-    };
-
-    for loc in &mut profile.location {
-        if !loc.line.is_empty() {
-            continue;
-        }
-
-        // Resolve the line and function for each location.
-        backtrace::resolve(loc.address as *mut c_void, |symbol| {
-            let Some(symbol_name) = symbol.name() else {
-                return;
-            };
-
-            let function_name = format!("{symbol_name:#}");
-            let functions_len = functions.len();
-            let function_id = functions
-                .entry(function_name)
-                .or_insert_with_key(|function_name| {
-                    let function_id = functions_len as u64 + 1;
-                    let system_name = String::from_utf8_lossy(symbol_name.as_bytes());
-                    let filename = symbol
-                        .filename()
-                        .map(|path| path.to_string_lossy())
-                        .unwrap_or(Cow::Borrowed(""));
-                    Function {
-                        id: function_id,
-                        name: string_id(function_name),
-                        system_name: string_id(&system_name),
-                        filename: string_id(&filename),
-                        ..Default::default()
-                    }
-                })
-                .id;
-            loc.line.push(Line {
-                function_id,
-                line: symbol.lineno().unwrap_or(0) as i64,
-                ..Default::default()
-            });
-        });
-    }
-
-    // Store the resolved functions, and mark the mapping as resolved.
-    profile.function = functions.into_values().sorted_by_key(|f| f.id).collect();
-    profile.string_table = strings
-        .into_iter()
-        .sorted_by_key(|(_, i)| *i)
-        .map(|(s, _)| s)
-        .collect();
-
-    for mapping in &mut profile.mapping {
-        mapping.has_functions = true;
-        mapping.has_filenames = true;
-    }
-
-    Ok(profile)
-}
-
-/// Strips locations (stack frames) matching the given mappings (substring) or function names
-/// (regex). The function bool specifies whether child frames should be stripped as well.
-///
-/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all
-/// string references.
-pub fn strip_locations(
-    mut profile: Profile,
-    mappings: &[&str],
-    functions: &[(Regex, bool)],
-) -> Profile {
-    // Strip mappings.
-    let mut strip_mappings: HashSet<u64> = HashSet::new();
-
-    profile.mapping.retain(|mapping| {
-        let Some(name) = profile.string_table.get(mapping.filename as usize) else {
-            return true;
-        };
-        if mappings.iter().any(|substr| name.contains(substr)) {
-            strip_mappings.insert(mapping.id);
-            return false;
-        }
-        true
-    });
-
-    // Strip functions.
-    let mut strip_functions: HashMap<u64, bool> = HashMap::new();
-
-    profile.function.retain(|function| {
-        let Some(name) = profile.string_table.get(function.name as usize) else {
-            return true;
-        };
-        for (regex, strip_children) in functions {
-            if regex.is_match(name) {
-                strip_functions.insert(function.id, *strip_children);
-                return false;
-            }
-        }
-        true
-    });
-
-    // Strip locations. The bool specifies whether child frames should be stripped too.
-    let mut strip_locations: HashMap<u64, bool> = HashMap::new();
-
-    profile.location.retain(|location| {
-        for line in &location.line {
-            if let Some(strip_children) = strip_functions.get(&line.function_id) {
-                strip_locations.insert(location.id, *strip_children);
-                return false;
-            }
-        }
-        if strip_mappings.contains(&location.mapping_id) {
-            strip_locations.insert(location.id, false);
-            return false;
-        }
-        true
-    });
-
-    // Strip sample locations.
-    for sample in &mut profile.sample {
-        // First, find the uppermost function with child removal and truncate the stack.
-        if let Some(truncate) = sample
-            .location_id
-            .iter()
-            .rposition(|id| strip_locations.get(id) == Some(&true))
-        {
-            sample.location_id.drain(..=truncate);
-        }
-        // Next, strip any individual frames without child removal.
-        sample
-            .location_id
-            .retain(|id| !strip_locations.contains_key(id));
-    }
-
-    profile
-}
-
-/// Generates an SVG flamegraph from a symbolized pprof profile.
-pub fn flamegraph(
-    profile: Profile,
-    opts: &mut inferno::flamegraph::Options,
-) -> anyhow::Result<Vec<u8>> {
-    if profile.mapping.iter().any(|m| !m.has_functions) {
-        bail!("profile not symbolized");
-    }
-
-    // Index locations, functions, and strings.
-    let locations: HashMap<u64, Location> =
-        profile.location.into_iter().map(|l| (l.id, l)).collect();
-    let functions: HashMap<u64, Function> =
-        profile.function.into_iter().map(|f| (f.id, f)).collect();
-    let strings = profile.string_table;
-
-    // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
-    // since inferno expects it bottom-up.
-    let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
-    for sample in profile.sample {
-        let mut stack = Vec::with_capacity(sample.location_id.len());
-        for location in sample.location_id.into_iter().rev() {
-            let Some(location) = locations.get(&location) else {
-                bail!("missing location {location}");
-            };
-            for line in location.line.iter().rev() {
-                let Some(function) = functions.get(&line.function_id) else {
-                    bail!("missing function {}", line.function_id);
-                };
-                let Some(name) = strings.get(function.name as usize) else {
-                    bail!("missing string {}", function.name);
-                };
-                stack.push(name.as_str());
-            }
-        }
-        let Some(&value) = sample.value.first() else {
-            bail!("missing value");
-        };
-        *stacks.entry(stack).or_default() += value;
-    }
-
-    // Construct stack lines for inferno.
-    let lines = stacks
-        .into_iter()
-        .map(|(stack, value)| (stack.into_iter().join(";"), value))
-        .map(|(stack, value)| format!("{stack} {value}"))
-        .sorted()
-        .collect_vec();
-
-    // Construct the flamegraph.
-    let mut bytes = Vec::new();
-    let lines = lines.iter().map(|line| line.as_str());
-    inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
-    Ok(bytes)
-}
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 5020d82adf..ac44300a51 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -15,7 +15,6 @@ arc-swap.workspace = true
 sentry.workspace = true
 async-compression.workspace = true
 anyhow.workspace = true
-backtrace.workspace = true
 bincode.workspace = true
 bytes.workspace = true
 camino.workspace = true

From 20af9cef17374a287850a866983943bcad579fa2 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 4 Mar 2025 09:55:50 -0500
Subject: [PATCH 27/61] fix(test): use the same value for reldir v1+v2 (#11070)

## Problem

part of https://github.com/neondatabase/neon/issues/11067

My observation is that with the current value of settings, x86-v1
usually takes 30s, arm-v1 1m30s, x86-v2 1m, arm-v2 3m. But sometimes the
system could run too slow and cause test to timeout on arm with reldir
v2.

While I investigate what's going on and further improve the performance,
I'd like to set both of them to use the same test input, so that it
doesn't timeout and we don't abuse this test case as a performance test.

## Summary of changes

Use the same settings for both test cases.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_pg_regress.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index df243c13f1..d2a78b16e4 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -375,12 +375,8 @@ def test_tx_abort_with_many_relations(
 
     # How many relations: this number is tuned to be long enough to take tens of seconds
     # if the rollback code path is buggy, tripping the test's timeout.
-    if reldir_type == "v1":
-        n = 4000
-        step = 4000
-    else:
-        n = 20000
-        step = 5000
+    n = 5000
+    step = 2500
 
     def create():
         # Create many relations

From 4bbdb758ec2f4f1552a76b8bce424fa7d0b2fdc7 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 4 Mar 2025 17:39:32 +0100
Subject: [PATCH 28/61] compute_tools: appease unused lint on macOS (#11074)

## Problem

On macOS, the `unused` lint complains about two variables not used in
`!linux` builds.

These were introduced in #11007.

## Summary of changes

Appease the linter by explicitly using the variables in `!linux`
branches.
---
 compute_tools/src/compute.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index a89d3345c1..e4d5a6aaba 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -468,6 +468,8 @@ impl ComputeNode {
                         // Kills the actual task running the monitor
                         handle.abort();
                     }
+                } else {
+                    _ = vm_monitor; // appease unused lint on macOS
                 }
             }
         }
@@ -791,6 +793,7 @@ impl ComputeNode {
                 };
                 StartVmMonitorResult { token, vm_monitor }
             } else {
+                _ = disable_lfc_resizing; // appease unused lint on macOS
                 StartVmMonitorResult { }
             }
         }

From 7b7e4a9fd3b1909b282349939284649bbcb040b0 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 4 Mar 2025 12:08:00 -0600
Subject: [PATCH 29/61] Authorize compute_ctl requests from the control plane
 (#10530)

The compute should only act if requests come from the control plane.

Signed-off-by: Tristan Partin <tristan@neon.tech>

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock                                    |  68 +++++++-
 Cargo.toml                                    |   3 +-
 compute_tools/Cargo.toml                      |   2 +
 compute_tools/src/bin/compute_ctl.rs          |   1 +
 compute_tools/src/compute.rs                  |  24 ++-
 compute_tools/src/http/extract/mod.rs         |   2 +
 compute_tools/src/http/extract/request_id.rs  |  86 ++++++++++
 compute_tools/src/http/headers.rs             |   2 +
 .../src/http/middleware/authorize.rs          | 145 ++++++++++++++++
 compute_tools/src/http/middleware/mod.rs      |   1 +
 compute_tools/src/http/mod.rs                 |   2 +
 compute_tools/src/http/server.rs              | 158 ++++++++++--------
 libs/compute_api/src/responses.rs             |   4 +-
 workspace_hack/Cargo.toml                     |   2 +-
 14 files changed, 417 insertions(+), 83 deletions(-)
 create mode 100644 compute_tools/src/http/extract/request_id.rs
 create mode 100644 compute_tools/src/http/headers.rs
 create mode 100644 compute_tools/src/http/middleware/authorize.rs
 create mode 100644 compute_tools/src/http/middleware/mod.rs

diff --git a/Cargo.lock b/Cargo.lock
index 030753bca5..772b1f50c6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -783,6 +783,28 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "axum-extra"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
+dependencies = [
+ "axum",
+ "axum-core",
+ "bytes",
+ "futures-util",
+ "headers",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "serde",
+ "tower 0.5.2",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "azure_core"
 version = "0.21.0"
@@ -925,9 +947,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
 
 [[package]]
 name = "base64"
-version = "0.21.1"
+version = "0.21.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
 
 [[package]]
 name = "base64"
@@ -1305,6 +1327,7 @@ dependencies = [
  "aws-sdk-s3",
  "aws-smithy-types",
  "axum",
+ "axum-extra",
  "base64 0.13.1",
  "bytes",
  "camino",
@@ -1316,6 +1339,7 @@ dependencies = [
  "flate2",
  "futures",
  "http 1.1.0",
+ "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
  "notify",
@@ -2297,7 +2321,7 @@ name = "framed-websockets"
 version = "0.1.0"
 source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
  "bytemuck",
  "bytes",
  "futures-core",
@@ -2653,7 +2677,7 @@ version = "7.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
  "byteorder",
  "crossbeam-channel",
  "flate2",
@@ -2661,6 +2685,30 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "headers"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "322106e6bd0cba2d5ead589ddb8150a13d7c4217cf80d7c4f682ca994ccc6aa9"
+dependencies = [
+ "base64 0.21.7",
+ "bytes",
+ "headers-core",
+ "http 1.1.0",
+ "httpdate",
+ "mime",
+ "sha1",
+]
+
+[[package]]
+name = "headers-core"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4"
+dependencies = [
+ "http 1.1.0",
+]
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -3385,7 +3433,7 @@ version = "9.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
  "js-sys",
  "pem",
  "ring",
@@ -4467,7 +4515,7 @@ version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
  "serde",
 ]
 
@@ -5814,7 +5862,7 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
 ]
 
 [[package]]
@@ -5823,7 +5871,7 @@ version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
  "rustls-pki-types",
 ]
 
@@ -7357,10 +7405,12 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
 dependencies = [
+ "base64 0.22.1",
  "bitflags 2.8.0",
  "bytes",
  "http 1.1.0",
  "http-body 1.0.0",
+ "mime",
  "pin-project-lite",
  "tower-layer",
  "tower-service",
@@ -8267,7 +8317,7 @@ dependencies = [
  "ahash",
  "anyhow",
  "base64 0.13.1",
- "base64 0.21.1",
+ "base64 0.21.7",
  "base64ct",
  "bytes",
  "camino",
diff --git a/Cargo.toml b/Cargo.toml
index 2303723e43..d11fe4f449 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -67,6 +67,7 @@ aws-credential-types = "1.2.0"
 aws-sigv4 = { version = "1.2", features = ["sign-http"] }
 aws-types = "1.3"
 axum = { version = "0.8.1", features = ["ws"] }
+axum-extra = { version = "0.10.0", features = ["typed-header"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.71"
@@ -191,7 +192,7 @@ toml = "0.8"
 toml_edit = "0.22"
 tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
 tower = { version = "0.5.2", default-features = false }
-tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
+tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
 
 # This revision uses opentelemetry 0.27. There's no tag for it.
 tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" }
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 8f3bcbeef8..dd2896714d 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -17,6 +17,7 @@ aws-sdk-kms.workspace = true
 aws-smithy-types.workspace = true
 anyhow.workspace = true
 axum = { workspace = true, features = [] }
+axum-extra.workspace = true
 camino.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
@@ -25,6 +26,7 @@ fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
 http.workspace = true
+jsonwebtoken.workspace = true
 metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 08966a6efb..fc7a3e2827 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -179,6 +179,7 @@ fn main() -> Result<()> {
             live_config_allowed: cli_spec.live_config_allowed,
         },
         cli_spec.spec,
+        cli_spec.compute_ctl_config,
     )?;
 
     let exit_code = compute_node.run()?;
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index e4d5a6aaba..d0b1bc2534 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -11,7 +11,7 @@ use std::{env, fs};
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
-use compute_api::responses::{ComputeMetrics, ComputeStatus};
+use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent};
 use futures::StreamExt;
 use futures::future::join_all;
@@ -132,6 +132,8 @@ pub struct ComputeState {
     /// passed by the control plane with a /configure HTTP request.
     pub pspec: Option<ParsedSpec>,
 
+    pub compute_ctl_config: ComputeCtlConfig,
+
     /// If the spec is passed by a /configure request, 'startup_span' is the
     /// /configure request's tracing span. The main thread enters it when it
     /// processes the compute startup, so that the compute startup is considered
@@ -155,6 +157,7 @@ impl ComputeState {
             last_active: None,
             error: None,
             pspec: None,
+            compute_ctl_config: ComputeCtlConfig::default(),
             startup_span: None,
             metrics: ComputeMetrics::default(),
         }
@@ -365,7 +368,11 @@ pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String {
 }
 
 impl ComputeNode {
-    pub fn new(params: ComputeNodeParams, cli_spec: Option<ComputeSpec>) -> Result<Self> {
+    pub fn new(
+        params: ComputeNodeParams,
+        cli_spec: Option<ComputeSpec>,
+        compute_ctl_config: ComputeCtlConfig,
+    ) -> Result<Self> {
         let connstr = params.connstr.as_str();
         let conn_conf = postgres::config::Config::from_str(connstr)
             .context("cannot build postgres config from connstr")?;
@@ -377,6 +384,7 @@ impl ComputeNode {
             let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
             new_state.pspec = Some(pspec);
         }
+        new_state.compute_ctl_config = compute_ctl_config;
 
         Ok(ComputeNode {
             params,
@@ -405,11 +413,19 @@ impl ComputeNode {
 
         // Launch the external HTTP server first, so that we can serve control plane
         // requests while configuration is still in progress.
-        crate::http::server::Server::External(this.params.external_http_port).launch(&this);
+        crate::http::server::Server::External {
+            port: this.params.external_http_port,
+            jwks: this.state.lock().unwrap().compute_ctl_config.jwks.clone(),
+            compute_id: this.params.compute_id.clone(),
+        }
+        .launch(&this);
 
         // The internal HTTP server could be launched later, but there isn't much
         // sense in waiting.
-        crate::http::server::Server::Internal(this.params.internal_http_port).launch(&this);
+        crate::http::server::Server::Internal {
+            port: this.params.internal_http_port,
+        }
+        .launch(&this);
 
         // If we got a spec from the CLI already, use that. Otherwise wait for the
         // control plane to pass it to us with a /configure HTTP request
diff --git a/compute_tools/src/http/extract/mod.rs b/compute_tools/src/http/extract/mod.rs
index 1b690e444d..589681cfe2 100644
--- a/compute_tools/src/http/extract/mod.rs
+++ b/compute_tools/src/http/extract/mod.rs
@@ -1,7 +1,9 @@
 pub(crate) mod json;
 pub(crate) mod path;
 pub(crate) mod query;
+pub(crate) mod request_id;
 
 pub(crate) use json::Json;
 pub(crate) use path::Path;
 pub(crate) use query::Query;
+pub(crate) use request_id::RequestId;
diff --git a/compute_tools/src/http/extract/request_id.rs b/compute_tools/src/http/extract/request_id.rs
new file mode 100644
index 0000000000..d911921a05
--- /dev/null
+++ b/compute_tools/src/http/extract/request_id.rs
@@ -0,0 +1,86 @@
+use std::{
+    fmt::Display,
+    ops::{Deref, DerefMut},
+};
+
+use axum::{extract::FromRequestParts, response::IntoResponse};
+use http::{StatusCode, request::Parts};
+
+use crate::http::{JsonResponse, headers::X_REQUEST_ID};
+
+/// Extract the request ID from the `X-Request-Id` header.
+#[derive(Debug, Clone, Default)]
+pub(crate) struct RequestId(pub String);
+
+#[derive(Debug)]
+/// Rejection used for [`RequestId`].
+///
+/// Contains one variant for each way the [`RequestId`] extractor can
+/// fail.
+pub(crate) enum RequestIdRejection {
+    /// The request is missing the header.
+    MissingRequestId,
+
+    /// The value of the header is invalid UTF-8.
+    InvalidUtf8,
+}
+
+impl RequestIdRejection {
+    pub fn status(&self) -> StatusCode {
+        match self {
+            RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR,
+            RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST,
+        }
+    }
+
+    pub fn message(&self) -> String {
+        match self {
+            RequestIdRejection::MissingRequestId => "request ID is missing",
+            RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8",
+        }
+        .to_string()
+    }
+}
+
+impl IntoResponse for RequestIdRejection {
+    fn into_response(self) -> axum::response::Response {
+        JsonResponse::error(self.status(), self.message())
+    }
+}
+
+impl<S> FromRequestParts<S> for RequestId
+where
+    S: Send + Sync,
+{
+    type Rejection = RequestIdRejection;
+
+    async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
+        match parts.headers.get(X_REQUEST_ID) {
+            Some(value) => match value.to_str() {
+                Ok(request_id) => Ok(Self(request_id.to_string())),
+                Err(_) => Err(RequestIdRejection::InvalidUtf8),
+            },
+            None => Err(RequestIdRejection::MissingRequestId),
+        }
+    }
+}
+
+impl Deref for RequestId {
+    type Target = String;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl DerefMut for RequestId {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl Display for RequestId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(&self.0)
+    }
+}
diff --git a/compute_tools/src/http/headers.rs b/compute_tools/src/http/headers.rs
new file mode 100644
index 0000000000..a11638e203
--- /dev/null
+++ b/compute_tools/src/http/headers.rs
@@ -0,0 +1,2 @@
+/// Constant for `X-Request-Id` header.
+pub const X_REQUEST_ID: &str = "x-request-id";
diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs
new file mode 100644
index 0000000000..798dd1179b
--- /dev/null
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -0,0 +1,145 @@
+use std::{collections::HashSet, net::SocketAddr};
+
+use anyhow::{Result, anyhow};
+use axum::{RequestExt, body::Body, extract::ConnectInfo};
+use axum_extra::{
+    TypedHeader,
+    headers::{Authorization, authorization::Bearer},
+};
+use futures::future::BoxFuture;
+use http::{Request, Response, StatusCode};
+use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
+use serde::Deserialize;
+use tower_http::auth::AsyncAuthorizeRequest;
+use tracing::warn;
+
+use crate::http::{JsonResponse, extract::RequestId};
+
+#[derive(Clone, Debug, Deserialize)]
+pub(in crate::http) struct Claims {
+    compute_id: String,
+}
+
+#[derive(Clone, Debug)]
+pub(in crate::http) struct Authorize {
+    compute_id: String,
+    jwks: JwkSet,
+    validation: Validation,
+}
+
+impl Authorize {
+    pub fn new(compute_id: String, jwks: JwkSet) -> Self {
+        let mut validation = Validation::new(Algorithm::EdDSA);
+        // Nothing is currently required
+        validation.required_spec_claims = HashSet::new();
+        validation.validate_exp = true;
+        // Unused by the control plane
+        validation.validate_aud = false;
+        // Unused by the control plane
+        validation.validate_nbf = false;
+
+        Self {
+            compute_id,
+            jwks,
+            validation,
+        }
+    }
+}
+
+impl AsyncAuthorizeRequest<Body> for Authorize {
+    type RequestBody = Body;
+    type ResponseBody = Body;
+    type Future = BoxFuture<'static, Result<Request<Body>, Response<Self::ResponseBody>>>;
+
+    fn authorize(&mut self, mut request: Request<Body>) -> Self::Future {
+        let compute_id = self.compute_id.clone();
+        let jwks = self.jwks.clone();
+        let validation = self.validation.clone();
+
+        Box::pin(async move {
+            let request_id = request.extract_parts::<RequestId>().await.unwrap();
+
+            // TODO: Remove this check after a successful rollout
+            if jwks.keys.is_empty() {
+                warn!(%request_id, "Authorization has not been configured");
+
+                return Ok(request);
+            }
+
+            let connect_info = request
+                .extract_parts::<ConnectInfo<SocketAddr>>()
+                .await
+                .unwrap();
+
+            // In the event the request is coming from the loopback interface,
+            // allow all requests
+            if connect_info.ip().is_loopback() {
+                warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
+
+                return Ok(request);
+            }
+
+            let TypedHeader(Authorization(bearer)) = request
+                .extract_parts::<TypedHeader<Authorization<Bearer>>>()
+                .await
+                .map_err(|_| {
+                    JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token")
+                })?;
+
+            let data = match Self::verify(&jwks, bearer.token(), &validation) {
+                Ok(claims) => claims,
+                Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)),
+            };
+
+            if data.claims.compute_id != compute_id {
+                return Err(JsonResponse::error(
+                    StatusCode::UNAUTHORIZED,
+                    "invalid claims in authorization token",
+                ));
+            }
+
+            // Make claims available to any subsequent middleware or request
+            // handlers
+            request.extensions_mut().insert(data.claims);
+
+            Ok(request)
+        })
+    }
+}
+
+impl Authorize {
+    /// Verify the token using the JSON Web Key set and return the token data.
+    fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
+        debug_assert!(!jwks.keys.is_empty());
+
+        for jwk in jwks.keys.iter() {
+            let decoding_key = match DecodingKey::from_jwk(jwk) {
+                Ok(key) => key,
+                Err(e) => {
+                    warn!(
+                        "Failed to construct decoding key from {}: {}",
+                        jwk.common.key_id.as_ref().unwrap(),
+                        e
+                    );
+
+                    continue;
+                }
+            };
+
+            match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
+                Ok(data) => return Ok(data),
+                Err(e) => {
+                    warn!(
+                        "Failed to decode authorization token using {}: {}",
+                        jwk.common.key_id.as_ref().unwrap(),
+                        e
+                    );
+
+                    continue;
+                }
+            }
+        }
+
+        Err(anyhow!("Failed to verify authorization token"))
+    }
+}
diff --git a/compute_tools/src/http/middleware/mod.rs b/compute_tools/src/http/middleware/mod.rs
new file mode 100644
index 0000000000..caeeeedfe5
--- /dev/null
+++ b/compute_tools/src/http/middleware/mod.rs
@@ -0,0 +1 @@
+pub(in crate::http) mod authorize;
diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs
index d182278174..9ecc1b0093 100644
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -7,6 +7,8 @@ use serde::Serialize;
 use tracing::error;
 
 mod extract;
+mod headers;
+mod middleware;
 mod routes;
 pub mod server;
 
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index 7283401bb5..126fa86d1c 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -10,48 +10,58 @@ use axum::middleware::{self, Next};
 use axum::response::{IntoResponse, Response};
 use axum::routing::{get, post};
 use http::StatusCode;
+use jsonwebtoken::jwk::JwkSet;
 use tokio::net::TcpListener;
 use tower::ServiceBuilder;
-use tower_http::request_id::PropagateRequestIdLayer;
-use tower_http::trace::TraceLayer;
-use tracing::{Span, debug, error, info};
+use tower_http::{
+    auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer,
+};
+use tracing::{Span, error, info};
 use uuid::Uuid;
 
-use super::routes::{
-    check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
-    grants, insights, metrics, metrics_json, status, terminate,
+use super::{
+    headers::X_REQUEST_ID,
+    middleware::authorize::Authorize,
+    routes::{
+        check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
+        grants, insights, metrics, metrics_json, status, terminate,
+    },
 };
 use crate::compute::ComputeNode;
 
-const X_REQUEST_ID: &str = "x-request-id";
-
 /// `compute_ctl` has two servers: internal and external. The internal server
 /// binds to the loopback interface and handles communication from clients on
 /// the compute. The external server is what receives communication from the
 /// control plane, the metrics scraper, etc. We make the distinction because
 /// certain routes in `compute_ctl` only need to be exposed to local processes
 /// like Postgres via the neon extension and local_proxy.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Debug)]
 pub enum Server {
-    Internal(u16),
-    External(u16),
+    Internal {
+        port: u16,
+    },
+    External {
+        port: u16,
+        jwks: JwkSet,
+        compute_id: String,
+    },
 }
 
 impl Display for Server {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
-            Server::Internal(_) => f.write_str("internal"),
-            Server::External(_) => f.write_str("external"),
+            Server::Internal { .. } => f.write_str("internal"),
+            Server::External { .. } => f.write_str("external"),
         }
     }
 }
 
-impl From<Server> for Router<Arc<ComputeNode>> {
-    fn from(server: Server) -> Self {
+impl From<&Server> for Router<Arc<ComputeNode>> {
+    fn from(server: &Server) -> Self {
         let mut router = Router::<Arc<ComputeNode>>::new();
 
         router = match server {
-            Server::Internal(_) => {
+            Server::Internal { .. } => {
                 router = router
                     .route(
                         "/extension_server/{*filename}",
@@ -69,59 +79,71 @@ impl From<Server> for Router<Arc<ComputeNode>> {
 
                 router
             }
-            Server::External(_) => router
-                .route("/check_writability", post(check_writability::is_writable))
-                .route("/configure", post(configure::configure))
-                .route("/database_schema", get(database_schema::get_schema_dump))
-                .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
-                .route("/insights", get(insights::get_insights))
-                .route("/metrics", get(metrics::get_metrics))
-                .route("/metrics.json", get(metrics_json::get_metrics))
-                .route("/status", get(status::get_status))
-                .route("/terminate", post(terminate::terminate)),
+            Server::External {
+                jwks, compute_id, ..
+            } => {
+                let unauthenticated_router =
+                    Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
+
+                let authenticated_router = Router::<Arc<ComputeNode>>::new()
+                    .route("/check_writability", post(check_writability::is_writable))
+                    .route("/configure", post(configure::configure))
+                    .route("/database_schema", get(database_schema::get_schema_dump))
+                    .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
+                    .route("/insights", get(insights::get_insights))
+                    .route("/metrics.json", get(metrics_json::get_metrics))
+                    .route("/status", get(status::get_status))
+                    .route("/terminate", post(terminate::terminate))
+                    .layer(AsyncRequireAuthorizationLayer::new(Authorize::new(
+                        compute_id.clone(),
+                        jwks.clone(),
+                    )));
+
+                router
+                    .merge(unauthenticated_router)
+                    .merge(authenticated_router)
+            }
         };
 
-        router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
-            ServiceBuilder::new()
-                // Add this middleware since we assume the request ID exists
-                .layer(middleware::from_fn(maybe_add_request_id_header))
-                .layer(
-                    TraceLayer::new_for_http()
-                        .on_request(|request: &http::Request<_>, _span: &Span| {
-                            let request_id = request
-                                .headers()
-                                .get(X_REQUEST_ID)
-                                .unwrap()
-                                .to_str()
-                                .unwrap();
-
-                            match request.uri().path() {
-                                "/metrics" => {
-                                    debug!(%request_id, "{} {}", request.method(), request.uri())
-                                }
-                                _ => info!(%request_id, "{} {}", request.method(), request.uri()),
-                            };
-                        })
-                        .on_response(
-                            |response: &http::Response<_>, latency: Duration, _span: &Span| {
-                                let request_id = response
+        router
+            .fallback(Server::handle_404)
+            .method_not_allowed_fallback(Server::handle_405)
+            .layer(
+                ServiceBuilder::new()
+                    .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
+                    // Add this middleware since we assume the request ID exists
+                    .layer(middleware::from_fn(maybe_add_request_id_header))
+                    .layer(
+                        TraceLayer::new_for_http()
+                            .on_request(|request: &http::Request<_>, _span: &Span| {
+                                let request_id = request
                                     .headers()
                                     .get(X_REQUEST_ID)
                                     .unwrap()
                                     .to_str()
                                     .unwrap();
 
-                                info!(
-                                    %request_id,
-                                    code = response.status().as_u16(),
-                                    latency = latency.as_millis()
-                                )
-                            },
-                        ),
-                )
-                .layer(PropagateRequestIdLayer::x_request_id()),
-        )
-            .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
+                                info!(%request_id, "{} {}", request.method(), request.uri());
+                            })
+                            .on_response(
+                                |response: &http::Response<_>, latency: Duration, _span: &Span| {
+                                    let request_id = response
+                                        .headers()
+                                        .get(X_REQUEST_ID)
+                                        .unwrap()
+                                        .to_str()
+                                        .unwrap();
+
+                                    info!(
+                                        %request_id,
+                                        code = response.status().as_u16(),
+                                        latency = latency.as_millis()
+                                    );
+                                },
+                            ),
+                    )
+                    .layer(PropagateRequestIdLayer::x_request_id()),
+            )
     }
 }
 
@@ -145,15 +167,15 @@ impl Server {
         match self {
             // TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
             // allow binding to localhost
-            Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
-            Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
+            Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
+            Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
         }
     }
 
-    fn port(self) -> u16 {
+    fn port(&self) -> u16 {
         match self {
-            Server::Internal(port) => port,
-            Server::External(port) => port,
+            Server::Internal { port, .. } => *port,
+            Server::External { port, .. } => *port,
         }
     }
 
@@ -180,7 +202,9 @@ impl Server {
             );
         }
 
-        let router = Router::from(self).with_state(compute);
+        let router = Router::from(&self)
+            .with_state(compute)
+            .into_make_service_with_connect_info::<SocketAddr>();
 
         if let Err(e) = axum::serve(listener, router).await {
             error!("compute_ctl {} HTTP server error: {}", self, e);
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 35c580bd37..3300fbf7dd 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -134,8 +134,10 @@ pub struct CatalogObjects {
     pub databases: Vec<Database>,
 }
 
-#[derive(Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct ComputeCtlConfig {
+    /// Set of JSON web keys that the compute can use to authenticate
+    /// communication from the control plane.
     pub jwks: JwkSet,
 }
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 1b7c376560..183cc66ab9 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -18,7 +18,7 @@ license.workspace = true
 ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
 base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] }
-base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] }
+base64-647d43efb71741da = { package = "base64", version = "0.21" }
 base64ct = { version = "1", default-features = false, features = ["std"] }
 bytes = { version = "1", features = ["serde"] }
 camino = { version = "1", default-features = false, features = ["serde1"] }

From f62ddb11ed8883842fecf44ddd85594c2562856b Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 4 Mar 2025 19:11:43 +0100
Subject: [PATCH 30/61] Distinguish manually submitted runs for periodic
 pagebench in grafana dashboard (#11079)

## Problem

Periodic pagebench workflow runs periodically from latest main commit
and also allows to dispatch it manually for a given commit hash to
bi-sect regressions.
However in the dashboards we can not distinguish manual runs from
periodic runs which makes it harder to follow the trend.

## Summary of changes

Send an additional flag commit type to the benchmark runner instance to
distinguish the run type.

Note: this needs a follow-up PR on the receiving side.
---
 .github/workflows/periodic_pagebench.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index af877029e4..0622faba33 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -78,8 +78,10 @@ jobs:
       run: |
         if [ -z "$INPUT_COMMIT_HASH" ]; then
           echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
+          echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
         else
           echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
+          echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
         fi
 
     - name: Start Bench with run_id
@@ -89,7 +91,7 @@ jobs:
         -H 'accept: application/json' \
         -H 'Content-Type: application/json' \
         -H "Authorization: Bearer $API_KEY" \
-        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
+        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"
 
     - name: Poll Test Status
       id: poll_step

From 438f7bb72697a7e373448f572f781d3d81dde960 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 5 Mar 2025 12:03:09 +0200
Subject: [PATCH 31/61] Check response status in prefetch_lookup (#11080)

## Problem

New async prefetch introduces `prefetch+lookup[` function which is
called before LFC lookup to check if prefetch request is already
completed.

This function is not containing now check that response is actually
`T_NeonGetPageResponse` (and not error).

## Summary of changes

Add checks for response tag.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index fe463fd4a6..0414661a5f 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1026,6 +1026,19 @@ prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, n
 			if (!neon_prefetch_response_usable(&lsns[i], slot))
 				continue;
 
+			/*
+			 * Ignore errors
+			 */
+			if (slot->response->tag != T_NeonGetPageResponse)
+			{
+				if (slot->response->tag != T_NeonErrorResponse)
+				{
+					NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+											T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag);
+				}
+				continue;
+			}
 			memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
 			prefetch_set_unused(ring_index);
 			BITMAP_SET(mask, i);

From 906d7468cc15248f3ebce3307370504bd07452e7 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 5 Mar 2025 11:14:51 +0100
Subject: [PATCH 32/61] exclude separate perf tests from bench step (#11084)

## Problem

Our benchmarking workflow has a job step `bench`which runs all tests in
test_runner/performance/* except those that we want to run separately.
We recently added two test cases to that testcase directory that we want
to run separately but forgot to ignore them during the bench step. This
is now causing
[failures](https://github.com/neondatabase/neon/actions/runs/13667689340/job/38212087331#step:7:392).

## Summary of changes

Ignore the separately run tests in the bench step.
---
 .github/workflows/benchmarking.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index ffb6c65af9..ff7db02e42 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -141,6 +141,8 @@ jobs:
           --ignore test_runner/performance/test_physical_replication.py
           --ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py
           --ignore test_runner/performance/test_cumulative_statistics_persistence.py
+          --ignore test_runner/performance/test_perf_many_relations.py
+          --ignore test_runner/performance/test_perf_oltp_large_tenant.py
       env:
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"

From 8e51bfc59767a25b7dce58516353dfb9635ea710 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 5 Mar 2025 10:27:46 +0000
Subject: [PATCH 33/61] proxy: JSON logging field refactor (#11078)

## Problem

Grafana Loki's JSON handling is somewhat limited and the log message
should be structured in a way that it's easy to sift through logs and
filter.

## Summary of changes

* Drop span_id. It's too short lived to be of value and only bloats the
logs.
* Use the span's name as the object key, but append a unique numeric
value to prevent name collisions.
* Extract interesting span fields into a separate object at the root.

New format:
```json
{
  "timestamp": "2025-03-04T18:54:44.134435Z",
  "level": "INFO",
  "message": "connected to compute node at 127.0.0.1 (127.0.0.1:5432) latency=client: 22.002292ms, cplane: 0ns, compute: 5.338875ms, retry: 0ns",
  "fields": {
    "cold_start_info": "unknown"
  },
  "process_id": 56675,
  "thread_id": 9122892,
  "task_id": "24",
  "target": "proxy::compute",
  "src": "proxy/src/compute.rs:288",
  "trace_id": "5eb89b840ec63fee5fc56cebd633e197",
  "spans": {
    "connect_request#1": {
      "ep": "endpoint",
      "role": "proxy",
      "session_id": "b8a41818-12bd-4c3f-8ef0-9a942cc99514",
      "protocol": "tcp",
      "conn_info": "127.0.0.1"
    },
    "connect_to_compute#6": {},
    "connect_once#8": {
      "compute_id": "compute",
      "pid": "853"
    }
  },
  "extract": {
    "session_id": "b8a41818-12bd-4c3f-8ef0-9a942cc99514"
  }
}
```
---
 Cargo.lock           |   8 +-
 proxy/Cargo.toml     |   2 +-
 proxy/src/logging.rs | 250 +++++++++++++++++++++++++++++++++++--------
 3 files changed, 209 insertions(+), 51 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 772b1f50c6..7aa9c53e7e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4385,9 +4385,9 @@ dependencies = [
 
 [[package]]
 name = "papaya"
-version = "0.1.8"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
+checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd"
 dependencies = [
  "equivalent",
  "seize",
@@ -6110,9 +6110,9 @@ dependencies = [
 
 [[package]]
 name = "seize"
-version = "0.4.9"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
+checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 5964b76ecf..b6e3f03a81 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -53,7 +53,7 @@ measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
 opentelemetry = { workspace = true, features = ["trace"] }
-papaya = "0.1.8"
+papaya = "0.2.0"
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 3c34918d84..b2e95a109f 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,9 +1,11 @@
 use std::cell::{Cell, RefCell};
 use std::collections::HashMap;
 use std::hash::BuildHasher;
-use std::{env, io};
+use std::sync::atomic::{AtomicU32, Ordering};
+use std::{array, env, fmt, io};
 
 use chrono::{DateTime, Utc};
+use indexmap::IndexSet;
 use opentelemetry::trace::TraceContextExt;
 use scopeguard::defer;
 use serde::ser::{SerializeMap, Serializer};
@@ -17,6 +19,7 @@ use tracing_subscriber::fmt::{FormatEvent, FormatFields};
 use tracing_subscriber::layer::{Context, Layer};
 use tracing_subscriber::prelude::*;
 use tracing_subscriber::registry::{LookupSpan, SpanRef};
+use try_lock::TryLock;
 
 /// Initialize logging and OpenTelemetry tracing and exporter.
 ///
@@ -46,13 +49,13 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
     let otlp_layer = tracing_utils::init_tracing("proxy").await;
 
     let json_log_layer = if logfmt == LogFormat::Json {
-        Some(JsonLoggingLayer {
-            clock: RealClock,
-            skipped_field_indices: papaya::HashMap::default(),
-            writer: StderrWriter {
+        Some(JsonLoggingLayer::new(
+            RealClock,
+            StderrWriter {
                 stderr: std::io::stderr(),
             },
-        })
+            ["request_id", "session_id", "conn_id"],
+        ))
     } else {
         None
     };
@@ -191,13 +194,39 @@ thread_local! {
 }
 
 /// Implements tracing layer to handle events specific to logging.
-struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
+struct JsonLoggingLayer<C: Clock, W: MakeWriter, const F: usize> {
     clock: C,
     skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+    callsite_ids: papaya::HashMap<callsite::Identifier, CallsiteId>,
     writer: W,
+    // We use a const generic and arrays to bypass one heap allocation.
+    extract_fields: IndexSet<&'static str>,
+    _marker: std::marker::PhantomData<[&'static str; F]>,
 }
 
-impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
+impl<C: Clock, W: MakeWriter, const F: usize> JsonLoggingLayer<C, W, F> {
+    fn new(clock: C, writer: W, extract_fields: [&'static str; F]) -> Self {
+        JsonLoggingLayer {
+            clock,
+            skipped_field_indices: papaya::HashMap::default(),
+            callsite_ids: papaya::HashMap::default(),
+            writer,
+            extract_fields: IndexSet::from_iter(extract_fields),
+            _marker: std::marker::PhantomData,
+        }
+    }
+
+    #[inline]
+    fn callsite_id(&self, cs: callsite::Identifier) -> CallsiteId {
+        *self
+            .callsite_ids
+            .pin()
+            .get_or_insert_with(cs, CallsiteId::next)
+    }
+}
+
+impl<S, C: Clock + 'static, W: MakeWriter + 'static, const F: usize> Layer<S>
+    for JsonLoggingLayer<C, W, F>
 where
     S: Subscriber + for<'a> LookupSpan<'a>,
 {
@@ -211,7 +240,14 @@ where
         let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
             if entered.get() {
                 let mut formatter = EventFormatter::new();
-                formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                formatter.format::<S, F>(
+                    now,
+                    event,
+                    &ctx,
+                    &self.skipped_field_indices,
+                    &self.callsite_ids,
+                    &self.extract_fields,
+                )?;
                 self.writer.make_writer().write_all(formatter.buffer())
             } else {
                 entered.set(true);
@@ -219,7 +255,14 @@ where
 
                 EVENT_FORMATTER.with_borrow_mut(move |formatter| {
                     formatter.reset();
-                    formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                    formatter.format::<S, F>(
+                        now,
+                        event,
+                        &ctx,
+                        &self.skipped_field_indices,
+                        &self.callsite_ids,
+                        &self.extract_fields,
+                    )?;
                     self.writer.make_writer().write_all(formatter.buffer())
                 })
             }
@@ -243,13 +286,17 @@ where
 
     /// Registers a SpanFields instance as span extension.
     fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
+        let csid = self.callsite_id(attrs.metadata().callsite());
         let span = ctx.span(id).expect("span must exist");
         let fields = SpanFields::default();
         fields.record_fields(attrs);
         // This could deadlock when there's a panic somewhere in the tracing
         // event handling and a read or write guard is still held. This includes
         // the OTel subscriber.
-        span.extensions_mut().insert(fields);
+        let mut exts = span.extensions_mut();
+
+        exts.insert(fields);
+        exts.insert(csid);
     }
 
     fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
@@ -265,6 +312,7 @@ where
     /// wins.
     fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
         if !metadata.is_event() {
+            self.callsite_id(metadata.callsite());
             // Must not be never because we wouldn't get trace and span data.
             return Interest::always();
         }
@@ -297,6 +345,26 @@ where
     }
 }
 
+#[derive(Copy, Clone, Debug, Default)]
+#[repr(transparent)]
+struct CallsiteId(u32);
+
+impl CallsiteId {
+    #[inline]
+    fn next() -> Self {
+        // Start at 1 to reserve 0 for default.
+        static COUNTER: AtomicU32 = AtomicU32::new(1);
+        CallsiteId(COUNTER.fetch_add(1, Ordering::Relaxed))
+    }
+}
+
+impl fmt::Display for CallsiteId {
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Stores span field values recorded during the spans lifetime.
 #[derive(Default)]
 struct SpanFields {
@@ -448,12 +516,14 @@ impl EventFormatter {
         self.logline_buffer.clear();
     }
 
-    fn format<S>(
+    fn format<S, const F: usize>(
         &mut self,
         now: DateTime<Utc>,
         event: &Event<'_>,
         ctx: &Context<'_, S>,
         skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+        callsite_ids: &papaya::HashMap<callsite::Identifier, CallsiteId>,
+        extract_fields: &IndexSet<&'static str>,
     ) -> io::Result<()>
     where
         S: Subscriber + for<'a> LookupSpan<'a>,
@@ -485,6 +555,7 @@ impl EventFormatter {
             event.record(&mut message_extractor);
             let mut serializer = message_extractor.into_serializer()?;
 
+            // Direct message fields.
             let mut fields_present = FieldsPresent(false, skipped_field_indices);
             event.record(&mut fields_present);
             if fields_present.0 {
@@ -494,7 +565,9 @@ impl EventFormatter {
                 )?;
             }
 
+            // TODO: thread-local cache?
             let pid = std::process::id();
+            // Skip adding pid 1 to reduce noise for services running in containers.
             if pid != 1 {
                 serializer.serialize_entry("process_id", &pid)?;
             }
@@ -514,6 +587,7 @@ impl EventFormatter {
 
             serializer.serialize_entry("target", meta.target())?;
 
+            // Skip adding module if it's the same as target.
             if let Some(module) = meta.module_path() {
                 if module != meta.target() {
                     serializer.serialize_entry("module", module)?;
@@ -540,7 +614,16 @@ impl EventFormatter {
                 }
             }
 
-            serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?;
+            let stack = SerializableSpans {
+                ctx,
+                callsite_ids,
+                fields: ExtractedSpanFields::<'_, F>::new(extract_fields),
+            };
+            serializer.serialize_entry("spans", &stack)?;
+
+            if stack.fields.has_values() {
+                serializer.serialize_entry("extract", &stack.fields)?;
+            }
 
             serializer.end()
         };
@@ -818,15 +901,20 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<
     }
 }
 
-/// Serializes the span stack from root to leaf (parent of event) enumerated
-/// inside an object where the keys are just the number padded with zeroes
-/// to retain sorting order.
-// The object is necessary because Loki cannot flatten arrays.
-struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>)
+/// Serializes the span stack from root to leaf (parent of event) as object
+/// with the span names as keys. To prevent collision we append a numberic value
+/// to the name. Also, collects any span fields we're interested in. Last one
+/// wins.
+struct SerializableSpans<'a, 'ctx, Span, const F: usize>
 where
-    Span: Subscriber + for<'lookup> LookupSpan<'lookup>;
+    Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
+{
+    ctx: &'a Context<'ctx, Span>,
+    callsite_ids: &'a papaya::HashMap<callsite::Identifier, CallsiteId>,
+    fields: ExtractedSpanFields<'a, F>,
+}
 
-impl<Span> serde::ser::Serialize for SerializableSpanStack<'_, '_, Span>
+impl<Span, const F: usize> serde::ser::Serialize for SerializableSpans<'_, '_, Span, F>
 where
     Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
 {
@@ -836,9 +924,24 @@ where
     {
         let mut serializer = serializer.serialize_map(None)?;
 
-        if let Some(leaf_span) = self.0.lookup_current() {
-            for (i, span) in leaf_span.scope().from_root().enumerate() {
-                serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?;
+        if let Some(leaf_span) = self.ctx.lookup_current() {
+            for span in leaf_span.scope().from_root() {
+                // Append a numeric callsite ID to the span name to keep the name unique
+                // in the JSON object.
+                let cid = self
+                    .callsite_ids
+                    .pin()
+                    .get(&span.metadata().callsite())
+                    .copied()
+                    .unwrap_or_default();
+
+                // Loki turns the # into an underscore during field name concatenation.
+                serializer.serialize_key(&format_args!("{}#{}", span.metadata().name(), &cid))?;
+
+                serializer.serialize_value(&SerializableSpanFields {
+                    span: &span,
+                    fields: &self.fields,
+                })?;
             }
         }
 
@@ -846,28 +949,79 @@ where
     }
 }
 
-/// Serializes a single span. Include the span ID, name and its fields as
-/// recorded up to this point.
-struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>)
-where
-    Span: for<'lookup> LookupSpan<'lookup>;
-
-impl<Span> serde::ser::Serialize for SerializableSpan<'_, '_, Span>
+/// Serializes the span fields as object.
+struct SerializableSpanFields<'a, 'span, Span, const F: usize>
 where
     Span: for<'lookup> LookupSpan<'lookup>,
 {
-    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
+    span: &'a SpanRef<'span, Span>,
+    fields: &'a ExtractedSpanFields<'a, F>,
+}
+
+impl<Span, const F: usize> serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F>
+where
+    Span: for<'lookup> LookupSpan<'lookup>,
+{
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
-        Ser: serde::ser::Serializer,
+        S: serde::ser::Serializer,
     {
         let mut serializer = serializer.serialize_map(None)?;
-        // TODO: the span ID is probably only useful for debugging tracing.
-        serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?;
-        serializer.serialize_entry("span_name", self.0.metadata().name())?;
 
-        let ext = self.0.extensions();
+        let ext = self.span.extensions();
         if let Some(data) = ext.get::<SpanFields>() {
-            for (key, value) in &data.fields.pin() {
+            for (name, value) in &data.fields.pin() {
+                serializer.serialize_entry(name, value)?;
+                // TODO: replace clone with reference, if possible.
+                self.fields.set(name, value.clone());
+            }
+        }
+
+        serializer.end()
+    }
+}
+
+struct ExtractedSpanFields<'a, const F: usize> {
+    names: &'a IndexSet<&'static str>,
+    // TODO: replace TryLock with something local thread and interior mutability.
+    //       serde API doesn't let us use `mut`.
+    values: TryLock<([Option<serde_json::Value>; F], bool)>,
+}
+
+impl<'a, const F: usize> ExtractedSpanFields<'a, F> {
+    fn new(names: &'a IndexSet<&'static str>) -> Self {
+        ExtractedSpanFields {
+            names,
+            values: TryLock::new((array::from_fn(|_| Option::default()), false)),
+        }
+    }
+
+    #[inline]
+    fn set(&self, name: &'static str, value: serde_json::Value) {
+        if let Some((index, _)) = self.names.get_full(name) {
+            let mut fields = self.values.try_lock().expect("thread-local use");
+            fields.0[index] = Some(value);
+            fields.1 = true;
+        }
+    }
+
+    #[inline]
+    fn has_values(&self) -> bool {
+        self.values.try_lock().expect("thread-local use").1
+    }
+}
+
+impl<const F: usize> serde::ser::Serialize for ExtractedSpanFields<'_, F> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::ser::Serializer,
+    {
+        let mut serializer = serializer.serialize_map(None)?;
+
+        let values = self.values.try_lock().expect("thread-local use");
+        for (i, value) in values.0.iter().enumerate() {
+            if let Some(value) = value {
+                let key = self.names[i];
                 serializer.serialize_entry(key, value)?;
             }
         }
@@ -879,6 +1033,7 @@ where
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
+    use std::marker::PhantomData;
     use std::sync::{Arc, Mutex, MutexGuard};
 
     use assert_json_diff::assert_json_eq;
@@ -927,14 +1082,17 @@ mod tests {
         let log_layer = JsonLoggingLayer {
             clock: clock.clone(),
             skipped_field_indices: papaya::HashMap::default(),
+            callsite_ids: papaya::HashMap::default(),
             writer: buffer.clone(),
+            extract_fields: IndexSet::from_iter(["x"]),
+            _marker: PhantomData::<[&'static str; 1]>,
         };
 
         let registry = tracing_subscriber::Registry::default().with(log_layer);
 
         tracing::subscriber::with_default(registry, || {
-            info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| {
-                info_span!("span2").in_scope(|| {
+            info_span!("some_span", x = 24).in_scope(|| {
+                info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| {
                     tracing::error!(
                         a = 1,
                         a = 2,
@@ -960,16 +1118,16 @@ mod tests {
                     "a": 3,
                 },
                 "spans": {
-                    "00":{
-                        "span_id": "0000000000000001",
-                        "span_name": "span1",
-                        "x": 42,
+                    "some_span#1":{
+                        "x": 24,
                     },
-                    "01": {
-                        "span_id": "0000000000000002",
-                        "span_name": "span2",
+                    "some_span#2": {
+                        "x": 42,
                     }
                 },
+                "extract": {
+                    "x": 42,
+                },
                 "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(),
                 "target": "proxy::logging::tests",
                 "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(),

From 40aa4d7151029fd0889ecb5f365c87a84d673d06 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 5 Mar 2025 12:23:07 +0100
Subject: [PATCH 34/61] utils: log Sentry initialization (#11077)

## Problem

We don't have any logging for Sentry initialization. This makes it hard
to verify that it has been configured correctly.

## Summary of changes

Log some basic info when Sentry has been initialized, but omit the
public key (which allows submitting events). Also log when `SENTRY_DSN`
isn't specified at all, and when it fails to initialize (which is
supposed to panic, but we may as well).
---
 libs/utils/src/sentry_init.rs | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/libs/utils/src/sentry_init.rs b/libs/utils/src/sentry_init.rs
index d77dbba087..72d192a591 100644
--- a/libs/utils/src/sentry_init.rs
+++ b/libs/utils/src/sentry_init.rs
@@ -3,20 +3,24 @@ use std::env;
 
 use sentry::ClientInitGuard;
 pub use sentry::release_name;
+use tracing::{error, info};
 
 #[must_use]
 pub fn init_sentry(
     release_name: Option<Cow<'static, str>>,
     extra_options: &[(&str, &str)],
 ) -> Option<ClientInitGuard> {
-    let dsn = env::var("SENTRY_DSN").ok()?;
+    let Ok(dsn) = env::var("SENTRY_DSN") else {
+        info!("not initializing Sentry, no SENTRY_DSN given");
+        return None;
+    };
     let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into());
 
     let guard = sentry::init((
         dsn,
         sentry::ClientOptions {
-            release: release_name,
-            environment: Some(environment.into()),
+            release: release_name.clone(),
+            environment: Some(environment.clone().into()),
             ..Default::default()
         },
     ));
@@ -25,5 +29,19 @@ pub fn init_sentry(
             scope.set_extra(key, value.into());
         }
     });
+
+    if let Some(dsn) = guard.dsn() {
+        info!(
+            "initialized Sentry for project {}, environment {}, release {} (using API {})",
+            dsn.project_id(),
+            environment,
+            release_name.unwrap_or(Cow::Borrowed("None")),
+            dsn.envelope_api_url(),
+        );
+    } else {
+        // This should panic during sentry::init(), but we may as well cover it.
+        error!("failed to initialize Sentry, invalid DSN");
+    }
+
     Some(guard)
 }

From 38a883118a87214062fb401ba83308a9aeeebe40 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 5 Mar 2025 11:29:46 +0000
Subject: [PATCH 35/61] Skip dropping tablesync replication slots on the
 publisher from branch (#11073)

fixes https://github.com/neondatabase/cloud/issues/24292

Do not drop tablesync replication slots on the publisher,
when we're in the process of dropping subscriptions inherited by a neon
branch.
Because these slots are still needed by the parent branch subscriptions.

For regular slots we handle this by setting the slot_name to NONE
before calling DROP SUBSCRIPTION, but tablesync slots are not exposed to
SQL.

rely on GUC disable_logical_replication_subscribers=true
to know that we're in the Neon-specific process of dropping
subscriptions.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 6254ab9b44..b1425505c6 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 6254ab9b4496c3e481bc037ae69d859bbc2bdd7d
+Subproject commit b1425505c6f9a622a5aadf3ee362740519993310
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 9b118b1cff..533be42f7d 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 9b118b1cffa6e4ca0d63389b57b54d11e207e9a8
+Subproject commit 533be42f7da97e614ce1c494fafe3e49f53991b1
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 799e7a08dd..78050f965f 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 799e7a08dd171aa06a7395dd326f4243aaeb9f93
+Subproject commit 78050f965f2e550fd6e58f837394cb3d080d7d42
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 517b8dc244..780efda2ef 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 517b8dc244abf3e56f0089849e464af76f70b94e
+Subproject commit 780efda2ef8d629495cc289624534ba8cde40779
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 8dde46a01e..1a811cfa3d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.4",
-    "517b8dc244abf3e56f0089849e464af76f70b94e"
+    "780efda2ef8d629495cc289624534ba8cde40779"
   ],
   "v16": [
     "16.8",
-    "799e7a08dd171aa06a7395dd326f4243aaeb9f93"
+    "78050f965f2e550fd6e58f837394cb3d080d7d42"
   ],
   "v15": [
     "15.12",
-    "9b118b1cffa6e4ca0d63389b57b54d11e207e9a8"
+    "533be42f7da97e614ce1c494fafe3e49f53991b1"
   ],
   "v14": [
     "14.17",
-    "6254ab9b4496c3e481bc037ae69d859bbc2bdd7d"
+    "b1425505c6f9a622a5aadf3ee362740519993310"
   ]
 }

From abae7637d6f4fd392c7ad2be15309222d1290d16 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 5 Mar 2025 11:55:55 +0000
Subject: [PATCH 36/61] pageserver: do big reads to fetch slru segment (#11029)

## Problem

Each page of the slru segment is fetched individually when it's loaded
on demand.

## Summary of Changes

Use `Timeline::get_vectored` to fetch 16 at a time.
---
 pageserver/src/pgdatadir_mapping.rs | 92 +++++++++++++++++++----------
 1 file changed, 61 insertions(+), 31 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 8aa96dd672..e663060d17 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -602,28 +602,36 @@ impl Timeline {
         let n_blocks = self
             .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
             .await?;
-        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
-        for blkno in 0..n_blocks {
-            let block = self
-                .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
-                .await?;
-            segment.extend_from_slice(&block[..BLCKSZ as usize]);
-        }
-        Ok(segment.freeze())
-    }
 
-    /// Look up given SLRU page version.
-    pub(crate) async fn get_slru_page_at_lsn(
-        &self,
-        kind: SlruKind,
-        segno: u32,
-        blknum: BlockNumber,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        assert!(self.tenant_shard_id.is_shard_zero());
-        let key = slru_block_to_key(kind, segno, blknum);
-        self.get(key, lsn, ctx).await
+        let keyspace = KeySpace::single(
+            slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks),
+        );
+
+        let batches = keyspace.partition(
+            self.get_shard_identity(),
+            Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+        );
+
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            self.conf,
+            self.gate
+                .enter()
+                .map_err(|_| PageReconstructError::Cancelled)?,
+        );
+
+        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
+        for batch in batches.parts {
+            let blocks = self
+                .get_vectored(batch, lsn, io_concurrency.clone(), ctx)
+                .await?;
+
+            for (_key, block) in blocks {
+                let block = block?;
+                segment.extend_from_slice(&block[..BLCKSZ as usize]);
+            }
+        }
+
+        Ok(segment.freeze())
     }
 
     /// Get size of an SLRU segment
@@ -832,19 +840,41 @@ impl Timeline {
             let nblocks = self
                 .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
                 .await?;
-            for blknum in (0..nblocks).rev() {
-                let clog_page = self
-                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
+
+            let keyspace = KeySpace::single(
+                slru_block_to_key(SlruKind::Clog, segno, 0)
+                    ..slru_block_to_key(SlruKind::Clog, segno, nblocks),
+            );
+
+            let batches = keyspace.partition(
+                self.get_shard_identity(),
+                Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+            );
+
+            let io_concurrency = IoConcurrency::spawn_from_conf(
+                self.conf,
+                self.gate
+                    .enter()
+                    .map_err(|_| PageReconstructError::Cancelled)?,
+            );
+
+            for batch in batches.parts.into_iter().rev() {
+                let blocks = self
+                    .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
                     .await?;
 
-                if clog_page.len() == BLCKSZ as usize + 8 {
-                    let mut timestamp_bytes = [0u8; 8];
-                    timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
-                    let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
+                for (_key, clog_page) in blocks.into_iter().rev() {
+                    let clog_page = clog_page?;
 
-                    match f(timestamp) {
-                        ControlFlow::Break(b) => return Ok(b),
-                        ControlFlow::Continue(()) => (),
+                    if clog_page.len() == BLCKSZ as usize + 8 {
+                        let mut timestamp_bytes = [0u8; 8];
+                        timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
+                        let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
+
+                        match f(timestamp) {
+                            ControlFlow::Break(b) => return Ok(b),
+                            ControlFlow::Continue(()) => (),
+                        }
                     }
                 }
             }

From 8c12ccf7291b435bd022bae39b3ea1cd5cced670 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 5 Mar 2025 12:20:18 +0000
Subject: [PATCH 37/61] pageserver: gate previous heatmap behind config flag
 (#11088)

## Problem

On unarchival, we update the previous heatmap with all visible layers.
When the primary generates a new heatmap it includes all those layers,
so the secondary will download them. Since they're not actually resident
on the primary (we didn't call the warm up API), they'll never be
evicted, so they remain in the heatmap.

This leads to oversized secondary locations like we saw in pre-prod.

## Summary of changes

Gate the loading of the previous heatmaps and the heatmap generation on
unarchival behind configuration
flags. They are disabled by default, but enabled in tests.
---
 libs/pageserver_api/src/config.rs     |  6 ++++++
 pageserver/src/config.rs              | 13 +++++++++++++
 pageserver/src/tenant.rs              |  6 +++++-
 test_runner/fixtures/neon_fixtures.py |  7 +++++--
 4 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 039cc1319e..f387ff0579 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -123,6 +123,10 @@ pub struct ConfigToml {
     pub enable_read_path_debugging: Option<bool>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub validate_wal_contiguity: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub load_previous_heatmap: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generate_unarchival_heatmap: Option<bool>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -523,6 +527,8 @@ impl Default for ConfigToml {
                 None
             },
             validate_wal_contiguity: None,
+            load_previous_heatmap: None,
+            generate_unarchival_heatmap: None,
         }
     }
 }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 64d00882b9..33ae8c4790 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -194,6 +194,13 @@ pub struct PageServerConf {
     /// Interpreted protocol feature: if enabled, validate that the logical WAL received from
     /// safekeepers does not have gaps.
     pub validate_wal_contiguity: bool,
+
+    /// When set, the previously written to disk heatmap is loaded on tenant attach and used
+    /// to avoid clobbering the heatmap from new, cold, attached locations.
+    pub load_previous_heatmap: bool,
+
+    /// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
+    pub generate_unarchival_heatmap: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -358,6 +365,8 @@ impl PageServerConf {
             get_vectored_concurrent_io,
             enable_read_path_debugging,
             validate_wal_contiguity,
+            load_previous_heatmap,
+            generate_unarchival_heatmap,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -447,6 +456,8 @@ impl PageServerConf {
             no_sync: no_sync.unwrap_or(false),
             enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
             validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
+            load_previous_heatmap: load_previous_heatmap.unwrap_or(false),
+            generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(false),
         };
 
         // ------------------------------------------------------------
@@ -493,6 +504,8 @@ impl PageServerConf {
             metric_collection_interval: Duration::from_secs(60),
             synthetic_size_calculation_interval: Duration::from_secs(60),
             background_task_maximum_delay: Duration::ZERO,
+            load_previous_heatmap: Some(true),
+            generate_unarchival_heatmap: Some(true),
             ..Default::default()
         };
         PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fee007b2d7..3694381078 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1150,7 +1150,7 @@ impl Tenant {
         // a previous heatmap which contains all visible layers in the layer map.
         // This previous heatmap will be used whenever a fresh heatmap is generated
         // for the timeline.
-        if matches!(cause, LoadTimelineCause::Unoffload) {
+        if self.conf.generate_unarchival_heatmap && matches!(cause, LoadTimelineCause::Unoffload) {
             let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn()));
             while let Some((tline, end_lsn)) = tline_ending_at {
                 let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await;
@@ -1582,6 +1582,10 @@ impl Tenant {
     }
 
     async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
+        if !self.conf.load_previous_heatmap {
+            return None;
+        }
+
         let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
         match tokio::fs::read_to_string(on_disk_heatmap_path).await {
             Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3aa018e99e..6171da52a0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1169,6 +1169,8 @@ class NeonEnv:
                 # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
                 # the pageserver taking a long time to start up due to syncfs flushing other tests' data
                 "no_sync": True,
+                # Look for gaps in WAL received from safekeepeers
+                "validate_wal_contiguity": True,
             }
 
             # Batching (https://github.com/neondatabase/neon/issues/9377):
@@ -1181,11 +1183,12 @@ class NeonEnv:
 
             if config.test_may_use_compatibility_snapshot_binaries:
                 log.info(
-                    "Skipping WAL contiguity validation to avoid forward-compatibility related test failures"
+                    "Skipping prev heatmap settings to avoid forward-compatibility related test failures"
                 )
             else:
                 # Look for gaps in WAL received from safekeepeers
-                ps_cfg["validate_wal_contiguity"] = True
+                ps_cfg["load_previous_heatmap"] = True
+                ps_cfg["generate_unarchival_heatmap"] = True
 
             get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
             if get_vectored_concurrent_io is not None:

From 332aae1484ce87f62897f6fa610c565eb85378d6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 5 Mar 2025 14:50:35 +0100
Subject: [PATCH 38/61] test_runner/regress: speed up
 `test_check_visibility_map` (#11086)

## Problem

`test_check_visibility_map` is the slowest test in CI, and can cause
timeouts under particularly slow configurations (`debug` and
`without-lfc`).

## Summary of changes

* Reduce the `pgbench` scale factor from 10 to 8.
* Omit a redundant vacuum during `pgbench` init.
* Remove a final `vacuum freeze` + `pg_check_visible` pass, which has
questionable value (we've already done a vacuum freeze previously, and
we don't flush the compute cache before checking anyway).
---
 test_runner/regress/test_vm_bits.py | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 4865178ca8..b30c02e0e4 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -327,9 +327,9 @@ def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         log.info(f"pgbench run {i+1}/{PGBENCH_RUNS}")
         endpoint.safe_psql(f"create database {dbname}")
         connstr = endpoint.connstr(dbname=dbname)
-        # pgbench -i will automatically vacuum the tables. This creates the visibility map.
-        pg_bin.run(["pgbench", "-i", "-s", "10", connstr])
-        # Freeze the tuples to set the initial frozen bit.
+        # Initialize the data set, but don't vacuum yet.
+        pg_bin.run(["pgbench", "-i", "-s", "8", "-n", connstr])
+        # Vacuum to create the visibility map, and freeze the tuples to set the frozen bit.
         endpoint.safe_psql("vacuum freeze", dbname=dbname)
         # Run pgbench.
         pg_bin.run(["pgbench", "-c", "32", "-j", "8", "-T", "10", connstr])
@@ -354,19 +354,3 @@ def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             row = cur.fetchone()
             assert row is not None
             assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)"
-
-    # Vacuum and freeze the tables, and check that the visibility map is still accurate.
-    for dbname in dbnames:
-        log.info(f"Vacuuming and checking visibility map for {dbname}")
-        with endpoint.cursor(dbname=dbname) as cur:
-            cur.execute("vacuum freeze")
-
-            cur.execute("select count(*) from pg_check_visible('pgbench_accounts')")
-            row = cur.fetchone()
-            assert row is not None
-            assert row[0] == 0, f"{row[0]} inconsistent VM pages (visible)"
-
-            cur.execute("select count(*) from pg_check_frozen('pgbench_accounts')")
-            row = cur.fetchone()
-            assert row is not None
-            assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)"

From 94e6897ead78cd1fca7781fa8e4e6c52a519415a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 5 Mar 2025 15:28:43 +0100
Subject: [PATCH 39/61] fix(ci): make deploy job depend on pushing images to
 dev registries (#11089)

## Problem
If an image fails to push to dev registries, we shouldn't trigger the
deploy job, because that depends on images existing in dev registries.
To ensure this is the case, the deploy job needs to depend on pushing to
dev registries.

## Summary of changes

Make `deploy` depend on `push-neon-image-dev` and
`push-compute-image-dev`.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ac6e0634f0..66758ca49f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1061,7 +1061,7 @@ jobs:
           exit 1
 
   deploy:
-    needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
+    needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
     # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
     if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
     permissions:

From 2d45522fa66e3265d08ab8cb317ee7f47eb31c3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 5 Mar 2025 16:45:43 +0100
Subject: [PATCH 40/61] storcon db: load safekeepers from DB again (#11087)

Earlier PR #11041 soft-disabled the loading code for safekeepers from
the storcon db. This PR makes us load the safekeepers from the database
again, now that we have [JWTs available on
staging](https://github.com/neondatabase/neon/pull/11087) and soon on
prod.

This reverts commit 23fb8053c5904d2ede67e09345de429ab56faefc.

Part of https://github.com/neondatabase/cloud/issues/24727
---
 control_plane/src/local_env.rs          |  3 ---
 control_plane/src/storage_controller.rs |  4 ----
 storage_controller/src/main.rs          |  5 -----
 storage_controller/src/service.rs       | 26 +++++++++----------------
 test_runner/fixtures/neon_fixtures.py   |  7 -------
 5 files changed, 9 insertions(+), 36 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index da7d7e5469..f4026efbbf 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -165,8 +165,6 @@ pub struct NeonStorageControllerConf {
 
     #[serde(with = "humantime_serde")]
     pub long_reconcile_threshold: Option<Duration>,
-
-    pub load_safekeepers: bool,
 }
 
 impl NeonStorageControllerConf {
@@ -190,7 +188,6 @@ impl Default for NeonStorageControllerConf {
             max_secondary_lag_bytes: None,
             heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
             long_reconcile_threshold: None,
-            load_safekeepers: true,
         }
     }
 }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 77a9075aa7..16e12f4e02 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -537,10 +537,6 @@ impl StorageController {
             args.push("--start-as-candidate".to_string());
         }
 
-        if self.config.load_safekeepers {
-            args.push("--load-safekeepers".to_string());
-        }
-
         if let Some(private_key) = &self.private_key {
             let claims = Claims::new(None, Scope::PageServerApi);
             let jwt_token =
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 6ef17c0007..967fb2996f 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -143,10 +143,6 @@ struct Cli {
     // Flag to use https for requests to pageserver API.
     #[arg(long, default_value = "false")]
     use_https_pageserver_api: bool,
-
-    /// Whether to load safekeeprs from the database and heartbeat them
-    #[arg(long, default_value = "false")]
-    load_safekeepers: bool,
 }
 
 enum StrictMode {
@@ -360,7 +356,6 @@ async fn async_main() -> anyhow::Result<()> {
         start_as_candidate: args.start_as_candidate,
         http_service_port: args.listen.port() as i32,
         use_https_pageserver_api: args.use_https_pageserver_api,
-        load_safekeepers: args.load_safekeepers,
     };
 
     // Validate that we can connect to the database
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 8fc7f7a0c5..e12bd299ce 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -394,8 +394,6 @@ pub struct Config {
     pub long_reconcile_threshold: Duration,
 
     pub use_https_pageserver_api: bool,
-
-    pub load_safekeepers: bool,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -1412,20 +1410,15 @@ impl Service {
             .set(nodes.len() as i64);
 
         tracing::info!("Loading safekeepers from database...");
-        let safekeepers = if config.load_safekeepers {
-            persistence
-                .list_safekeepers()
-                .await?
-                .into_iter()
-                .map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new()))
-                .collect::<Vec<_>>()
-        } else {
-            tracing::info!("Skipping safekeeper loading");
-            Default::default()
-        };
-
+        let safekeepers = persistence
+            .list_safekeepers()
+            .await?
+            .into_iter()
+            .map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new()))
+            .collect::<Vec<_>>();
         let safekeepers: HashMap<NodeId, Safekeeper> =
             safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
+        tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
 
         tracing::info!("Loading shards from database...");
         let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
@@ -8066,8 +8059,7 @@ impl Service {
     ) -> Result<(), DatabaseError> {
         let node_id = NodeId(record.id as u64);
         self.persistence.safekeeper_upsert(record.clone()).await?;
-
-        if self.config.load_safekeepers {
+        {
             let mut locked = self.inner.write().unwrap();
             let mut safekeepers = (*locked.safekeepers).clone();
             match safekeepers.entry(node_id) {
@@ -8099,7 +8091,7 @@ impl Service {
             .await?;
         let node_id = NodeId(id as u64);
         // After the change has been persisted successfully, update the in-memory state
-        if self.config.load_safekeepers {
+        {
             let mut locked = self.inner.write().unwrap();
             let mut safekeepers = (*locked.safekeepers).clone();
             let sk = safekeepers
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6171da52a0..ef9d8cb46f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1133,13 +1133,6 @@ class NeonEnv:
         if self.storage_controller_config is not None:
             cfg["storage_controller"] = self.storage_controller_config
 
-        # Disable new storcon flag in compat tests
-        if config.test_may_use_compatibility_snapshot_binaries:
-            if "storage_controller" in cfg:
-                cfg["storage_controller"]["load_safekeepers"] = False
-            else:
-                cfg["storage_controller"] = {"load_safekeepers": False}
-
         # Create config for pageserver
         http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
         pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"

From 9cdc8c0e6c7adf9bf31ec3cff6f8a978833e528a Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 5 Mar 2025 10:57:38 -0500
Subject: [PATCH 41/61] feat(pageserver): revisit error types for gc-compaction
 (#11082)

## Problem

part of https://github.com/neondatabase/neon/issues/9114

We used anyhow::Error everywhere and it's time to fix.

## Summary of changes

* Make sure that cancel errors are correctly propagated as
CompactionError::ShuttingDown.
* Skip all the trigger computation work if gc_cutoff is not generated
yet.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 182 ++++++++++++++-----
 1 file changed, 134 insertions(+), 48 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 76c28e11ab..17f7d96e5e 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -15,7 +15,7 @@ use super::{
     Timeline,
 };
 
-use anyhow::{Context, anyhow, bail};
+use anyhow::{Context, anyhow};
 use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
@@ -234,6 +234,12 @@ impl GcCompactionQueue {
             // it enough in staging yet.
             return Ok(());
         }
+        if timeline.get_gc_compaction_watermark() == Lsn::INVALID {
+            // If the gc watermark is not set, we don't need to trigger auto compaction.
+            // This check is the same as in `gc_compaction_split_jobs` but we don't log
+            // here and we can also skip the computation of the trigger condition earlier.
+            return Ok(());
+        }
 
         let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else {
             // Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure
@@ -357,8 +363,7 @@ impl GcCompactionQueue {
                 GcCompactJob::from_compact_options(options.clone()),
                 options.sub_compaction_max_job_size_mb,
             )
-            .await
-            .map_err(CompactionError::Other)?;
+            .await?;
         if jobs.is_empty() {
             info!("no jobs to run, skipping scheduled compaction task");
             self.notify_and_unblock(id);
@@ -825,9 +830,7 @@ impl Timeline {
             .flags
             .contains(CompactFlags::EnhancedGcBottomMostCompaction)
         {
-            self.compact_with_gc(cancel, options, ctx)
-                .await
-                .map_err(CompactionError::Other)?;
+            self.compact_with_gc(cancel, options, ctx).await?;
             return Ok(CompactionOutcome::Done);
         }
 
@@ -2345,12 +2348,19 @@ impl Timeline {
     async fn check_compaction_space(
         self: &Arc<Self>,
         layer_selection: &[Layer],
-    ) -> anyhow::Result<()> {
-        let available_space = self.check_available_space().await?;
+    ) -> Result<(), CompactionError> {
+        let available_space = self
+            .check_available_space()
+            .await
+            .map_err(CompactionError::Other)?;
         let mut remote_layer_size = 0;
         let mut all_layer_size = 0;
         for layer in layer_selection {
-            let needs_download = layer.needs_download().await?;
+            let needs_download = layer
+                .needs_download()
+                .await
+                .context("failed to check if layer needs download")
+                .map_err(CompactionError::Other)?;
             if needs_download.is_some() {
                 remote_layer_size += layer.layer_desc().file_size;
             }
@@ -2359,14 +2369,14 @@ impl Timeline {
         let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
         if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
         {
-            return Err(anyhow!(
+            return Err(CompactionError::Other(anyhow!(
                 "not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
                 available_space,
                 allocated_space,
                 all_layer_size,
                 remote_layer_size,
                 all_layer_size + remote_layer_size
-            ));
+            )));
         }
         Ok(())
     }
@@ -2397,7 +2407,7 @@ impl Timeline {
         self: &Arc<Self>,
         job: GcCompactJob,
         sub_compaction_max_job_size_mb: Option<u64>,
-    ) -> anyhow::Result<Vec<GcCompactJob>> {
+    ) -> Result<Vec<GcCompactJob>, CompactionError> {
         let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
             job.compact_lsn_range.end
         } else {
@@ -2548,7 +2558,7 @@ impl Timeline {
         cancel: &CancellationToken,
         options: CompactOptions,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), CompactionError> {
         let sub_compaction = options.sub_compaction;
         let job = GcCompactJob::from_compact_options(options.clone());
         if sub_compaction {
@@ -2580,7 +2590,7 @@ impl Timeline {
         cancel: &CancellationToken,
         job: GcCompactJob,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), CompactionError> {
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -2588,8 +2598,7 @@ impl Timeline {
         let gc_lock = async {
             tokio::select! {
                 guard = self.gc_lock.lock() => Ok(guard),
-                // TODO: refactor to CompactionError to correctly pass cancelled error
-                _ = cancel.cancelled() => Err(anyhow!("cancelled")),
+                _ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
             }
         };
 
@@ -2810,10 +2819,10 @@ impl Timeline {
             .map(|layer| layer.layer_desc().layer_name())
             .collect_vec();
         if let Some(err) = check_valid_layermap(&layer_names) {
-            bail!(
+            return Err(CompactionError::Other(anyhow!(
                 "gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss",
                 err
-            );
+            )));
         }
         // The maximum LSN we are processing in this compaction loop
         let end_lsn = job_desc
@@ -2828,11 +2837,24 @@ impl Timeline {
         let mut total_downloaded_size = 0;
         let mut total_layer_size = 0;
         for layer in &job_desc.selected_layers {
-            if layer.needs_download().await?.is_some() {
+            if layer
+                .needs_download()
+                .await
+                .context("failed to check if layer needs download")
+                .map_err(CompactionError::Other)?
+                .is_some()
+            {
                 total_downloaded_size += layer.layer_desc().file_size;
             }
             total_layer_size += layer.layer_desc().file_size;
-            let resident_layer = layer.download_and_keep_resident(ctx).await?;
+            if cancel.is_cancelled() {
+                return Err(CompactionError::ShuttingDown);
+            }
+            let resident_layer = layer
+                .download_and_keep_resident(ctx)
+                .await
+                .context("failed to download and keep resident layer")
+                .map_err(CompactionError::Other)?;
             downloaded_layers.push(resident_layer);
         }
         info!(
@@ -2843,19 +2865,33 @@ impl Timeline {
         );
         for resident_layer in &downloaded_layers {
             if resident_layer.layer_desc().is_delta() {
-                let layer = resident_layer.get_as_delta(ctx).await?;
+                let layer = resident_layer
+                    .get_as_delta(ctx)
+                    .await
+                    .context("failed to get delta layer")
+                    .map_err(CompactionError::Other)?;
                 delta_layers.push(layer);
             } else {
-                let layer = resident_layer.get_as_image(ctx).await?;
+                let layer = resident_layer
+                    .get_as_image(ctx)
+                    .await
+                    .context("failed to get image layer")
+                    .map_err(CompactionError::Other)?;
                 image_layers.push(layer);
             }
         }
-        let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
+        let (dense_ks, sparse_ks) = self
+            .collect_gc_compaction_keyspace()
+            .await
+            .context("failed to collect gc compaction keyspace")
+            .map_err(CompactionError::Other)?;
         let mut merge_iter = FilterIterator::create(
             MergeIterator::create(&delta_layers, &image_layers, ctx),
             dense_ks,
             sparse_ks,
-        )?;
+        )
+        .context("failed to create filter iterator")
+        .map_err(CompactionError::Other)?;
 
         // Step 2: Produce images+deltas.
         let mut accumulated_values = Vec::new();
@@ -2874,7 +2910,9 @@ impl Timeline {
                     self.get_compaction_target_size(),
                     ctx,
                 )
-                .await?,
+                .await
+                .context("failed to create image layer writer")
+                .map_err(CompactionError::Other)?,
             )
         } else {
             None
@@ -2887,7 +2925,9 @@ impl Timeline {
             lowest_retain_lsn..end_lsn,
             self.get_compaction_target_size(),
         )
-        .await?;
+        .await
+        .context("failed to create delta layer writer")
+        .map_err(CompactionError::Other)?;
 
         #[derive(Default)]
         struct RewritingLayers {
@@ -2927,9 +2967,14 @@ impl Timeline {
         // the key and LSN range are determined. However, to keep things simple here, we still
         // create this writer, and discard the writer in the end.
 
-        while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
+        while let Some(((key, lsn, val), desc)) = merge_iter
+            .next_with_trace()
+            .await
+            .context("failed to get next key-value pair")
+            .map_err(CompactionError::Other)?
+        {
             if cancel.is_cancelled() {
-                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
+                return Err(CompactionError::ShuttingDown);
             }
             if self.shard_identity.is_key_disposable(&key) {
                 // If this shard does not need to store this key, simply skip it.
@@ -2960,7 +3005,9 @@ impl Timeline {
                                 desc.lsn_range.clone(),
                                 ctx,
                             )
-                            .await?,
+                            .await
+                            .context("failed to create delta layer writer")
+                            .map_err(CompactionError::Other)?,
                         );
                     }
                     rewriter.before.as_mut().unwrap()
@@ -2975,14 +3022,20 @@ impl Timeline {
                                 desc.lsn_range.clone(),
                                 ctx,
                             )
-                            .await?,
+                            .await
+                            .context("failed to create delta layer writer")
+                            .map_err(CompactionError::Other)?,
                         );
                     }
                     rewriter.after.as_mut().unwrap()
                 } else {
                     unreachable!()
                 };
-                rewriter.put_value(key, lsn, val, ctx).await?;
+                rewriter
+                    .put_value(key, lsn, val, ctx)
+                    .await
+                    .context("failed to put value")
+                    .map_err(CompactionError::Other)?;
                 continue;
             }
             match val {
@@ -3005,9 +3058,13 @@ impl Timeline {
                         &job_desc.retain_lsns_below_horizon,
                         COMPACTION_DELTA_THRESHOLD,
                         get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
-                            .await?,
+                            .await
+                            .context("failed to get ancestor image")
+                            .map_err(CompactionError::Other)?,
                     )
-                    .await?;
+                    .await
+                    .context("failed to generate key retention")
+                    .map_err(CompactionError::Other)?;
                 retention
                     .pipe_to(
                         *last_key,
@@ -3016,7 +3073,9 @@ impl Timeline {
                         &mut stat,
                         ctx,
                     )
-                    .await?;
+                    .await
+                    .context("failed to pipe to delta layer writer")
+                    .map_err(CompactionError::Other)?;
                 accumulated_values.clear();
                 *last_key = key;
                 accumulated_values.push((key, lsn, val));
@@ -3034,9 +3093,14 @@ impl Timeline {
                 job_desc.gc_cutoff,
                 &job_desc.retain_lsns_below_horizon,
                 COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
+                get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn)
+                    .await
+                    .context("failed to get ancestor image")
+                    .map_err(CompactionError::Other)?,
             )
-            .await?;
+            .await
+            .context("failed to generate key retention")
+            .map_err(CompactionError::Other)?;
         retention
             .pipe_to(
                 last_key,
@@ -3045,7 +3109,9 @@ impl Timeline {
                 &mut stat,
                 ctx,
             )
-            .await?;
+            .await
+            .context("failed to pipe to delta layer writer")
+            .map_err(CompactionError::Other)?;
         // end: move the above part to the loop body
 
         let mut rewrote_delta_layers = Vec::new();
@@ -3053,13 +3119,23 @@ impl Timeline {
             if let Some(delta_writer_before) = writers.before {
                 let (desc, path) = delta_writer_before
                     .finish(job_desc.compaction_key_range.start, ctx)
-                    .await?;
-                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                    .await
+                    .context("failed to finish delta layer writer")
+                    .map_err(CompactionError::Other)?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)
+                    .context("failed to finish creating delta layer")
+                    .map_err(CompactionError::Other)?;
                 rewrote_delta_layers.push(layer);
             }
             if let Some(delta_writer_after) = writers.after {
-                let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
-                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                let (desc, path) = delta_writer_after
+                    .finish(key.key_range.end, ctx)
+                    .await
+                    .context("failed to finish delta layer writer")
+                    .map_err(CompactionError::Other)?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)
+                    .context("failed to finish creating delta layer")
+                    .map_err(CompactionError::Other)?;
                 rewrote_delta_layers.push(layer);
             }
         }
@@ -3074,7 +3150,9 @@ impl Timeline {
                 let end_key = job_desc.compaction_key_range.end;
                 writer
                     .finish_with_discard_fn(self, ctx, end_key, discard)
-                    .await?
+                    .await
+                    .context("failed to finish image layer writer")
+                    .map_err(CompactionError::Other)?
             } else {
                 drop(writer);
                 Vec::new()
@@ -3086,7 +3164,9 @@ impl Timeline {
         let produced_delta_layers = if !dry_run {
             delta_layer_writer
                 .finish_with_discard_fn(self, ctx, discard)
-                .await?
+                .await
+                .context("failed to finish delta layer writer")
+                .map_err(CompactionError::Other)?
         } else {
             drop(delta_layer_writer);
             Vec::new()
@@ -3166,7 +3246,9 @@ impl Timeline {
                     &layer.layer_desc().key_range,
                     &job_desc.compaction_key_range,
                 ) {
-                    bail!("violated constraint: image layer outside of compaction key range");
+                    return Err(CompactionError::Other(anyhow!(
+                        "violated constraint: image layer outside of compaction key range"
+                    )));
                 }
                 if !fully_contains(
                     &job_desc.compaction_key_range,
@@ -3181,7 +3263,9 @@ impl Timeline {
 
         info!(
             "gc-compaction statistics: {}",
-            serde_json::to_string(&stat)?
+            serde_json::to_string(&stat)
+                .context("failed to serialize gc-compaction statistics")
+                .map_err(CompactionError::Other)?
         );
 
         if dry_run {
@@ -3220,10 +3304,10 @@ impl Timeline {
         // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are
         // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails.
         if let Some(err) = check_valid_layermap(&final_layers) {
-            bail!(
+            return Err(CompactionError::Other(anyhow!(
                 "gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss",
                 err
-            );
+            )));
         }
 
         // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
@@ -3275,7 +3359,9 @@ impl Timeline {
         // find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should
         // be batched into `schedule_compaction_update`.
         let disk_consistent_lsn = self.disk_consistent_lsn.load();
-        self.schedule_uploads(disk_consistent_lsn, None)?;
+        self.schedule_uploads(disk_consistent_lsn, None)
+            .context("failed to schedule uploads")
+            .map_err(CompactionError::Other)?;
         // If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead
         // of `compact_from`.
         let compact_from = {

From d94fc75cfc154d3b5daafcd337cff497b223fe03 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 5 Mar 2025 18:01:00 +0000
Subject: [PATCH 42/61] Setup compute_ctl pgaudit and rsyslog (#10615)

Setup pgaudit and pgauditlogtofile extensions
in compute_ctl when the ComputeAuditLogLevel is
set to 'hipaa'.

See cloud PR https://github.com/neondatabase/cloud/pull/24568

Add rsyslog setup for compute_ctl.
Spin up a rsyslog server in the compute VM,
and configure it to send logs to the endpoint
specified in AUDIT_LOGGING_ENDPOINT env.
---
 compute/compute-node.Dockerfile               | 10 +++
 compute/vm-image-spec-bookworm.yaml           |  2 +-
 compute/vm-image-spec-bullseye.yaml           |  2 +-
 compute_tools/src/compute.rs                  | 22 ++++-
 compute_tools/src/config.rs                   | 56 ++++++++++++-
 .../compute_rsyslog_template.conf             | 10 +++
 compute_tools/src/lib.rs                      |  1 +
 compute_tools/src/rsyslog.rs                  | 80 +++++++++++++++++++
 compute_tools/src/spec_apply.rs               | 45 +++++++++--
 control_plane/src/endpoint.rs                 |  4 +-
 libs/compute_api/src/spec.rs                  | 21 +++++
 11 files changed, 241 insertions(+), 12 deletions(-)
 create mode 100644 compute_tools/src/config_template/compute_rsyslog_template.conf
 create mode 100644 compute_tools/src/rsyslog.rs

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index c3aecfbdc5..a7e8718ea9 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1933,6 +1933,7 @@ RUN apt update && \
         locales \
         procps \
         ca-certificates \
+        rsyslog \
         $VERSION_INSTALLS && \
     apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
@@ -1978,6 +1979,15 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo
 # Make the libraries we built available
 RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
 
+# rsyslog config permissions
+RUN chown postgres:postgres /etc/rsyslog.conf && \
+    touch /etc/compute_rsyslog.conf && \
+    chown -R postgres:postgres /etc/compute_rsyslog.conf && \
+    # directory for rsyslogd pid file
+    mkdir /var/run/rsyslogd && \
+    chown -R postgres:postgres /var/run/rsyslogd
+
+
 ENV LANG=en_US.utf8
 USER postgres
 ENTRYPOINT ["/usr/local/bin/compute_ctl"]
diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index ff4c3387d9..74ff3a8b6d 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -54,7 +54,7 @@ files:
       # regardless of hostname (ALL)
       #
       # Also allow it to shut down the VM. The fast_import job does that when it's finished.
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes
diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index c001040bc9..c1787ab018 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -54,7 +54,7 @@ files:
       # regardless of hostname (ALL)
       #
       # Also allow it to shut down the VM. The fast_import job does that when it's finished.
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index d0b1bc2534..ddcfe12330 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -12,7 +12,9 @@ use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent};
+use compute_api::spec::{
+    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
+};
 use futures::StreamExt;
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
@@ -35,6 +37,7 @@ use crate::logger::startup_context_from_env;
 use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use crate::monitor::launch_monitor;
 use crate::pg_helpers::*;
+use crate::rsyslog::configure_and_start_rsyslog;
 use crate::spec::*;
 use crate::swap::resize_swap;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -676,6 +679,23 @@ impl ComputeNode {
             });
         }
 
+        // Configure and start rsyslog if necessary
+        if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
+            let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
+            if remote_endpoint.is_empty() {
+                anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+            }
+
+            let log_directory_path = Path::new(&self.params.pgdata).join("log");
+            // TODO: make this more robust
+            // now rsyslog starts once and there is no monitoring or restart if it fails
+            configure_and_start_rsyslog(
+                log_directory_path.to_str().unwrap(),
+                "hipaa",
+                &remote_endpoint,
+            )?;
+        }
+
         // Launch remaining service threads
         let _monitor_handle = launch_monitor(self);
         let _configurator_handle = launch_configurator(self);
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index ca24ff76b3..0760568ff8 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -1,3 +1,4 @@
+use anyhow::Result;
 use std::fmt::Write as FmtWrite;
 use std::fs::{File, OpenOptions};
 use std::io;
@@ -5,10 +6,11 @@ use std::io::Write;
 use std::io::prelude::*;
 use std::path::Path;
 
-use anyhow::Result;
-use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
+use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
 
-use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize, escape_conf_value};
+use crate::pg_helpers::{
+    GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
+};
 
 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -138,6 +140,54 @@ pub fn write_postgres_conf(
         writeln!(file, "# Managed by compute_ctl: end")?;
     }
 
+    // If audit logging is enabled, configure pgaudit.
+    //
+    // Note, that this is called after the settings from spec are written.
+    // This way we always override the settings from the spec
+    // and don't allow the user or the control plane admin to change them.
+    if let ComputeAudit::Hipaa = spec.audit_log_level {
+        writeln!(file, "# Managed by compute_ctl audit settings: begin")?;
+        // This log level is very verbose
+        // but this is necessary for HIPAA compliance.
+        writeln!(file, "pgaudit.log='all'")?;
+        writeln!(file, "pgaudit.log_parameter=on")?;
+        // Disable logging of catalog queries
+        // The catalog doesn't contain sensitive data, so we don't need to audit it.
+        writeln!(file, "pgaudit.log_catalog=off")?;
+        // Set log rotation to 5 minutes
+        // TODO: tune this after performance testing
+        writeln!(file, "pgaudit.log_rotation_age=5")?;
+
+        // Add audit shared_preload_libraries, if they are not present.
+        //
+        // The caller who sets the flag is responsible for ensuring that the necessary
+        // shared_preload_libraries are present in the compute image,
+        // otherwise the compute start will fail.
+        if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+            let mut extra_shared_preload_libraries = String::new();
+            if !libs.contains("pgaudit") {
+                extra_shared_preload_libraries.push_str(",pgaudit");
+            }
+            if !libs.contains("pgauditlogtofile") {
+                extra_shared_preload_libraries.push_str(",pgauditlogtofile");
+            }
+            writeln!(
+                file,
+                "shared_preload_libraries='{}{}'",
+                libs, extra_shared_preload_libraries
+            )?;
+        } else {
+            // Typically, this should be unreacheable,
+            // because we always set at least some shared_preload_libraries in the spec
+            // but let's handle it explicitly anyway.
+            writeln!(
+                file,
+                "shared_preload_libraries='neon,pgaudit,pgauditlogtofile'"
+            )?;
+        }
+        writeln!(file, "# Managed by compute_ctl audit settings: end")?;
+    }
+
     writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
 
     if spec.drop_subscriptions_before_start {
diff --git a/compute_tools/src/config_template/compute_rsyslog_template.conf b/compute_tools/src/config_template/compute_rsyslog_template.conf
new file mode 100644
index 0000000000..bef3c36446
--- /dev/null
+++ b/compute_tools/src/config_template/compute_rsyslog_template.conf
@@ -0,0 +1,10 @@
+# Load imfile module to read log files
+module(load="imfile")
+
+# Input configuration for log files in the specified directory
+# Replace {log_directory} with the directory containing the log files
+input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
+global(workDirectory="/var/log")
+
+# Forward logs to remote syslog server
+*.* @@{remote_endpoint}
\ No newline at end of file
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index b08df22134..5c78bbcd02 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -21,6 +21,7 @@ mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
+pub mod rsyslog;
 pub mod spec;
 mod spec_apply;
 pub mod swap;
diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs
new file mode 100644
index 0000000000..776ff14644
--- /dev/null
+++ b/compute_tools/src/rsyslog.rs
@@ -0,0 +1,80 @@
+use std::process::Command;
+use std::{fs::OpenOptions, io::Write};
+
+use anyhow::{Context, Result};
+use tracing::info;
+
+fn get_rsyslog_pid() -> Option<String> {
+    let output = Command::new("pgrep")
+        .arg("rsyslogd")
+        .output()
+        .expect("Failed to execute pgrep");
+
+    if !output.stdout.is_empty() {
+        let pid = std::str::from_utf8(&output.stdout)
+            .expect("Invalid UTF-8 in process output")
+            .trim()
+            .to_string();
+        Some(pid)
+    } else {
+        None
+    }
+}
+
+// Start rsyslogd with the specified configuration file
+// If it is already running, do nothing.
+fn start_rsyslog(rsyslog_conf_path: &str) -> Result<()> {
+    let pid = get_rsyslog_pid();
+    if let Some(pid) = pid {
+        info!("rsyslogd is already running with pid: {}", pid);
+        return Ok(());
+    }
+
+    let _ = Command::new("/usr/sbin/rsyslogd")
+        .arg("-f")
+        .arg(rsyslog_conf_path)
+        .arg("-i")
+        .arg("/var/run/rsyslogd/rsyslogd.pid")
+        .output()
+        .context("Failed to start rsyslogd")?;
+
+    // Check that rsyslogd is running
+    if let Some(pid) = get_rsyslog_pid() {
+        info!("rsyslogd started successfully with pid: {}", pid);
+    } else {
+        return Err(anyhow::anyhow!("Failed to start rsyslogd"));
+    }
+
+    Ok(())
+}
+
+pub fn configure_and_start_rsyslog(
+    log_directory: &str,
+    tag: &str,
+    remote_endpoint: &str,
+) -> Result<()> {
+    let config_content: String = format!(
+        include_str!("config_template/compute_rsyslog_template.conf"),
+        log_directory = log_directory,
+        tag = tag,
+        remote_endpoint = remote_endpoint
+    );
+
+    info!("rsyslog config_content: {}", config_content);
+
+    let rsyslog_conf_path = "/etc/compute_rsyslog.conf";
+    let mut file = OpenOptions::new()
+        .create(true)
+        .write(true)
+        .truncate(true)
+        .open(rsyslog_conf_path)?;
+
+    file.write_all(config_content.as_bytes())?;
+
+    info!("rsyslog configuration added successfully. Starting rsyslogd");
+
+    // start the service, using the configuration
+    start_rsyslog(rsyslog_conf_path)?;
+
+    Ok(())
+}
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index f9a37c5c98..dbc02c8d02 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -6,7 +6,7 @@ use std::sync::Arc;
 
 use anyhow::{Context, Result};
 use compute_api::responses::ComputeStatus;
-use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role};
+use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role};
 use futures::future::join_all;
 use tokio::sync::RwLock;
 use tokio_postgres::Client;
@@ -19,10 +19,10 @@ use crate::pg_helpers::{
     get_existing_roles_async,
 };
 use crate::spec_apply::ApplySpecPhase::{
-    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon,
-    CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
-    HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
-    RunInEachDatabase,
+    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
+    CreatePgauditlogtofileExtension, CreateSchemaNeon, CreateSuperUser, DisablePostgresDBPgAudit,
+    DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, HandleNeonExtension,
+    HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
 };
 use crate::spec_apply::PerDatabasePhase::{
     ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension,
@@ -277,6 +277,19 @@ impl ComputeNode {
                 phases.push(FinalizeDropLogicalSubscriptions);
             }
 
+            // Keep DisablePostgresDBPgAudit phase at the end,
+            // so that all config operations are audit logged.
+            match spec.audit_log_level
+            {
+                ComputeAudit::Hipaa => {
+                    phases.push(CreatePgauditExtension);
+                    phases.push(CreatePgauditlogtofileExtension);
+                    phases.push(DisablePostgresDBPgAudit);
+                }
+                ComputeAudit::Log => { /* not implemented yet */ }
+                ComputeAudit::Disabled => {}
+            }
+
             for phase in phases {
                 debug!("Applying phase {:?}", &phase);
                 apply_operations(
@@ -463,6 +476,9 @@ pub enum ApplySpecPhase {
     CreateAndAlterDatabases,
     CreateSchemaNeon,
     RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
+    CreatePgauditExtension,
+    CreatePgauditlogtofileExtension,
+    DisablePostgresDBPgAudit,
     HandleOtherExtensions,
     HandleNeonExtension,
     CreateAvailabilityCheck,
@@ -1098,6 +1114,25 @@ async fn get_operations<'a>(
             }
             Ok(Box::new(empty()))
         }
+        ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
+            query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
+            comment: Some(String::from("create pgaudit extensions")),
+        }))),
+        ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
+            query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
+            comment: Some(String::from("create pgauditlogtofile extensions")),
+        }))),
+        // Disable pgaudit logging for postgres database.
+        // Postgres is neon system database used by monitors
+        // and compute_ctl tuning functions and thus generates a lot of noise.
+        // We do not consider data stored in this database as sensitive.
+        ApplySpecPhase::DisablePostgresDBPgAudit => {
+            let query = "ALTER DATABASE postgres SET pgaudit.log to 'none'";
+            Ok(Box::new(once(Operation {
+                query: query.to_string(),
+                comment: Some(query.to_string()),
+            })))
+        }
         ApplySpecPhase::HandleNeonExtension => {
             let operations = vec![
                 Operation {
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 87bfbd7570..b46d616827 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -48,7 +48,8 @@ use anyhow::{Context, Result, anyhow, bail};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
 use compute_api::spec::{
-    Cluster, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, RemoteExtSpec, Role,
+    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
+    RemoteExtSpec, Role,
 };
 use nix::sys::signal::{Signal, kill};
 use pageserver_api::shard::ShardStripeSize;
@@ -668,6 +669,7 @@ impl Endpoint {
             local_proxy_config: None,
             reconfigure_concurrency: self.reconfigure_concurrency,
             drop_subscriptions_before_start: self.drop_subscriptions_before_start,
+            audit_log_level: ComputeAudit::Disabled,
         };
 
         // this strange code is needed to support respec() in tests
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index df82d8b449..77f2e1e631 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -155,6 +155,16 @@ pub struct ComputeSpec {
     /// over the same replication content from publisher.
     #[serde(default)] // Default false
     pub drop_subscriptions_before_start: bool,
+
+    /// Log level for audit logging:
+    ///
+    /// Disabled - no audit logging. This is the default.
+    /// log - log masked statements to the postgres log using pgaudit extension
+    /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
+    ///
+    /// Extensions should be present in shared_preload_libraries
+    #[serde(default)]
+    pub audit_log_level: ComputeAudit,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -262,6 +272,17 @@ pub enum ComputeMode {
     Replica,
 }
 
+/// Log level for audit logging
+/// Disabled, log, hipaa
+/// Default is Disabled
+#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
+pub enum ComputeAudit {
+    #[default]
+    Disabled,
+    Log,
+    Hipaa,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
 pub struct Cluster {
     pub cluster_id: Option<String>,

From 8263107f6c67c86a4e1a641129bad42cb88b2557 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 5 Mar 2025 19:17:57 +0100
Subject: [PATCH 43/61] feat(compute): Add filename label to remote ext
 requests metric (#11091)

## Problem

We realized that we may use this metric for more 'live' info about
extension installations vs. what we have with installed extensions
metric, which is only updated at start, atm.

## Summary of changes

Add `filename` label to `compute_ctl_remote_ext_requests_total`. Note
that it contains the raw archive name with `.tar.zst` at the end, so the
consumer may need to strip this suffix.

Closes https://github.com/neondatabase/cloud/issues/24694
---
 compute_tools/src/extension_server.rs          | 18 +++++++++++-------
 compute_tools/src/metrics.rs                   |  4 +---
 .../regress/test_download_extensions.py        |  2 ++
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 77e98359ab..b4de786b00 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -253,27 +253,31 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
     }
 }
 
-// Do request to extension storage proxy, i.e.
+// Do request to extension storage proxy, e.g.,
 // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
-// using HHTP GET
-// and return the response body as bytes
-//
+// using HTTP GET and return the response body as bytes.
 async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
     let uri = format!("{}/{}", ext_remote_storage, ext_path);
+    let filename = Path::new(ext_path)
+        .file_name()
+        .unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
+        .to_str()
+        .unwrap_or("unknown")
+        .to_string();
 
-    info!("Download extension {} from uri {}", ext_path, uri);
+    info!("Downloading extension file '{}' from uri {}", filename, uri);
 
     match do_extension_server_request(&uri).await {
         Ok(resp) => {
             info!("Successfully downloaded remote extension data {}", ext_path);
             REMOTE_EXT_REQUESTS_TOTAL
-                .with_label_values(&[&StatusCode::OK.to_string()])
+                .with_label_values(&[&StatusCode::OK.to_string(), &filename])
                 .inc();
             Ok(resp)
         }
         Err((msg, status)) => {
             REMOTE_EXT_REQUESTS_TOTAL
-                .with_label_values(&[&status])
+                .with_label_values(&[&status, &filename])
                 .inc();
             bail!(msg);
         }
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index bc96e5074c..dab32d5dc1 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -54,9 +54,7 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(||
     register_int_counter_vec!(
         "compute_ctl_remote_ext_requests_total",
         "Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
-        // Do not use any labels like extension name yet.
-        // We can add them later if needed.
-        &["http_status"]
+        &["http_status", "filename"]
     )
     .expect("failed to define a metric")
 });
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 7f12c14073..2ff525464d 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -137,6 +137,8 @@ def test_remote_extensions(
     metrics = parse_metrics(raw_metrics)
     remote_ext_requests = metrics.query_all(
         "compute_ctl_remote_ext_requests_total",
+        # Check that we properly report the filename in the metrics
+        {"filename": "anon.tar.zst"},
     )
     assert len(remote_ext_requests) == 1
     for sample in remote_ext_requests:

From d599d2df8065bbb2d9090d259baf0ed0504b4c50 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 5 Mar 2025 12:32:45 -0600
Subject: [PATCH 44/61] Update postgres_exporter to 0.17.1 (#11094)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index a7e8718ea9..61d9d59f79 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1484,7 +1484,7 @@ WORKDIR /ext-src
 COPY compute/patches/pg_duckdb_v031.patch .
 COPY compute/patches/duckdb_v120.patch .
 # pg_duckdb build requires source dir to be a git repo to get submodules
-# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: 
+# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
 # - extension management function duckdb.install_extension()
 # - access to duckdb.extensions table and its sequence
 RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
@@ -1499,8 +1499,8 @@ ARG PG_VERSION
 COPY --from=pg_duckdb-src /ext-src/ /ext-src/
 WORKDIR /ext-src/pg_duckdb-src
 RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control 
-        
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
+
 #########################################################################################
 #
 # Layer "pg_repack"
@@ -1758,15 +1758,15 @@ ARG TARGETARCH
 # test_runner/regress/test_compute_metrics.py
 # See comment on the top of the file regading `echo`, `-e` and `\n`
 RUN if [ "$TARGETARCH" = "amd64" ]; then\
-        postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
+        postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\
         pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
         sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
     else\
-        postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\
+        postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\
         pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
         sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
     fi\
-    && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\
+    && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\
      | tar xzf - --strip-components=1 -C.\
     && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
      | tar xzf - --strip-components=1 -C.\

From 604eb5e8d454104705eeeb3e60c68b9a12d221ef Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 5 Mar 2025 21:01:17 +0100
Subject: [PATCH 45/61] fix grafana dashboard link for pooler endoints (#11099)

## Problem

Our benchmarking workflows contain links to grafana dashboards to
troubleshoot problems.
This works fine for non-pooled endpoints.
For pooled endpoints we need to remove the `-pooler` suffix from the
endpoint's hostname to get a valid endpoint ID.

Example link that doesn't work in this run


https://github.com/neondatabase/neon/actions/runs/13678933253/job/38246028316#step:8:311

## Summary of changes

Check if connection string is a -pooler connection string and if so
remove this suffix from the endpoint ID.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 test_runner/fixtures/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 84d62fb877..d1b2a5a400 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -337,6 +337,8 @@ def allure_add_grafana_link(host: str, timeline_id: TimelineId, start_ms: int, e
     """
     # We expect host to be in format like ep-holy-mouse-w2u462gi.us-east-2.aws.neon.build
     endpoint_id, region_id, _ = host.split(".", 2)
+    # Remove "-pooler" suffix if present
+    endpoint_id = endpoint_id.removesuffix("-pooler")
 
     params = {
         "orgId": 1,

From 1fe23fe8d25d01ea256e2f4a8b2294815f8b0671 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Wed, 5 Mar 2025 12:35:08 -0800
Subject: [PATCH 46/61] compute/lfc: Add chunk size to neon_lfc_stats (#11100)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR adds a new key to neon.neon_lfc_stats —
'file_cache_chunk_size_pages'. It just returns the value of
BLOCKS_PER_CHUNK from the LFC implementation.

The new value should (eventually) allow changing the chunk size without
breaking any places that rely on LFC stats values measured in number of
chunks.

See neondatabase/cloud#25170 for more.
---
 compute/etc/neon_collector.jsonnet                |  1 +
 compute/etc/sql_exporter/lfc_chunk_size.libsonnet | 10 ++++++++++
 compute/etc/sql_exporter/lfc_chunk_size.sql       |  1 +
 pgxn/neon/file_cache.c                            |  4 ++++
 4 files changed, 16 insertions(+)
 create mode 100644 compute/etc/sql_exporter/lfc_chunk_size.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_chunk_size.sql

diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index f8f4cab63b..da2b86d542 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -29,6 +29,7 @@
     import 'sql_exporter/lfc_approximate_working_set_size.libsonnet',
     import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet',
     import 'sql_exporter/lfc_cache_size_limit.libsonnet',
+    import 'sql_exporter/lfc_chunk_size.libsonnet',
     import 'sql_exporter/lfc_hits.libsonnet',
     import 'sql_exporter/lfc_misses.libsonnet',
     import 'sql_exporter/lfc_used.libsonnet',
diff --git a/compute/etc/sql_exporter/lfc_chunk_size.libsonnet b/compute/etc/sql_exporter/lfc_chunk_size.libsonnet
new file mode 100644
index 0000000000..bbe56f869f
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_chunk_size.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_chunk_size',
+  type: 'gauge',
+  help: 'LFC chunk size, measured in 8KiB pages',
+  key_labels: null,
+  values: [
+    'lfc_chunk_size_pages',
+  ],
+  query: importstr 'sql_exporter/lfc_chunk_size.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_chunk_size.sql b/compute/etc/sql_exporter/lfc_chunk_size.sql
new file mode 100644
index 0000000000..0905870064
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_chunk_size.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_chunk_size_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_chunk_size_pages';
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index f6a577abfc..9f0a877b07 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -1369,6 +1369,10 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			if (lfc_ctl)
 				value = lfc_ctl->limit;
 			break;
+		case 8:
+			key = "file_cache_chunk_size_pages";
+			value = BLOCKS_PER_CHUNK;
+			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
 	}

From 2de3629b88ac8e68bdb5d236796c866e9b1bba25 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 5 Mar 2025 16:02:44 -0500
Subject: [PATCH 47/61] test(pageserver): use reldirv2 by default in regress
 tests (#11081)

## Problem

For pg_regress test, we do both v1 and v2; for all the rest, we default
to v2.

part of https://github.com/neondatabase/neon/issues/9516

## Summary of changes

Use reldir v2 across test cases by default.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs           | 15 ++++++----
 pageserver/src/tenant.rs                      |  9 ++++--
 .../src/tenant/remote_timeline_client.rs      |  8 ++++-
 test_runner/fixtures/neon_fixtures.py         |  3 ++
 test_runner/regress/test_pg_regress.py        | 29 ++++++++++++++-----
 5 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index e663060d17..8bcc6d58ec 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1964,14 +1964,12 @@ impl DatadirModification<'_> {
                 .context("deserialize db")?
         };
 
-        // Add the new relation to the rel directory entry, and write it back
-        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            return Err(RelationError::AlreadyExists);
-        }
-
         let v2_enabled = self.maybe_enable_rel_size_v2()?;
 
         if v2_enabled {
+            if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
+                return Err(RelationError::AlreadyExists);
+            }
             let sparse_rel_dir_key =
                 rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
             // check if the rel_dir_key exists in v2
@@ -2006,6 +2004,10 @@ impl DatadirModification<'_> {
             self.pending_directory_entries
                 .push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
         } else {
+            // Add the new relation to the rel directory entry, and write it back
+            if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
+                return Err(RelationError::AlreadyExists);
+            }
             if !dbdir_exists {
                 self.pending_directory_entries
                     .push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
@@ -2019,6 +2021,7 @@ impl DatadirModification<'_> {
                 )),
             );
         }
+
         // Put size
         let size_key = rel_size_to_key(rel);
         let buf = nblocks.to_le_bytes();
@@ -2141,7 +2144,7 @@ impl DatadirModification<'_> {
                     // Remove entry from relation size cache
                     self.tline.remove_cached_rel_size(&rel_tag);
 
-                    // Delete size entry, as well as all blocks
+                    // Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage.
                     self.delete(rel_key_range(rel_tag));
                 }
             }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 3694381078..c78d15c9b5 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2451,6 +2451,7 @@ impl Tenant {
             create_guard,
             initdb_lsn,
             None,
+            None,
         )
         .await
     }
@@ -2782,6 +2783,7 @@ impl Tenant {
                     timeline_create_guard,
                     initdb_lsn,
                     None,
+                    None,
                 )
                 .await
             }
@@ -4869,6 +4871,7 @@ impl Tenant {
                 timeline_create_guard,
                 start_lsn + 1,
                 Some(Arc::clone(src_timeline)),
+                Some(src_timeline.get_rel_size_v2_status()),
             )
             .await?;
 
@@ -5142,6 +5145,7 @@ impl Tenant {
                 timeline_create_guard,
                 pgdata_lsn,
                 None,
+                None,
             )
             .await?;
 
@@ -5220,13 +5224,14 @@ impl Tenant {
         create_guard: TimelineCreateGuard,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
+        rel_size_v2_status: Option<RelSizeMigration>,
     ) -> anyhow::Result<UninitializedTimeline<'a>> {
         let tenant_shard_id = self.tenant_shard_id;
 
         let resources = self.build_timeline_resources(new_timeline_id);
         resources
             .remote_client
-            .init_upload_queue_for_empty_remote(new_metadata)?;
+            .init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?;
 
         let timeline_struct = self
             .create_timeline_struct(
@@ -5238,7 +5243,7 @@ impl Tenant {
                 CreateTimelineCause::Load,
                 create_guard.idempotency.clone(),
                 None,
-                None,
+                rel_size_v2_status,
             )
             .context("Failed to create timeline data structure")?;
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 2ca482ca43..a784a05972 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -437,9 +437,13 @@ impl RemoteTimelineClient {
 
     /// Initialize the upload queue for the case where the remote storage is empty,
     /// i.e., it doesn't have an `IndexPart`.
+    ///
+    /// `rel_size_v2_status` needs to be carried over during branching, and that's why
+    /// it's passed in here.
     pub fn init_upload_queue_for_empty_remote(
         &self,
         local_metadata: &TimelineMetadata,
+        rel_size_v2_status: Option<RelSizeMigration>,
     ) -> anyhow::Result<()> {
         // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
         // certainly no point in starting more upload tasks than this.
@@ -449,7 +453,9 @@ impl RemoteTimelineClient {
             .as_ref()
             .map_or(0, |r| r.concurrency_limit());
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
+        let initialized_queue =
+            upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
+        initialized_queue.dirty.rel_size_migration = rel_size_v2_status;
         self.update_remote_physical_size_gauge(None);
         info!("initialized upload queue as empty");
         Ok(())
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ef9d8cb46f..8e3277a34a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1197,6 +1197,9 @@ class NeonEnv:
                     config.pageserver_default_tenant_config_compaction_algorithm
                 )
 
+            tenant_config = ps_cfg.setdefault("tenant_config", {})
+            tenant_config["rel_size_v2_enabled"] = True  # Enable relsize_v2 by default in tests
+
             if self.pageserver_remote_storage is not None:
                 ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
                     self.pageserver_remote_storage
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index d2a78b16e4..1d9f385358 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -5,7 +5,7 @@ from __future__ import annotations
 
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import pytest
 from fixtures.log_helper import log
@@ -118,10 +118,20 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End
         pageserver.http_client().timeline_gc(shard, env.initial_timeline, None)
 
 
+def patch_tenant_conf(tenant_conf: dict[str, Any], reldir_type: str) -> dict[str, Any]:
+    tenant_conf = tenant_conf.copy()
+    if reldir_type == "v2":
+        tenant_conf["rel_size_v2_enabled"] = "true"
+    else:
+        tenant_conf["rel_size_v2_enabled"] = "false"
+    return tenant_conf
+
+
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
 @pytest.mark.timeout(3000)  # Contains many sub-tests, is slow in debug builds
 @pytest.mark.parametrize("shard_count", [None, 4])
+@pytest.mark.parametrize("reldir_type", ["v1", "v2"])
 def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
@@ -130,6 +140,7 @@ def test_pg_regress(
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: int | None,
+    reldir_type: str,
 ):
     DBNAME = "regression"
 
@@ -142,7 +153,7 @@ def test_pg_regress(
 
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     env = neon_env_builder.init_start(
-        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type),
         initial_tenant_shard_count=shard_count,
     )
 
@@ -196,6 +207,7 @@ def test_pg_regress(
 #
 @pytest.mark.timeout(1500)  # Contains many sub-tests, is slow in debug builds
 @pytest.mark.parametrize("shard_count", [None, 4])
+@pytest.mark.parametrize("reldir_type", ["v1", "v2"])
 def test_isolation(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
@@ -204,6 +216,7 @@ def test_isolation(
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: int | None,
+    reldir_type: str,
 ):
     DBNAME = "isolation_regression"
 
@@ -211,7 +224,8 @@ def test_isolation(
         neon_env_builder.num_pageservers = shard_count
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     env = neon_env_builder.init_start(
-        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+        initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type),
+        initial_tenant_shard_count=shard_count,
     )
 
     # Connect to postgres and create a database called "regression".
@@ -267,6 +281,7 @@ def test_isolation(
 # Run extra Neon-specific pg_regress-based tests. The tests and their
 # schedule file are in the sql_regress/ directory.
 @pytest.mark.parametrize("shard_count", [None, 4])
+@pytest.mark.parametrize("reldir_type", ["v1", "v2"])
 def test_sql_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
@@ -275,6 +290,7 @@ def test_sql_regress(
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: int | None,
+    reldir_type: str,
 ):
     DBNAME = "regression"
 
@@ -282,7 +298,8 @@ def test_sql_regress(
         neon_env_builder.num_pageservers = shard_count
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     env = neon_env_builder.init_start(
-        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+        initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type),
+        initial_tenant_shard_count=shard_count,
     )
 
     # Connect to postgres and create a database called "regression".
@@ -345,9 +362,7 @@ def test_tx_abort_with_many_relations(
     """
 
     env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "rel_size_v2_enabled": "true" if reldir_type == "v2" else "false",
-        }
+        initial_tenant_conf=patch_tenant_conf({}, reldir_type),
     )
     ep = env.endpoints.create_start(
         "main",

From 78b322f616a711e40ae8babc4b013782fb12a99a Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 5 Mar 2025 16:43:16 -0500
Subject: [PATCH 48/61] rfc: add 041-rel-sparse-keyspace (#10412)

Based on the PoC patch I've done in #10316, I'd like to put an RFC in
advance to ensure everyone is on the same page, and start incrementally
port the code to the main branch.

https://github.com/neondatabase/neon/issues/9516


[Rendered](https://github.com/neondatabase/neon/blob/skyzh/rfc-041-rel-sparse-keyspace/docs/rfcs/041-rel-sparse-keyspace.md)

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 docs/rfcs/041-rel-sparse-keyspace.md | 201 +++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 docs/rfcs/041-rel-sparse-keyspace.md

diff --git a/docs/rfcs/041-rel-sparse-keyspace.md b/docs/rfcs/041-rel-sparse-keyspace.md
new file mode 100644
index 0000000000..03e68bd5c1
--- /dev/null
+++ b/docs/rfcs/041-rel-sparse-keyspace.md
@@ -0,0 +1,201 @@
+# Sparse Keyspace for Relation Directories
+
+## Summary
+
+This is an RFC describing a new storage strategy for storing relation directories.
+
+## Motivation
+
+Postgres maintains a directory structure for databases and relations. In Neon, we store these information
+by serializing the directory data in a single key (see `pgdatadir_mapping.rs`).
+
+```rust
+// DbDir:
+// 00 00000000 00000000 00000000 00   00000000
+
+// RelDir:
+// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
+```
+
+We have a dedicated structure on the ingestion path to serialize the relation directory into this single key.
+
+```rust
+#[derive(Debug, Serialize, Deserialize, Default)]
+pub(crate) struct RelDirectory {
+    // Set of relations that exist. (relfilenode, forknum)
+    //
+    // TODO: Store it as a btree or radix tree or something else that spans multiple
+    // key-value pairs, if you have a lot of relations
+    pub(crate) rels: HashSet<(Oid, u8)>,
+}
+```
+
+The current codebase has the following three access patterns for the relation directory.
+
+1. Check if a relation exists.
+2. List all relations.
+3. Create/drop a relation.
+
+For (1), we currently have to get the reldir key, deserialize it, and check whether the relation exists in the
+hash set. For (2), we get the reldir key and the hash set. For (3), we need first to get
+and deserialize the key, add the new relation record to the hash set, and then serialize it and write it back.
+
+If we have 100k relations in a database, we would have a 100k-large hash set. Then, every
+relation created and dropped would have deserialized and serialized this 100k-large hash set. This makes the
+relation create/drop process to be quadratic. When we check if a relation exists in the ingestion path,
+we would have to deserialize this super big 100k-large key before checking if a single relation exists.
+
+In this RFC, we will propose a new way to store the reldir data in the sparse keyspace and propose how
+to seamlessly migrate users to use the new keyspace.
+
+The PoC patch is implemented in [PR10316](https://github.com/neondatabase/neon/pull/10316).
+
+## Key Mapping
+
+We will use the recently introduced sparse keyspace to store actual data. Sparse keyspace was proposed in
+[038-aux-file-v2.md](038-aux-file-v2.md). The original reldir has one single value of `HashSet<(Oid, u8)>`
+for each of the databases (identified as `spcnode, dbnode`). We encode the `Oid` (`relnode, forknum`),
+into the key.
+
+```plain
+(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> deleted
+(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> exists
+```
+
+Assume all reldir data are stored in this new keyspace; the 3 reldir operations we mentioned before can be
+implemented as follows.
+
+1. Check if a relation exists: check if the key maps to "exists".
+2. List all relations: scan the sprase keyspace over the `rel_dir_key_prefix`. Extract relnode and forknum from the key.
+3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation. The delete tombstone will
+   be removed during image layer generation upon compaction.
+
+Note that "exists" and "deleted" will be encoded as a single byte as two variants of an enum.
+The mapping is implemented as `rel_tag_sparse_key` in the PoC patch.
+
+## Changes to Sparse Keyspace
+
+Previously, we only used sparse keyspaces for the aux files, which did not carry over when branching. The reldir
+information needs to be preserved from the parent branch to the child branch. Therefore, the read path needs
+to be updated accordingly to accommodate such "inherited sparse keys". This is done in
+[PR#10313](https://github.com/neondatabase/neon/pull/10313).
+
+## Coexistence of the Old and New Keyspaces
+
+Migrating to the new keyspace will be done gradually: when we flip a config item to enable the new reldir keyspace, the
+ingestion path will start to write to the new keyspace and the old reldir data will be kept in the old one. The read
+path needs to combine the data from both keyspaces.
+
+Theoretically, we could do a rewrite at the startup time that scans all relation directories and copies that data into the
+new keyspace. However, this could take a long time, especially if we have thousands of tenants doing the migration
+process simultaneously after the pageserver restarts. Therefore, we propose the coexistence strategy so that the
+migration can happen seamlessly and imposes no potential downtime for the user.
+
+With the coexistence assumption, the 3 reldir operations will be implemented as follows:
+
+1. Check if a relation exists
+   - Check the new keyspace if the key maps to any value. If it maps to "exists" or "deleted", directly
+    return it to the user.
+   - Otherwise, deserialize the old reldir key and get the result.
+2. List all relations: scan the sparse keyspace over the `rel_dir_key_prefix` and deserialize the old reldir key.
+   Combine them to obtain the final result.
+3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation into the new keyspace.
+   - We assume no overwrite of relations will happen (i.e., the user won't create a relation at the same Oid). This will be implemented as a runtime check.
+   - For relation creation, we add `sparse_reldir_tableX -> exists` to the keyspace.
+   - For relation drop, we first check if the relation is recorded in the old keyspace. If yes, we deserialize the old reldir key,
+    remove the relation, and then write it back. Otherwise, we put `sparse_reldir_tableX -> deleted` to the keyspace.
+   - The delete tombstone will be removed during image layer generation upon compaction.
+
+This process ensures that the transition will not introduce any downtime and all new updates are written to the new keyspace. The total
+amount of data in the storage would be `O(relations_modifications)` and we can guarantee `O(current_relations)` after compaction.
+There could be some relations that exist in the old reldir key for a long time. Refer to the "Full Migration" section on how to deal
+with them. Plus, for relation modifications, it will have `O(old_relations)` complexity until we do the full migration, which gives
+us `O(1)` complexity after fully opt-in the sparse keyspace.
+
+The process also implies that a relation will only exists either in the old reldir key or in the new sparse keyspace. It is not possible
+to have a table to be recorded in the old reldir key while later having a delete tombstone for it in the sparse keyspace at any LSN.
+
+We will introduce a config item and an index_part record to record the current status of the migration process.
+
+- Config item `enable_reldir_v2`: controls whether the ingestion path writes the reldir info into the new keyspace.
+- `index_part.json` field `reldir_v2_status`: whether the timeline has written any key into the new reldir keyspace.
+
+If `enable_reldir_v2` is set to `true` and the timeline ingests the first key into the new reldir keyspace, it will update
+`index_part.json` to set `reldir_v2_status` to `Status::Migrating`. Even if `enable_reldir_v2` gets flipped back to
+`false` (i.e., when the pageserver restarts and such config isn't persisted), the read/write path will still
+read/write to the new keyspace to avoid data inconsistency. This also indicates that the migration is one-way only:
+once v2 is enabled, the user cannot go back to v1.
+
+## Next Steps
+
+### Full Migration
+
+This won't be implemented in the project's first phase but might be implemented in the future. Having both v1 and
+v2 existing in the system would force us to keep the code to deserialize the old reldir key forever. To entirely deprecate this
+code path, we must ensure the timeline has no old reldir data.
+
+We can trigger a special image layer generation process at the gc-horizon. The generated image layers will cover several keyspaces:
+the old reldir key in each of the databases, and the new reldir sparse keyspace. It will remove the old reldir key while
+copying them into the corresponding keys in the sparse keyspace in the resulting image. This special process happens in
+the background during compaction. For example, assume this special process is triggered at LSN 0/180. The `create_image_layers`
+process discovers the following keys at this LSN.
+
+```plain
+db1/reldir_key -> (table 1, table 2, table 3)
+...db1 rel keys
+db2/reldir_key -> (table 4, table 5, table 6)
+...db2 rel keys
+sparse_reldir_db2_table7 -> exists
+sparse_reldir_db1_table8 -> deleted
+```
+
+It will generate the following keys:
+
+```plain
+db1/reldir_key -> () # we have to keep the key because it is part of `collect_keyspace`.
+...db1 rel keys
+db2/reldir_key -> ()
+...db2 rel keys
+
+-- start image layer for the sparse keyspace at sparse_reldir_prefix at LSN 0/180
+sparse_reldir_db1_table1 -> exists
+sparse_reldir_db1_table2 -> exists
+sparse_reldir_db1_table3 -> exists
+sparse_reldir_db2_table4 -> exists
+sparse_reldir_db2_table5 -> exists
+sparse_reldir_db2_table6 -> exists
+sparse_reldir_db2_table7 -> exists
+-- end image layer for the sparse keyspace at sparse_reldir_prefix+1
+
+# The `sparse_reldir_db1_table8` key gets dropped as part of the image layer generation code for the sparse keyspace.
+# Note that the read path will stop reading if a key is not found in the image layer covering the key range so there
+# are no correctness issue.
+```
+
+We must verify that no pending modifications to the old reldir exists in the delta/image layers above the gc-horizon before
+we start this process (We can do a vectored read to get the full key history of the old reldir key and ensure there are no more images
+above the gc-horizon). Otherwise, it will violate the property that "a relation will only exists either in the old reldir key or
+in the new sparse keyspace". After we run this migration process, we can mark `reldir_v2_status` in the `index_part.json` to
+`Status::Migrated`, and the read path won't need to read from the old reldir anymore. Once the status is set to `Migrated`, we
+don't need to add the key into `collect_keyspace` and therefore all of them will be removed from all future image layers.
+
+The migration process can be proactively triggered across all attached/detached tenants to help us fully remove the old reldir code.
+
+### Consolidate Relation Size Keys
+
+We have relsize at the end of all relation nodes.
+
+```plain
+// RelSize:
+// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
+```
+
+This means that computing logical size requires us to do several single-key gets across the keyspace,
+potentially requiring downloading many layer files. We could consolidate them into a single
+keyspace, improving logical size calculation performance.
+
+### Migrate DBDir Keys
+
+We assume the number of databases created by the users will be small, and therefore, the current way
+of storing the database directory would be acceptable. In the future, we could also migrate DBDir keys into
+the sparse keyspace to support large amount of databases.

From f343537e4dba622114011154a121df7fb8d57afc Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 6 Mar 2025 09:18:28 +0000
Subject: [PATCH 49/61] proxy: Small adjustments to json logging (#11107)

* Remove callsite identifier registration on span creation. Forgot to
remove from last PR. Was part of alternative idea.
* Move "spans" object to right after "fields", so event and span fields
are listed together.
---
 proxy/src/logging.rs | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index b2e95a109f..6f9845fd6e 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -286,17 +286,16 @@ where
 
     /// Registers a SpanFields instance as span extension.
     fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
-        let csid = self.callsite_id(attrs.metadata().callsite());
         let span = ctx.span(id).expect("span must exist");
         let fields = SpanFields::default();
         fields.record_fields(attrs);
+
         // This could deadlock when there's a panic somewhere in the tracing
         // event handling and a read or write guard is still held. This includes
         // the OTel subscriber.
         let mut exts = span.extensions_mut();
 
         exts.insert(fields);
-        exts.insert(csid);
     }
 
     fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
@@ -565,6 +564,13 @@ impl EventFormatter {
                 )?;
             }
 
+            let spans = SerializableSpans {
+                ctx,
+                callsite_ids,
+                extract: ExtractedSpanFields::<'_, F>::new(extract_fields),
+            };
+            serializer.serialize_entry("spans", &spans)?;
+
             // TODO: thread-local cache?
             let pid = std::process::id();
             // Skip adding pid 1 to reduce noise for services running in containers.
@@ -614,15 +620,9 @@ impl EventFormatter {
                 }
             }
 
-            let stack = SerializableSpans {
-                ctx,
-                callsite_ids,
-                fields: ExtractedSpanFields::<'_, F>::new(extract_fields),
-            };
-            serializer.serialize_entry("spans", &stack)?;
-
-            if stack.fields.has_values() {
-                serializer.serialize_entry("extract", &stack.fields)?;
+            if spans.extract.has_values() {
+                // TODO: add fields from event, too?
+                serializer.serialize_entry("extract", &spans.extract)?;
             }
 
             serializer.end()
@@ -911,7 +911,7 @@ where
 {
     ctx: &'a Context<'ctx, Span>,
     callsite_ids: &'a papaya::HashMap<callsite::Identifier, CallsiteId>,
-    fields: ExtractedSpanFields<'a, F>,
+    extract: ExtractedSpanFields<'a, F>,
 }
 
 impl<Span, const F: usize> serde::ser::Serialize for SerializableSpans<'_, '_, Span, F>
@@ -940,7 +940,7 @@ where
 
                 serializer.serialize_value(&SerializableSpanFields {
                     span: &span,
-                    fields: &self.fields,
+                    extract: &self.extract,
                 })?;
             }
         }
@@ -955,7 +955,7 @@ where
     Span: for<'lookup> LookupSpan<'lookup>,
 {
     span: &'a SpanRef<'span, Span>,
-    fields: &'a ExtractedSpanFields<'a, F>,
+    extract: &'a ExtractedSpanFields<'a, F>,
 }
 
 impl<Span, const F: usize> serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F>
@@ -973,7 +973,7 @@ where
             for (name, value) in &data.fields.pin() {
                 serializer.serialize_entry(name, value)?;
                 // TODO: replace clone with reference, if possible.
-                self.fields.set(name, value.clone());
+                self.extract.set(name, value.clone());
             }
         }
 

From 16b8a3f598ff3b7b22f4411fdf55c088ecf6c84e Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 6 Mar 2025 09:55:41 +0000
Subject: [PATCH 50/61] Update Jinja2 to 3.1.6 (#11109)

https://github.com/neondatabase/neon/security/dependabot/89
---
 poetry.lock    | 8 ++++----
 pyproject.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index ba3b0535e4..03aa543b06 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1414,14 +1414,14 @@ files = [
 
 [[package]]
 name = "jinja2"
-version = "3.1.5"
+version = "3.1.6"
 description = "A very fast and expressive template engine."
 optional = false
 python-versions = ">=3.7"
 groups = ["main"]
 files = [
-    {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"},
-    {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"},
+    {file = "jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67"},
+    {file = "jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d"},
 ]
 
 [package.dependencies]
@@ -3820,4 +3820,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "9711c5479c867fa614ce3d352f1bbc63dba1cb2376d347f96fbeda6f512ee308"
+content-hash = "010ffce959bb256880ab5a267048c182e4612b3151f9a94e3bf5d3a7807962fe"
diff --git a/pyproject.toml b/pyproject.toml
index c6e5073bcd..e7f5c62bd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ requests = "^2.32.3"
 pytest-xdist = "^3.3.1"
 asyncpg = "^0.30.0"
 aiopg = "^1.4.0"
-Jinja2 = "^3.1.5"
+Jinja2 = "^3.1.6"
 types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.20241019"
 boto3 = "^1.34.11"

From ab7efe9e47952292f463027a007f4386c605348e Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 6 Mar 2025 11:23:48 +0100
Subject: [PATCH 51/61] pageserver: add amortized read amp metrics (#11093)

## Problem

In a batch, `pageserver_layers_per_read_global` counts all layer visits
towards every read in the batch, since this directly affects the
observed latency of the read. However, this doesn't give a good picture
of the amortized read amplification due to batching.

## Summary of changes

Add two more global read amp metrics:

* `pageserver_layers_per_read_batch_global`: number of layers visited
per batch.
* `pageserver_layers_per_read_amortized_global`: number of layers
divided by reads in a batch.
---
 pageserver/src/metrics.rs         | 25 +++++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs | 25 ++++++++++++++++++-------
 2 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index eb8a9b8e24..b5b4e5c91f 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -143,6 +143,29 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_per_read_batch_global",
+        "Layers visited to serve a single read batch (read amplification), regardless of number of reads.",
+        vec![
+            1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
+        ],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_per_read_amortized_global",
+        "Layers visited to serve a single read (read amplification). Amortized across a batch: \
+            all visited layers are divided by number of reads.",
+        vec![
+            1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
+        ],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
     register_histogram!(
@@ -4074,6 +4097,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
     // histograms
     [
         &LAYERS_PER_READ_GLOBAL,
+        &LAYERS_PER_READ_BATCH_GLOBAL,
+        &LAYERS_PER_READ_AMORTIZED_GLOBAL,
         &DELTAS_PER_READ_GLOBAL,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7ed7910732..f646e621d3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -99,7 +99,8 @@ use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate,
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::l0_flush::{self, L0FlushGlobalState};
 use crate::metrics::{
-    DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
+    DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL,
+    LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
 };
 use crate::page_service::TenantManagerTypes;
 use crate::pgdatadir_mapping::{
@@ -1330,10 +1331,6 @@ impl Timeline {
         // (this is a requirement, not a bug). Skip updating the metric in these cases
         // to avoid infinite results.
         if !results.is_empty() {
-            // Record the total number of layers visited towards each key in the batch. While some
-            // layers may not intersect with a given read, and the cost of layer visits are
-            // amortized across the batch, each visited layer contributes directly to the observed
-            // latency for every read in the batch, which is what we care about.
             if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
                 static LOG_PACER: Lazy<Mutex<RateLimit>> =
                     Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
@@ -1348,9 +1345,23 @@ impl Timeline {
                 });
             }
 
+            // Records the number of layers visited in a few different ways:
+            //
+            // * LAYERS_PER_READ: all layers count towards every read in the batch, because each
+            //   layer directly affects its observed latency.
+            //
+            // * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch
+            //   layer visits and access cost.
+            //
+            // * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized
+            //   read amplification after batching.
+            let layers_visited = layers_visited as f64;
+            let avg_layers_visited = layers_visited / results.len() as f64;
+            LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited);
             for _ in &results {
-                self.metrics.layers_per_read.observe(layers_visited as f64);
-                LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
+                self.metrics.layers_per_read.observe(layers_visited);
+                LAYERS_PER_READ_GLOBAL.observe(layers_visited);
+                LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited);
             }
         }
 

From 43cea0df91f29509f69e1083cf1b68645e18c8f9 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 6 Mar 2025 11:23:25 +0000
Subject: [PATCH 52/61] pageserver: allow for unit test stress test (#11112)

## Problem

I like using `cargo stress` to hammer on a test, but it doesn't work out
of the box because it does parallel runs by default and tests always use
the same repo dir.

## Summary of changes

Add an uuid to the test repo dir when generating it.
---
 Cargo.lock               | 1 +
 pageserver/Cargo.toml    | 1 +
 pageserver/src/config.rs | 4 +++-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7aa9c53e7e..67f0fa4b77 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4303,6 +4303,7 @@ dependencies = [
  "tracing",
  "url",
  "utils",
+ "uuid",
  "wal_decoder",
  "walkdir",
  "workspace_hack",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 7330856be4..fa16090170 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -98,6 +98,7 @@ criterion.workspace = true
 hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
 indoc.workspace = true
+uuid.workspace = true
 
 [[bench]]
 name = "bench_layer_map"
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 33ae8c4790..582019d96f 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -491,7 +491,9 @@ impl PageServerConf {
     #[cfg(test)]
     pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
         let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
-        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
+
+        let test_id = uuid::Uuid::new_v4();
+        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}"))
     }
 
     pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {

From 5ceb8c994d5c22737ef6c2c70349525f13cc225c Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 6 Mar 2025 11:25:02 +0000
Subject: [PATCH 53/61] pageserver: mark unarchival heatmap layers as cold 
 (#11098)

## Problem

On unarchival, we update the previous heatmap with all visible layers.
When the primary generates a new heatmap it includes all those layers,
so the secondary will download them. Since they're not actually resident
on the primary (we didn't call the warm up API), they'll never be
evicted,
so they remain in the heatmap.

We want these layers in the heatmap, since we might wish to warm-up an
unarchived timeline after a shard migration. However, we don't want them
to be downloaded on the secondary until we've warmed up the primary.

## Summary of Changes

Include these layers in the heatmap and mark them as cold. All heatmap
operations act on non-cold layers apart from the attached location
warming up API,
which will download the cold layers. Once the cold layers are downloaded
on the primary,
they'll be included in the next heatmap as hot and the secondary starts
fetching them too.
---
 pageserver/src/tenant/secondary/downloader.rs | 14 ++---
 pageserver/src/tenant/secondary/heatmap.rs    | 24 ++++++--
 pageserver/src/tenant/storage_layer/layer.rs  |  4 +-
 pageserver/src/tenant/timeline.rs             | 28 ++++++----
 .../timeline/heatmap_layers_downloader.rs     |  4 +-
 .../regress/test_pageserver_secondary.py      | 55 ++++++++++++++++---
 6 files changed, 96 insertions(+), 33 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index a13b9323ac..5f3a0932c4 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -869,8 +869,7 @@ impl<'a> TenantDownloader<'a> {
                 let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
 
                 let layers_in_heatmap = heatmap_timeline
-                    .layers
-                    .iter()
+                    .hot_layers()
                     .map(|l| (&l.name, l.metadata.generation))
                     .collect::<HashSet<_>>();
                 let layers_on_disk = timeline_state
@@ -1015,7 +1014,8 @@ impl<'a> TenantDownloader<'a> {
         // Accumulate updates to the state
         let mut touched = Vec::new();
 
-        for layer in timeline.layers {
+        let timeline_id = timeline.timeline_id;
+        for layer in timeline.into_hot_layers() {
             if self.secondary_state.cancel.is_cancelled() {
                 tracing::debug!("Cancelled -- dropping out of layer loop");
                 return (Err(UpdateError::Cancelled), touched);
@@ -1040,7 +1040,7 @@ impl<'a> TenantDownloader<'a> {
             }
 
             match self
-                .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
+                .download_layer(tenant_shard_id, &timeline_id, layer, ctx)
                 .await
             {
                 Ok(Some(layer)) => touched.push(layer),
@@ -1148,7 +1148,7 @@ impl<'a> TenantDownloader<'a> {
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
         let timeline_id = timeline.timeline_id;
 
-        tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
+        tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count());
 
         let (result, touched) = self
             .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
@@ -1316,11 +1316,11 @@ async fn init_timeline_state(
     // As we iterate through layers found on disk, we will look up their metadata from this map.
     // Layers not present in metadata will be discarded.
     let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
-        heatmap.layers.iter().map(|l| (&l.name, l)).collect();
+        heatmap.hot_layers().map(|l| (&l.name, l)).collect();
 
     let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
         if let Some(last_heatmap) = last_heatmap {
-            last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
+            last_heatmap.hot_layers().map(|l| (&l.name, l)).collect()
         } else {
             HashMap::new()
         };
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 4a938e9095..6dbb3f091f 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -42,7 +42,7 @@ pub(crate) struct HeatMapTimeline {
     #[serde_as(as = "DisplayFromStr")]
     pub(crate) timeline_id: TimelineId,
 
-    pub(crate) layers: Vec<HeatMapLayer>,
+    layers: Vec<HeatMapLayer>,
 }
 
 #[serde_as]
@@ -53,8 +53,10 @@ pub(crate) struct HeatMapLayer {
 
     #[serde_as(as = "TimestampSeconds<i64>")]
     pub(crate) access_time: SystemTime,
-    // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
-    // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
+
+    #[serde(default)]
+    pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
+                           // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
 }
 
 impl HeatMapLayer {
@@ -62,11 +64,13 @@ impl HeatMapLayer {
         name: LayerName,
         metadata: LayerFileMetadata,
         access_time: SystemTime,
+        cold: bool,
     ) -> Self {
         Self {
             name,
             metadata,
             access_time,
+            cold,
         }
     }
 }
@@ -78,6 +82,18 @@ impl HeatMapTimeline {
             layers,
         }
     }
+
+    pub(crate) fn into_hot_layers(self) -> impl Iterator<Item = HeatMapLayer> {
+        self.layers.into_iter().filter(|l| !l.cold)
+    }
+
+    pub(crate) fn hot_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
+        self.layers.iter().filter(|l| !l.cold)
+    }
+
+    pub(crate) fn all_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
+        self.layers.iter()
+    }
 }
 
 pub(crate) struct HeatMapStats {
@@ -92,7 +108,7 @@ impl HeatMapTenant {
             layers: 0,
         };
         for timeline in &self.timelines {
-            for layer in &timeline.layers {
+            for layer in timeline.hot_layers() {
                 stats.layers += 1;
                 stats.bytes += layer.metadata.file_size;
             }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index bde7fbc1f9..247092bf45 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1563,10 +1563,10 @@ impl LayerInner {
 
         self.access_stats.record_residence_event();
 
-        self.status.as_ref().unwrap().send_replace(Status::Evicted);
-
         *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
 
+        self.status.as_ref().unwrap().send_replace(Status::Evicted);
+
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f646e621d3..4483ecfe94 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3648,7 +3648,7 @@ impl Timeline {
         let visible_non_resident = match previous_heatmap.as_deref() {
             Some(PreviousHeatmap::Active {
                 heatmap, read_at, ..
-            }) => Some(heatmap.layers.iter().filter_map(|hl| {
+            }) => Some(heatmap.all_layers().filter_map(|hl| {
                 let desc: PersistentLayerDesc = hl.name.clone().into();
                 let layer = guard.try_get_from_key(&desc.key())?;
 
@@ -3664,7 +3664,7 @@ impl Timeline {
                     return None;
                 }
 
-                Some((desc, hl.metadata.clone(), hl.access_time))
+                Some((desc, hl.metadata.clone(), hl.access_time, hl.cold))
             })),
             Some(PreviousHeatmap::Obsolete) => None,
             None => None,
@@ -3680,6 +3680,7 @@ impl Timeline {
                         layer.layer_desc().clone(),
                         layer.metadata(),
                         last_activity_ts,
+                        false, // these layers are not cold
                     ))
                 }
                 LayerVisibilityHint::Covered => {
@@ -3706,12 +3707,14 @@ impl Timeline {
         // Sort layers in order of which to download first.  For a large set of layers to download, we
         // want to prioritize those layers which are most likely to still be in the resident many minutes
         // or hours later:
+        // - Cold layers go last for convenience when a human inspects the heatmap.
         // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
         //   only exist for a few minutes before being compacted into L1s.
         // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
         //   the layer is likely to be covered by an image layer during compaction.
-        layers.sort_by_key(|(desc, _meta, _atime)| {
+        layers.sort_by_key(|(desc, _meta, _atime, cold)| {
             std::cmp::Reverse((
+                *cold,
                 !LayerMap::is_l0(&desc.key_range, desc.is_delta),
                 desc.lsn_range.end,
             ))
@@ -3719,7 +3722,9 @@ impl Timeline {
 
         let layers = layers
             .into_iter()
-            .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
+            .map(|(desc, meta, atime, cold)| {
+                HeatMapLayer::new(desc.layer_name(), meta, atime, cold)
+            })
             .collect();
 
         Some(HeatMapTimeline::new(self.timeline_id, layers))
@@ -3739,6 +3744,7 @@ impl Timeline {
                 name: vl.layer_desc().layer_name(),
                 metadata: vl.metadata(),
                 access_time: now,
+                cold: true,
             };
             heatmap_layers.push(hl);
         }
@@ -7040,6 +7046,7 @@ mod tests {
 
     use pageserver_api::key::Key;
     use pageserver_api::value::Value;
+    use std::iter::Iterator;
     use tracing::Instrument;
     use utils::id::TimelineId;
     use utils::lsn::Lsn;
@@ -7053,8 +7060,8 @@ mod tests {
     use crate::tenant::{PreviousHeatmap, Timeline};
 
     fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
-        assert_eq!(lhs.layers.len(), rhs.layers.len());
-        let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
+        assert_eq!(lhs.all_layers().count(), rhs.all_layers().count());
+        let lhs_rhs = lhs.all_layers().zip(rhs.all_layers());
         for (l, r) in lhs_rhs {
             assert_eq!(l.name, r.name);
             assert_eq!(l.metadata, r.metadata);
@@ -7132,10 +7139,11 @@ mod tests {
         assert_eq!(heatmap.timeline_id, timeline.timeline_id);
 
         // L0 should come last
-        assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
+        let heatmap_layers = heatmap.all_layers().collect::<Vec<_>>();
+        assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name());
 
         let mut last_lsn = Lsn::MAX;
-        for layer in &heatmap.layers {
+        for layer in heatmap_layers {
             // Covered layer should be omitted
             assert!(layer.name != covered_delta.layer_name());
 
@@ -7264,7 +7272,7 @@ mod tests {
             .expect("Infallible while timeline is not shut down");
 
         // Both layers should be in the heatmap
-        assert!(!heatmap.layers.is_empty());
+        assert!(heatmap.all_layers().count() > 0);
 
         // Now simulate a migration.
         timeline
@@ -7290,7 +7298,7 @@ mod tests {
             .await
             .expect("Infallible while timeline is not shut down");
 
-        assert!(post_eviction_heatmap.layers.is_empty());
+        assert_eq!(post_eviction_heatmap.all_layers().count(), 0);
         assert!(matches!(
             timeline.previous_heatmap.load().as_deref(),
             Some(PreviousHeatmap::Obsolete)
diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
index 6209b63de4..11df232a10 100644
--- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
+++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
@@ -61,11 +61,11 @@ impl HeatmapLayersDownloader {
 
                 tracing::info!(
                     resident_size=%timeline.resident_physical_size(),
-                    heatmap_layers=%heatmap.layers.len(),
+                    heatmap_layers=%heatmap.all_layers().count(),
                     "Starting heatmap layers download"
                 );
 
-                let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
+                let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
                     |layer| {
                         let ctx = ctx.attached_child();
                         let tl = timeline.clone();
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index ab0f00db1c..b9e2934505 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -955,6 +955,17 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
 
         raise RuntimeError(f"No heatmap for timeline: {tlid}")
 
+    def count_timeline_heatmap_layers(tlid) -> tuple[int, int]:
+        cold, hot = 0, 0
+        layers = timeline_heatmap(tlid)["layers"]
+        for layer in layers:
+            if layer["cold"]:
+                cold += 1
+            else:
+                hot += 1
+
+        return cold, hot
+
     env.storage_controller.allowed_errors.extend(
         [
             ".*Timed out.*downloading layers.*",
@@ -988,13 +999,19 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
         TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id
     )
 
-    def all_layers_downloaded(expected_layer_count: int):
-        local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id))
+    def all_layers_downloaded(node, expected_layer_count: int):
+        local_layers_count = len(node.list_layers(tenant_id, timeline_id))
 
         log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}")
         assert local_layers_count >= expected_layer_count
 
-    wait_until(lambda: all_layers_downloaded(after_migration_heatmap_layers_count))
+    def no_layers_downloaded(node):
+        local_layers_count = len(node.list_layers(tenant_id, timeline_id))
+
+        log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}")
+        assert local_layers_count == 0
+
+    wait_until(lambda: all_layers_downloaded(ps_secondary, after_migration_heatmap_layers_count))
 
     # Read everything and make sure that we're not downloading anything extra.
     # All hot layers should be available locally now.
@@ -1047,13 +1064,35 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
     wait_until(lambda: check_archival_state(TimelineArchivalState.UNARCHIVED, child_timeline_id))
 
     ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
-    log.info(f"Parent timeline heatmap size: {len(timeline_heatmap(timeline_id)['layers'])}")
-    log.info(f"Child timeline heatmap size: {len(timeline_heatmap(child_timeline_id)['layers'])}")
 
-    expected_locally = len(timeline_heatmap(timeline_id)["layers"])
-    assert expected_locally > 0
+    parent_cold, parent_hot = count_timeline_heatmap_layers(timeline_id)
+    child_cold, child_hot = count_timeline_heatmap_layers(child_timeline_id)
+
+    log.info(f"Parent timeline heatmap size: cold={parent_cold}, hot={parent_hot}")
+    log.info(f"Child timeline heatmap size: cold={child_cold}, hot={child_hot}")
+
+    # All layers in the heatmap should come from the generation on unarchival.
+    # Hence, they should be cold.
+    assert parent_cold > 0
+    assert parent_hot == 0
+
+    expected_locally = parent_cold
 
     env.storage_controller.download_heatmap_layers(
         TenantShardId(tenant_id, shard_number=0, shard_count=0), child_timeline_id, recurse=True
     )
-    wait_until(lambda: all_layers_downloaded(expected_locally))
+    wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally))
+
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "off")])
+
+    # The uploaded heatmap is still empty. Clean up all layers on the secondary.
+    ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100)
+    wait_until(lambda: no_layers_downloaded(ps_attached))
+
+    # Upload a new heatmap. The previously cold layers become hot since they're now resident.
+    ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
+
+    # Warm up the current secondary.
+    ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100)
+    wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally))

From 4b77807de9bc2ea550aff812303f44b71e64aefd Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 6 Mar 2025 16:32:17 +0100
Subject: [PATCH 54/61] fix(compute/sql_exporter): Ignore invalid DBs when
 collecting size (#11097)

## Problem

Original Slack discussion:
https://neondb.slack.com/archives/C04DGM6SMTM/p1739915430147169

TL;DR in Postgres, it's totally normal to have 'invalid' DBs (state
after the interrupted `DROP DATABASE`). Yet, some of our metrics
collected with `sql_exporter` try to get the size of such invalid DBs.

Typical log lines:
```
time=2025-03-05T16:30:32.368Z level=ERROR source=promhttp.go:52 msg="Error gathering metrics" error="[from Gatherer #1] [collector=neon_collector,query=pg_stats_userdb] pq: [NEON_SMGR] [reqid 0] could not read db size of db 173228 from page server at lsn 0/44A0E8C0"
time=2025-03-05T16:30:32.369Z level=ERROR source=promhttp.go:52 msg="Error gathering metrics" error="[from Gatherer #1] [collector=neon_collector,query=db_total_size] pq: [NEON_SMGR] [reqid 0] could not read db size of db 173228 from page server at lsn 0/44A0E8C0"
```

## Summary of changes

Ignore invalid DBs in these two metrics -- `pg_stats_userdb` and
`db_total_size`
---
 compute/etc/sql_exporter/db_total_size.sql   |  6 +++++-
 compute/etc/sql_exporter/pg_stats_userdb.sql | 16 +++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql
index 9cbbdfd8a3..fe0360ab5c 100644
--- a/compute/etc/sql_exporter/db_total_size.sql
+++ b/compute/etc/sql_exporter/db_total_size.sql
@@ -1 +1,5 @@
-SELECT sum(pg_database_size(datname)) AS total FROM pg_database;
+SELECT sum(pg_database_size(datname)) AS total
+FROM pg_database
+-- Ignore invalid databases, as we will likely have problems with
+-- getting their size from the Pageserver.
+WHERE datconnlimit != -2;
diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql
index 00ada87370..12e6c4ae59 100644
--- a/compute/etc/sql_exporter/pg_stats_userdb.sql
+++ b/compute/etc/sql_exporter/pg_stats_userdb.sql
@@ -1,10 +1,20 @@
 -- We export stats for 10 non-system databases. Without this limit it is too
 -- easy to abuse the system by creating lots of databases.
 
-SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
-  tup_updated AS updated, tup_deleted AS deleted, datname
+SELECT pg_database_size(datname) AS db_size,
+  deadlocks,
+  tup_inserted AS inserted,
+  tup_updated AS updated,
+  tup_deleted AS deleted,
+  datname
 FROM pg_stat_database
 WHERE datname IN (
   SELECT datname FROM pg_database
-  WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
+  -- Ignore invalid databases, as we will likely have problems with
+  -- getting their size from the Pageserver.
+  WHERE datconnlimit != -2
+    AND datname <> 'postgres'
+    AND NOT datistemplate
+  ORDER BY oid
+  LIMIT 10
 );

From 11334a2cdb8cf7e5c1c0245bbd61b45c54ac7d69 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 6 Mar 2025 11:44:00 -0500
Subject: [PATCH 55/61] feat(pageserver): more statistics for gc-compaction
 (#11103)

## Problem

part of https://github.com/neondatabase/neon/issues/9114

## Summary of changes

* Add timers for each phase of the gc-compaction.
* Add a final ratio computation to directly show the garbage collection
ratio in the logs.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 107 +++++++++++++++++--
 1 file changed, 99 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 17f7d96e5e..8fa79ddb22 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -7,6 +7,7 @@
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
+use std::time::Instant;
 
 use super::layer_manager::LayerManager;
 use super::{
@@ -724,17 +725,41 @@ struct CompactionStatisticsNumSize {
 
 #[derive(Debug, Serialize, Default)]
 pub struct CompactionStatistics {
+    /// Delta layer visited (maybe compressed, physical size)
     delta_layer_visited: CompactionStatisticsNumSize,
+    /// Image layer visited (maybe compressed, physical size)
     image_layer_visited: CompactionStatisticsNumSize,
+    /// Delta layer produced (maybe compressed, physical size)
     delta_layer_produced: CompactionStatisticsNumSize,
+    /// Image layer produced (maybe compressed, physical size)
     image_layer_produced: CompactionStatisticsNumSize,
-    num_delta_layer_discarded: usize,
-    num_image_layer_discarded: usize,
+    /// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
+    delta_layer_discarded: CompactionStatisticsNumSize,
+    /// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
+    image_layer_discarded: CompactionStatisticsNumSize,
     num_unique_keys_visited: usize,
+    /// Delta visited (uncompressed, original size)
     wal_keys_visited: CompactionStatisticsNumSize,
+    /// Image visited (uncompressed, original size)
     image_keys_visited: CompactionStatisticsNumSize,
+    /// Delta produced (uncompressed, original size)
     wal_produced: CompactionStatisticsNumSize,
+    /// Image produced (uncompressed, original size)
     image_produced: CompactionStatisticsNumSize,
+
+    // Time spent in each phase
+    time_acquire_lock_secs: f64,
+    time_analyze_secs: f64,
+    time_download_layer_secs: f64,
+    time_main_loop_secs: f64,
+    time_final_phase_secs: f64,
+    time_total_secs: f64,
+
+    // Summary
+    /// Ratio of the key-value size before/after gc-compaction.
+    uncompressed_size_ratio: f64,
+    /// Ratio of the physical size before/after gc-compaction.
+    physical_size_ratio: f64,
 }
 
 impl CompactionStatistics {
@@ -784,11 +809,13 @@ impl CompactionStatistics {
         self.image_produced.num += 1;
         self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
     }
-    fn discard_delta_layer(&mut self) {
-        self.num_delta_layer_discarded += 1;
+    fn discard_delta_layer(&mut self, original_size: u64) {
+        self.delta_layer_discarded.num += 1;
+        self.delta_layer_discarded.size += original_size;
     }
-    fn discard_image_layer(&mut self) {
-        self.num_image_layer_discarded += 1;
+    fn discard_image_layer(&mut self, original_size: u64) {
+        self.image_layer_discarded.num += 1;
+        self.image_layer_discarded.size += original_size;
     }
     fn produce_delta_layer(&mut self, size: u64) {
         self.delta_layer_produced.num += 1;
@@ -798,6 +825,19 @@ impl CompactionStatistics {
         self.image_layer_produced.num += 1;
         self.image_layer_produced.size += size;
     }
+    fn finalize(&mut self) {
+        let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
+        let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
+        self.uncompressed_size_ratio =
+            original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
+        let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
+        let produced_physical_size = self.image_layer_produced.size
+            + self.delta_layer_produced.size
+            + self.image_layer_discarded.size
+            + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
+        self.physical_size_ratio =
+            original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
+    }
 }
 
 #[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
@@ -2595,6 +2635,9 @@ impl Timeline {
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
 
+        let timer = Instant::now();
+        let begin_timer = timer;
+
         let gc_lock = async {
             tokio::select! {
                 guard = self.gc_lock.lock() => Ok(guard),
@@ -2602,6 +2645,9 @@ impl Timeline {
             }
         };
 
+        let time_acquire_lock = timer.elapsed();
+        let timer = Instant::now();
+
         let gc_lock = crate::timed(
             gc_lock,
             "acquires gc lock",
@@ -2791,6 +2837,9 @@ impl Timeline {
             has_data_below,
         );
 
+        let time_analyze = timer.elapsed();
+        let timer = Instant::now();
+
         for layer in &job_desc.selected_layers {
             debug!("read layer: {}", layer.layer_desc().key());
         }
@@ -2893,6 +2942,9 @@ impl Timeline {
         .context("failed to create filter iterator")
         .map_err(CompactionError::Other)?;
 
+        let time_download_layer = timer.elapsed();
+        let timer = Instant::now();
+
         // Step 2: Produce images+deltas.
         let mut accumulated_values = Vec::new();
         let mut last_key: Option<Key> = None;
@@ -3114,6 +3166,9 @@ impl Timeline {
             .map_err(CompactionError::Other)?;
         // end: move the above part to the loop body
 
+        let time_main_loop = timer.elapsed();
+        let timer = Instant::now();
+
         let mut rewrote_delta_layers = Vec::new();
         for (key, writers) in delta_layer_rewriters {
             if let Some(delta_writer_before) = writers.before {
@@ -3178,6 +3233,13 @@ impl Timeline {
         let mut keep_layers = HashSet::new();
         let produced_delta_layers_len = produced_delta_layers.len();
         let produced_image_layers_len = produced_image_layers.len();
+
+        let layer_selection_by_key = job_desc
+            .selected_layers
+            .iter()
+            .map(|l| (l.layer_desc().key(), l.layer_desc().clone()))
+            .collect::<HashMap<_, _>>();
+
         for action in produced_delta_layers {
             match action {
                 BatchWriterResult::Produced(layer) => {
@@ -3191,8 +3253,16 @@ impl Timeline {
                     if cfg!(debug_assertions) {
                         info!("discarded delta layer: {}", l);
                     }
+                    if let Some(layer_desc) = layer_selection_by_key.get(&l) {
+                        stat.discard_delta_layer(layer_desc.file_size());
+                    } else {
+                        tracing::warn!(
+                            "discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
+                            l
+                        );
+                        stat.discard_delta_layer(0);
+                    }
                     keep_layers.insert(l);
-                    stat.discard_delta_layer();
                 }
             }
         }
@@ -3201,6 +3271,9 @@ impl Timeline {
                 "produced rewritten delta layer: {}",
                 layer.layer_desc().key()
             );
+            // For now, we include rewritten delta layer size in the "produce_delta_layer". We could
+            // make it a separate statistics in the future.
+            stat.produce_delta_layer(layer.layer_desc().file_size());
         }
         compact_to.extend(rewrote_delta_layers);
         for action in produced_image_layers {
@@ -3212,8 +3285,16 @@ impl Timeline {
                 }
                 BatchWriterResult::Discarded(l) => {
                     debug!("discarded image layer: {}", l);
+                    if let Some(layer_desc) = layer_selection_by_key.get(&l) {
+                        stat.discard_image_layer(layer_desc.file_size());
+                    } else {
+                        tracing::warn!(
+                            "discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
+                            l
+                        );
+                        stat.discard_image_layer(0);
+                    }
                     keep_layers.insert(l);
-                    stat.discard_image_layer();
                 }
             }
         }
@@ -3261,6 +3342,16 @@ impl Timeline {
 
         layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
 
+        let time_final_phase = timer.elapsed();
+
+        stat.time_final_phase_secs = time_final_phase.as_secs_f64();
+        stat.time_main_loop_secs = time_main_loop.as_secs_f64();
+        stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
+        stat.time_download_layer_secs = time_download_layer.as_secs_f64();
+        stat.time_analyze_secs = time_analyze.as_secs_f64();
+        stat.time_total_secs = begin_timer.elapsed().as_secs_f64();
+        stat.finalize();
+
         info!(
             "gc-compaction statistics: {}",
             serde_json::to_string(&stat)

From 3bb318a2959b38cdf6fb6b074432144926312ada Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 6 Mar 2025 18:47:54 +0100
Subject: [PATCH 56/61] run periodic page bench more frequently to simplify
 bi-secting regressions (#11121)

## Problem

When periodic pagebench runs only once a day a lot of commits can be in
between a good run and a regression.

## Summary of changes

Run the workflow every 3 hours
---
 .github/workflows/periodic_pagebench.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index 0622faba33..f854bf3212 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central
 on:
   schedule:
     # * is a special character in YAML so you have to quote this string
-    #          ┌───────────── minute (0 - 59)
-    #          │ ┌───────────── hour (0 - 23)
-    #          │ │ ┌───────────── day of the month (1 - 31)
-    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
-    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '0 18 * * *' # Runs at 6 PM UTC every day
+    #        ┌───────────── minute (0 - 59)
+    #        │   ┌───────────── hour (0 - 23)
+    #        │   │ ┌───────────── day of the month (1 - 31)
+    #        │   │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #        │   │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron: '0 */3 * * *' # Runs every 3 hours
   workflow_dispatch: # Allows manual triggering of the workflow
     inputs:
       commit_hash:

From 3dee29eb0069b2d31a5fb3a810d7ec4c8966f6af Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 6 Mar 2025 19:14:19 +0000
Subject: [PATCH 57/61] Spawn rsyslog from neonvm (#11111)

then configure it from compute_ctl.
to make it more robust in case of restarts and rsyslogd crashes.
---
 compute/compute-node.Dockerfile               | 10 ++--
 compute/vm-image-spec-bookworm.yaml           | 16 ++++++
 compute/vm-image-spec-bullseye.yaml           | 15 ++++++
 compute_tools/src/compute.rs                  |  4 +-
 ...nf => compute_audit_rsyslog_template.conf} |  0
 compute_tools/src/rsyslog.rs                  | 49 +++++++++----------
 6 files changed, 60 insertions(+), 34 deletions(-)
 rename compute_tools/src/config_template/{compute_rsyslog_template.conf => compute_audit_rsyslog_template.conf} (100%)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 61d9d59f79..6e46185e36 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1980,12 +1980,10 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo
 RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
 
 # rsyslog config permissions
-RUN chown postgres:postgres /etc/rsyslog.conf && \
-    touch /etc/compute_rsyslog.conf && \
-    chown -R postgres:postgres /etc/compute_rsyslog.conf && \
-    # directory for rsyslogd pid file
-    mkdir /var/run/rsyslogd && \
-    chown -R postgres:postgres /var/run/rsyslogd
+# directory for rsyslogd pid file
+RUN mkdir /var/run/rsyslogd && \
+    chown -R postgres:postgres /var/run/rsyslogd && \
+    chown -R postgres:postgres /etc/rsyslog.d/
 
 
 ENV LANG=en_US.utf8
diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index 74ff3a8b6d..e6707381ac 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -39,6 +39,10 @@ commands:
     user: nobody
     sysvInitAction: respawn
     shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
+  - name: rsyslogd
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
@@ -69,6 +73,12 @@ files:
           }
           memory {}
       }
+# Create dummy rsyslog config, because it refuses to start without at least one action configured.
+# compute_ctl will rewrite this file with the actual configuration, if needed.
+  - filename: compute_rsyslog.conf
+    content: |
+      *.*    /dev/null
+      $IncludeConfig /etc/rsyslog.d/*.conf
 build: |
   # Build cgroup-tools
   #
@@ -132,6 +142,12 @@ merge: |
   RUN set -e \
       && chmod 0644 /etc/cgconfig.conf
 
+
+  COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
+  RUN chmod 0666 /etc/compute_rsyslog.conf
+  RUN chmod 0666 /var/log/
+
+
   COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
   COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
   COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index c1787ab018..c89ee112dc 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -39,6 +39,10 @@ commands:
     user: nobody
     sysvInitAction: respawn
     shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
+  - name: rsyslogd
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
@@ -69,6 +73,12 @@ files:
           }
           memory {}
       }
+# Create dummy rsyslog config, because it refuses to start without at least one action configured.
+# compute_ctl will rewrite this file with the actual configuration, if needed.
+  - filename: compute_rsyslog.conf
+    content: |
+      *.*    /dev/null
+      $IncludeConfig /etc/rsyslog.d/*.conf
 build: |
   # Build cgroup-tools
   #
@@ -128,6 +138,11 @@ merge: |
   RUN set -e \
       && chmod 0644 /etc/cgconfig.conf
 
+  COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
+  RUN chmod 0666 /etc/compute_rsyslog.conf
+  RUN chmod 0666 /var/log/
+
+
   COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
   COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
   COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index ddcfe12330..fed97ee2b2 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -37,7 +37,7 @@ use crate::logger::startup_context_from_env;
 use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use crate::monitor::launch_monitor;
 use crate::pg_helpers::*;
-use crate::rsyslog::configure_and_start_rsyslog;
+use crate::rsyslog::configure_audit_rsyslog;
 use crate::spec::*;
 use crate::swap::resize_swap;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -689,7 +689,7 @@ impl ComputeNode {
             let log_directory_path = Path::new(&self.params.pgdata).join("log");
             // TODO: make this more robust
             // now rsyslog starts once and there is no monitoring or restart if it fails
-            configure_and_start_rsyslog(
+            configure_audit_rsyslog(
                 log_directory_path.to_str().unwrap(),
                 "hipaa",
                 &remote_endpoint,
diff --git a/compute_tools/src/config_template/compute_rsyslog_template.conf b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf
similarity index 100%
rename from compute_tools/src/config_template/compute_rsyslog_template.conf
rename to compute_tools/src/config_template/compute_audit_rsyslog_template.conf
diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs
index 776ff14644..c8fba4fdcd 100644
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -21,40 +21,34 @@ fn get_rsyslog_pid() -> Option<String> {
     }
 }
 
-// Start rsyslogd with the specified configuration file
-// If it is already running, do nothing.
-fn start_rsyslog(rsyslog_conf_path: &str) -> Result<()> {
-    let pid = get_rsyslog_pid();
-    if let Some(pid) = pid {
-        info!("rsyslogd is already running with pid: {}", pid);
-        return Ok(());
-    }
+// Restart rsyslogd to apply the new configuration.
+// This is necessary, because there is no other way to reload the rsyslog configuration.
+//
+// Rsyslogd shouldn't lose any messages, because of the restart,
+// because it tracks the last read position in the log files
+// and will continue reading from that position.
+// TODO: test it properly
+//
+fn restart_rsyslog() -> Result<()> {
+    let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
+    info!("rsyslogd is running with pid: {}, restart it", old_pid);
 
-    let _ = Command::new("/usr/sbin/rsyslogd")
-        .arg("-f")
-        .arg(rsyslog_conf_path)
-        .arg("-i")
-        .arg("/var/run/rsyslogd/rsyslogd.pid")
+    // kill it to restart
+    let _ = Command::new("pkill")
+        .arg("rsyslogd")
         .output()
-        .context("Failed to start rsyslogd")?;
-
-    // Check that rsyslogd is running
-    if let Some(pid) = get_rsyslog_pid() {
-        info!("rsyslogd started successfully with pid: {}", pid);
-    } else {
-        return Err(anyhow::anyhow!("Failed to start rsyslogd"));
-    }
+        .context("Failed to stop rsyslogd")?;
 
     Ok(())
 }
 
-pub fn configure_and_start_rsyslog(
+pub fn configure_audit_rsyslog(
     log_directory: &str,
     tag: &str,
     remote_endpoint: &str,
 ) -> Result<()> {
     let config_content: String = format!(
-        include_str!("config_template/compute_rsyslog_template.conf"),
+        include_str!("config_template/compute_audit_rsyslog_template.conf"),
         log_directory = log_directory,
         tag = tag,
         remote_endpoint = remote_endpoint
@@ -62,7 +56,7 @@ pub fn configure_and_start_rsyslog(
 
     info!("rsyslog config_content: {}", config_content);
 
-    let rsyslog_conf_path = "/etc/compute_rsyslog.conf";
+    let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf";
     let mut file = OpenOptions::new()
         .create(true)
         .write(true)
@@ -71,10 +65,13 @@ pub fn configure_and_start_rsyslog(
 
     file.write_all(config_content.as_bytes())?;
 
-    info!("rsyslog configuration added successfully. Starting rsyslogd");
+    info!(
+        "rsyslog configuration file {} added successfully. Starting rsyslogd",
+        rsyslog_conf_path
+    );
 
     // start the service, using the configuration
-    start_rsyslog(rsyslog_conf_path)?;
+    restart_rsyslog()?;
 
     Ok(())
 }

From a485022300005491a62e3371764f029367380be0 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 6 Mar 2025 20:54:29 +0100
Subject: [PATCH 58/61] fix(compute_ctl): Properly escape identifiers inside
 PL/pgSQL blocks (#11045)

## Problem

In f37eeb56, I properly escaped the identifier, but I haven't noticed
that the resulting string is used in the `format('...')`, so it needs
additional escaping. Yet, after looking at it closer and with Heikki's
and Tristan's help, it appeared to be that it's a full can of worms and
we have problems all over the code in places where we use PL/pgSQL
blocks.

## Summary of changes

Add a new `pg_quote_dollar()` helper to deal with it, as dollar-quoting
of strings seems to be the only robust way to escape strings in dynamic
PL/pgSQL blocks. We mimic the Postgres' `pg_get_functiondef` logic here
[1].

While on it, I added more tests and caught a couple of more bugs with
string escaping:

1. `get_existing_dbs_async()` was wrapping `owner` in additional
double-quotes if it contained special characters
2. `construct_superuser_query()` was flawed in even more ways than the
rest of the code. It wasn't realistic to fix it quickly, but after
thinking about it more, I realized that we could drop most of it
altogether. IIUC, it was added as some sort of migration, probably back
when we haven't had migrations yet. So all the complicated code was
needed to properly update existing roles and DBs. In the current Neon,
this code only runs before we create the very first DB and role. When we
create roles and DBs, all `neon_superuser` grants are added in the
different places. So the worst thing that could happen is that there is
an ancient branch somewhere, so when users poke it, they will realize
that not all Neon features work as expected. Yet, the fix is simple and
self-serve -- just create a new role via UI or API, and it will get a
proper `neon_superuser` grant.

[1]:
https://github.com/postgres/postgres/blob/8b49392b270b4ac0b9f5c210e2a503546841e832/src/backend/utils/adt/ruleutils.c#L3153

Closes neondatabase/cloud#25048
---
 compute_tools/src/compute.rs                  |  73 ---------
 compute_tools/src/pg_helpers.rs               |  34 +++-
 compute_tools/src/spec_apply.rs               |  49 +++---
 .../src/sql/create_neon_superuser.sql         |   8 +
 compute_tools/src/sql/drop_subscriptions.sql  |   4 +-
 .../sql/pre_drop_role_revoke_privileges.sql   |  10 +-
 .../src/sql/set_public_schema_owner.sql       |   7 +-
 .../src/sql/unset_template_for_drop_dbs.sql   |   8 +-
 compute_tools/tests/pg_helpers_tests.rs       |  17 ++
 test_runner/regress/test_compute_catalog.py   | 150 +++++++++++++-----
 10 files changed, 210 insertions(+), 150 deletions(-)
 create mode 100644 compute_tools/src/sql/create_neon_superuser.sql

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index fed97ee2b2..354528e2cd 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -297,79 +297,6 @@ struct StartVmMonitorResult {
     vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
 }
 
-pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String {
-    let roles = spec
-        .cluster
-        .roles
-        .iter()
-        .map(|r| escape_literal(&r.name))
-        .collect::<Vec<_>>();
-
-    let dbs = spec
-        .cluster
-        .databases
-        .iter()
-        .map(|db| escape_literal(&db.name))
-        .collect::<Vec<_>>();
-
-    let roles_decl = if roles.is_empty() {
-        String::from("roles text[] := NULL;")
-    } else {
-        format!(
-            r#"
-               roles text[] := ARRAY(SELECT rolname
-                                     FROM pg_catalog.pg_roles
-                                     WHERE rolname IN ({}));"#,
-            roles.join(", ")
-        )
-    };
-
-    let database_decl = if dbs.is_empty() {
-        String::from("dbs text[] := NULL;")
-    } else {
-        format!(
-            r#"
-               dbs text[] := ARRAY(SELECT datname
-                                   FROM pg_catalog.pg_database
-                                   WHERE datname IN ({}));"#,
-            dbs.join(", ")
-        )
-    };
-
-    // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
-    // (see https://www.postgresql.org/docs/current/ddl-priv.html)
-    let query = format!(
-        r#"
-            DO $$
-                DECLARE
-                    r text;
-                    {}
-                    {}
-                BEGIN
-                    IF NOT EXISTS (
-                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
-                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
-                        IF array_length(roles, 1) IS NOT NULL THEN
-                            EXECUTE format('GRANT neon_superuser TO %s',
-                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
-                            FOREACH r IN ARRAY roles LOOP
-                                EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
-                            END LOOP;
-                        END IF;
-                        IF array_length(dbs, 1) IS NOT NULL THEN
-                            EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
-                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
-                        END IF;
-                    END IF;
-                END
-            $$;"#,
-        roles_decl, database_decl,
-    );
-
-    query
-}
-
 impl ComputeNode {
     pub fn new(
         params: ComputeNodeParams,
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 5a2e305e1d..dd8d8e9b8b 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -186,15 +186,40 @@ impl DatabaseExt for Database {
 /// Postgres SQL queries and DATABASE_URL.
 pub trait Escaping {
     fn pg_quote(&self) -> String;
+    fn pg_quote_dollar(&self) -> (String, String);
 }
 
 impl Escaping for PgIdent {
     /// This is intended to mimic Postgres quote_ident(), but for simplicity it
     /// always quotes provided string with `""` and escapes every `"`.
     /// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
+    /// N.B. it's not useful for escaping identifiers that are used inside WHERE
+    /// clause, use `escape_literal()` instead.
     fn pg_quote(&self) -> String {
-        let result = format!("\"{}\"", self.replace('"', "\"\""));
-        result
+        format!("\"{}\"", self.replace('"', "\"\""))
+    }
+
+    /// This helper is intended to be used for dollar-escaping strings for usage
+    /// inside PL/pgSQL procedures. In addition to dollar-escaping the string,
+    /// it also returns a tag that is intended to be used inside the outer
+    /// PL/pgSQL procedure. If you do not need an outer tag, just discard it.
+    /// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`,
+    /// <https://github.com/postgres/postgres/blob/8b49392b270b4ac0b9f5c210e2a503546841e832/src/backend/utils/adt/ruleutils.c#L2924>
+    fn pg_quote_dollar(&self) -> (String, String) {
+        let mut tag: String = "".to_string();
+        let mut outer_tag = "x".to_string();
+
+        // Find the first suitable tag that is not present in the string.
+        // Postgres' max role/DB name length is 63 bytes, so even in the
+        // worst case it won't take long.
+        while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
+            tag += "x";
+            outer_tag = tag.clone() + "x";
+        }
+
+        let escaped = format!("${tag}${self}${tag}$");
+
+        (escaped, outer_tag)
     }
 }
 
@@ -226,10 +251,13 @@ pub async fn get_existing_dbs_async(
     // invalid state. See:
     //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
     let rowstream = client
+        // We use a subquery instead of a fancy `datdba::regrole::text AS owner`,
+        // because the latter automatically wraps the result in double quotes,
+        // if the role name contains special characters.
         .query_raw::<str, &String, &[String; 0]>(
             "SELECT
                 datname AS name,
-                datdba::regrole::text AS owner,
+                (SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
                 NOT datallowconn AS restrict_conn,
                 datconnlimit = - 2 AS invalid
             FROM
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index dbc02c8d02..e5f7aebbf8 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -13,16 +13,17 @@ use tokio_postgres::Client;
 use tokio_postgres::error::SqlState;
 use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
 
-use crate::compute::{ComputeNode, ComputeState, construct_superuser_query};
+use crate::compute::{ComputeNode, ComputeState};
 use crate::pg_helpers::{
-    DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal, get_existing_dbs_async,
+    DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
     get_existing_roles_async,
 };
 use crate::spec_apply::ApplySpecPhase::{
-    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
-    CreatePgauditlogtofileExtension, CreateSchemaNeon, CreateSuperUser, DisablePostgresDBPgAudit,
-    DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, HandleNeonExtension,
-    HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
+    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser,
+    CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon,
+    DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
+    HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
+    RunInEachDatabase,
 };
 use crate::spec_apply::PerDatabasePhase::{
     ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension,
@@ -187,7 +188,7 @@ impl ComputeNode {
             }
 
             for phase in [
-                CreateSuperUser,
+                CreateNeonSuperuser,
                 DropInvalidDatabases,
                 RenameRoles,
                 CreateAndAlterRoles,
@@ -468,7 +469,7 @@ pub enum PerDatabasePhase {
 
 #[derive(Clone, Debug)]
 pub enum ApplySpecPhase {
-    CreateSuperUser,
+    CreateNeonSuperuser,
     DropInvalidDatabases,
     RenameRoles,
     CreateAndAlterRoles,
@@ -595,14 +596,10 @@ async fn get_operations<'a>(
     apply_spec_phase: &'a ApplySpecPhase,
 ) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
     match apply_spec_phase {
-        ApplySpecPhase::CreateSuperUser => {
-            let query = construct_superuser_query(spec);
-
-            Ok(Box::new(once(Operation {
-                query,
-                comment: None,
-            })))
-        }
+        ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation {
+            query: include_str!("sql/create_neon_superuser.sql").to_string(),
+            comment: None,
+        }))),
         ApplySpecPhase::DropInvalidDatabases => {
             let mut ctx = ctx.write().await;
             let databases = &mut ctx.dbs;
@@ -736,14 +733,15 @@ async fn get_operations<'a>(
                         // We do not check whether the DB exists or not,
                         // Postgres will take care of it for us
                         "delete_db" => {
+                            let (db_name, outer_tag) = op.name.pg_quote_dollar();
                             // In Postgres we can't drop a database if it is a template.
                             // So we need to unset the template flag first, but it could
                             // be a retry, so we could've already dropped the database.
                             // Check that database exists first to make it idempotent.
                             let unset_template_query: String = format!(
                                 include_str!("sql/unset_template_for_drop_dbs.sql"),
-                                datname_str = escape_literal(&op.name),
-                                datname = &op.name.pg_quote()
+                                datname = db_name,
+                                outer_tag = outer_tag,
                             );
 
                             // Use FORCE to drop database even if there are active connections.
@@ -850,6 +848,8 @@ async fn get_operations<'a>(
                                 comment: None,
                             },
                             Operation {
+                                // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
+                                // (see https://www.postgresql.org/docs/current/ddl-priv.html)
                                 query: format!(
                                     "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
                                     db.name.pg_quote()
@@ -909,9 +909,11 @@ async fn get_operations<'a>(
                 PerDatabasePhase::DropLogicalSubscriptions => {
                     match &db {
                         DB::UserDB(db) => {
+                            let (db_name, outer_tag) = db.name.pg_quote_dollar();
                             let drop_subscription_query: String = format!(
                                 include_str!("sql/drop_subscriptions.sql"),
-                                datname_str = escape_literal(&db.name),
+                                datname_str = db_name,
+                                outer_tag = outer_tag,
                             );
 
                             let operations = vec![Operation {
@@ -950,6 +952,7 @@ async fn get_operations<'a>(
                                     DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
                                     DB::UserDB(db) => db.owner.pg_quote(),
                                 };
+                                let (escaped_role, outer_tag) = op.name.pg_quote_dollar();
 
                                 Some(vec![
                                     // This will reassign all dependent objects to the db owner
@@ -964,7 +967,9 @@ async fn get_operations<'a>(
                                     Operation {
                                         query: format!(
                                             include_str!("sql/pre_drop_role_revoke_privileges.sql"),
-                                            role_name = quoted,
+                                            // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
+                                            role_name = escaped_role,
+                                            outer_tag = outer_tag,
                                         ),
                                         comment: None,
                                     },
@@ -989,12 +994,14 @@ async fn get_operations<'a>(
                         DB::SystemDB => return Ok(Box::new(empty())),
                         DB::UserDB(db) => db,
                     };
+                    let (db_owner, outer_tag) = db.owner.pg_quote_dollar();
 
                     let operations = vec![
                         Operation {
                             query: format!(
                                 include_str!("sql/set_public_schema_owner.sql"),
-                                db_owner = db.owner.pg_quote()
+                                db_owner = db_owner,
+                                outer_tag = outer_tag,
                             ),
                             comment: None,
                         },
diff --git a/compute_tools/src/sql/create_neon_superuser.sql b/compute_tools/src/sql/create_neon_superuser.sql
new file mode 100644
index 0000000000..300645627b
--- /dev/null
+++ b/compute_tools/src/sql/create_neon_superuser.sql
@@ -0,0 +1,8 @@
+DO $$
+    BEGIN
+        IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
+        THEN
+            CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
+        END IF;
+    END
+$$;
diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql
index 03e8e158fa..f5d9420130 100644
--- a/compute_tools/src/sql/drop_subscriptions.sql
+++ b/compute_tools/src/sql/drop_subscriptions.sql
@@ -1,4 +1,4 @@
-DO $$
+DO ${outer_tag}$
 DECLARE
     subname TEXT;
 BEGIN
@@ -9,4 +9,4 @@ BEGIN
         EXECUTE format('DROP SUBSCRIPTION %I;', subname);
     END LOOP;
 END;
-$$;
+${outer_tag}$;
diff --git a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
index cdaa7071d3..4342650591 100644
--- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
+++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
@@ -1,6 +1,6 @@
 SET SESSION ROLE neon_superuser;
 
-DO $$
+DO ${outer_tag}$
 DECLARE
     schema TEXT;
     revoke_query TEXT;
@@ -16,13 +16,15 @@ BEGIN
         WHERE schema_name IN ('public')
     LOOP
         revoke_query := format(
-            'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
-            schema
+            'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY neon_superuser;',
+            schema,
+            -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
+            {role_name}
         );
 
         EXECUTE revoke_query;
     END LOOP;
 END;
-$$;
+${outer_tag}$;
 
 RESET ROLE;
diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql
index fd061a713e..dc502c6d2d 100644
--- a/compute_tools/src/sql/set_public_schema_owner.sql
+++ b/compute_tools/src/sql/set_public_schema_owner.sql
@@ -1,5 +1,4 @@
-DO
-$$
+DO ${outer_tag}$
     DECLARE
         schema_owner TEXT;
     BEGIN
@@ -16,8 +15,8 @@ $$
 
             IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
             THEN
-                ALTER SCHEMA public OWNER TO {db_owner};
+                EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
             END IF;
         END IF;
     END
-$$;
\ No newline at end of file
+${outer_tag}$;
\ No newline at end of file
diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql
index 6c4343a589..36dc648beb 100644
--- a/compute_tools/src/sql/unset_template_for_drop_dbs.sql
+++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql
@@ -1,12 +1,12 @@
-DO $$
+DO ${outer_tag}$
     BEGIN
         IF EXISTS(
             SELECT 1
             FROM pg_catalog.pg_database
-            WHERE datname = {datname_str}
+            WHERE datname = {datname}
         )
         THEN
-            ALTER DATABASE {datname} is_template false;
+            EXECUTE format('ALTER DATABASE %I is_template false', {datname});
         END IF;
     END
-$$;
\ No newline at end of file
+${outer_tag}$;
diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs
index 4961bc293d..f2d74ff384 100644
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -61,6 +61,23 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
         assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
     }
 
+    #[test]
+    fn ident_pg_quote_dollar() {
+        let test_cases = vec![
+            ("name", ("$$name$$", "x")),
+            ("name$$", ("$x$name$$$x$", "xx")),
+            ("name$$$", ("$x$name$$$$x$", "xx")),
+            ("name$$$$", ("$x$name$$$$$x$", "xx")),
+            ("name$x$", ("$xx$name$x$$xx$", "xxx")),
+        ];
+
+        for (input, expected) in test_cases {
+            let (escaped, tag) = PgIdent::from(input).pg_quote_dollar();
+            assert_eq!(escaped, expected.0);
+            assert_eq!(tag, expected.1);
+        }
+    }
+
     #[test]
     fn generic_options_search() {
         let generic_options: GenericOptions = Some(vec![
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index 3a08671bbf..ce655d22b5 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -5,34 +5,59 @@ import logging
 import requests
 from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
 
+TEST_ROLE_NAMES = [
+    {"name": "neondb_owner"},
+    {"name": "role with spaces"},
+    {"name": "role with%20spaces "},
+    {"name": "role with whitespaces	"},
+    {"name": "injective role with spaces'; SELECT pg_sleep(1000);"},
+    {"name": "role with #pound-sign and &ampersands=true"},
+    {"name": "role with emoji 🌍"},
+    {"name": "role \";with ';injections $$ $x$ $ %I !/\\&#@"},
+    {"name": '"role in double quotes"'},
+    {"name": "'role in single quotes'"},
+]
+
 TEST_DB_NAMES = [
     {
         "name": "neondb",
-        "owner": "cloud_admin",
+        "owner": "neondb_owner",
     },
     {
         "name": "db with spaces",
-        "owner": "cloud_admin",
+        "owner": "role with spaces",
     },
     {
         "name": "db with%20spaces ",
-        "owner": "cloud_admin",
+        "owner": "role with%20spaces ",
     },
     {
         "name": "db with whitespaces	",
-        "owner": "cloud_admin",
+        "owner": "role with whitespaces	",
     },
     {
-        "name": "injective db with spaces'; SELECT pg_sleep(10);",
-        "owner": "cloud_admin",
+        "name": "injective db with spaces'; SELECT pg_sleep(1000);",
+        "owner": "injective role with spaces'; SELECT pg_sleep(1000);",
     },
     {
         "name": "db with #pound-sign and &ampersands=true",
-        "owner": "cloud_admin",
+        "owner": "role with #pound-sign and &ampersands=true",
     },
     {
         "name": "db with emoji 🌍",
-        "owner": "cloud_admin",
+        "owner": "role with emoji 🌍",
+    },
+    {
+        "name": "db \";with ';injections $$ $x$ $ %I !/\\&#@",
+        "owner": "role \";with ';injections $$ $x$ $ %I !/\\&#@",
+    },
+    {
+        "name": '"db in double quotes"',
+        "owner": '"role in double quotes"',
+    },
+    {
+        "name": "'db in single quotes'",
+        "owner": "'role in single quotes'",
     },
 ]
 
@@ -52,6 +77,7 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
         **{
             "skip_pg_catalog_updates": False,
             "cluster": {
+                "roles": TEST_ROLE_NAMES,
                 "databases": TEST_DB_NAMES,
             },
         }
@@ -99,10 +125,10 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
         ), f"Expected 404 status code, but got {e.response.status_code}"
 
 
-def test_compute_create_databases(neon_simple_env: NeonEnv):
+def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
     """
-    Test that compute_ctl can create and work with databases with special
-    characters (whitespaces, %, tabs, etc.) in the name.
+    Test that compute_ctl can create and work with databases and roles
+    with special characters (whitespaces, %, tabs, etc.) in the name.
     """
     env = neon_simple_env
 
@@ -116,6 +142,7 @@ def test_compute_create_databases(neon_simple_env: NeonEnv):
         **{
             "skip_pg_catalog_updates": False,
             "cluster": {
+                "roles": TEST_ROLE_NAMES,
                 "databases": TEST_DB_NAMES,
             },
         }
@@ -139,6 +166,43 @@ def test_compute_create_databases(neon_simple_env: NeonEnv):
             assert len(curr_db) == 1
             assert curr_db[0] == db["name"]
 
+    for role in TEST_ROLE_NAMES:
+        with endpoint.cursor() as cursor:
+            cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = %s", (role["name"],))
+            catalog_role = cursor.fetchone()
+            assert catalog_role is not None
+            assert catalog_role[0] == role["name"]
+
+    delta_operations = []
+    for db in TEST_DB_NAMES:
+        delta_operations.append({"action": "delete_db", "name": db["name"]})
+    for role in TEST_ROLE_NAMES:
+        delta_operations.append({"action": "delete_role", "name": role["name"]})
+
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "roles": [],
+                "databases": [],
+            },
+            "delta_operations": delta_operations,
+        }
+    )
+    endpoint.reconfigure()
+
+    for db in TEST_DB_NAMES:
+        with endpoint.cursor() as cursor:
+            cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (db["name"],))
+            catalog_db = cursor.fetchone()
+            assert catalog_db is None
+
+    for role in TEST_ROLE_NAMES:
+        with endpoint.cursor() as cursor:
+            cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = %s", (role["name"],))
+            catalog_role = cursor.fetchone()
+            assert catalog_role is None
+
 
 def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
     """
@@ -150,17 +214,19 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
     # stuff into the spec.json file.
     endpoint = env.endpoints.create_start("main")
 
+    SUB_DB_NAME = "';subscriber_db $$ $x$ $;"
+    PUB_DB_NAME = "publisher_db"
     TEST_DB_NAMES = [
         {
             "name": "neondb",
             "owner": "cloud_admin",
         },
         {
-            "name": "subscriber_db",
+            "name": SUB_DB_NAME,
             "owner": "cloud_admin",
         },
         {
-            "name": "publisher_db",
+            "name": PUB_DB_NAME,
             "owner": "cloud_admin",
         },
     ]
@@ -177,47 +243,47 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
     )
     endpoint.reconfigure()
 
-    # connect to the publisher_db and create a publication
-    with endpoint.cursor(dbname="publisher_db") as cursor:
+    # Connect to the PUB_DB_NAME and create a publication
+    with endpoint.cursor(dbname=PUB_DB_NAME) as cursor:
         cursor.execute("CREATE PUBLICATION mypub FOR ALL TABLES")
         cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');")
         cursor.execute("CREATE TABLE t(a int)")
         cursor.execute("INSERT INTO t VALUES (1)")
         cursor.execute("CHECKPOINT")
 
-    # connect to the subscriber_db and create a subscription
-    # Note that we need to create subscription with
-    connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''")
-    with endpoint.cursor(dbname="subscriber_db") as cursor:
+    # Connect to the SUB_DB_NAME and create a subscription
+    # Note that we need to create subscription with the following connstr:
+    connstr = endpoint.connstr(dbname=PUB_DB_NAME).replace("'", "''")
+    with endpoint.cursor(dbname=SUB_DB_NAME) as cursor:
         cursor.execute("CREATE TABLE t(a int)")
         cursor.execute(
-            f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub  WITH (create_slot = false) "
+            f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub WITH (create_slot = false) "
         )
 
-    # wait for the subscription to be active
+    # Wait for the subscription to be active
     logical_replication_sync(
         endpoint,
         endpoint,
         "mysub",
-        sub_dbname="subscriber_db",
-        pub_dbname="publisher_db",
+        sub_dbname=SUB_DB_NAME,
+        pub_dbname=PUB_DB_NAME,
     )
 
     # Check that replication is working
-    with endpoint.cursor(dbname="subscriber_db") as cursor:
+    with endpoint.cursor(dbname=SUB_DB_NAME) as cursor:
         cursor.execute("SELECT * FROM t")
         rows = cursor.fetchall()
         assert len(rows) == 1
         assert rows[0][0] == 1
 
-    # drop the subscriber_db from the list
+    # Drop the SUB_DB_NAME from the list
     TEST_DB_NAMES_NEW = [
         {
             "name": "neondb",
             "owner": "cloud_admin",
         },
         {
-            "name": "publisher_db",
+            "name": PUB_DB_NAME,
             "owner": "cloud_admin",
         },
     ]
@@ -230,7 +296,7 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
                 "databases": TEST_DB_NAMES_NEW,
             },
             "delta_operations": [
-                {"action": "delete_db", "name": "subscriber_db"},
+                {"action": "delete_db", "name": SUB_DB_NAME},
                 # also test the case when we try to delete a non-existent database
                 # shouldn't happen in normal operation,
                 # but can occur when failed operations are retried
@@ -239,22 +305,22 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
         }
     )
 
-    logging.info("Reconfiguring the endpoint to drop the subscriber_db")
+    logging.info(f"Reconfiguring the endpoint to drop the {SUB_DB_NAME} database")
     endpoint.reconfigure()
 
-    # Check that the subscriber_db is dropped
+    # Check that the SUB_DB_NAME is dropped
     with endpoint.cursor() as cursor:
-        cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", ("subscriber_db",))
+        cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (SUB_DB_NAME,))
         catalog_db = cursor.fetchone()
         assert catalog_db is None
 
-    # Check that we can still connect to the publisher_db
-    with endpoint.cursor(dbname="publisher_db") as cursor:
+    # Check that we can still connect to the PUB_DB_NAME
+    with endpoint.cursor(dbname=PUB_DB_NAME) as cursor:
         cursor.execute("SELECT * FROM current_database()")
         curr_db = cursor.fetchone()
         assert curr_db is not None
         assert len(curr_db) == 1
-        assert curr_db[0] == "publisher_db"
+        assert curr_db[0] == PUB_DB_NAME
 
 
 def test_compute_drop_role(neon_simple_env: NeonEnv):
@@ -265,6 +331,7 @@ def test_compute_drop_role(neon_simple_env: NeonEnv):
     """
     env = neon_simple_env
     TEST_DB_NAME = "db_with_permissions"
+    TEST_GRANTEE = "'); MALFORMED SQL $$ $x$ $/;5%$ %I"
 
     endpoint = env.endpoints.create_start("main")
 
@@ -301,16 +368,18 @@ def test_compute_drop_role(neon_simple_env: NeonEnv):
         cursor.execute("create view test_view as select * from test_table")
 
     with endpoint.cursor(dbname=TEST_DB_NAME, user="neon") as cursor:
-        cursor.execute("create role readonly")
+        cursor.execute(f'create role "{TEST_GRANTEE}"')
         # We (`compute_ctl`) make 'neon' the owner of schema 'public' in the owned database.
         # Postgres has all sorts of permissions and grants that we may not handle well,
         # but this is the shortest repro grant for the issue
         # https://github.com/neondatabase/cloud/issues/13582
-        cursor.execute("grant select on all tables in schema public to readonly")
+        cursor.execute(f'grant select on all tables in schema public to "{TEST_GRANTEE}"')
 
     # Check that role was created
     with endpoint.cursor() as cursor:
-        cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'")
+        cursor.execute(
+            "SELECT rolname FROM pg_roles WHERE rolname = %(role)s", {"role": TEST_GRANTEE}
+        )
         role = cursor.fetchone()
         assert role is not None
 
@@ -318,7 +387,8 @@ def test_compute_drop_role(neon_simple_env: NeonEnv):
     # that may block our ability to drop the role.
     with endpoint.cursor(dbname=TEST_DB_NAME) as cursor:
         cursor.execute(
-            "select grantor from information_schema.role_table_grants where grantee = 'readonly'"
+            "select grantor from information_schema.role_table_grants where grantee = %(grantee)s",
+            {"grantee": TEST_GRANTEE},
         )
         res = cursor.fetchall()
         assert len(res) == 2, f"Expected 2 table grants, got {len(res)}"
@@ -332,7 +402,7 @@ def test_compute_drop_role(neon_simple_env: NeonEnv):
             "delta_operations": [
                 {
                     "action": "delete_role",
-                    "name": "readonly",
+                    "name": TEST_GRANTEE,
                 },
             ],
         }
@@ -341,7 +411,9 @@ def test_compute_drop_role(neon_simple_env: NeonEnv):
 
     # Check that role is dropped
     with endpoint.cursor() as cursor:
-        cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'")
+        cursor.execute(
+            "SELECT rolname FROM pg_roles WHERE rolname = %(role)s", {"role": TEST_GRANTEE}
+        )
         role = cursor.fetchone()
         assert role is None
 

From 50d883d516b620c7f91ce20adf777c4c724f0b37 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Thu, 6 Mar 2025 20:59:17 +0100
Subject: [PATCH 59/61] Add performance-correctness to the CODEOWNERS (#11124)

## Problem
After splitting teams it became a bit more complicated for the PerfCorr
team to work on tests changes.

## Summary of changes
1. Add PerfCorr team co-owners for `.github/` folder
2. Add PerCorr team as owner for `test_runner/` folder
---
 CODEOWNERS | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 71b5e65f94..ab6d2257a4 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,8 +1,9 @@
 # Autoscaling
 /libs/vm_monitor/ @neondatabase/autoscaling
 
-# DevProd
-/.github/ @neondatabase/developer-productivity
+# DevProd & PerfCorr
+/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness
+/test_runner/	@neondatabase/performance-correctness
 
 # Compute
 /pgxn/ @neondatabase/compute

From e825974a2d2791611c5278e36e67560f61100ebd Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 6 Mar 2025 15:30:11 -0500
Subject: [PATCH 60/61] feat(pageserver): yield gc-compaction to L0 compaction
 (#11120)

## Problem

Part of https://github.com/neondatabase/neon/issues/9114

## Summary of changes

gc-compaction could take a long time in some cases, for example, if the
job split heuristics is wrong and we selected a too large region for
compaction that can't be finished within a reasonable amount of time. We
will give up such tasks and yield to L0 compaction. Each gc-compaction
sub-compaction job is atomic and cannot be split further so we have to
give up (instead of storing a state and continue later as in image
creation).

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 62 ++++++++++++++++----
 1 file changed, 50 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8fa79ddb22..42b36f7252 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -20,6 +20,7 @@ use anyhow::{Context, anyhow};
 use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::FutureExt;
 use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
@@ -443,6 +444,7 @@ impl GcCompactionQueue {
             ));
         };
         let has_pending_tasks;
+        let mut yield_for_l0 = false;
         let Some((id, item)) = ({
             let mut guard = self.inner.lock().unwrap();
             if let Some((id, item)) = guard.queued.pop_front() {
@@ -492,13 +494,23 @@ impl GcCompactionQueue {
                         let mut guard = self.inner.lock().unwrap();
                         guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
                     }
-                    let _ = timeline.compact_with_options(cancel, options, ctx).await?;
+                    let compaction_result =
+                        timeline.compact_with_options(cancel, options, ctx).await?;
                     self.notify_and_unblock(id);
+                    if compaction_result == CompactionOutcome::YieldForL0 {
+                        yield_for_l0 = true;
+                    }
                 }
             }
             GcCompactionQueueItem::SubCompactionJob(options) => {
                 // TODO: error handling, clear the queue if any task fails?
-                let _ = timeline.compact_with_options(cancel, options, ctx).await?;
+                let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?;
+                if compaction_result == CompactionOutcome::YieldForL0 {
+                    // We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running
+                    // again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because
+                    // we need to clean things up before returning from the function.
+                    yield_for_l0 = true;
+                }
             }
             GcCompactionQueueItem::Notify(id, l2_lsn) => {
                 self.notify_and_unblock(id);
@@ -527,7 +539,10 @@ impl GcCompactionQueue {
             let mut guard = self.inner.lock().unwrap();
             guard.running = None;
         }
-        Ok(if has_pending_tasks {
+        Ok(if yield_for_l0 {
+            tracing::info!("give up gc-compaction: yield for L0 compaction");
+            CompactionOutcome::YieldForL0
+        } else if has_pending_tasks {
             CompactionOutcome::Pending
         } else {
             CompactionOutcome::Done
@@ -2598,7 +2613,7 @@ impl Timeline {
         cancel: &CancellationToken,
         options: CompactOptions,
         ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         let sub_compaction = options.sub_compaction;
         let job = GcCompactJob::from_compact_options(options.clone());
         if sub_compaction {
@@ -2620,7 +2635,7 @@ impl Timeline {
             if jobs_len == 0 {
                 info!("no jobs to run, skipping gc bottom-most compaction");
             }
-            return Ok(());
+            return Ok(CompactionOutcome::Done);
         }
         self.compact_with_gc_inner(cancel, job, ctx).await
     }
@@ -2630,7 +2645,7 @@ impl Timeline {
         cancel: &CancellationToken,
         job: GcCompactJob,
         ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -2699,7 +2714,7 @@ impl Timeline {
                         tracing::warn!(
                             "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"
                         );
-                        return Ok(());
+                        return Ok(CompactionOutcome::Skipped);
                     }
                     real_gc_cutoff
                 } else {
@@ -2737,7 +2752,7 @@ impl Timeline {
                     "no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}",
                     gc_cutoff
                 );
-                return Ok(());
+                return Ok(CompactionOutcome::Done);
             };
             // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
             // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
@@ -2758,7 +2773,7 @@ impl Timeline {
                     "no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}",
                     compact_lsn_range.end
                 );
-                return Ok(());
+                return Ok(CompactionOutcome::Done);
             };
             // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
             // layers to compact.
@@ -2784,7 +2799,7 @@ impl Timeline {
                     "no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}",
                     gc_cutoff, compact_key_range.start, compact_key_range.end
                 );
-                return Ok(());
+                return Ok(CompactionOutcome::Done);
             }
             retain_lsns_below_horizon.sort();
             GcCompactionJobDescription {
@@ -2899,6 +2914,15 @@ impl Timeline {
             if cancel.is_cancelled() {
                 return Err(CompactionError::ShuttingDown);
             }
+            let should_yield = self
+                .l0_compaction_trigger
+                .notified()
+                .now_or_never()
+                .is_some();
+            if should_yield {
+                tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
+                return Ok(CompactionOutcome::YieldForL0);
+            }
             let resident_layer = layer
                 .download_and_keep_resident(ctx)
                 .await
@@ -3019,6 +3043,8 @@ impl Timeline {
         // the key and LSN range are determined. However, to keep things simple here, we still
         // create this writer, and discard the writer in the end.
 
+        let mut keys_processed = 0;
+
         while let Some(((key, lsn, val), desc)) = merge_iter
             .next_with_trace()
             .await
@@ -3028,6 +3054,18 @@ impl Timeline {
             if cancel.is_cancelled() {
                 return Err(CompactionError::ShuttingDown);
             }
+            keys_processed += 1;
+            if keys_processed % 1000 == 0 {
+                let should_yield = self
+                    .l0_compaction_trigger
+                    .notified()
+                    .now_or_never()
+                    .is_some();
+                if should_yield {
+                    tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
+                    return Ok(CompactionOutcome::YieldForL0);
+                }
+            }
             if self.shard_identity.is_key_disposable(&key) {
                 // If this shard does not need to store this key, simply skip it.
                 //
@@ -3360,7 +3398,7 @@ impl Timeline {
         );
 
         if dry_run {
-            return Ok(());
+            return Ok(CompactionOutcome::Done);
         }
 
         info!(
@@ -3479,7 +3517,7 @@ impl Timeline {
 
         drop(gc_lock);
 
-        Ok(())
+        Ok(CompactionOutcome::Done)
     }
 }
 

From 8542507ee554715af6f1060c94dc1153c8172753 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 7 Mar 2025 07:00:52 +0000
Subject: [PATCH 61/61] Compute release 2025-03-07