From c71aea02238909e4107ef4f750a41b9e35ef4cc3 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 16 Jul 2025 14:29:18 +0100
Subject: [PATCH 01/39] proxy: for json logging, only use callsite IDs if span
 name is duplicated (#12625)

## Problem

We run multiple proxies, we get logs like

```
... spans={"http_conn#22":{"conn_id": ...
... spans={"http_conn#24":{"conn_id": ...
```

these are the same span, and the difference is confusing.

## Summary of changes

Introduce a counter per span name, rather than a global counter. If the
counter is 0, no change to the span name is made.

To follow up: see which span names are duplicated within the codebase in
different callsites
---
 proxy/src/logging.rs | 58 +++++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 19 deletions(-)
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index a87b0f1175..d4fd826c13 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,7 +1,6 @@
 use std::cell::RefCell;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::sync::atomic::{AtomicU32, Ordering};
 use std::{env, io};
 
 use chrono::{DateTime, Utc};
@@ -211,6 +210,9 @@ struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
     /// tracks which fields of each **event** are duplicates
     skipped_field_indices: CallsiteMap<SkippedFieldIndices>,
 
+    /// tracks callsite names to an ID.
+    callsite_name_ids: papaya::HashMap<&'static str, u32, ahash::RandomState>,
+
     span_info: CallsiteMap<CallsiteSpanInfo>,
 
     /// Fields we want to keep track of in a separate json object.
@@ -223,6 +225,7 @@ impl<C: Clock, W: MakeWriter> JsonLoggingLayer<C, W> {
             clock,
             skipped_field_indices: CallsiteMap::default(),
             span_info: CallsiteMap::default(),
+            callsite_name_ids: papaya::HashMap::default(),
             writer,
             extract_fields,
         }
@@ -233,7 +236,7 @@ impl<C: Clock, W: MakeWriter> JsonLoggingLayer<C, W> {
         self.span_info
             .pin()
             .get_or_insert_with(metadata.callsite(), || {
-                CallsiteSpanInfo::new(metadata, self.extract_fields)
+                CallsiteSpanInfo::new(&self.callsite_name_ids, metadata, self.extract_fields)
             })
             .clone()
     }
@@ -345,10 +348,11 @@ struct CallsiteSpanInfo {
 }
 
 impl CallsiteSpanInfo {
-    fn new(metadata: &'static Metadata<'static>, extract_fields: &[&'static str]) -> Self {
-        // Start at 1 to reserve 0 for default.
-        static COUNTER: AtomicU32 = AtomicU32::new(1);
-
+    fn new(
+        callsite_name_ids: &papaya::HashMap<&'static str, u32, ahash::RandomState>,
+        metadata: &'static Metadata<'static>,
+        extract_fields: &[&'static str],
+    ) -> Self {
         let names: Vec<&'static str> = metadata.fields().iter().map(|f| f.name()).collect();
 
         // get all the indices of span fields we want to focus
@@ -361,8 +365,18 @@ impl CallsiteSpanInfo {
         // normalized_name is unique for each callsite, but it is not
         // unified across separate proxy instances.
         // todo: can we do better here?
-        let cid = COUNTER.fetch_add(1, Ordering::Relaxed);
-        let normalized_name = format!("{}#{cid}", metadata.name()).into();
+        let cid = *callsite_name_ids
+            .pin()
+            .update_or_insert(metadata.name(), |&cid| cid + 1, 0);
+
+        // we hope that most span names are unique, in which case this will always be 0
+        let normalized_name = if cid == 0 {
+            metadata.name().into()
+        } else {
+            // if the span name is not unique, add the numeric ID to span name to distinguish it.
+            // sadly this is non-determinstic, across restarts but we should fix it by disambiguating re-used span names instead.
+            format!("{}#{cid}", metadata.name()).into()
+        };
 
         Self {
             extract,
@@ -914,6 +928,7 @@ mod tests {
             clock: clock.clone(),
             skipped_field_indices: papaya::HashMap::default(),
             span_info: papaya::HashMap::default(),
+            callsite_name_ids: papaya::HashMap::default(),
             writer: buffer.clone(),
             extract_fields: &["x"],
         };
@@ -922,14 +937,16 @@ mod tests {
 
         tracing::subscriber::with_default(registry, || {
             info_span!("some_span", x = 24).in_scope(|| {
-                info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| {
-                    tracing::error!(
-                        a = 1,
-                        a = 2,
-                        a = 3,
-                        message = "explicit message field",
-                        "implicit message field"
-                    );
+                info_span!("some_other_span", y = 30).in_scope(|| {
+                    info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| {
+                        tracing::error!(
+                            a = 1,
+                            a = 2,
+                            a = 3,
+                            message = "explicit message field",
+                            "implicit message field"
+                        );
+                    });
                 });
             });
         });
@@ -948,12 +965,15 @@ mod tests {
                     "a": 3,
                 },
                 "spans": {
-                    "some_span#1":{
+                    "some_span":{
                         "x": 24,
                     },
-                    "some_span#2": {
+                    "some_other_span": {
+                        "y": 30,
+                    },
+                    "some_span#1": {
                         "x": 42,
-                    }
+                    },
                 },
                 "extract": {
                     "x": 42,

From 3e4cbaed6727f4440dc8711df59d3852c2f2f159 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 16 Jul 2025 15:37:40 +0100
Subject: [PATCH 02/39] storcon: validate intent state before applying
 optimization (#12593)

## Problem

In the gap between picking an optimization and applying it, something
might insert a change to the intent state that makes it incompatible.
If the change is done via the `schedule()` method, we are covered by the
increased sequence number, but otherwise we can panic if we violate the
intent state invariants.

## Summary of Changes

Validate the optimization right before applying it. Since we hold the
service lock at that point, nothing else can sneak in.

Closes LKB-65
---
 storage_controller/src/tenant_shard.rs        | 41 ++++++++++++++++++-
 .../performance/test_sharding_autosplit.py    |  5 +++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 0bfca5385e..99079c57b0 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1272,7 +1272,9 @@ impl TenantShard {
     }
 
     /// Return true if the optimization was really applied: it will not be applied if the optimization's
-    /// sequence is behind this tenant shard's
+    /// sequence is behind this tenant shard's or if the intent state proposed by the optimization
+    /// is not compatible with the current intent state. The later may happen when the background
+    /// reconcile loops runs concurrently with HTTP driven optimisations.
     pub(crate) fn apply_optimization(
         &mut self,
         scheduler: &mut Scheduler,
@@ -1282,6 +1284,15 @@ impl TenantShard {
             return false;
         }
 
+        if !self.validate_optimization(&optimization) {
+            tracing::info!(
+                "Skipping optimization for {} because it does not match current intent: {:?}",
+                self.tenant_shard_id,
+                optimization,
+            );
+            return false;
+        }
+
         metrics::METRICS_REGISTRY
             .metrics_group
             .storage_controller_schedule_optimization
@@ -1322,6 +1333,34 @@ impl TenantShard {
         true
     }
 
+    /// Check that the desired modifications to the intent state are compatible with
+    /// the current intent state
+    fn validate_optimization(&self, optimization: &ScheduleOptimization) -> bool {
+        match optimization.action {
+            ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                old_attached_node_id,
+                new_attached_node_id,
+            }) => {
+                self.intent.attached == Some(old_attached_node_id)
+                    && self.intent.secondary.contains(&new_attached_node_id)
+            }
+            ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
+                old_node_id: _,
+                new_node_id,
+            }) => {
+                // It's legal to remove a secondary that is not present in the intent state
+                !self.intent.secondary.contains(&new_node_id)
+            }
+            ScheduleOptimizationAction::CreateSecondary(new_node_id) => {
+                !self.intent.secondary.contains(&new_node_id)
+            }
+            ScheduleOptimizationAction::RemoveSecondary(_) => {
+                // It's legal to remove a secondary that is not present in the intent state
+                true
+            }
+        }
+    }
+
     /// When a shard has several secondary locations, we need to pick one in situations where
     /// we promote one of them to an attached location:
     ///  - When draining a node for restart
diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py
index 0bb210db23..1b77831b75 100644
--- a/test_runner/performance/test_sharding_autosplit.py
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -73,6 +73,11 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             ".*Local notification hook failed.*",
             ".*Marking shard.*for notification retry.*",
             ".*Failed to notify compute.*",
+            # As an optimization, the storage controller kicks the downloads on the secondary
+            # after the shard split. However, secondaries are created async, so it's possible
+            # that the intent state was modified, but the actual secondary hasn't been created,
+            # which results in an error.
+            ".*Error calling secondary download after shard split.*",
         ]
     )
 

From 8b18d8b31b608a54ce936f5eb893e2ae11a52a04 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 16 Jul 2025 15:43:17 +0100
Subject: [PATCH 03/39] safekeeper: add global disk usage utilization limit
 (#12605)

N.B: No-op for the neon-env.

## Problem

We added a per-timeline disk utilization protection circuit breaker,
which will stop the safekeeper from accepting more WAL writes if the
disk utilization by the timeline has exceeded a configured limit. We
mainly designed the mechanism as a guard against WAL upload/backup bugs,
and we assumed that as long as WAL uploads are proceeding as normal we
will not run into disk pressure. This turned out to be not true. In one
of our load tests where we have 500 PGs ingesting data at the same time,
safekeeper disk utilization started to creep up even though WAL uploads
were completely normal (we likely just maxed out our S3 upload bandwidth
from the single SK). This means the per-timeline disk utilization
protection won't be enough if too many timelines are ingesting data at
the same time.

## Summary of changes

Added a global disk utilization protection circuit breaker which will
stop a safekeeper from accepting more WAL writes if the total disk usage
on the safekeeper (across all tenants) exceeds a limit. We implemented
this circuit breaker through two parts:

1. A "global disk usage watcher" background task that runs at a
configured interval (default every minute) to see how much disk space is
being used in the safekeeper's filesystem. This background task also
performs the check against the limit and publishes the result to a
global atomic boolean flag.
2. The `hadron_check_disk_usage()` routine (in `timeline.rs`) now also
checks this global boolean flag published in the step above, and fails
the `WalAcceptor` (triggers the circuit breaker) if the flag was raised.

The disk usage limit is disabled by default.
It can be tuned with the `--max-global-disk-usage-ratio` CLI arg.

## How is this tested?

Added integration test
`test_wal_acceptor.py::test_global_disk_usage_limit`.

Also noticed that I haven't been using the `wait_until(f)` test function
correctly (the `f` passed in is supposed to raise an exception if the
condition is not met, instead of returning `False`...). Fixed it in both
circuit breaker tests.

---------

Co-authored-by: William Huang <william.huang@databricks.com>
---
 Cargo.lock                                    |  1 +
 safekeeper/Cargo.toml                         |  1 +
 safekeeper/src/bin/safekeeper.rs              | 65 +++++++++++++-
 safekeeper/src/hadron.rs                      | 75 +++++++++++++++-
 safekeeper/src/http/routes.rs                 | 22 +++++
 safekeeper/src/lib.rs                         | 18 +++-
 safekeeper/src/metrics.rs                     | 14 +++
 safekeeper/src/timeline.rs                    |  7 ++
 .../tests/walproposer_sim/safekeeper.rs       |  2 +
 test_runner/regress/test_wal_acceptor.py      | 87 ++++++++++++++++++-
 10 files changed, 284 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3474211ac6..e5f39658a7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6204,6 +6204,7 @@ dependencies = [
  "itertools 0.10.5",
  "jsonwebtoken",
  "metrics",
+ "nix 0.30.1",
  "once_cell",
  "pageserver_api",
  "parking_lot 0.12.1",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 539e931983..56822b5c25 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -72,6 +72,7 @@ http-utils.workspace = true
 utils.workspace = true
 wal_decoder.workspace = true
 env_logger.workspace = true
+nix.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 79cf2f9149..2ec541b6f0 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -17,8 +17,9 @@ use http_utils::tls_certs::ReloadingCertificateResolver;
 use metrics::set_build_info_metric;
 use remote_storage::RemoteStorageConfig;
 use safekeeper::defaults::{
-    DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT,
-    DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
+    DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT,
+    DEFAULT_GLOBAL_DISK_CHECK_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR,
+    DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
     DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES, DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES,
     DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
     DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
@@ -42,6 +43,12 @@ use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
 use utils::sentry_init::init_sentry;
 use utils::{pid_file, project_build_tag, project_git_version, tcp_listener};
 
+use safekeeper::hadron::{
+    GLOBAL_DISK_LIMIT_EXCEEDED, get_filesystem_capacity, get_filesystem_usage,
+};
+use safekeeper::metrics::GLOBAL_DISK_UTIL_CHECK_SECONDS;
+use std::sync::atomic::Ordering;
+
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
@@ -256,6 +263,15 @@ struct Args {
     /* BEGIN_HADRON */
     #[arg(long)]
     enable_pull_timeline_on_startup: bool,
+    /// How often to scan entire data-dir for total disk usage
+    #[arg(long, value_parser=humantime::parse_duration, default_value = DEFAULT_GLOBAL_DISK_CHECK_INTERVAL)]
+    global_disk_check_interval: Duration,
+    /// The portion of the filesystem capacity that can be used by all timelines.
+    /// A circuit breaker will trip and reject all WAL writes if the total usage
+    /// exceeds this ratio.
+    /// Set to 0 to disable the global disk usage limit.
+    #[arg(long, default_value_t = DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO)]
+    max_global_disk_usage_ratio: f64,
     /* END_HADRON */
 }
 
@@ -444,6 +460,8 @@ async fn main() -> anyhow::Result<()> {
         advertise_pg_addr_tenant_only: None,
         enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup,
         hcc_base_url: None,
+        global_disk_check_interval: args.global_disk_check_interval,
+        max_global_disk_usage_ratio: args.max_global_disk_usage_ratio,
         /* END_HADRON */
     });
 
@@ -618,6 +636,49 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
         .map(|res| ("Timeline map housekeeping".to_owned(), res));
     tasks_handles.push(Box::pin(timeline_housekeeping_handle));
 
+    /* BEGIN_HADRON */
+    // Spawn global disk usage watcher task, if a global disk usage limit is specified.
+    let interval = conf.global_disk_check_interval;
+    let data_dir = conf.workdir.clone();
+    // Use the safekeeper data directory to compute filesystem capacity. This only runs once on startup, so
+    // there is little point to continue if we can't have the proper protections in place.
+    let fs_capacity_bytes = get_filesystem_capacity(data_dir.as_std_path())
+        .expect("Failed to get filesystem capacity for data directory");
+    let limit: u64 = (conf.max_global_disk_usage_ratio * fs_capacity_bytes as f64) as u64;
+    if limit > 0 {
+        let disk_usage_watch_handle = BACKGROUND_RUNTIME
+            .handle()
+            .spawn(async move {
+                // Use Tokio interval to preserve fixed cadence between filesystem utilization checks
+                let mut ticker = tokio::time::interval(interval);
+                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
+
+                loop {
+                    ticker.tick().await;
+                    let data_dir_clone = data_dir.clone();
+                    let check_start = Instant::now();
+
+                    let usage = tokio::task::spawn_blocking(move || {
+                        get_filesystem_usage(data_dir_clone.as_std_path())
+                    })
+                    .await
+                    .unwrap_or(0);
+
+                    let elapsed = check_start.elapsed().as_secs_f64();
+                    GLOBAL_DISK_UTIL_CHECK_SECONDS.observe(elapsed);
+                    if usage > limit {
+                        warn!(
+                            "Global disk usage exceeded limit. Usage: {} bytes, limit: {} bytes",
+                            usage, limit
+                        );
+                    }
+                    GLOBAL_DISK_LIMIT_EXCEEDED.store(usage > limit, Ordering::Relaxed);
+                }
+            })
+            .map(|res| ("Global disk usage watcher".to_string(), res));
+        tasks_handles.push(Box::pin(disk_usage_watch_handle));
+    }
+    /* END_HADRON */
     if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
         let wal_service_handle = current_thread_rt
             .as_ref()
diff --git a/safekeeper/src/hadron.rs b/safekeeper/src/hadron.rs
index b41bf2c3da..8c6a912166 100644
--- a/safekeeper/src/hadron.rs
+++ b/safekeeper/src/hadron.rs
@@ -1,12 +1,17 @@
+use once_cell::sync::Lazy;
 use pem::Pem;
 use safekeeper_api::models::PullTimelineRequest;
-use std::{collections::HashMap, env::VarError, net::IpAddr, sync::Arc, time::Duration};
+use std::{
+    collections::HashMap, env::VarError, net::IpAddr, sync::Arc, sync::atomic::AtomicBool,
+    time::Duration,
+};
 use tokio::time::sleep;
 use tokio_util::sync::CancellationToken;
 use url::Url;
-use utils::{backoff, id::TenantTimelineId, ip_address};
+use utils::{backoff, critical_timeline, id::TenantTimelineId, ip_address};
+
+use anyhow::{Result, anyhow};
 
-use anyhow::Result;
 use pageserver_api::controller_api::{
     AvailabilityZone, NodeRegisterRequest, SafekeeperTimeline, SafekeeperTimelinesResponse,
 };
@@ -346,6 +351,70 @@ pub async fn hcc_pull_timelines(
     Ok(())
 }
 
+/// true if the last background scan found total usage > limit
+pub static GLOBAL_DISK_LIMIT_EXCEEDED: Lazy<AtomicBool> = Lazy::new(|| AtomicBool::new(false));
+
+/// Returns filesystem usage in bytes for the filesystem containing the given path.
+// Need to suppress the clippy::unnecessary_cast warning because the casts on the block count and the
+// block size are required on macOS (they are 32-bit integers on macOS, apparantly).
+#[allow(clippy::unnecessary_cast)]
+pub fn get_filesystem_usage(path: &std::path::Path) -> u64 {
+    // Allow overriding disk usage via failpoint for tests
+    fail::fail_point!("sk-global-disk-usage", |val| {
+        // val is Option<String>; parse payload if present
+        val.and_then(|s| s.parse::<u64>().ok()).unwrap_or(0)
+    });
+
+    // Call statvfs(3) for filesystem usage
+    use nix::sys::statvfs::statvfs;
+    match statvfs(path) {
+        Ok(stat) => {
+            // fragment size (f_frsize) if non-zero else block size (f_bsize)
+            let frsize = stat.fragment_size();
+            let blocksz = if frsize > 0 {
+                frsize
+            } else {
+                stat.block_size()
+            };
+            // used blocks = total blocks - available blocks for unprivileged
+            let used_blocks = stat.blocks().saturating_sub(stat.blocks_available());
+            used_blocks as u64 * blocksz as u64
+        }
+        Err(e) => {
+            // The global disk usage watcher aren't associated with a tenant or timeline, so we just
+            // pass placeholder (all-zero) tenant and timeline IDs to the critical!() macro.
+            let placeholder_ttid = TenantTimelineId::empty();
+            critical_timeline!(
+                placeholder_ttid.tenant_id,
+                placeholder_ttid.timeline_id,
+                "Global disk usage watcher failed to read filesystem usage: {:?}",
+                e
+            );
+            0
+        }
+    }
+}
+
+/// Returns the total capacity of the current working directory's filesystem in bytes.
+#[allow(clippy::unnecessary_cast)]
+pub fn get_filesystem_capacity(path: &std::path::Path) -> Result<u64> {
+    // Call statvfs(3) for filesystem stats
+    use nix::sys::statvfs::statvfs;
+    match statvfs(path) {
+        Ok(stat) => {
+            // fragment size (f_frsize) if non-zero else block size (f_bsize)
+            let frsize = stat.fragment_size();
+            let blocksz = if frsize > 0 {
+                frsize
+            } else {
+                stat.block_size()
+            };
+            Ok(stat.blocks() as u64 * blocksz as u64)
+        }
+        Err(e) => Err(anyhow!("Failed to read filesystem capacity: {:?}", e)),
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index a0ee2facb5..c9d8e7d3b0 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -33,11 +33,13 @@ use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 
 use crate::debug_dump::TimelineDigestRequest;
+use crate::hadron::{get_filesystem_capacity, get_filesystem_usage};
 use crate::safekeeper::TermLsn;
 use crate::timelines_global_map::DeleteOrExclude;
 use crate::{
     GlobalTimelines, SafeKeeperConf, copy_timeline, debug_dump, patch_control_file, pull_timeline,
 };
+use serde_json::json;
 
 /// Healthcheck handler.
 async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -127,6 +129,21 @@ async fn utilization_handler(request: Request<Body>) -> Result<Response<Body>, A
     json_response(StatusCode::OK, utilization)
 }
 
+/// Returns filesystem capacity and current utilization for the safekeeper data directory.
+async fn filesystem_usage_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let conf = get_conf(&request);
+    let path = conf.workdir.as_std_path();
+    let capacity = get_filesystem_capacity(path).map_err(ApiError::InternalServerError)?;
+    let usage = get_filesystem_usage(path);
+    let resp = json!({
+        "data_dir": path,
+        "capacity_bytes": capacity,
+        "usage_bytes": usage,
+    });
+    json_response(StatusCode::OK, resp)
+}
+
 /// List all (not deleted) timelines.
 /// Note: it is possible to do the same with debug_dump.
 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -730,6 +747,11 @@ pub fn make_router(
             })
         })
         .get("/v1/utilization", |r| request_span(r, utilization_handler))
+        /* BEGIN_HADRON */
+        .get("/v1/debug/filesystem_usage", |r| {
+            request_span(r, filesystem_usage_handler)
+        })
+        /* END_HADRON */
         .delete("/v1/tenant/:tenant_id", |r| {
             request_span(r, tenant_delete_handler)
         })
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 02533b804d..c6f9cc29e5 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -50,6 +50,7 @@ pub mod wal_storage;
 pub mod test_utils;
 
 mod timelines_global_map;
+
 use std::sync::Arc;
 
 pub use timelines_global_map::GlobalTimelines;
@@ -83,6 +84,10 @@ pub mod defaults {
     pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
     pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
     pub const DEFAULT_SSL_CERT_RELOAD_PERIOD: &str = "60s";
+
+    // Global disk watcher defaults
+    pub const DEFAULT_GLOBAL_DISK_CHECK_INTERVAL: &str = "60s";
+    pub const DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO: f64 = 0.0;
 }
 
 #[derive(Debug, Clone)]
@@ -116,6 +121,10 @@ pub struct SafeKeeperConf {
     /* BEGIN_HADRON */
     pub max_reelect_offloader_lag_bytes: u64,
     pub max_timeline_disk_usage_bytes: u64,
+    /// How often to check the working directory's filesystem for total disk usage.
+    pub global_disk_check_interval: Duration,
+    /// The portion of the filesystem capacity that can be used by all timelines.
+    pub max_global_disk_usage_ratio: f64,
     /* END_HADRON */
     pub backup_parallel_jobs: usize,
     pub wal_backup_enabled: bool,
@@ -173,6 +182,8 @@ impl SafeKeeperConf {
             /* BEGIN_HADRON */
             max_reelect_offloader_lag_bytes: defaults::DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES,
             max_timeline_disk_usage_bytes: defaults::DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES,
+            global_disk_check_interval: Duration::from_secs(60),
+            max_global_disk_usage_ratio: defaults::DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO,
             /* END_HADRON */
             current_thread_runtime: false,
             walsenders_keep_horizon: false,
@@ -235,10 +246,13 @@ pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
         .expect("Failed to create WAL backup runtime")
 });
 
+/// Hadron: Dedicated runtime for infrequent background tasks.
 pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
     tokio::runtime::Builder::new_multi_thread()
-        .thread_name("background worker")
-        .worker_threads(1) // there is only one task now (ssl certificate reloading), having more threads doesn't make sense
+        .thread_name("Hadron background worker")
+        // One worker thread is enough, as most of the actual tasks run on blocking threads
+        // which has it own thread pool.
+        .worker_threads(1)
         .enable_all()
         .build()
         .expect("Failed to create background runtime")
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index e1af51c115..b07852aaee 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -963,3 +963,17 @@ async fn collect_timeline_metrics(global_timelines: Arc<GlobalTimelines>) -> Vec
     }
     res
 }
+
+/* BEGIN_HADRON */
+// Metrics reporting the time spent to perform each safekeeper filesystem utilization check.
+pub static GLOBAL_DISK_UTIL_CHECK_SECONDS: Lazy<Histogram> = Lazy::new(|| {
+    // Buckets from 1ms up to 10s
+    let buckets = vec![0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0];
+    register_histogram!(
+        "safekeeper_global_disk_utilization_check_seconds",
+        "Seconds spent to perform each safekeeper filesystem utilization check",
+        buckets
+    )
+    .expect("Failed to register safekeeper_global_disk_utilization_check_seconds histogram")
+});
+/* END_HADRON */
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index dbe510a019..a1a0aab9fd 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -29,6 +29,8 @@ use utils::sync::gate::Gate;
 use crate::metrics::{
     FullTimelineInfo, MISC_OPERATION_SECONDS, WAL_STORAGE_LIMIT_ERRORS, WalStorageMetrics,
 };
+
+use crate::hadron::GLOBAL_DISK_LIMIT_EXCEEDED;
 use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn};
@@ -1081,6 +1083,11 @@ impl WalResidentTimeline {
                 );
             }
         }
+
+        if GLOBAL_DISK_LIMIT_EXCEEDED.load(Ordering::Relaxed) {
+            bail!("Global disk usage exceeded limit");
+        }
+
         Ok(())
     }
     // END HADRON
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 393df6228e..30d3ab1a87 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -195,6 +195,8 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         enable_pull_timeline_on_startup: false,
         advertise_pg_addr_tenant_only: None,
         hcc_base_url: None,
+        global_disk_check_interval: Duration::from_secs(10),
+        max_global_disk_usage_ratio: 0.0,
         /* END_HADRON */
     };
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 22e6d2e1c3..c691087259 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2788,7 +2788,8 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
 
     # Wait for the error message to appear in the compute log
     def error_logged():
-        return endpoint.log_contains("WAL storage utilization exceeds configured limit") is not None
+        if endpoint.log_contains("WAL storage utilization exceeds configured limit") is None:
+            raise Exception("Expected error message not found in compute log yet")
 
     wait_until(error_logged)
     log.info("Found expected error message in compute log, resuming.")
@@ -2822,3 +2823,87 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
             cur.execute("select count(*) from t")
             # 2000 rows from first insert + 1000 from last insert
             assert cur.fetchone() == (3000,)
+
+
+def test_global_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
+    """
+    Similar to `test_timeline_disk_usage_limit`, but test that the global disk usage circuit breaker
+    also works as expected. The test scenario:
+    1. Create a timeline and endpoint.
+    2. Mock high disk usage via failpoint
+    3. Write data to the timeline so that disk usage exceeds the limit.
+    4. Verify that the writes hang and the expected error message appears in the compute log.
+    5. Mock low disk usage via failpoint
+    6. Verify that the hanging writes unblock and we can continue to write as normal.
+    """
+    neon_env_builder.num_safekeepers = 1
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start()
+
+    env.create_branch("test_global_disk_usage_limit")
+    endpoint = env.endpoints.create_start("test_global_disk_usage_limit")
+
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("create table t2(key int, value text)")
+
+    for sk in env.safekeepers:
+        sk.stop().start(
+            extra_opts=["--global-disk-check-interval=1s", "--max-global-disk-usage-ratio=0.8"]
+        )
+
+    # Set the failpoint to have the disk usage check return u64::MAX, which definitely exceeds the practical
+    # limits in the test environment.
+    for sk in env.safekeepers:
+        sk.http_client().configure_failpoints(
+            [("sk-global-disk-usage", "return(18446744073709551615)")]
+        )
+
+    # Wait until the global disk usage limit watcher trips the circuit breaker.
+    def error_logged_in_sk():
+        for sk in env.safekeepers:
+            if sk.log_contains("Global disk usage exceeded limit") is None:
+                raise Exception("Expected error message not found in safekeeper log yet")
+
+    wait_until(error_logged_in_sk)
+
+    def run_hanging_insert_global():
+        with closing(endpoint.connect()) as bg_conn:
+            with bg_conn.cursor() as bg_cur:
+                # This should generate more than 1KiB of WAL
+                bg_cur.execute("insert into t2 select generate_series(1,2000), 'payload'")
+
+    bg_thread_global = threading.Thread(target=run_hanging_insert_global)
+    bg_thread_global.start()
+
+    def error_logged_in_compute():
+        if endpoint.log_contains("Global disk usage exceeded limit") is None:
+            raise Exception("Expected error message not found in compute log yet")
+
+    wait_until(error_logged_in_compute)
+    log.info("Found the expected error message in compute log, resuming.")
+
+    time.sleep(2)
+    assert bg_thread_global.is_alive(), "Global hanging insert unblocked prematurely!"
+
+    # Make the disk usage check always return 0 through the failpoint to simulate the disk pressure easing.
+    # The SKs should resume accepting WAL writes without restarting.
+    for sk in env.safekeepers:
+        sk.http_client().configure_failpoints([("sk-global-disk-usage", "return(0)")])
+
+    bg_thread_global.join(timeout=120)
+    assert not bg_thread_global.is_alive(), "Hanging global insert did not complete after restart"
+    log.info("Global hanging insert unblocked.")
+
+    # Verify that we can continue to write as normal and we don't have obvious data corruption
+    # following the recovery.
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("insert into t2 select generate_series(2001,3000), 'payload'")
+
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("select count(*) from t2")
+            assert cur.fetchone() == (3000,)

From 1178f6fe7c1a7359acda31a499e821c3429bbe65 Mon Sep 17 00:00:00 2001
From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com>
Date: Wed, 16 Jul 2025 19:02:01 +0400
Subject: [PATCH 04/39] pageserver: Downgrade log level of 'No broker updates'
 (#12627)

## Problem

The warning message was seen during deployment, but it's actually OK.

## Summary of changes

- Treat `"No broker updates received for a while ..."` as an info
message.

Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
---
 .../src/tenant/timeline/walreceiver/connection_manager.rs     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index aba94244a3..f33f47a956 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -184,7 +184,7 @@ pub(super) async fn connection_manager_loop_step(
 
             // If we've not received any updates from the broker from a while, are waiting for WAL
             // and have no safekeeper connection or connection candidates, then it might be that
-            // the broker subscription is wedged. Drop the currrent subscription and re-subscribe
+            // the broker subscription is wedged. Drop the current subscription and re-subscribe
             // with the goal of unblocking it.
             _ = broker_reset_interval.tick() => {
                 let awaiting_lsn = wait_lsn_status.borrow().is_some();
@@ -192,7 +192,7 @@ pub(super) async fn connection_manager_loop_step(
                 let no_connection = connection_manager_state.wal_connection.is_none();
 
                 if awaiting_lsn && no_candidates && no_connection {
-                    tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...");
+                    tracing::info!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...");
                     broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
                 }
             },

From 80e5771c675ffcac2025664fef002c9d3332cbf5 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 16 Jul 2025 11:51:20 -0400
Subject: [PATCH 05/39] fix(storcon): passthrough 404 as 503 during migrations
 (#12620)

## Problem

close LKB-270, close LKB-253

We periodically saw pageserver returns 404 -> storcon converts it to 500
to cplane, and causing branch operations fail. This is due to storcon is
migrating tenants across pageservers and the request was forwarded from
the storcon to pageservers while the tenant was not attached yet. Such
operations should be retried from cplane and storcon should return 503
in such cases.

## Summary of changes

- Refactor `tenant_timeline_lsn_lease` to have a single function process
and passthrough such requests: `collect_tenant_shards` for collecting
all shards and checking if they're consistent with the observed state,
`process_result_and_passthrough_errors` to convert 404 into 503 if
necessary.
- `tenant_shard_node` also checks observed state now.

Note that for passthrough shard0, we originally had a check to convert
404 to 503:

```
    // Transform 404 into 503 if we raced with a migration
    if resp.status() == reqwest::StatusCode::NOT_FOUND {
        // Look up node again: if we migrated it will be different
        let new_node = service.tenant_shard_node(tenant_shard_id).await?;
        if new_node.get_id() != node.get_id() {
            // Rather than retry here, send the client a 503 to prompt a retry: this matches
            // the pageserver's use of 503, and all clients calling this API should retry on 503.
            return Err(ApiError::ResourceUnavailable(
                format!("Pageserver {node} returned 404, was migrated to {new_node}").into(),
            ));
        }
    }
```

However, this only checks the intent state. It is possible that the
migration is in progress before/after the request is processed and
intent state is always the same throughout the API call, therefore 404
not being processed by this branch.

Also, not sure about if this new code is correct or not, need second
eyes on that:

```
// As a reconciliation is in flight, we do not have the observed state yet, and therefore we assume it is always inconsistent.
Ok((node.clone(), false))
```

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/http.rs    |  46 ++++---
 storage_controller/src/service.rs | 192 +++++++++++++++++++-----------
 2 files changed, 141 insertions(+), 97 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 62fc212e12..c8227f0219 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -735,15 +735,13 @@ async fn handle_tenant_timeline_passthrough(
     );
 
     // Find the node that holds shard zero
-    let (node, tenant_shard_id) = if tenant_or_shard_id.is_unsharded() {
+    let (node, tenant_shard_id, consistent) = if tenant_or_shard_id.is_unsharded() {
         service
             .tenant_shard0_node(tenant_or_shard_id.tenant_id)
             .await?
     } else {
-        (
-            service.tenant_shard_node(tenant_or_shard_id).await?,
-            tenant_or_shard_id,
-        )
+        let (node, consistent) = service.tenant_shard_node(tenant_or_shard_id).await?;
+        (node, tenant_or_shard_id, consistent)
     };
 
     // Callers will always pass an unsharded tenant ID.  Before proxying, we must
@@ -788,16 +786,12 @@ async fn handle_tenant_timeline_passthrough(
     }
 
     // Transform 404 into 503 if we raced with a migration
-    if resp.status() == reqwest::StatusCode::NOT_FOUND {
-        // Look up node again: if we migrated it will be different
-        let new_node = service.tenant_shard_node(tenant_shard_id).await?;
-        if new_node.get_id() != node.get_id() {
-            // Rather than retry here, send the client a 503 to prompt a retry: this matches
-            // the pageserver's use of 503, and all clients calling this API should retry on 503.
-            return Err(ApiError::ResourceUnavailable(
-                format!("Pageserver {node} returned 404, was migrated to {new_node}").into(),
-            ));
-        }
+    if resp.status() == reqwest::StatusCode::NOT_FOUND && !consistent {
+        // Rather than retry here, send the client a 503 to prompt a retry: this matches
+        // the pageserver's use of 503, and all clients calling this API should retry on 503.
+        return Err(ApiError::ResourceUnavailable(
+            format!("Pageserver {node} returned 404 due to ongoing migration, retry later").into(),
+        ));
     }
 
     // We have a reqest::Response, would like a http::Response
@@ -2597,6 +2591,17 @@ pub fn make_router(
                 )
             },
         )
+        // Tenant timeline mark_invisible passthrough to shard zero
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/mark_invisible",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_passthrough,
+                    RequestName("v1_tenant_timeline_mark_invisible_passthrough"),
+                )
+            },
+        )
         // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
@@ -2615,17 +2620,6 @@ pub fn make_router(
                 RequestName("v1_tenant_passthrough"),
             )
         })
-        // Tenant timeline mark_invisible passthrough to shard zero
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/mark_invisible",
-            |r| {
-                tenant_service_handler(
-                    r,
-                    handle_tenant_timeline_passthrough,
-                    RequestName("v1_tenant_timeline_mark_invisible_passthrough"),
-                )
-            },
-        )
 }
 
 #[cfg(test)]
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 638cb410fa..0c5d7f44d4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -207,6 +207,27 @@ enum ShardGenerationValidity {
     },
 }
 
+/// We collect the state of attachments for some operations to determine if the operation
+/// needs to be retried when it fails.
+struct TenantShardAttachState {
+    /// The targets of the operation.
+    ///
+    /// Tenant shard ID, node ID, node, is intent node observed primary.
+    targets: Vec<(TenantShardId, NodeId, Node, bool)>,
+
+    /// The targets grouped by node ID.
+    by_node_id: HashMap<NodeId, (TenantShardId, Node, bool)>,
+}
+
+impl TenantShardAttachState {
+    fn for_api_call(&self) -> Vec<(TenantShardId, Node)> {
+        self.targets
+            .iter()
+            .map(|(tenant_shard_id, _, node, _)| (*tenant_shard_id, node.clone()))
+            .collect()
+    }
+}
+
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
 pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
@@ -4752,6 +4773,86 @@ impl Service {
         Ok(())
     }
 
+    fn is_observed_consistent_with_intent(
+        &self,
+        shard: &TenantShard,
+        intent_node_id: NodeId,
+    ) -> bool {
+        if let Some(location) = shard.observed.locations.get(&intent_node_id)
+            && let Some(ref conf) = location.conf
+            && (conf.mode == LocationConfigMode::AttachedSingle
+                || conf.mode == LocationConfigMode::AttachedMulti)
+        {
+            true
+        } else {
+            false
+        }
+    }
+
+    fn collect_tenant_shards(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantShardAttachState, ApiError> {
+        let locked = self.inner.read().unwrap();
+        let mut targets = Vec::new();
+        let mut by_node_id = HashMap::new();
+
+        // If the request got an unsharded tenant id, then apply
+        // the operation to all shards. Otherwise, apply it to a specific shard.
+        let shards_range = TenantShardId::tenant_range(tenant_id);
+
+        for (tenant_shard_id, shard) in locked.tenants.range(shards_range) {
+            if let Some(node_id) = shard.intent.get_attached() {
+                let node = locked
+                    .nodes
+                    .get(node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                let consistent = self.is_observed_consistent_with_intent(shard, *node_id);
+
+                targets.push((*tenant_shard_id, *node_id, node.clone(), consistent));
+                by_node_id.insert(*node_id, (*tenant_shard_id, node.clone(), consistent));
+            }
+        }
+
+        Ok(TenantShardAttachState {
+            targets,
+            by_node_id,
+        })
+    }
+
+    fn process_result_and_passthrough_errors<T>(
+        &self,
+        results: Vec<(Node, Result<T, mgmt_api::Error>)>,
+        attach_state: TenantShardAttachState,
+    ) -> Result<Vec<(Node, T)>, ApiError> {
+        let mut processed_results: Vec<(Node, T)> = Vec::with_capacity(results.len());
+        debug_assert_eq!(results.len(), attach_state.targets.len());
+        for (node, res) in results {
+            let is_consistent = attach_state
+                .by_node_id
+                .get(&node.get_id())
+                .map(|(_, _, consistent)| *consistent);
+            match res {
+                Ok(res) => processed_results.push((node, res)),
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _))
+                    if is_consistent == Some(false) =>
+                {
+                    // This is expected if the attach is not finished yet. Return 503 so that the client can retry.
+                    return Err(ApiError::ResourceUnavailable(
+                        format!(
+                            "Timeline is not attached to the pageserver {} yet, please retry",
+                            node.get_id()
+                        )
+                        .into(),
+                    ));
+                }
+                Err(e) => return Err(passthrough_api_error(&node, e)),
+            }
+        }
+        Ok(processed_results)
+    }
+
     pub(crate) async fn tenant_timeline_lsn_lease(
         &self,
         tenant_id: TenantId,
@@ -4765,49 +4866,11 @@ impl Service {
         )
         .await;
 
-        let mut retry_if_not_attached = false;
-        let targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
+        let attach_state = self.collect_tenant_shards(tenant_id)?;
 
-            // If the request got an unsharded tenant id, then apply
-            // the operation to all shards. Otherwise, apply it to a specific shard.
-            let shards_range = TenantShardId::tenant_range(tenant_id);
-
-            for (tenant_shard_id, shard) in locked.tenants.range(shards_range) {
-                if let Some(node_id) = shard.intent.get_attached() {
-                    let node = locked
-                        .nodes
-                        .get(node_id)
-                        .expect("Pageservers may not be deleted while referenced");
-
-                    targets.push((*tenant_shard_id, node.clone()));
-
-                    if let Some(location) = shard.observed.locations.get(node_id) {
-                        if let Some(ref conf) = location.conf {
-                            if conf.mode != LocationConfigMode::AttachedSingle
-                                && conf.mode != LocationConfigMode::AttachedMulti
-                            {
-                                // If the shard is attached as secondary, we need to retry if 404.
-                                retry_if_not_attached = true;
-                            }
-                            // If the shard is attached as primary, we should succeed.
-                        } else {
-                            // Location conf is not available yet, retry if 404.
-                            retry_if_not_attached = true;
-                        }
-                    } else {
-                        // The shard is not attached to the intended pageserver yet, retry if 404.
-                        retry_if_not_attached = true;
-                    }
-                }
-            }
-            targets
-        };
-
-        let res = self
+        let results = self
             .tenant_for_shards_api(
-                targets,
+                attach_state.for_api_call(),
                 |tenant_shard_id, client| async move {
                     client
                         .timeline_lease_lsn(tenant_shard_id, timeline_id, lsn)
@@ -4820,31 +4883,13 @@ impl Service {
             )
             .await;
 
+        let leases = self.process_result_and_passthrough_errors(results, attach_state)?;
         let mut valid_until = None;
-        for (node, r) in res {
-            match r {
-                Ok(lease) => {
-                    if let Some(ref mut valid_until) = valid_until {
-                        *valid_until = std::cmp::min(*valid_until, lease.valid_until);
-                    } else {
-                        valid_until = Some(lease.valid_until);
-                    }
-                }
-                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _))
-                    if retry_if_not_attached =>
-                {
-                    // This is expected if the attach is not finished yet. Return 503 so that the client can retry.
-                    return Err(ApiError::ResourceUnavailable(
-                        format!(
-                            "Timeline is not attached to the pageserver {} yet, please retry",
-                            node.get_id()
-                        )
-                        .into(),
-                    ));
-                }
-                Err(e) => {
-                    return Err(passthrough_api_error(&node, e));
-                }
+        for (_, lease) in leases {
+            if let Some(ref mut valid_until) = valid_until {
+                *valid_until = std::cmp::min(*valid_until, lease.valid_until);
+            } else {
+                valid_until = Some(lease.valid_until);
             }
         }
         Ok(LsnLease {
@@ -5267,10 +5312,12 @@ impl Service {
         status_code
     }
     /// When you know the TenantId but not a specific shard, and would like to get the node holding shard 0.
+    ///
+    /// Returns the node, tenant shard id, and whether it is consistent with the observed state.
     pub(crate) async fn tenant_shard0_node(
         &self,
         tenant_id: TenantId,
-    ) -> Result<(Node, TenantShardId), ApiError> {
+    ) -> Result<(Node, TenantShardId, bool), ApiError> {
         let tenant_shard_id = {
             let locked = self.inner.read().unwrap();
             let Some((tenant_shard_id, _shard)) = locked
@@ -5288,15 +5335,17 @@ impl Service {
 
         self.tenant_shard_node(tenant_shard_id)
             .await
-            .map(|node| (node, tenant_shard_id))
+            .map(|(node, consistent)| (node, tenant_shard_id, consistent))
     }
 
     /// When you need to send an HTTP request to the pageserver that holds a shard of a tenant, this
     /// function looks up and returns node. If the shard isn't found, returns Err(ApiError::NotFound)
+    ///
+    /// Returns the intent node and whether it is consistent with the observed state.
     pub(crate) async fn tenant_shard_node(
         &self,
         tenant_shard_id: TenantShardId,
-    ) -> Result<Node, ApiError> {
+    ) -> Result<(Node, bool), ApiError> {
         // Look up in-memory state and maybe use the node from there.
         {
             let locked = self.inner.read().unwrap();
@@ -5326,7 +5375,8 @@ impl Service {
                         "Shard refers to nonexistent node"
                     )));
                 };
-                return Ok(node.clone());
+                let consistent = self.is_observed_consistent_with_intent(shard, *intent_node_id);
+                return Ok((node.clone(), consistent));
             }
         };
 
@@ -5360,8 +5410,8 @@ impl Service {
                 "Shard refers to nonexistent node"
             )));
         };
-
-        Ok(node.clone())
+        // As a reconciliation is in flight, we do not have the observed state yet, and therefore we assume it is always inconsistent.
+        Ok((node.clone(), false))
     }
 
     pub(crate) fn tenant_locate(

From 79d72c94e86d0205f98b526e9d51ab723335e094 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 16 Jul 2025 18:02:07 +0200
Subject: [PATCH 06/39] reformat cargo install invocations in build-tools image
 (#12629)

## Problem
Same change with different formatting happened in multiple branches.

## Summary of changes
Realign formatting with the other branch.
---
 build-tools/Dockerfile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/build-tools/Dockerfile b/build-tools/Dockerfile
index e02707a5eb..b5fe642e6f 100644
--- a/build-tools/Dockerfile
+++ b/build-tools/Dockerfile
@@ -317,14 +317,14 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     . "$HOME/.cargo/env" && \
     cargo --version && rustup --version && \
     rustup component add llvm-tools rustfmt clippy && \
-    cargo install rustfilt            --version ${RUSTFILT_VERSION} --locked && \
-    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} --locked && \
-    cargo install cargo-deny          --version ${CARGO_DENY_VERSION} --locked && \
-    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} --locked && \
-    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} --locked && \
-    cargo install cargo-chef          --version ${CARGO_CHEF_VERSION} --locked && \
-    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} --locked \
-                                      --features postgres-bundled --no-default-features && \
+    cargo install rustfilt      --locked --version ${RUSTFILT_VERSION} && \
+    cargo install cargo-hakari  --locked --version ${CARGO_HAKARI_VERSION} && \
+    cargo install cargo-deny    --locked --version ${CARGO_DENY_VERSION} && \
+    cargo install cargo-hack    --locked --version ${CARGO_HACK_VERSION} && \
+    cargo install cargo-nextest --locked --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install cargo-chef    --locked --version ${CARGO_CHEF_VERSION} && \
+    cargo install diesel_cli    --locked --version ${CARGO_DIESEL_CLI_VERSION} \
+                                --features postgres-bundled --no-default-features && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
 

From 9e154a8130ebd82e042f83d62165291fa9355ccd Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 16 Jul 2025 10:11:25 -0600
Subject: [PATCH 07/39] PG: smooth max wal rate (#12514)

## Problem
We were only resetting the limit in the wal proposer. If backends are
back pressured, it might take a while for the wal proposer to receive a
new WAL to reset the limit.

## Summary of changes
Backend also checks the time and resets the limit.

## How is this tested?
pgbench has more smooth tps

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>
---
 libs/walproposer/src/api_bindings.rs |  2 +-
 pgxn/neon/walproposer.h              | 12 +++++++++++-
 pgxn/neon/walproposer_pg.c           | 26 ++++++++++++++++++++------
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 5f856a44d4..825a137d0f 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -431,7 +431,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
     let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
         should_limit: crate::bindings::pg_atomic_uint32 { value: 0 },
         sent_bytes: 0,
-        last_recorded_time_us: 0,
+        last_recorded_time_us: crate::bindings::pg_atomic_uint64 { value: 0 },
     };
 
     crate::bindings::WalproposerShmemState {
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index e3a4022664..19d23925a5 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -377,6 +377,16 @@ typedef struct PageserverFeedback
 } PageserverFeedback;
 
 /* BEGIN_HADRON */
+/**
+ * WAL proposer is the only backend that will update `sent_bytes` and `last_recorded_time_us`.
+ * Once the `sent_bytes` reaches the limit, it puts backpressure on PG backends.
+ *
+ * A PG backend checks `should_limit` to see if it should hit backpressure.
+ * - If yes, it also checks the `last_recorded_time_us` to see
+ *   if it's time to push more WALs. This is because the WAL proposer
+ *   only resets `should_limit` to 0 after it is notified about new WALs
+ *   which might take a while.
+ */
 typedef struct WalRateLimiter
 {
 	/* If the value is 1, PG backends will hit backpressure. */
@@ -384,7 +394,7 @@ typedef struct WalRateLimiter
 	/* The number of bytes sent in the current second. */
 	uint64		sent_bytes;
 	/* The last recorded time in microsecond. */
-	TimestampTz last_recorded_time_us;
+	pg_atomic_uint64 last_recorded_time_us;
 } WalRateLimiter;
 /* END_HADRON */
 
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index aaf8f43eeb..18655d4c6c 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -449,8 +449,20 @@ backpressure_lag_impl(void)
 	}
 
 	state = GetWalpropShmemState();
-	if (state != NULL && pg_atomic_read_u32(&state->wal_rate_limiter.should_limit) == 1)
+	if (state != NULL && !!pg_atomic_read_u32(&state->wal_rate_limiter.should_limit))
 	{
+		TimestampTz now = GetCurrentTimestamp();
+		struct WalRateLimiter *limiter = &state->wal_rate_limiter;
+		uint64 last_recorded_time = pg_atomic_read_u64(&limiter->last_recorded_time_us);
+		if (now - last_recorded_time > USECS_PER_SEC)
+		{
+			/*
+			 * The backend has past 1 second since the last recorded time and it's time to push more WALs.
+			 * If the backends are pushing WALs too fast, the wal proposer will rate limit them again.
+			 */
+			uint32 expected = true;
+			pg_atomic_compare_exchange_u32(&state->wal_rate_limiter.should_limit, &expected, false);
+		}
 		return 1;
 	}
 	/* END_HADRON */
@@ -502,6 +514,7 @@ WalproposerShmemInit(void)
 		pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
 		/* BEGIN_HADRON */
 		pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
+		pg_atomic_init_u64(&walprop_shared->wal_rate_limiter.last_recorded_time_us, 0);
 		/* END_HADRON */
 	}
 	LWLockRelease(AddinShmemInitLock);
@@ -520,6 +533,7 @@ WalproposerShmemInit_SyncSafekeeper(void)
 	pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
 	/* BEGIN_HADRON */
 	pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
+	pg_atomic_init_u64(&walprop_shared->wal_rate_limiter.last_recorded_time_us, 0);
 	/* END_HADRON */
 }
 
@@ -1551,18 +1565,18 @@ XLogBroadcastWalProposer(WalProposer *wp)
 	{
 		uint64 max_wal_bytes = (uint64) databricks_max_wal_mb_per_second * 1024 * 1024;
 		struct WalRateLimiter *limiter = &state->wal_rate_limiter;
-
-		if (now - limiter->last_recorded_time_us > USECS_PER_SEC)
+		uint64 last_recorded_time = pg_atomic_read_u64(&limiter->last_recorded_time_us);
+		if (now - last_recorded_time > USECS_PER_SEC)
 		{
 			/* Reset the rate limiter */
-			limiter->last_recorded_time_us = now;
 			limiter->sent_bytes = 0;
-			pg_atomic_exchange_u32(&limiter->should_limit, 0);
+			pg_atomic_write_u64(&limiter->last_recorded_time_us, now);
+			pg_atomic_write_u32(&limiter->should_limit, false);
 		}
 		limiter->sent_bytes += (endptr - startptr);
 		if (limiter->sent_bytes > max_wal_bytes)
 		{
-			pg_atomic_exchange_u32(&limiter->should_limit, 1);
+			pg_atomic_write_u32(&limiter->should_limit, true);
 		}
 	}
 	/* END_HADRON */

From e2982ed3ecdb8e1e67239ea84953550909c4700b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Szafra=C5=84ski?= <k.p.szafranski@gmail.com>
Date: Wed, 16 Jul 2025 18:23:05 +0200
Subject: [PATCH 08/39] [proxy] Cache node info only for TTL, even if Redis is
 available (#12626)

This PR simplifies our node info cache. Now we'll store entries for at
most the TTL duration, even if Redis notifications are available. This
will allow us to cache intermittent errors later (e.g. due to rate
limits) with more predictable behavior.

Related to https://github.com/neondatabase/cloud/issues/19353
---
 proxy/src/cache/project_info.rs  | 107 +++++--------------------------
 proxy/src/redis/notifications.rs |   7 +-
 2 files changed, 16 insertions(+), 98 deletions(-)

diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index d37c107323..c812779e30 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -1,13 +1,11 @@
 use std::collections::{HashMap, HashSet, hash_map};
 use std::convert::Infallible;
-use std::sync::atomic::AtomicU64;
 use std::time::Duration;
 
 use async_trait::async_trait;
 use clashmap::ClashMap;
 use clashmap::mapref::one::Ref;
 use rand::{Rng, thread_rng};
-use tokio::sync::Mutex;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
@@ -22,31 +20,23 @@ pub(crate) trait ProjectInfoCache {
     fn invalidate_endpoint_access_for_project(&self, project_id: ProjectIdInt);
     fn invalidate_endpoint_access_for_org(&self, account_id: AccountIdInt);
     fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
-    async fn decrement_active_listeners(&self);
-    async fn increment_active_listeners(&self);
 }
 
 struct Entry<T> {
-    created_at: Instant,
+    expires_at: Instant,
     value: T,
 }
 
 impl<T> Entry<T> {
-    pub(crate) fn new(value: T) -> Self {
+    pub(crate) fn new(value: T, ttl: Duration) -> Self {
         Self {
-            created_at: Instant::now(),
+            expires_at: Instant::now() + ttl,
             value,
         }
     }
 
-    pub(crate) fn get(&self, valid_since: Instant) -> Option<&T> {
-        (valid_since < self.created_at).then_some(&self.value)
-    }
-}
-
-impl<T> From<T> for Entry<T> {
-    fn from(value: T) -> Self {
-        Self::new(value)
+    pub(crate) fn get(&self) -> Option<&T> {
+        (self.expires_at > Instant::now()).then_some(&self.value)
     }
 }
 
@@ -56,18 +46,12 @@ struct EndpointInfo {
 }
 
 impl EndpointInfo {
-    pub(crate) fn get_role_secret(
-        &self,
-        role_name: RoleNameInt,
-        valid_since: Instant,
-    ) -> Option<RoleAccessControl> {
-        let controls = self.role_controls.get(&role_name)?;
-        controls.get(valid_since).cloned()
+    pub(crate) fn get_role_secret(&self, role_name: RoleNameInt) -> Option<RoleAccessControl> {
+        self.role_controls.get(&role_name)?.get().cloned()
     }
 
-    pub(crate) fn get_controls(&self, valid_since: Instant) -> Option<EndpointAccessControl> {
-        let controls = self.controls.as_ref()?;
-        controls.get(valid_since).cloned()
+    pub(crate) fn get_controls(&self) -> Option<EndpointAccessControl> {
+        self.controls.as_ref()?.get().cloned()
     }
 
     pub(crate) fn invalidate_endpoint(&mut self) {
@@ -92,11 +76,8 @@ pub struct ProjectInfoCacheImpl {
     project2ep: ClashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
     // FIXME(stefan): we need a way to GC the account2ep map.
     account2ep: ClashMap<AccountIdInt, HashSet<EndpointIdInt>>,
-    config: ProjectInfoCacheOptions,
 
-    start_time: Instant,
-    ttl_disabled_since_us: AtomicU64,
-    active_listeners_lock: Mutex<usize>,
+    config: ProjectInfoCacheOptions,
 }
 
 #[async_trait]
@@ -152,29 +133,6 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
             }
         }
     }
-
-    async fn decrement_active_listeners(&self) {
-        let mut listeners_guard = self.active_listeners_lock.lock().await;
-        if *listeners_guard == 0 {
-            tracing::error!("active_listeners count is already 0, something is broken");
-            return;
-        }
-        *listeners_guard -= 1;
-        if *listeners_guard == 0 {
-            self.ttl_disabled_since_us
-                .store(u64::MAX, std::sync::atomic::Ordering::SeqCst);
-        }
-    }
-
-    async fn increment_active_listeners(&self) {
-        let mut listeners_guard = self.active_listeners_lock.lock().await;
-        *listeners_guard += 1;
-        if *listeners_guard == 1 {
-            let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
-            self.ttl_disabled_since_us
-                .store(new_ttl, std::sync::atomic::Ordering::SeqCst);
-        }
-    }
 }
 
 impl ProjectInfoCacheImpl {
@@ -184,9 +142,6 @@ impl ProjectInfoCacheImpl {
             project2ep: ClashMap::new(),
             account2ep: ClashMap::new(),
             config,
-            ttl_disabled_since_us: AtomicU64::new(u64::MAX),
-            start_time: Instant::now(),
-            active_listeners_lock: Mutex::new(0),
         }
     }
 
@@ -203,19 +158,17 @@ impl ProjectInfoCacheImpl {
         endpoint_id: &EndpointId,
         role_name: &RoleName,
     ) -> Option<RoleAccessControl> {
-        let valid_since = self.get_cache_times();
         let role_name = RoleNameInt::get(role_name)?;
         let endpoint_info = self.get_endpoint_cache(endpoint_id)?;
-        endpoint_info.get_role_secret(role_name, valid_since)
+        endpoint_info.get_role_secret(role_name)
     }
 
     pub(crate) fn get_endpoint_access(
         &self,
         endpoint_id: &EndpointId,
     ) -> Option<EndpointAccessControl> {
-        let valid_since = self.get_cache_times();
         let endpoint_info = self.get_endpoint_cache(endpoint_id)?;
-        endpoint_info.get_controls(valid_since)
+        endpoint_info.get_controls()
     }
 
     pub(crate) fn insert_endpoint_access(
@@ -237,8 +190,8 @@ impl ProjectInfoCacheImpl {
             return;
         }
 
-        let controls = Entry::from(controls);
-        let role_controls = Entry::from(role_controls);
+        let controls = Entry::new(controls, self.config.ttl);
+        let role_controls = Entry::new(role_controls, self.config.ttl);
 
         match self.cache.entry(endpoint_id) {
             clashmap::Entry::Vacant(e) => {
@@ -275,27 +228,6 @@ impl ProjectInfoCacheImpl {
         }
     }
 
-    fn ignore_ttl_since(&self) -> Option<Instant> {
-        let ttl_disabled_since_us = self
-            .ttl_disabled_since_us
-            .load(std::sync::atomic::Ordering::Relaxed);
-
-        if ttl_disabled_since_us == u64::MAX {
-            return None;
-        }
-
-        Some(self.start_time + Duration::from_micros(ttl_disabled_since_us))
-    }
-
-    fn get_cache_times(&self) -> Instant {
-        let mut valid_since = Instant::now() - self.config.ttl;
-        if let Some(ignore_ttl_since) = self.ignore_ttl_since() {
-            // We are fine if entry is not older than ttl or was added before we are getting notifications.
-            valid_since = valid_since.min(ignore_ttl_since);
-        }
-        valid_since
-    }
-
     pub fn maybe_invalidate_role_secret(&self, endpoint_id: &EndpointId, role_name: &RoleName) {
         let Some(endpoint_id) = EndpointIdInt::get(endpoint_id) else {
             return;
@@ -313,16 +245,7 @@ impl ProjectInfoCacheImpl {
             return;
         };
 
-        let created_at = role_controls.get().created_at;
-        let expire = match self.ignore_ttl_since() {
-            // if ignoring TTL, we should still try and roll the password if it's old
-            // and we the client gave an incorrect password. There could be some lag on the redis channel.
-            Some(_) => created_at + self.config.ttl < Instant::now(),
-            // edge case: redis is down, let's be generous and invalidate the cache immediately.
-            None => true,
-        };
-
-        if expire {
+        if role_controls.get().expires_at <= Instant::now() {
             role_controls.remove();
         }
     }
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 973a4c5b02..a6d376562b 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -265,10 +265,7 @@ async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
             return Ok(());
         }
         let mut conn = match try_connect(&redis).await {
-            Ok(conn) => {
-                handler.cache.increment_active_listeners().await;
-                conn
-            }
+            Ok(conn) => conn,
             Err(e) => {
                 tracing::error!(
                     "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
@@ -287,11 +284,9 @@ async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
                 }
             }
             if cancellation_token.is_cancelled() {
-                handler.cache.decrement_active_listeners().await;
                 return Ok(());
             }
         }
-        handler.cache.decrement_active_listeners().await;
     }
 }
 

From 267fb4990888ef2a325005b21b88cf66fd214c72 Mon Sep 17 00:00:00 2001
From: Dimitri Fontaine <dim@tapoueh.org>
Date: Wed, 16 Jul 2025 20:39:54 +0200
Subject: [PATCH 09/39] Update Postgres branches. (#12628)

## Problem

## Summary of changes
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index af550a80c6..ac3c460e01 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit af550a80c6b86d0fec378ee929e2bb2e591e5cd3
+Subproject commit ac3c460e01a31f11fb52fd8d8e88e60f0e1069b4
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 21cb86b814..24313bf8f3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 21cb86b81454522870d3634cac3e10b821da09fe
+Subproject commit 24313bf8f3de722968a2fdf764de7ef77ed64f06
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index c148871ead..51194dc5ce 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit c148871eada02c0cf15d553d8ff7c389d01810f2
+Subproject commit 51194dc5ce2e3523068d8607852e6c3125a17e58
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 8de764e44b..eac5279cd1 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 8de764e44b56d1cffb3644368d4d689f482b611a
+Subproject commit eac5279cd147d4086e0eb242198aae2f4b766d7b
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 3c8067a23d..e4b6c8e23a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.5",
-    "8de764e44b56d1cffb3644368d4d689f482b611a"
+    "eac5279cd147d4086e0eb242198aae2f4b766d7b"
   ],
   "v16": [
     "16.9",
-    "c148871eada02c0cf15d553d8ff7c389d01810f2"
+    "51194dc5ce2e3523068d8607852e6c3125a17e58"
   ],
   "v15": [
     "15.13",
-    "21cb86b81454522870d3634cac3e10b821da09fe"
+    "24313bf8f3de722968a2fdf764de7ef77ed64f06"
   ],
   "v14": [
     "14.18",
-    "af550a80c6b86d0fec378ee929e2bb2e591e5cd3"
+    "ac3c460e01a31f11fb52fd8d8e88e60f0e1069b4"
   ]
 }

From fb796229bf16d6e1684b3f498d3fb5a55f13c5ee Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 16 Jul 2025 22:20:44 +0100
Subject: [PATCH 10/39] Fix `make neon-pgindent`  (#12535)

## Problem

`make neon-pgindent` doesn't work:
- there's no `$(BUILD_DIR)/neon-v17` dir
- `make -C ...` along with relative `BUILD_DIR` resolves to a path that
doesn't exist

## Summary of changes
- Fix path for to neon extension for `make neon-pgindent`
- Make `BUILD_DIR` absolute
- Remove trailing slash from `POSTGRES_INSTALL_DIR` to avoid duplicated
slashed in commands (doesn't break anything, it make it look nicer)
---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 749e527ac3..dc8bacc78e 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
 # Where to install Postgres, default is ./pg_install, maybe useful for package
 # managers.
-POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
+POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install
 
 # Supported PostgreSQL versions
 POSTGRES_VERSIONS = v17 v16 v15 v14
@@ -14,7 +14,7 @@ POSTGRES_VERSIONS = v17 v16 v15 v14
 # it is derived from BUILD_TYPE.
 
 # All intermediate build artifacts are stored here.
-BUILD_DIR := build
+BUILD_DIR := $(ROOT_PROJECT_DIR)/build
 
 ICU_PREFIX_DIR := /usr/local/icu
 
@@ -212,7 +212,7 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
 		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \
 		INDENT=$(BUILD_DIR)/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
 		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \
-		-C $(BUILD_DIR)/neon-v17 \
+		-C $(BUILD_DIR)/pgxn-v17/neon \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
 
 

From f2828bbe198a45c1604e67cad60bdcb96634b64d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 16 Jul 2025 17:52:18 -0400
Subject: [PATCH 11/39] fix(pageserver): skip gc-compaction for metadata key
 ranges (#12618)

## Problem

part of https://github.com/neondatabase/neon/issues/11318 ; it is not
entirely safe to run gc-compaction over the metadata key range due to
tombstones and implications of image layers (missing key in image layer
== key not exist). The auto gc-compaction trigger already skips metadata
key ranges (see `schedule_auto_compaction` call in
`trigger_auto_compaction`). In this patch we enforce it directly in
gc_compact_inner so that compactions triggered via HTTP API will also be
subject to this restriction.

## Summary of changes

Ensure gc-compaction only runs on rel key ranges.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                |   1 +
 pageserver/src/tenant.rs                     | 104 ++++++++++++++-----
 pageserver/src/tenant/timeline.rs            |  15 +++
 pageserver/src/tenant/timeline/compaction.rs |  30 +++++-
 4 files changed, 123 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3e844a375d..3a08244d71 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2357,6 +2357,7 @@ async fn timeline_compact_handler(
         flags,
         sub_compaction,
         sub_compaction_max_job_size_mb,
+        gc_compaction_do_metadata_compaction: false,
     };
 
     let scheduled = compact_request
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1a3016e7f1..3d66ae4719 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -9216,7 +9216,11 @@ mod tests {
 
         let cancel = CancellationToken::new();
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
 
@@ -9299,7 +9303,11 @@ mod tests {
             guard.cutoffs.space = Lsn(0x40);
         }
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
 
@@ -9836,7 +9844,11 @@ mod tests {
 
         let cancel = CancellationToken::new();
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
 
@@ -9871,7 +9883,11 @@ mod tests {
             guard.cutoffs.space = Lsn(0x40);
         }
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
 
@@ -10446,7 +10462,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     flags: dryrun_flags,
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -10457,14 +10473,22 @@ mod tests {
         verify_result().await;
 
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
 
         // compact again
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
@@ -10483,14 +10507,22 @@ mod tests {
             guard.cutoffs.space = Lsn(0x38);
         }
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
 
         // not increasing the GC horizon and compact again
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
@@ -10695,7 +10727,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     flags: dryrun_flags,
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -10706,14 +10738,22 @@ mod tests {
         verify_result().await;
 
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
 
         // compact again
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
@@ -10913,7 +10953,11 @@ mod tests {
 
         let cancel = CancellationToken::new();
         branch_tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
 
@@ -10926,7 +10970,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x40))),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -11594,7 +11638,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_key_range: Some((get_key(0)..get_key(2)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -11641,7 +11685,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_key_range: Some((get_key(2)..get_key(4)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -11693,7 +11737,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_key_range: Some((get_key(4)..get_key(9)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -11744,7 +11788,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_key_range: Some((get_key(9)..get_key(10)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -11800,7 +11844,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_key_range: Some((get_key(0)..get_key(10)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -12071,7 +12115,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x28))),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -12106,7 +12150,11 @@ mod tests {
 
         // compact again
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
@@ -12325,7 +12373,7 @@ mod tests {
                 CompactOptions {
                     compact_key_range: Some((get_key(0)..get_key(2)).into()),
                     compact_lsn_range: Some((Lsn(0x20)..Lsn(0x28)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -12371,7 +12419,7 @@ mod tests {
                 CompactOptions {
                     compact_key_range: Some((get_key(3)..get_key(8)).into()),
                     compact_lsn_range: Some((Lsn(0x28)..Lsn(0x40)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -12419,7 +12467,7 @@ mod tests {
                 CompactOptions {
                     compact_key_range: Some((get_key(0)..get_key(5)).into()),
                     compact_lsn_range: Some((Lsn(0x20)..Lsn(0x50)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -12454,7 +12502,11 @@ mod tests {
 
         // final full compaction
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
@@ -12564,7 +12616,7 @@ mod tests {
                 CompactOptions {
                     compact_key_range: None,
                     compact_lsn_range: None,
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 73d2d72b59..8f25555929 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -939,6 +939,20 @@ pub(crate) struct CompactOptions {
     /// Set job size for the GC compaction.
     /// This option is only used by GC compaction.
     pub sub_compaction_max_job_size_mb: Option<u64>,
+    /// Only for GC compaction.
+    /// If set, the compaction will compact the metadata layers. Should be only set to true in unit tests
+    /// because metadata compaction is not fully supported yet.
+    pub gc_compaction_do_metadata_compaction: bool,
+}
+
+impl CompactOptions {
+    #[cfg(test)]
+    pub fn default_for_gc_compaction_unit_tests() -> Self {
+        Self {
+            gc_compaction_do_metadata_compaction: true,
+            ..Default::default()
+        }
+    }
 }
 
 impl std::fmt::Debug for Timeline {
@@ -2185,6 +2199,7 @@ impl Timeline {
                     compact_lsn_range: None,
                     sub_compaction: false,
                     sub_compaction_max_job_size_mb: None,
+                    gc_compaction_do_metadata_compaction: false,
                 },
                 ctx,
             )
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index aa1aa937b6..f76ef502dc 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -396,6 +396,7 @@ impl GcCompactionQueue {
                     }),
                     compact_lsn_range: None,
                     sub_compaction_max_job_size_mb: None,
+                    gc_compaction_do_metadata_compaction: false,
                 },
                 permit,
             );
@@ -512,6 +513,7 @@ impl GcCompactionQueue {
                     compact_key_range: Some(job.compact_key_range.into()),
                     compact_lsn_range: Some(job.compact_lsn_range.into()),
                     sub_compaction_max_job_size_mb: None,
+                    gc_compaction_do_metadata_compaction: false,
                 };
                 pending_tasks.push(GcCompactionQueueItem::SubCompactionJob {
                     options,
@@ -785,6 +787,8 @@ pub(crate) struct GcCompactJob {
     /// as specified here. The true range being compacted is `min_lsn/max_lsn` in [`GcCompactionJobDescription`].
     /// min_lsn will always <= the lower bound specified here, and max_lsn will always >= the upper bound specified here.
     pub compact_lsn_range: Range<Lsn>,
+    /// See [`CompactOptions::gc_compaction_do_metadata_compaction`].
+    pub do_metadata_compaction: bool,
 }
 
 impl GcCompactJob {
@@ -799,6 +803,7 @@ impl GcCompactJob {
                 .compact_lsn_range
                 .map(|x| x.into())
                 .unwrap_or(Lsn::INVALID..Lsn::MAX),
+            do_metadata_compaction: options.gc_compaction_do_metadata_compaction,
         }
     }
 }
@@ -3174,6 +3179,7 @@ impl Timeline {
                         dry_run: job.dry_run,
                         compact_key_range: start..end,
                         compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
+                        do_metadata_compaction: false,
                     });
                     current_start = Some(end);
                 }
@@ -3236,7 +3242,7 @@ impl Timeline {
     async fn compact_with_gc_inner(
         self: &Arc<Self>,
         cancel: &CancellationToken,
-        job: GcCompactJob,
+        mut job: GcCompactJob,
         ctx: &RequestContext,
         yield_for_l0: bool,
     ) -> Result<CompactionOutcome, CompactionError> {
@@ -3244,6 +3250,28 @@ impl Timeline {
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
 
+        // If the job is not configured to compact the metadata key range, shrink the key range
+        // to exclude the metadata key range. The check is done by checking if the end of the key range
+        // is larger than the start of the metadata key range. Note that metadata keys cover the entire
+        // second half of the keyspace, so it's enough to only check the end of the key range.
+        if !job.do_metadata_compaction
+            && job.compact_key_range.end > Key::metadata_key_range().start
+        {
+            tracing::info!(
+                "compaction for metadata key range is not supported yet, overriding compact_key_range from {} to {}",
+                job.compact_key_range.end,
+                Key::metadata_key_range().start
+            );
+            // Shrink the key range to exclude the metadata key range.
+            job.compact_key_range.end = Key::metadata_key_range().start;
+
+            // Skip the job if the key range completely lies within the metadata key range.
+            if job.compact_key_range.start >= job.compact_key_range.end {
+                tracing::info!("compact_key_range is empty, skipping compaction");
+                return Ok(CompactionOutcome::Done);
+            }
+        }
+
         let timer = Instant::now();
         let begin_timer = timer;
 

From 5dd24c7ad8ec46669ea474c544c0180b9253acd9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 17 Jul 2025 09:57:36 +0100
Subject: [PATCH 12/39] test_total_size_limit: support hosts with up to 256 GB
 of RAM (#12617)

## Problem

`test_total_size_limit` fails on runners with 256 GB of RAM

## Summary of changes
- Generate more data in `test_total_size_limit`
---
 test_runner/regress/test_pageserver_layer_rolling.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index 91c4ef521c..68f470d962 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -246,9 +246,9 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
 
     system_memory = psutil.virtual_memory().total
 
-    # The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 128MB on
-    # a system with 128GB of RAM).  We will then write enough data to violate this limit.
-    max_dirty_data = 128 * 1024 * 1024
+    # The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 256MB on
+    # a system with 256GB of RAM).  We will then write enough data to violate this limit.
+    max_dirty_data = 256 * 1024 * 1024
     ephemeral_bytes_per_memory_kb = (max_dirty_data * 1024) // system_memory
     assert ephemeral_bytes_per_memory_kb > 0
 
@@ -272,7 +272,7 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
     timeline_count = 10
 
     # This is about 2MiB of data per timeline
-    entries_per_timeline = 100_000
+    entries_per_timeline = 200_000
 
     last_flush_lsns = asyncio.run(workload(env, tenant_conf, timeline_count, entries_per_timeline))
     wait_until_pageserver_is_caught_up(env, last_flush_lsns)

From 4559ba79b66bb19062de65fd3963543ed1b01fa2 Mon Sep 17 00:00:00 2001
From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com>
Date: Thu, 17 Jul 2025 15:51:31 +0400
Subject: [PATCH 13/39] Introduce force flag for new deletion API (#12588)

## Problem

The force deletion API should behave like the graceful deletion API - it
needs to support cancellation, persistence, and be non-blocking.

## Summary of Changes

- Added a `force` flag to the `NodeStartDelete` command.
- Passed the `force` flag through the `start_node_delete` handler in the
storage controller.
- Handled the `force` flag in the `delete_node` function.
- Set the tombstone after removing the node from memory.
- Minor cleanup, like adding a `get_error_on_cancel` closure.

---------

Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
---
 control_plane/storcon_cli/src/main.rs         | 19 +++--
 storage_controller/src/http.rs                |  3 +-
 storage_controller/src/service.rs             | 74 ++++++++++---------
 test_runner/fixtures/neon_fixtures.py         |  7 +-
 .../regress/test_storage_controller.py        | 58 +++++++++++++--
 5 files changed, 111 insertions(+), 50 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index fcc5549beb..a4d1030488 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -76,6 +76,12 @@ enum Command {
     NodeStartDelete {
         #[arg(long)]
         node_id: NodeId,
+        /// When `force` is true, skip waiting for shards to prewarm during migration.
+        /// This can significantly speed up node deletion since prewarming all shards
+        /// can take considerable time, but may result in slower initial access to
+        /// migrated shards until they warm up naturally.
+        #[arg(long)]
+        force: bool,
     },
     /// Cancel deletion of the specified pageserver and wait for `timeout`
     /// for the operation to be canceled. May be retried.
@@ -952,13 +958,14 @@ async fn main() -> anyhow::Result<()> {
                 .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
                 .await?;
         }
-        Command::NodeStartDelete { node_id } => {
+        Command::NodeStartDelete { node_id, force } => {
+            let query = if force {
+                format!("control/v1/node/{node_id}/delete?force=true")
+            } else {
+                format!("control/v1/node/{node_id}/delete")
+            };
             storcon_client
-                .dispatch::<(), ()>(
-                    Method::PUT,
-                    format!("control/v1/node/{node_id}/delete"),
-                    None,
-                )
+                .dispatch::<(), ()>(Method::PUT, query, None)
                 .await?;
             println!("Delete started for {node_id}");
         }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index c8227f0219..5f9a1124de 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1085,9 +1085,10 @@ async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiErr
 
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    let force: bool = parse_query_param(&req, "force")?.unwrap_or(false);
     json_response(
         StatusCode::OK,
-        state.service.start_node_delete(node_id).await?,
+        state.service.start_node_delete(node_id, force).await?,
     )
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 0c5d7f44d4..b315b88fcc 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -7385,6 +7385,7 @@ impl Service {
         self: &Arc<Self>,
         node_id: NodeId,
         policy_on_start: NodeSchedulingPolicy,
+        force: bool,
         cancel: CancellationToken,
     ) -> Result<(), OperationError> {
         let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal).build();
@@ -7392,23 +7393,27 @@ impl Service {
         let mut waiters: Vec<ReconcilerWaiter> = Vec::new();
         let mut tid_iter = create_shared_shard_iterator(self.clone());
 
+        let reset_node_policy_on_cancel = || async {
+            match self
+                .node_configure(node_id, None, Some(policy_on_start))
+                .await
+            {
+                Ok(()) => OperationError::Cancelled,
+                Err(err) => {
+                    OperationError::FinalizeError(
+                        format!(
+                            "Failed to finalise delete cancel of {} by setting scheduling policy to {}: {}",
+                            node_id, String::from(policy_on_start), err
+                        )
+                        .into(),
+                    )
+                }
+            }
+        };
+
         while !tid_iter.finished() {
             if cancel.is_cancelled() {
-                match self
-                    .node_configure(node_id, None, Some(policy_on_start))
-                    .await
-                {
-                    Ok(()) => return Err(OperationError::Cancelled),
-                    Err(err) => {
-                        return Err(OperationError::FinalizeError(
-                            format!(
-                                "Failed to finalise delete cancel of {} by setting scheduling policy to {}: {}",
-                                node_id, String::from(policy_on_start), err
-                            )
-                            .into(),
-                        ));
-                    }
-                }
+                return Err(reset_node_policy_on_cancel().await);
             }
 
             operation_utils::validate_node_state(
@@ -7477,8 +7482,18 @@ impl Service {
                         nodes,
                         reconciler_config,
                     );
-                    if let Some(some) = waiter {
-                        waiters.push(some);
+
+                    if force {
+                        // Here we remove an existing observed location for the node we're removing, and it will
+                        // not be re-added by a reconciler's completion because we filter out removed nodes in
+                        // process_result.
+                        //
+                        // Note that we update the shard's observed state _after_ calling maybe_configured_reconcile_shard:
+                        // that means any reconciles we spawned will know about the node we're deleting,
+                        // enabling them to do live migrations if it's still online.
+                        tenant_shard.observed.locations.remove(&node_id);
+                    } else if let Some(waiter) = waiter {
+                        waiters.push(waiter);
                     }
                 }
             }
@@ -7492,21 +7507,7 @@ impl Service {
 
         while !waiters.is_empty() {
             if cancel.is_cancelled() {
-                match self
-                    .node_configure(node_id, None, Some(policy_on_start))
-                    .await
-                {
-                    Ok(()) => return Err(OperationError::Cancelled),
-                    Err(err) => {
-                        return Err(OperationError::FinalizeError(
-                            format!(
-                                "Failed to finalise drain cancel of {} by setting scheduling policy to {}: {}",
-                                node_id, String::from(policy_on_start), err
-                            )
-                            .into(),
-                        ));
-                    }
-                }
+                return Err(reset_node_policy_on_cancel().await);
             }
 
             tracing::info!("Awaiting {} pending delete reconciliations", waiters.len());
@@ -7516,6 +7517,12 @@ impl Service {
                 .await;
         }
 
+        let pf = pausable_failpoint!("delete-node-after-reconciles-spawned", &cancel);
+        if pf.is_err() {
+            // An error from pausable_failpoint indicates the cancel token was triggered.
+            return Err(reset_node_policy_on_cancel().await);
+        }
+
         self.persistence
             .set_tombstone(node_id)
             .await
@@ -8111,6 +8118,7 @@ impl Service {
     pub(crate) async fn start_node_delete(
         self: &Arc<Self>,
         node_id: NodeId,
+        force: bool,
     ) -> Result<(), ApiError> {
         let (ongoing_op, node_policy, schedulable_nodes_count) = {
             let locked = self.inner.read().unwrap();
@@ -8180,7 +8188,7 @@ impl Service {
 
                             tracing::info!("Delete background operation starting");
                             let res = service
-                                .delete_node(node_id, policy_on_start, cancel)
+                                .delete_node(node_id, policy_on_start, force, cancel)
                                 .await;
                             match res {
                                 Ok(()) => {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ae73ace9bb..86ffa9e4d4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2119,11 +2119,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
             headers=self.headers(TokenScope.ADMIN),
         )
 
-    def node_delete(self, node_id):
+    def node_delete(self, node_id, force: bool = False):
         log.info(f"node_delete({node_id})")
+        query = f"{self.api}/control/v1/node/{node_id}/delete"
+        if force:
+            query += "?force=true"
         self.request(
             "PUT",
-            f"{self.api}/control/v1/node/{node_id}/delete",
+            query,
             headers=self.headers(TokenScope.ADMIN),
         )
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 10845ef02e..d1e9bbd7dc 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -72,6 +72,12 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids):
     return counts
 
 
+class DeletionAPIKind(Enum):
+    OLD = "old"
+    FORCE = "force"
+    GRACEFUL = "graceful"
+
+
 @pytest.mark.parametrize(**fixtures.utils.allpairs_versions())
 def test_storage_controller_smoke(
     neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure, combination
@@ -2572,9 +2578,11 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
 
 
 @pytest.mark.parametrize("while_offline", [True, False])
+@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.OLD, DeletionAPIKind.FORCE])
 def test_storage_controller_node_deletion(
     neon_env_builder: NeonEnvBuilder,
     while_offline: bool,
+    deletion_api: DeletionAPIKind,
 ):
     """
     Test that deleting a node works & properly reschedules everything that was on the node.
@@ -2598,6 +2606,8 @@ def test_storage_controller_node_deletion(
     assert env.storage_controller.reconcile_all() == 0
 
     victim = env.pageservers[-1]
+    if deletion_api == DeletionAPIKind.FORCE and not while_offline:
+        victim.allowed_errors.append(".*request was dropped before completing.*")
 
     # The procedure a human would follow is:
     # 1. Mark pageserver scheduling=pause
@@ -2621,7 +2631,12 @@ def test_storage_controller_node_deletion(
         wait_until(assert_shards_migrated)
 
     log.info(f"Deleting pageserver {victim.id}")
-    env.storage_controller.node_delete_old(victim.id)
+    if deletion_api == DeletionAPIKind.FORCE:
+        env.storage_controller.node_delete(victim.id, force=True)
+    elif deletion_api == DeletionAPIKind.OLD:
+        env.storage_controller.node_delete_old(victim.id)
+    else:
+        raise AssertionError(f"Invalid deletion API: {deletion_api}")
 
     if not while_offline:
 
@@ -2634,7 +2649,15 @@ def test_storage_controller_node_deletion(
         wait_until(assert_victim_evacuated)
 
     # The node should be gone from the list API
-    assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
+    def assert_node_is_gone():
+        assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
+
+    if deletion_api == DeletionAPIKind.FORCE:
+        wait_until(assert_node_is_gone)
+    elif deletion_api == DeletionAPIKind.OLD:
+        assert_node_is_gone()
+    else:
+        raise AssertionError(f"Invalid deletion API: {deletion_api}")
 
     # No tenants should refer to the node in their intent
     for tenant_id in tenant_ids:
@@ -2656,7 +2679,11 @@ def test_storage_controller_node_deletion(
     env.storage_controller.consistency_check()
 
 
-def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.FORCE, DeletionAPIKind.GRACEFUL])
+def test_storage_controller_node_delete_cancellation(
+    neon_env_builder: NeonEnvBuilder,
+    deletion_api: DeletionAPIKind,
+):
     neon_env_builder.num_pageservers = 3
     neon_env_builder.num_azs = 3
     env = neon_env_builder.init_configs()
@@ -2680,12 +2707,16 @@ def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBu
     assert len(nodes) == 3
 
     env.storage_controller.configure_failpoints(("sleepy-delete-loop", "return(10000)"))
+    env.storage_controller.configure_failpoints(("delete-node-after-reconciles-spawned", "pause"))
 
     ps_id_to_delete = env.pageservers[0].id
 
     env.storage_controller.warm_up_all_secondaries()
+
+    assert deletion_api in [DeletionAPIKind.FORCE, DeletionAPIKind.GRACEFUL]
+    force = deletion_api == DeletionAPIKind.FORCE
     env.storage_controller.retryable_node_operation(
-        lambda ps_id: env.storage_controller.node_delete(ps_id),
+        lambda ps_id: env.storage_controller.node_delete(ps_id, force),
         ps_id_to_delete,
         max_attempts=3,
         backoff=2,
@@ -2701,6 +2732,8 @@ def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBu
 
     env.storage_controller.cancel_node_delete(ps_id_to_delete)
 
+    env.storage_controller.configure_failpoints(("delete-node-after-reconciles-spawned", "off"))
+
     env.storage_controller.poll_node_status(
         ps_id_to_delete,
         PageserverAvailability.ACTIVE,
@@ -3252,7 +3285,10 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
     wait_until(reconfigure_node_again)
 
 
-def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.OLD, DeletionAPIKind.FORCE])
+def test_ps_unavailable_after_delete(
+    neon_env_builder: NeonEnvBuilder, deletion_api: DeletionAPIKind
+):
     neon_env_builder.num_pageservers = 3
 
     env = neon_env_builder.init_start()
@@ -3265,10 +3301,16 @@ def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
     assert_nodes_count(3)
 
     ps = env.pageservers[0]
-    env.storage_controller.node_delete_old(ps.id)
 
-    # After deletion, the node count must be reduced
-    assert_nodes_count(2)
+    if deletion_api == DeletionAPIKind.FORCE:
+        ps.allowed_errors.append(".*request was dropped before completing.*")
+        env.storage_controller.node_delete(ps.id, force=True)
+        wait_until(lambda: assert_nodes_count(2))
+    elif deletion_api == DeletionAPIKind.OLD:
+        env.storage_controller.node_delete_old(ps.id)
+        assert_nodes_count(2)
+    else:
+        raise AssertionError(f"Invalid deletion API: {deletion_api}")
 
     # Running pageserver CLI init in a separate thread
     with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:

From b7fc5a2fe0b2a2f0e45fc88e026fa8a4b498cb5a Mon Sep 17 00:00:00 2001
From: HaoyuHuang <haoyu.huang.68@gmail.com>
Date: Thu, 17 Jul 2025 06:14:36 -0700
Subject: [PATCH 14/39] A few SC changes (#12615)

## Summary of changes
A bunch of no-op changes.

---------

Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 Cargo.lock                             |  4 +-
 libs/utils/Cargo.toml                  |  1 +
 libs/utils/src/auth.rs                 | 38 ++++++++++++++-
 pageserver/src/auth.rs                 |  3 +-
 safekeeper/src/auth.rs                 |  3 +-
 storage_controller/Cargo.toml          |  2 +
 storage_controller/src/auth.rs         | 12 +++++
 storage_controller/src/compute_hook.rs | 30 ++++++++++--
 storage_controller/src/metrics.rs      | 64 ++++++++++++++++++++++++++
 storage_controller/src/node.rs         | 55 ++++++++++++++++++++++
 workspace_hack/Cargo.toml              |  1 -
 11 files changed, 203 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e5f39658a7..215b3360bc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1872,6 +1872,7 @@ dependencies = [
  "diesel_derives",
  "itoa",
  "serde_json",
+ "uuid",
 ]
 
 [[package]]
@@ -6933,6 +6934,7 @@ dependencies = [
  "tokio-util",
  "tracing",
  "utils",
+ "uuid",
  "workspace_hack",
 ]
 
@@ -8206,6 +8208,7 @@ dependencies = [
  "tracing-error",
  "tracing-subscriber",
  "tracing-utils",
+ "uuid",
  "walkdir",
 ]
 
@@ -8807,7 +8810,6 @@ dependencies = [
  "tracing-log",
  "tracing-subscriber",
  "url",
- "uuid",
  "zeroize",
  "zstd",
  "zstd-safe",
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 7b1dc56071..4b326949d7 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -47,6 +47,7 @@ tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 tracing-utils.workspace = true
 rand.workspace = true
 scopeguard.workspace = true
+uuid.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 walkdir.workspace = true
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index de3a964d23..b2aade15de 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -12,7 +12,8 @@ use jsonwebtoken::{
     Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode,
 };
 use pem::Pem;
-use serde::{Deserialize, Serialize, de::DeserializeOwned};
+use serde::{Deserialize, Deserializer, Serialize, de::DeserializeOwned};
+use uuid::Uuid;
 
 use crate::id::TenantId;
 
@@ -25,6 +26,11 @@ pub enum Scope {
     /// Provides access to all data for a specific tenant (specified in `struct Claims` below)
     // TODO: join these two?
     Tenant,
+    /// Provides access to all data for a specific tenant, but based on endpoint ID. This token scope
+    /// is only used by compute to fetch the spec for a specific endpoint. The spec contains a Tenant-scoped
+    /// token authorizing access to all data of a tenant, so the spec-fetch API requires a TenantEndpoint
+    /// scope token to ensure that untrusted compute nodes can't fetch spec for arbitrary endpoints.
+    TenantEndpoint,
     /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
     /// Should only be used e.g. for status check/tenant creation/list.
     PageServerApi,
@@ -51,17 +57,43 @@ pub enum Scope {
     ControllerPeer,
 }
 
+fn deserialize_empty_string_as_none_uuid<'de, D>(deserializer: D) -> Result<Option<Uuid>, D::Error>
+where
+    D: Deserializer<'de>,
+{
+    let opt = Option::<String>::deserialize(deserializer)?;
+    match opt.as_deref() {
+        Some("") => Ok(None),
+        Some(s) => Uuid::parse_str(s)
+            .map(Some)
+            .map_err(serde::de::Error::custom),
+        None => Ok(None),
+    }
+}
+
 /// JWT payload. See docs/authentication.md for the format
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 pub struct Claims {
     #[serde(default)]
     pub tenant_id: Option<TenantId>,
+    #[serde(
+        default,
+        skip_serializing_if = "Option::is_none",
+        // Neon control plane includes this field as empty in the claims.
+        // Consider it None in those cases.
+        deserialize_with = "deserialize_empty_string_as_none_uuid"
+    )]
+    pub endpoint_id: Option<Uuid>,
     pub scope: Scope,
 }
 
 impl Claims {
     pub fn new(tenant_id: Option<TenantId>, scope: Scope) -> Self {
-        Self { tenant_id, scope }
+        Self {
+            tenant_id,
+            scope,
+            endpoint_id: None,
+        }
     }
 }
 
@@ -212,6 +244,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         let expected_claims = Claims {
             tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
             scope: Scope::Tenant,
+            endpoint_id: None,
         };
 
         // A test token containing the following payload, signed using TEST_PRIV_KEY_ED25519:
@@ -240,6 +273,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         let claims = Claims {
             tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
             scope: Scope::Tenant,
+            endpoint_id: None,
         };
 
         let pem = pem::parse(TEST_PRIV_KEY_ED25519).unwrap();
diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs
index 4075427ab4..9e97fdaba8 100644
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -20,7 +20,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             | Scope::GenerationsApi
             | Scope::Infra
             | Scope::Scrubber
-            | Scope::ControllerPeer,
+            | Scope::ControllerPeer
+            | Scope::TenantEndpoint,
             _,
         ) => Err(AuthError(
             format!(
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index 81c79fae30..008f903a89 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -21,7 +21,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             | Scope::GenerationsApi
             | Scope::Infra
             | Scope::Scrubber
-            | Scope::ControllerPeer,
+            | Scope::ControllerPeer
+            | Scope::TenantEndpoint,
             _,
         ) => Err(AuthError(
             format!(
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 143f4241f4..d67be6d469 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -52,6 +52,7 @@ tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio.workspace = true
 tracing.workspace = true
+uuid.workspace = true
 measured.workspace = true
 rustls.workspace = true
 scopeguard.workspace = true
@@ -63,6 +64,7 @@ tokio-postgres-rustls.workspace = true
 diesel = { version = "2.2.6", features = [
     "serde_json",
     "chrono",
+    "uuid",
 ] }
 diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] }
 diesel_migrations = { version = "2.2.0" }
diff --git a/storage_controller/src/auth.rs b/storage_controller/src/auth.rs
index ef47abf8c7..8f15f0f072 100644
--- a/storage_controller/src/auth.rs
+++ b/storage_controller/src/auth.rs
@@ -1,4 +1,5 @@
 use utils::auth::{AuthError, Claims, Scope};
+use uuid::Uuid;
 
 pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> {
     if claims.scope != required_scope {
@@ -7,3 +8,14 @@ pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), Au
 
     Ok(())
 }
+
+#[allow(dead_code)]
+pub fn check_endpoint_permission(claims: &Claims, endpoint_id: Uuid) -> Result<(), AuthError> {
+    if claims.scope != Scope::TenantEndpoint {
+        return Err(AuthError("Scope mismatch. Permission denied".into()));
+    }
+    if claims.endpoint_id != Some(endpoint_id) {
+        return Err(AuthError("Endpoint id mismatch. Permission denied".into()));
+    }
+    Ok(())
+}
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index ab37a207e4..fb03412f3c 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -810,6 +810,7 @@ impl ComputeHook {
                 let send_locked = tokio::select! {
                     guard = send_lock.lock_owned() => {guard},
                     _ = cancel.cancelled() => {
+                        tracing::info!("Notification cancelled while waiting for lock");
                         return Err(NotifyError::ShuttingDown)
                     }
                 };
@@ -851,11 +852,32 @@ impl ComputeHook {
             let notify_url = compute_hook_url.as_ref().unwrap();
             self.do_notify(notify_url, &request, cancel).await
         } else {
-            self.do_notify_local::<M>(&request).await.map_err(|e| {
+            match self.do_notify_local::<M>(&request).await.map_err(|e| {
                 // This path is for testing only, so munge the error into our prod-style error type.
-                tracing::error!("neon_local notification hook failed: {e}");
-                NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
-            })
+                if e.to_string().contains("refresh-configuration-pending") {
+                    // If the error message mentions "refresh-configuration-pending", it means the compute node
+                    // rejected our notification request because it already trying to reconfigure itself. We
+                    // can proceed with the rest of the reconcliation process as the compute node already
+                    // discovers the need to reconfigure and will eventually update its configuration once
+                    // we update the pageserver mappings. In fact, it is important that we continue with
+                    // reconcliation to make sure we update the pageserver mappings to unblock the compute node.
+                    tracing::info!("neon_local notification hook failed: {e}");
+                    tracing::info!("Notification failed likely due to compute node self-reconfiguration, will retry.");
+                    Ok(())
+                } else {
+                    tracing::error!("neon_local notification hook failed: {e}");
+                    Err(NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR))
+                }
+            }) {
+                // Compute node accepted the notification request. Ok to proceed.
+                Ok(_) => Ok(()),
+                // Compute node rejected our request but it is already self-reconfiguring. Ok to proceed.
+                Err(Ok(_)) => Ok(()),
+                // Fail the reconciliation attempt in all other cases. Recall that this whole code path involving
+                // neon_local is for testing only. In production we always retry failed reconcliations so we
+                // don't have any deadends here.
+                Err(Err(e)) => Err(e),
+            }
         };
 
         match result {
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index 8738386968..0c923e742e 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -151,6 +151,29 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Indicator of completed safekeeper reconciles, broken down by safekeeper.
     pub(crate) storage_controller_safekeeper_reconciles_complete:
         measured::CounterVec<SafekeeperReconcilerLabelGroupSet>,
+
+    /* BEGIN HADRON */
+    /// Hadron `config_watcher` reconciliation runs completed, broken down by success/failure.
+    pub(crate) storage_controller_config_watcher_complete:
+        measured::CounterVec<ConfigWatcherCompleteLabelGroupSet>,
+
+    /// Hadron long waits for node state changes during drain and fill.
+    pub(crate) storage_controller_drain_and_fill_long_waits: measured::Counter,
+
+    /// Set to 1 if we detect any page server pods with pending node pool rotation annotations.
+    /// Requires manual reset after oncall investigation.
+    pub(crate) storage_controller_ps_node_pool_rotation_pending: measured::Gauge,
+
+    /// Hadron storage scrubber status.
+    pub(crate) storage_controller_storage_scrub_status:
+        measured::CounterVec<StorageScrubberLabelGroupSet>,
+
+    /// Desired number of pageservers managed by the storage controller
+    pub(crate) storage_controller_num_pageservers_desired: measured::Gauge,
+
+    /// Desired number of safekeepers managed by the storage controller
+    pub(crate) storage_controller_num_safekeeper_desired: measured::Gauge,
+    /* END HADRON */
 }
 
 impl StorageControllerMetrics {
@@ -173,6 +196,10 @@ impl Default for StorageControllerMetrics {
             .storage_controller_reconcile_complete
             .init_all_dense();
 
+        metrics_group
+            .storage_controller_config_watcher_complete
+            .init_all_dense();
+
         Self {
             metrics_group,
             encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
@@ -262,11 +289,48 @@ pub(crate) struct ReconcileLongRunningLabelGroup<'a> {
     pub(crate) sequence: &'a str,
 }
 
+#[derive(measured::LabelGroup, Clone)]
+#[label(set = StorageScrubberLabelGroupSet)]
+pub(crate) struct StorageScrubberLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) tenant_id: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) shard_number: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) timeline_id: &'a str,
+    pub(crate) outcome: StorageScrubberOutcome,
+}
+
+#[derive(FixedCardinalityLabel, Clone, Copy)]
+pub(crate) enum StorageScrubberOutcome {
+    PSOk,
+    PSWarning,
+    PSError,
+    PSOrphan,
+    SKOk,
+    SKError,
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = ConfigWatcherCompleteLabelGroupSet)]
+pub(crate) struct ConfigWatcherCompleteLabelGroup {
+    // Reuse the ReconcileOutcome from the SC's reconciliation metrics.
+    pub(crate) status: ReconcileOutcome,
+}
+
 #[derive(FixedCardinalityLabel, Clone, Copy)]
 pub(crate) enum ReconcileOutcome {
+    // Successfully reconciled everything.
     #[label(rename = "ok")]
     Success,
+    // Used by tenant-shard reconciler only. Reconciled pageserver state successfully,
+    // but failed to delivery the compute notificiation. This error is typically transient
+    // but if its occurance keeps increasing, it should be investigated.
+    #[label(rename = "ok_no_notify")]
+    SuccessNoNotify,
+    // We failed to reconcile some state and the reconcilation will be retried.
     Error,
+    // Reconciliation was cancelled.
     Cancel,
 }
 
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 6642c72f3c..63c82b5682 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -51,6 +51,39 @@ pub(crate) struct Node {
     cancel: CancellationToken,
 }
 
+#[allow(dead_code)]
+const ONE_MILLION: i64 = 1000000;
+
+// Converts a pool ID to a large number that can be used to assign unique IDs to pods in StatefulSets.
+/// For example, if pool_id is 1, then the pods have NodeIds 1000000, 1000001, 1000002, etc.
+/// If pool_id is None, then the pods have NodeIds 0, 1, 2, etc.
+#[allow(dead_code)]
+pub fn transform_pool_id(pool_id: Option<i32>) -> i64 {
+    match pool_id {
+        Some(id) => (id as i64) * ONE_MILLION,
+        None => 0,
+    }
+}
+
+#[allow(dead_code)]
+pub fn get_pool_id_from_node_id(node_id: i64) -> i32 {
+    (node_id / ONE_MILLION) as i32
+}
+
+/// Example pod name: page-server-0-1, safe-keeper-1-0
+#[allow(dead_code)]
+pub fn get_node_id_from_pod_name(pod_name: &str) -> anyhow::Result<NodeId> {
+    let parts: Vec<&str> = pod_name.split('-').collect();
+    if parts.len() != 4 {
+        return Err(anyhow::anyhow!("Invalid pod name: {}", pod_name));
+    }
+    let pool_id = parts[2].parse::<i32>()?;
+    let node_offset = parts[3].parse::<i64>()?;
+    let node_id = transform_pool_id(Some(pool_id)) + node_offset;
+
+    Ok(NodeId(node_id as u64))
+}
+
 /// When updating [`Node::availability`] we use this type to indicate to the caller
 /// whether/how they changed it.
 pub(crate) enum AvailabilityTransition {
@@ -403,3 +436,25 @@ impl std::fmt::Debug for Node {
         write!(f, "{} ({})", self.id, self.listen_http_addr)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use utils::id::NodeId;
+
+    use crate::node::get_node_id_from_pod_name;
+
+    #[test]
+    fn test_get_node_id_from_pod_name() {
+        let pod_name = "page-server-3-12";
+        let node_id = get_node_id_from_pod_name(pod_name).unwrap();
+        assert_eq!(node_id, NodeId(3000012));
+
+        let pod_name = "safe-keeper-1-0";
+        let node_id = get_node_id_from_pod_name(pod_name).unwrap();
+        assert_eq!(node_id, NodeId(1000000));
+
+        let pod_name = "invalid-pod-name";
+        let result = get_node_id_from_pod_name(pod_name);
+        assert!(result.is_err());
+    }
+}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index c61598cdf6..d6d64a2045 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -107,7 +107,6 @@ tracing-core = { version = "0.1" }
 tracing-log = { version = "0.2" }
 tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
 url = { version = "2", features = ["serde"] }
-uuid = { version = "1", features = ["serde", "v4", "v7"] }
 zeroize = { version = "1", features = ["derive", "serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }

From 8862e7c4bf16a77bd9c354f4f94e5625c86b302f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 17 Jul 2025 14:20:40 +0100
Subject: [PATCH 15/39] tests: use new snapshot in test_forward_compat (#12637)

## Problem

The forward compatibility test is erroneously
using the downloaded (old) compatibility data. This test is meant to
test that old binaries can work with **new** data. Using the old
compatibility data renders this test useless.

## Summary of changes

Use new snapshot in test_forward_compat

Closes LKB-666

Co-authored-by: William Huang <william.huang@databricks.com>
---
 test_runner/regress/test_compatibility.py | 24 +++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index a4d2bf8d9b..a3a20cdc62 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -187,19 +187,21 @@ def test_create_snapshot(
     env.pageserver.stop()
     env.storage_controller.stop()
 
-    # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
-    compatibility_snapshot_dir = (
+    # Directory `new_compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
+    new_compatibility_snapshot_dir = (
         top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
     )
-    if compatibility_snapshot_dir.exists():
-        shutil.rmtree(compatibility_snapshot_dir)
+    if new_compatibility_snapshot_dir.exists():
+        shutil.rmtree(new_compatibility_snapshot_dir)
 
     shutil.copytree(
         test_output_dir,
-        compatibility_snapshot_dir,
+        new_compatibility_snapshot_dir,
         ignore=shutil.ignore_patterns("pg_dynshmem"),
     )
 
+    log.info(f"Copied new compatibility snapshot dir to: {new_compatibility_snapshot_dir}")
+
 
 # check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning
 ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
@@ -218,6 +220,7 @@ def test_backward_compatibility(
     """
     Test that the new binaries can read old data
     """
+    log.info(f"Using snapshot dir at {compatibility_snapshot_dir}")
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
     env.pageserver.allowed_errors.append(ingest_lag_log_line)
@@ -242,7 +245,6 @@ def test_forward_compatibility(
     test_output_dir: Path,
     top_output_dir: Path,
     pg_version: PgVersion,
-    compatibility_snapshot_dir: Path,
     compute_reconfigure_listener: ComputeReconfigure,
 ):
     """
@@ -266,8 +268,14 @@ def test_forward_compatibility(
     neon_env_builder.neon_binpath = neon_env_builder.compatibility_neon_binpath
     neon_env_builder.pg_distrib_dir = neon_env_builder.compatibility_pg_distrib_dir
 
+    # Note that we are testing with new data, so we should use `new_compatibility_snapshot_dir`, which is created by test_create_snapshot.
+    new_compatibility_snapshot_dir = (
+        top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
+    )
+
+    log.info(f"Using snapshot dir at {new_compatibility_snapshot_dir}")
     env = neon_env_builder.from_repo_dir(
-        compatibility_snapshot_dir / "repo",
+        new_compatibility_snapshot_dir / "repo",
     )
     # there may be an arbitrary number of unrelated tests run between create_snapshot and here
     env.pageserver.allowed_errors.append(ingest_lag_log_line)
@@ -296,7 +304,7 @@ def test_forward_compatibility(
     check_neon_works(
         env,
         test_output_dir=test_output_dir,
-        sql_dump_path=compatibility_snapshot_dir / "dump.sql",
+        sql_dump_path=new_compatibility_snapshot_dir / "dump.sql",
         repo_dir=env.repo_dir,
     )
 

From f0c0733a64889e0e9291b08fcc471ecb502540c1 Mon Sep 17 00:00:00 2001
From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com>
Date: Thu, 17 Jul 2025 18:52:57 +0400
Subject: [PATCH 16/39] storcon: Ignore stuck reconciles when considering
 optimizations (#12589)

## Problem

The `keep_failing_reconciles` counter was introduced in #12391, but
there is a special case:

> if a reconciliation loop claims to have succeeded, but maybe_reconcile
still thinks the tenant is in need of reconciliation, then that's a
probable bug and we should activate a similar backoff to prevent
flapping.

This PR redefines "flapping" to include not just repeated failures, but
also consecutive reconciliations of any kind (success or failure).

## Summary of Changes

- Replace `keep_failing_reconciles` with a new `stuck_reconciles` metric
- Replace `MAX_CONSECUTIVE_RECONCILIATION_ERRORS` with
`MAX_CONSECUTIVE_RECONCILES`, and increasing that from 5 to 10
- Increment the consecutive reconciles counter for all reconciles, not
just failures
- Reset the counter in `reconcile_all` when no reconcile is needed for a
shard
- Improve and fix the related test

---------

Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
---
 storage_controller/src/metrics.rs             |  4 +-
 storage_controller/src/service.rs             | 61 +++++++++----------
 storage_controller/src/tenant_shard.rs        | 18 +++---
 .../regress/test_storage_controller.py        |  8 ++-
 4 files changed, 46 insertions(+), 45 deletions(-)

diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index 0c923e742e..9c34b34044 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -76,8 +76,8 @@ pub(crate) struct StorageControllerMetricGroup {
     /// How many shards would like to reconcile but were blocked by concurrency limits
     pub(crate) storage_controller_pending_reconciles: measured::Gauge,
 
-    /// How many shards are keep-failing and will be ignored when considering to run optimizations
-    pub(crate) storage_controller_keep_failing_reconciles: measured::Gauge,
+    /// How many shards are stuck and will be ignored when considering to run optimizations
+    pub(crate) storage_controller_stuck_reconciles: measured::Gauge,
 
     /// HTTP request status counters for handled requests
     pub(crate) storage_controller_http_request_status:
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index b315b88fcc..ec3b419437 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -232,9 +232,9 @@ pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
 pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
 
-// Number of consecutive reconciliation errors, occured for one shard,
+// Number of consecutive reconciliations that have occurred for one shard,
 // after which the shard is ignored when considering to run optimizations.
-const MAX_CONSECUTIVE_RECONCILIATION_ERRORS: usize = 5;
+const MAX_CONSECUTIVE_RECONCILES: usize = 10;
 
 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
 // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
@@ -735,31 +735,31 @@ struct TenantMutationLocations(BTreeMap<TenantShardId, ShardMutationLocations>);
 
 struct ReconcileAllResult {
     spawned_reconciles: usize,
-    keep_failing_reconciles: usize,
+    stuck_reconciles: usize,
     has_delayed_reconciles: bool,
 }
 
 impl ReconcileAllResult {
     fn new(
         spawned_reconciles: usize,
-        keep_failing_reconciles: usize,
+        stuck_reconciles: usize,
         has_delayed_reconciles: bool,
     ) -> Self {
         assert!(
-            spawned_reconciles >= keep_failing_reconciles,
-            "It is impossible to have more keep-failing reconciles than spawned reconciles"
+            spawned_reconciles >= stuck_reconciles,
+            "It is impossible to have less spawned reconciles than stuck reconciles"
         );
         Self {
             spawned_reconciles,
-            keep_failing_reconciles,
+            stuck_reconciles,
             has_delayed_reconciles,
         }
     }
 
     /// We can run optimizations only if we don't have any delayed reconciles and
-    /// all spawned reconciles are also keep-failing reconciles.
+    /// all spawned reconciles are also stuck reconciles.
     fn can_run_optimizations(&self) -> bool {
-        !self.has_delayed_reconciles && self.spawned_reconciles == self.keep_failing_reconciles
+        !self.has_delayed_reconciles && self.spawned_reconciles == self.stuck_reconciles
     }
 }
 
@@ -1503,7 +1503,6 @@ impl Service {
 
         match result.result {
             Ok(()) => {
-                tenant.consecutive_errors_count = 0;
                 tenant.apply_observed_deltas(deltas);
                 tenant.waiter.advance(result.sequence);
             }
@@ -1522,8 +1521,6 @@ impl Service {
                     }
                 }
 
-                tenant.consecutive_errors_count = tenant.consecutive_errors_count.saturating_add(1);
-
                 // Ordering: populate last_error before advancing error_seq,
                 // so that waiters will see the correct error after waiting.
                 tenant.set_last_error(result.sequence, e);
@@ -1535,6 +1532,8 @@ impl Service {
             }
         }
 
+        tenant.consecutive_reconciles_count = tenant.consecutive_reconciles_count.saturating_add(1);
+
         // If we just finished detaching all shards for a tenant, it might be time to drop it from memory.
         if tenant.policy == PlacementPolicy::Detached {
             // We may only drop a tenant from memory while holding the exclusive lock on the tenant ID: this protects us
@@ -8640,7 +8639,7 @@ impl Service {
         // This function is an efficient place to update lazy statistics, since we are walking
         // all tenants.
         let mut pending_reconciles = 0;
-        let mut keep_failing_reconciles = 0;
+        let mut stuck_reconciles = 0;
         let mut az_violations = 0;
 
         // If we find any tenants to drop from memory, stash them to offload after
@@ -8676,30 +8675,32 @@ impl Service {
 
             // Eventual consistency: if an earlier reconcile job failed, and the shard is still
             // dirty, spawn another one
-            let consecutive_errors_count = shard.consecutive_errors_count;
             if self
                 .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
                 .is_some()
             {
                 spawned_reconciles += 1;
 
-                // Count shards that are keep-failing. We still want to reconcile them
-                // to avoid a situation where a shard is stuck.
-                // But we don't want to consider them when deciding to run optimizations.
-                if consecutive_errors_count >= MAX_CONSECUTIVE_RECONCILIATION_ERRORS {
+                if shard.consecutive_reconciles_count >= MAX_CONSECUTIVE_RECONCILES {
+                    // Count shards that are stuck, butwe still want to reconcile them.
+                    // We don't want to consider them when deciding to run optimizations.
                     tracing::warn!(
                         tenant_id=%shard.tenant_shard_id.tenant_id,
                         shard_id=%shard.tenant_shard_id.shard_slug(),
-                        "Shard reconciliation is keep-failing: {} errors",
-                        consecutive_errors_count
+                        "Shard reconciliation is stuck: {} consecutive launches",
+                        shard.consecutive_reconciles_count
                     );
-                    keep_failing_reconciles += 1;
+                    stuck_reconciles += 1;
+                }
+            } else {
+                if shard.delayed_reconcile {
+                    // Shard wanted to reconcile but for some reason couldn't.
+                    pending_reconciles += 1;
                 }
-            } else if shard.delayed_reconcile {
-                // Shard wanted to reconcile but for some reason couldn't.
-                pending_reconciles += 1;
-            }
 
+                // Reset the counter when we don't need to launch a reconcile.
+                shard.consecutive_reconciles_count = 0;
+            }
             // If this tenant is detached, try dropping it from memory. This is usually done
             // proactively in [`Self::process_results`], but we do it here to handle the edge
             // case where a reconcile completes while someone else is holding an op lock for the tenant.
@@ -8735,14 +8736,10 @@ impl Service {
 
         metrics::METRICS_REGISTRY
             .metrics_group
-            .storage_controller_keep_failing_reconciles
-            .set(keep_failing_reconciles as i64);
+            .storage_controller_stuck_reconciles
+            .set(stuck_reconciles as i64);
 
-        ReconcileAllResult::new(
-            spawned_reconciles,
-            keep_failing_reconciles,
-            has_delayed_reconciles,
-        )
+        ReconcileAllResult::new(spawned_reconciles, stuck_reconciles, has_delayed_reconciles)
     }
 
     /// `optimize` in this context means identifying shards which have valid scheduled locations, but
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 99079c57b0..05de155963 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -131,14 +131,16 @@ pub(crate) struct TenantShard {
     #[serde(serialize_with = "read_last_error")]
     pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
 
-    /// Number of consecutive reconciliation errors that have occurred for this shard.
+    /// Amount of consecutive [`crate::service::Service::reconcile_all`] iterations that have been
+    /// scheduled a reconciliation for this shard.
     ///
-    /// When this count reaches MAX_CONSECUTIVE_RECONCILIATION_ERRORS, the tenant shard
-    /// will be countered as keep-failing in `reconcile_all` calculations. This will lead to
-    /// allowing optimizations to run even with some failing shards.
+    /// If this reaches `MAX_CONSECUTIVE_RECONCILES`, the shard is considered "stuck" and will be
+    /// ignored when deciding whether optimizations can run. This includes both successful and failed
+    /// reconciliations.
     ///
-    /// The counter is reset to 0 after a successful reconciliation.
-    pub(crate) consecutive_errors_count: usize,
+    /// Incremented in [`crate::service::Service::process_result`], and reset to 0 when
+    /// [`crate::service::Service::reconcile_all`] determines no reconciliation is needed for this shard.
+    pub(crate) consecutive_reconciles_count: usize,
 
     /// If we have a pending compute notification that for some reason we weren't able to send,
     /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
@@ -603,7 +605,7 @@ impl TenantShard {
             waiter: Arc::new(SeqWait::new(Sequence(0))),
             error_waiter: Arc::new(SeqWait::new(Sequence(0))),
             last_error: Arc::default(),
-            consecutive_errors_count: 0,
+            consecutive_reconciles_count: 0,
             pending_compute_notification: false,
             scheduling_policy: ShardSchedulingPolicy::default(),
             preferred_node: None,
@@ -1908,7 +1910,7 @@ impl TenantShard {
             waiter: Arc::new(SeqWait::new(Sequence::initial())),
             error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
             last_error: Arc::default(),
-            consecutive_errors_count: 0,
+            consecutive_reconciles_count: 0,
             pending_compute_notification: false,
             delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index d1e9bbd7dc..fbdb14b6bb 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -996,7 +996,7 @@ def test_storage_controller_compute_hook_retry(
 
 
 @run_only_on_default_postgres("postgres behavior is not relevant")
-def test_storage_controller_compute_hook_keep_failing(
+def test_storage_controller_compute_hook_stuck_reconciles(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
     httpserver_listen_address: ListenAddress,
@@ -1046,7 +1046,7 @@ def test_storage_controller_compute_hook_keep_failing(
     env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
     env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
     env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*")
-    env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
+    env.storage_controller.allowed_errors.append(".*Shard reconciliation is stuck.*")
     env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})
 
     # Migrate all allowed tenant shards to the first alive pageserver
@@ -1061,7 +1061,7 @@ def test_storage_controller_compute_hook_keep_failing(
 
     # Make some reconcile_all calls to trigger optimizations
     # RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS
-    RECONCILE_COUNT = 12
+    RECONCILE_COUNT = 20
     for i in range(RECONCILE_COUNT):
         try:
             n = env.storage_controller.reconcile_all()
@@ -1074,6 +1074,8 @@ def test_storage_controller_compute_hook_keep_failing(
         assert banned_descr["shards"][0]["is_pending_compute_notification"] is True
         time.sleep(2)
 
+    env.storage_controller.assert_log_contains(".*Shard reconciliation is stuck.*")
+
     # Check that the allowed tenant shards are optimized due to affinity rules
     locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"]
     not_optimized_shard_count = 0

From b309cbc6e9d2cf540f5c081969bbc62f34351f73 Mon Sep 17 00:00:00 2001
From: quantumish <freifeld.david@gmail.com>
Date: Thu, 17 Jul 2025 10:40:53 -0700
Subject: [PATCH 17/39] Add resizable hashmap and RwLock implementations to
 `neon-shmem` (#12596)

Second PR for the hashmap behind the updated LFC implementation ([see
first here](https://github.com/neondatabase/neon/pull/12595)). This only
adds the raw code for the hashmap/lock implementations and doesn't plug
it into the crate (that's dependent on the previous PR and should
probably be done when the full integration into the new communicator is
merged alongside `communicator-rewrite` changes?).

Some high level details: the communicator codebase expects to be able to
store references to entries within this hashmap for arbitrary periods of
time and so the hashmap cannot be allowed to move them during a rehash.
As a result, this implementation has a slightly unusual structure where
key-value pairs (and hash chains) are allocated in a separate region
with a freelist. The core hashmap structure is then an array of
"dictionary entries" that are just indexes into this region of key-value
pairs.

Concurrency support is very naive at the moment with the entire map
guarded by one big `RwLock` (which is implemented on top of a
`pthread_rwlock_t` since Rust doesn't guarantee that a
`std::sync::RwLock` is safe to use in shared memory). This (along with a
lot of other things) is being changed on the
`quantumish/lfc-resizable-map` branch.
---
 Cargo.lock                        |  90 ++++-
 Cargo.toml                        |   3 +-
 libs/neon-shmem/Cargo.toml        |   7 +
 libs/neon-shmem/src/hash.rs       | 583 ++++++++++++++++++++++++++++++
 libs/neon-shmem/src/hash/core.rs  | 174 +++++++++
 libs/neon-shmem/src/hash/entry.rs | 130 +++++++
 libs/neon-shmem/src/hash/tests.rs | 428 ++++++++++++++++++++++
 libs/neon-shmem/src/lib.rs        |   2 +
 libs/neon-shmem/src/sync.rs       | 111 ++++++
 9 files changed, 1522 insertions(+), 6 deletions(-)
 create mode 100644 libs/neon-shmem/src/hash.rs
 create mode 100644 libs/neon-shmem/src/hash/core.rs
 create mode 100644 libs/neon-shmem/src/hash/entry.rs
 create mode 100644 libs/neon-shmem/src/hash/tests.rs
 create mode 100644 libs/neon-shmem/src/sync.rs

diff --git a/Cargo.lock b/Cargo.lock
index 215b3360bc..137b883a6d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2534,6 +2534,18 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasi 0.14.2+wasi-0.2.4",
+]
+
 [[package]]
 name = "gettid"
 version = "0.1.3"
@@ -3607,9 +3619,9 @@ checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
 
 [[package]]
 name = "lock_api"
-version = "0.4.10"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
+checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765"
 dependencies = [
  "autocfg",
  "scopeguard",
@@ -3759,7 +3771,7 @@ dependencies = [
  "procfs",
  "prometheus",
  "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
  "twox-hash",
 ]
 
@@ -3847,7 +3859,12 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 name = "neon-shmem"
 version = "0.1.0"
 dependencies = [
+ "libc",
+ "lock_api",
  "nix 0.30.1",
+ "rand 0.9.1",
+ "rand_distr 0.5.1",
+ "rustc-hash 2.1.1",
  "tempfile",
  "thiserror 1.0.69",
  "workspace_hack",
@@ -5348,7 +5365,7 @@ dependencies = [
  "postgres_backend",
  "pq_proto",
  "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
  "rcgen",
  "redis",
  "regex",
@@ -5359,7 +5376,7 @@ dependencies = [
  "reqwest-tracing",
  "rsa",
  "rstest",
- "rustc-hash 1.1.0",
+ "rustc-hash 2.1.1",
  "rustls 0.23.27",
  "rustls-native-certs 0.8.0",
  "rustls-pemfile 2.1.1",
@@ -5452,6 +5469,12 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5476,6 +5499,16 @@ dependencies = [
  "rand_core 0.6.4",
 ]
 
+[[package]]
+name = "rand"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_chacha"
 version = "0.2.2"
@@ -5496,6 +5529,16 @@ dependencies = [
  "rand_core 0.6.4",
 ]
 
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_core"
 version = "0.5.1"
@@ -5514,6 +5557,15 @@ dependencies = [
  "getrandom 0.2.11",
 ]
 
+[[package]]
+name = "rand_core"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+dependencies = [
+ "getrandom 0.3.3",
+]
+
 [[package]]
 name = "rand_distr"
 version = "0.4.3"
@@ -5524,6 +5576,16 @@ dependencies = [
  "rand 0.8.5",
 ]
 
+[[package]]
+name = "rand_distr"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
+dependencies = [
+ "num-traits",
+ "rand 0.9.1",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
@@ -8351,6 +8413,15 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
+[[package]]
+name = "wasi"
+version = "0.14.2+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
+dependencies = [
+ "wit-bindgen-rt",
+]
+
 [[package]]
 name = "wasite"
 version = "0.1.0"
@@ -8708,6 +8779,15 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "wit-bindgen-rt"
+version = "0.39.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "workspace_hack"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index df2064a4a7..6d91262882 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -130,6 +130,7 @@ jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
 jsonwebtoken = "9"
 lasso = "0.7"
 libc = "0.2"
+lock_api = "0.4.13"
 md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
@@ -165,7 +166,7 @@ reqwest-middleware = "0.4"
 reqwest-retry = "0.7"
 routerify = "3"
 rpds = "0.13"
-rustc-hash = "1.1.0"
+rustc-hash = "2.1.1"
 rustls = { version = "0.23.16", default-features = false }
 rustls-pemfile = "2"
 rustls-pki-types = "1.11"
diff --git a/libs/neon-shmem/Cargo.toml b/libs/neon-shmem/Cargo.toml
index 2a636bec40..7ed991502e 100644
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -8,6 +8,13 @@ license.workspace = true
 thiserror.workspace = true
 nix.workspace=true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+libc.workspace = true
+lock_api.workspace = true
+rustc-hash.workspace = true
 
 [target.'cfg(target_os = "macos")'.dependencies]
 tempfile = "3.14.0"
+
+[dev-dependencies]
+rand = "0.9"
+rand_distr = "0.5.1"
diff --git a/libs/neon-shmem/src/hash.rs b/libs/neon-shmem/src/hash.rs
new file mode 100644
index 0000000000..58726b9ba3
--- /dev/null
+++ b/libs/neon-shmem/src/hash.rs
@@ -0,0 +1,583 @@
+//! Resizable hash table implementation on top of byte-level storage (either a [`ShmemHandle`] or a fixed byte array).
+//!
+//! This hash table has two major components: the bucket array and the dictionary. Each bucket within the
+//! bucket array contains a `Option<(K, V)>` and an index of another bucket. In this way there is both an
+//! implicit freelist within the bucket array (`None` buckets point to other `None` entries) and various hash
+//! chains within the bucket array (a Some bucket will point to other Some buckets that had the same hash).
+//!
+//! Buckets are never moved unless they are within a region that is being shrunk, and so the actual hash-
+//! dependent component is done with the dictionary. When a new key is inserted into the map, a position
+//! within the dictionary is decided based on its hash, the data is inserted into an empty bucket based
+//! off of the freelist, and then the index of said bucket is placed in the dictionary.
+//!
+//! This map is resizable (if initialized on top of a [`ShmemHandle`]). Both growing and shrinking happen
+//! in-place and are at a high level achieved by expanding/reducing the bucket array and rebuilding the
+//! dictionary by rehashing all keys.
+//!
+//! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock.
+
+use std::hash::{BuildHasher, Hash};
+use std::mem::MaybeUninit;
+
+use crate::shmem::ShmemHandle;
+use crate::{shmem, sync::*};
+
+mod core;
+pub mod entry;
+
+#[cfg(test)]
+mod tests;
+
+use core::{Bucket, CoreHashMap, INVALID_POS};
+use entry::{Entry, OccupiedEntry, PrevPos, VacantEntry};
+
+use thiserror::Error;
+
+/// Error type for a hashmap shrink operation.
+#[derive(Error, Debug)]
+pub enum HashMapShrinkError {
+    /// There was an error encountered while resizing the memory area.
+    #[error("shmem resize failed: {0}")]
+    ResizeError(shmem::Error),
+    /// Occupied entries in to-be-shrunk space were encountered beginning at the given index.
+    #[error("occupied entry in deallocated space found at {0}")]
+    RemainingEntries(usize),
+}
+
+/// This represents a hash table that (possibly) lives in shared memory.
+/// If a new process is launched with fork(), the child process inherits
+/// this struct.
+#[must_use]
+pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+    shared_size: usize,
+    hasher: S,
+    num_buckets: u32,
+}
+
+/// This is a per-process handle to a hash table that (possibly) lives in shared memory.
+/// If a child process is launched with fork(), the child process should
+/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
+///
+/// XXX: We're not making use of it at the moment, but this struct could
+/// hold process-local information in the future.
+pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+    hasher: S,
+}
+
+unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
+unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
+
+impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
+    /// Change the 'hasher' used by the hash table.
+    ///
+    /// NOTE: This must be called right after creating the hash table,
+    /// before inserting any entries and before calling attach_writer/reader.
+    /// Otherwise different accessors could be using different hash function,
+    /// with confusing results.
+    pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<'a, K, V, T> {
+        HashMapInit {
+            hasher,
+            shmem_handle: self.shmem_handle,
+            shared_ptr: self.shared_ptr,
+            shared_size: self.shared_size,
+            num_buckets: self.num_buckets,
+        }
+    }
+
+    /// Loosely (over)estimate the size needed to store a hash table with `num_buckets` buckets.
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        // add some margin to cover alignment etc.
+        CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
+    }
+
+    fn new(
+        num_buckets: u32,
+        shmem_handle: Option<ShmemHandle>,
+        area_ptr: *mut u8,
+        area_size: usize,
+        hasher: S,
+    ) -> Self {
+        let mut ptr: *mut u8 = area_ptr;
+        let end_ptr: *mut u8 = unsafe { ptr.add(area_size) };
+
+        // carve out area for the One Big Lock (TM) and the HashMapShared.
+        ptr = unsafe { ptr.add(ptr.align_offset(align_of::<libc::pthread_rwlock_t>())) };
+        let raw_lock_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<libc::pthread_rwlock_t>()) };
+        ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
+        let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
+        ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
+
+        // carve out the buckets
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<core::Bucket<K, V>>())) };
+        let buckets_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<core::Bucket<K, V>>() * num_buckets as usize) };
+
+        // use remaining space for the dictionary
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
+        assert!(ptr.addr() < end_ptr.addr());
+        let dictionary_ptr = ptr;
+        let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
+        assert!(dictionary_size > 0);
+
+        let buckets =
+            unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
+        };
+
+        let hashmap = CoreHashMap::new(buckets, dictionary);
+        unsafe {
+            let lock = RwLock::from_raw(PthreadRwLock::new(raw_lock_ptr.cast()), hashmap);
+            std::ptr::write(shared_ptr, lock);
+        }
+
+        Self {
+            num_buckets,
+            shmem_handle,
+            shared_ptr,
+            shared_size: area_size,
+            hasher,
+        }
+    }
+
+    /// Attach to a hash table for writing.
+    pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
+        HashMapAccess {
+            shmem_handle: self.shmem_handle,
+            shared_ptr: self.shared_ptr,
+            hasher: self.hasher,
+        }
+    }
+
+    /// Initialize a table for reading. Currently identical to [`HashMapInit::attach_writer`].
+    ///
+    /// This is a holdover from a previous implementation and is being kept around for
+    /// backwards compatibility reasons.
+    pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
+        self.attach_writer()
+    }
+}
+
+/// Hash table data that is actually stored in the shared memory area.
+///
+/// NOTE: We carve out the parts from a contiguous chunk. Growing and shrinking the hash table
+/// relies on the memory layout! The data structures are laid out in the contiguous shared memory
+/// area as follows:
+///
+/// [`libc::pthread_rwlock_t`]
+/// [`HashMapShared`]
+/// buckets
+/// dictionary
+///
+/// In between the above parts, there can be padding bytes to align the parts correctly.
+type HashMapShared<'a, K, V> = RwLock<CoreHashMap<'a, K, V>>;
+
+impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
+where
+    K: Clone + Hash + Eq,
+{
+    /// Place the hash table within a user-supplied fixed memory area.
+    pub fn with_fixed(num_buckets: u32, area: &'a mut [MaybeUninit<u8>]) -> Self {
+        Self::new(
+            num_buckets,
+            None,
+            area.as_mut_ptr().cast(),
+            area.len(),
+            rustc_hash::FxBuildHasher,
+        )
+    }
+
+    /// Place a new hash map in the given shared memory area
+    ///
+    /// # Panics
+    /// Will panic on failure to resize area to expected map size.
+    pub fn with_shmem(num_buckets: u32, shmem: ShmemHandle) -> Self {
+        let size = Self::estimate_size(num_buckets);
+        shmem
+            .set_size(size)
+            .expect("could not resize shared memory area");
+        let ptr = shmem.data_ptr.as_ptr().cast();
+        Self::new(
+            num_buckets,
+            Some(shmem),
+            ptr,
+            size,
+            rustc_hash::FxBuildHasher,
+        )
+    }
+
+    /// Make a resizable hash map within a new shared memory area with the given name.
+    pub fn new_resizeable_named(num_buckets: u32, max_buckets: u32, name: &str) -> Self {
+        let size = Self::estimate_size(num_buckets);
+        let max_size = Self::estimate_size(max_buckets);
+        let shmem =
+            ShmemHandle::new(name, size, max_size).expect("failed to make shared memory area");
+        let ptr = shmem.data_ptr.as_ptr().cast();
+
+        Self::new(
+            num_buckets,
+            Some(shmem),
+            ptr,
+            size,
+            rustc_hash::FxBuildHasher,
+        )
+    }
+
+    /// Make a resizable hash map within a new anonymous shared memory area.
+    pub fn new_resizeable(num_buckets: u32, max_buckets: u32) -> Self {
+        use std::sync::atomic::{AtomicUsize, Ordering};
+        static COUNTER: AtomicUsize = AtomicUsize::new(0);
+        let val = COUNTER.fetch_add(1, Ordering::Relaxed);
+        let name = format!("neon_shmem_hmap{val}");
+        Self::new_resizeable_named(num_buckets, max_buckets, &name)
+    }
+}
+
+impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
+where
+    K: Clone + Hash + Eq,
+{
+    /// Hash a key using the map's hasher.
+    #[inline]
+    fn get_hash_value(&self, key: &K) -> u64 {
+        self.hasher.hash_one(key)
+    }
+
+    fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
+        let mut map = unsafe { self.shared_ptr.as_ref() }.unwrap().write();
+        let dict_pos = hash as usize % map.dictionary.len();
+        let first = map.dictionary[dict_pos];
+        if first == INVALID_POS {
+            // no existing entry
+            return Entry::Vacant(VacantEntry {
+                map,
+                key,
+                dict_pos: dict_pos as u32,
+            });
+        }
+
+        let mut prev_pos = PrevPos::First(dict_pos as u32);
+        let mut next = first;
+        loop {
+            let bucket = &mut map.buckets[next as usize];
+            let (bucket_key, _bucket_value) = bucket.inner.as_mut().expect("entry is in use");
+            if *bucket_key == key {
+                // found existing entry
+                return Entry::Occupied(OccupiedEntry {
+                    map,
+                    _key: key,
+                    prev_pos,
+                    bucket_pos: next,
+                });
+            }
+
+            if bucket.next == INVALID_POS {
+                // No existing entry
+                return Entry::Vacant(VacantEntry {
+                    map,
+                    key,
+                    dict_pos: dict_pos as u32,
+                });
+            }
+            prev_pos = PrevPos::Chained(next);
+            next = bucket.next;
+        }
+    }
+
+    /// Get a reference to the corresponding value for a key.
+    pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, V>> {
+        let hash = self.get_hash_value(key);
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        RwLockReadGuard::try_map(map, |m| m.get_with_hash(key, hash)).ok()
+    }
+
+    /// Get a reference to the entry containing a key.
+    ///
+    /// NB: THis takes a write lock as there's no way to distinguish whether the intention
+    /// is to use the entry for reading or for writing in advance.
+    pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
+        let hash = self.get_hash_value(&key);
+        self.entry_with_hash(key, hash)
+    }
+
+    /// Remove a key given its hash. Returns the associated value if it existed.
+    pub fn remove(&self, key: &K) -> Option<V> {
+        let hash = self.get_hash_value(key);
+        match self.entry_with_hash(key.clone(), hash) {
+            Entry::Occupied(e) => Some(e.remove()),
+            Entry::Vacant(_) => None,
+        }
+    }
+
+    /// Insert/update a key. Returns the previous associated value if it existed.
+    ///
+    /// # Errors
+    /// Will return [`core::FullError`] if there is no more space left in the map.
+    pub fn insert(&self, key: K, value: V) -> Result<Option<V>, core::FullError> {
+        let hash = self.get_hash_value(&key);
+        match self.entry_with_hash(key.clone(), hash) {
+            Entry::Occupied(mut e) => Ok(Some(e.insert(value))),
+            Entry::Vacant(e) => {
+                _ = e.insert(value)?;
+                Ok(None)
+            }
+        }
+    }
+
+    /// Optionally return the entry for a bucket at a given index if it exists.
+    ///
+    /// Has more overhead than one would intuitively expect: performs both a clone of the key
+    /// due to the [`OccupiedEntry`] type owning the key and also a hash of the key in order
+    /// to enable repairing the hash chain if the entry is removed.
+    pub fn entry_at_bucket(&self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        if pos >= map.buckets.len() {
+            return None;
+        }
+
+        let entry = map.buckets[pos].inner.as_ref();
+        match entry {
+            Some((key, _)) => Some(OccupiedEntry {
+                _key: key.clone(),
+                bucket_pos: pos as u32,
+                prev_pos: entry::PrevPos::Unknown(self.get_hash_value(key)),
+                map,
+            }),
+            _ => None,
+        }
+    }
+
+    /// Returns the number of buckets in the table.
+    pub fn get_num_buckets(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        map.get_num_buckets()
+    }
+
+    /// Return the key and value stored in bucket with given index. This can be used to
+    /// iterate through the hash map.
+    // TODO: An Iterator might be nicer. The communicator's clock algorithm needs to
+    // _slowly_ iterate through all buckets with its clock hand,  without holding a lock.
+    // If we switch to an Iterator, it must not hold the lock.
+    pub fn get_at_bucket(&self, pos: usize) -> Option<ValueReadGuard<(K, V)>> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        if pos >= map.buckets.len() {
+            return None;
+        }
+        RwLockReadGuard::try_map(map, |m| m.buckets[pos].inner.as_ref()).ok()
+    }
+
+    /// Returns the index of the bucket a given value corresponds to.
+    pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+
+        let origin = map.buckets.as_ptr();
+        let idx = (val_ptr as usize - origin as usize) / size_of::<Bucket<K, V>>();
+        assert!(idx < map.buckets.len());
+
+        idx
+    }
+
+    /// Returns the number of occupied buckets in the table.
+    pub fn get_num_buckets_in_use(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        map.buckets_in_use as usize
+    }
+
+    /// Clears all entries in a table. Does not reset any shrinking operations.
+    pub fn clear(&self) {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        map.clear();
+    }
+
+    /// Perform an in-place rehash of some region (0..`rehash_buckets`) of the table and reset
+    /// the `buckets` and `dictionary` slices to be as long as `num_buckets`. Resets the freelist
+    /// in the process.
+    fn rehash_dict(
+        &self,
+        inner: &mut CoreHashMap<'a, K, V>,
+        buckets_ptr: *mut core::Bucket<K, V>,
+        end_ptr: *mut u8,
+        num_buckets: u32,
+        rehash_buckets: u32,
+    ) {
+        inner.free_head = INVALID_POS;
+
+        let buckets;
+        let dictionary;
+        unsafe {
+            let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
+            let dictionary_ptr: *mut u32 = buckets_end_ptr
+                .byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
+                .cast();
+            let dictionary_size: usize =
+                end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
+
+            buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
+            dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
+        }
+        for e in dictionary.iter_mut() {
+            *e = INVALID_POS;
+        }
+
+        for (i, bucket) in buckets.iter_mut().enumerate().take(rehash_buckets as usize) {
+            if bucket.inner.is_none() {
+                bucket.next = inner.free_head;
+                inner.free_head = i as u32;
+                continue;
+            }
+
+            let hash = self.hasher.hash_one(&bucket.inner.as_ref().unwrap().0);
+            let pos: usize = (hash % dictionary.len() as u64) as usize;
+            bucket.next = dictionary[pos];
+            dictionary[pos] = i as u32;
+        }
+
+        inner.dictionary = dictionary;
+        inner.buckets = buckets;
+    }
+
+    /// Rehash the map without growing or shrinking.
+    pub fn shuffle(&self) {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        let num_buckets = map.get_num_buckets() as u32;
+        let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+        let end_ptr: *mut u8 = unsafe { self.shared_ptr.byte_add(size_bytes).cast() };
+        let buckets_ptr = map.buckets.as_mut_ptr();
+        self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
+    }
+
+    /// Grow the number of buckets within the table.
+    ///
+    /// 1. Grows the underlying shared memory area
+    /// 2. Initializes new buckets and overwrites the current dictionary
+    /// 3. Rehashes the dictionary
+    ///
+    /// # Panics
+    /// Panics if called on a map initialized with [`HashMapInit::with_fixed`].
+    ///
+    /// # Errors
+    /// Returns an [`shmem::Error`] if any errors occur resizing the memory region.
+    pub fn grow(&self, num_buckets: u32) -> Result<(), shmem::Error> {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        let old_num_buckets = map.buckets.len() as u32;
+
+        assert!(
+            num_buckets >= old_num_buckets,
+            "grow called with a smaller number of buckets"
+        );
+        if num_buckets == old_num_buckets {
+            return Ok(());
+        }
+        let shmem_handle = self
+            .shmem_handle
+            .as_ref()
+            .expect("grow called on a fixed-size hash table");
+
+        let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+        shmem_handle.set_size(size_bytes)?;
+        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+
+        // Initialize new buckets. The new buckets are linked to the free list.
+        // NB: This overwrites the dictionary!
+        let buckets_ptr = map.buckets.as_mut_ptr();
+        unsafe {
+            for i in old_num_buckets..num_buckets {
+                let bucket = buckets_ptr.add(i as usize);
+                bucket.write(core::Bucket {
+                    next: if i < num_buckets - 1 {
+                        i + 1
+                    } else {
+                        map.free_head
+                    },
+                    inner: None,
+                });
+            }
+        }
+
+        self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, old_num_buckets);
+        map.free_head = old_num_buckets;
+
+        Ok(())
+    }
+
+    /// Begin a shrink, limiting all new allocations to be in buckets with index below `num_buckets`.
+    ///
+    /// # Panics
+    /// Panics if called on a map initialized with [`HashMapInit::with_fixed`] or if `num_buckets` is
+    /// greater than the number of buckets in the map.
+    pub fn begin_shrink(&mut self, num_buckets: u32) {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        assert!(
+            num_buckets <= map.get_num_buckets() as u32,
+            "shrink called with a larger number of buckets"
+        );
+        _ = self
+            .shmem_handle
+            .as_ref()
+            .expect("shrink called on a fixed-size hash table");
+        map.alloc_limit = num_buckets;
+    }
+
+    /// If a shrink operation is underway, returns the target size of the map. Otherwise, returns None.
+    pub fn shrink_goal(&self) -> Option<usize> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap().read();
+        let goal = map.alloc_limit;
+        if goal == INVALID_POS {
+            None
+        } else {
+            Some(goal as usize)
+        }
+    }
+
+    /// Complete a shrink after caller has evicted entries, removing the unused buckets and rehashing.
+    ///
+    /// # Panics
+    /// The following cases result in a panic:
+    /// - Calling this function on a map initialized with [`HashMapInit::with_fixed`].
+    /// - Calling this function on a map when no shrink operation is in progress.
+    pub fn finish_shrink(&self) -> Result<(), HashMapShrinkError> {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        assert!(
+            map.alloc_limit != INVALID_POS,
+            "called finish_shrink when no shrink is in progress"
+        );
+
+        let num_buckets = map.alloc_limit;
+
+        if map.get_num_buckets() == num_buckets as usize {
+            return Ok(());
+        }
+
+        assert!(
+            map.buckets_in_use <= num_buckets,
+            "called finish_shrink before enough entries were removed"
+        );
+
+        for i in (num_buckets as usize)..map.buckets.len() {
+            if map.buckets[i].inner.is_some() {
+                return Err(HashMapShrinkError::RemainingEntries(i));
+            }
+        }
+
+        let shmem_handle = self
+            .shmem_handle
+            .as_ref()
+            .expect("shrink called on a fixed-size hash table");
+
+        let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+        if let Err(e) = shmem_handle.set_size(size_bytes) {
+            return Err(HashMapShrinkError::ResizeError(e));
+        }
+        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+        let buckets_ptr = map.buckets.as_mut_ptr();
+        self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
+        map.alloc_limit = INVALID_POS;
+
+        Ok(())
+    }
+}
diff --git a/libs/neon-shmem/src/hash/core.rs b/libs/neon-shmem/src/hash/core.rs
new file mode 100644
index 0000000000..4665c36adb
--- /dev/null
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -0,0 +1,174 @@
+//! Simple hash table with chaining.
+
+use std::hash::Hash;
+use std::mem::MaybeUninit;
+
+use crate::hash::entry::*;
+
+/// Invalid position within the map (either within the dictionary or bucket array).
+pub(crate) const INVALID_POS: u32 = u32::MAX;
+
+/// Fundamental storage unit within the hash table. Either empty or contains a key-value pair.
+/// Always part of a chain of some kind (either a freelist if empty or a hash chain if full).
+pub(crate) struct Bucket<K, V> {
+    /// Index of next bucket in the chain.
+    pub(crate) next: u32,
+    /// Key-value pair contained within bucket.
+    pub(crate) inner: Option<(K, V)>,
+}
+
+/// Core hash table implementation.
+pub(crate) struct CoreHashMap<'a, K, V> {
+    /// Dictionary used to map hashes to bucket indices.
+    pub(crate) dictionary: &'a mut [u32],
+    /// Buckets containing key-value pairs.
+    pub(crate) buckets: &'a mut [Bucket<K, V>],
+    /// Head of the freelist.
+    pub(crate) free_head: u32,
+    /// Maximum index of a bucket allowed to be allocated. [`INVALID_POS`] if no limit.
+    pub(crate) alloc_limit: u32,
+    /// The number of currently occupied buckets.
+    pub(crate) buckets_in_use: u32,
+}
+
+/// Error for when there are no empty buckets left but one is needed.
+#[derive(Debug, PartialEq)]
+pub struct FullError;
+
+impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
+    const FILL_FACTOR: f32 = 0.60;
+
+    /// Estimate the size of data contained within the the hash map.
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        let mut size = 0;
+
+        // buckets
+        size += size_of::<Bucket<K, V>>() * num_buckets as usize;
+
+        // dictionary
+        size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
+            as usize;
+
+        size
+    }
+
+    pub fn new(
+        buckets: &'a mut [MaybeUninit<Bucket<K, V>>],
+        dictionary: &'a mut [MaybeUninit<u32>],
+    ) -> Self {
+        // Initialize the buckets
+        for i in 0..buckets.len() {
+            buckets[i].write(Bucket {
+                next: if i < buckets.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },
+                inner: None,
+            });
+        }
+
+        // Initialize the dictionary
+        for e in dictionary.iter_mut() {
+            e.write(INVALID_POS);
+        }
+
+        // TODO: use std::slice::assume_init_mut() once it stabilizes
+        let buckets =
+            unsafe { std::slice::from_raw_parts_mut(buckets.as_mut_ptr().cast(), buckets.len()) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary.as_mut_ptr().cast(), dictionary.len())
+        };
+
+        Self {
+            dictionary,
+            buckets,
+            free_head: 0,
+            buckets_in_use: 0,
+            alloc_limit: INVALID_POS,
+        }
+    }
+
+    /// Get the value associated with a key (if it exists) given its hash.
+    pub fn get_with_hash(&self, key: &K, hash: u64) -> Option<&V> {
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        loop {
+            if next == INVALID_POS {
+                return None;
+            }
+
+            let bucket = &self.buckets[next as usize];
+            let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
+            if bucket_key == key {
+                return Some(bucket_value);
+            }
+            next = bucket.next;
+        }
+    }
+
+    /// Get number of buckets in map.
+    pub fn get_num_buckets(&self) -> usize {
+        self.buckets.len()
+    }
+
+    /// Clears all entries from the hashmap.
+    ///
+    /// Does not reset any allocation limits, but does clear any entries beyond them.
+    pub fn clear(&mut self) {
+        for i in 0..self.buckets.len() {
+            self.buckets[i] = Bucket {
+                next: if i < self.buckets.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },
+                inner: None,
+            }
+        }
+        for i in 0..self.dictionary.len() {
+            self.dictionary[i] = INVALID_POS;
+        }
+
+        self.free_head = 0;
+        self.buckets_in_use = 0;
+    }
+
+    /// Find the position of an unused bucket via the freelist and initialize it.
+    pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result<u32, FullError> {
+        let mut pos = self.free_head;
+
+        // Find the first bucket we're *allowed* to use.
+        let mut prev = PrevPos::First(self.free_head);
+        while pos != INVALID_POS && pos >= self.alloc_limit {
+            let bucket = &mut self.buckets[pos as usize];
+            prev = PrevPos::Chained(pos);
+            pos = bucket.next;
+        }
+        if pos == INVALID_POS {
+            return Err(FullError);
+        }
+
+        // Repair the freelist.
+        match prev {
+            PrevPos::First(_) => {
+                let next_pos = self.buckets[pos as usize].next;
+                self.free_head = next_pos;
+            }
+            PrevPos::Chained(p) => {
+                if p != INVALID_POS {
+                    let next_pos = self.buckets[pos as usize].next;
+                    self.buckets[p as usize].next = next_pos;
+                }
+            }
+            _ => unreachable!(),
+        }
+
+        // Initialize the bucket.
+        let bucket = &mut self.buckets[pos as usize];
+        self.buckets_in_use += 1;
+        bucket.next = INVALID_POS;
+        bucket.inner = Some((key, value));
+
+        Ok(pos)
+    }
+}
diff --git a/libs/neon-shmem/src/hash/entry.rs b/libs/neon-shmem/src/hash/entry.rs
new file mode 100644
index 0000000000..560a20db1d
--- /dev/null
+++ b/libs/neon-shmem/src/hash/entry.rs
@@ -0,0 +1,130 @@
+//! Equivalent of [`std::collections::hash_map::Entry`] for this hashmap.
+
+use crate::hash::core::{CoreHashMap, FullError, INVALID_POS};
+use crate::sync::{RwLockWriteGuard, ValueWriteGuard};
+
+use std::hash::Hash;
+use std::mem;
+
+pub enum Entry<'a, 'b, K, V> {
+    Occupied(OccupiedEntry<'a, 'b, K, V>),
+    Vacant(VacantEntry<'a, 'b, K, V>),
+}
+
+/// Enum representing the previous position within a chain.
+#[derive(Clone, Copy)]
+pub(crate) enum PrevPos {
+    /// Starting index within the dictionary.  
+    First(u32),
+    /// Regular index within the buckets.
+    Chained(u32),
+    /// Unknown - e.g. the associated entry was retrieved by index instead of chain.
+    Unknown(u64),
+}
+
+pub struct OccupiedEntry<'a, 'b, K, V> {
+    /// Mutable reference to the map containing this entry.
+    pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
+    /// The key of the occupied entry
+    pub(crate) _key: K,
+    /// The index of the previous entry in the chain.
+    pub(crate) prev_pos: PrevPos,
+    /// The position of the bucket in the [`CoreHashMap`] bucket array.
+    pub(crate) bucket_pos: u32,
+}
+
+impl<K, V> OccupiedEntry<'_, '_, K, V> {
+    pub fn get(&self) -> &V {
+        &self.map.buckets[self.bucket_pos as usize]
+            .inner
+            .as_ref()
+            .unwrap()
+            .1
+    }
+
+    pub fn get_mut(&mut self) -> &mut V {
+        &mut self.map.buckets[self.bucket_pos as usize]
+            .inner
+            .as_mut()
+            .unwrap()
+            .1
+    }
+
+    /// Inserts a value into the entry, replacing (and returning) the existing value.
+    pub fn insert(&mut self, value: V) -> V {
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+        // This assumes inner is Some, which it must be for an OccupiedEntry
+        mem::replace(&mut bucket.inner.as_mut().unwrap().1, value)
+    }
+
+    /// Removes the entry from the hash map, returning the value originally stored within it.
+    ///
+    /// This may result in multiple bucket accesses if the entry was obtained by index as the
+    /// previous chain entry needs to be discovered in this case.
+    pub fn remove(mut self) -> V {
+        // If this bucket was queried by index, go ahead and follow its chain from the start.
+        let prev = if let PrevPos::Unknown(hash) = self.prev_pos {
+            let dict_idx = hash as usize % self.map.dictionary.len();
+            let mut prev = PrevPos::First(dict_idx as u32);
+            let mut curr = self.map.dictionary[dict_idx];
+            while curr != self.bucket_pos {
+                assert!(curr != INVALID_POS);
+                prev = PrevPos::Chained(curr);
+                curr = self.map.buckets[curr as usize].next;
+            }
+            prev
+        } else {
+            self.prev_pos
+        };
+
+        // CoreHashMap::remove returns Option<(K, V)>. We know it's Some for an OccupiedEntry.
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+
+        // unlink it from the chain
+        match prev {
+            PrevPos::First(dict_pos) => {
+                self.map.dictionary[dict_pos as usize] = bucket.next;
+            }
+            PrevPos::Chained(bucket_pos) => {
+                self.map.buckets[bucket_pos as usize].next = bucket.next;
+            }
+            _ => unreachable!(),
+        }
+
+        // and add it to the freelist
+        let free = self.map.free_head;
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+        let old_value = bucket.inner.take();
+        bucket.next = free;
+        self.map.free_head = self.bucket_pos;
+        self.map.buckets_in_use -= 1;
+
+        old_value.unwrap().1
+    }
+}
+
+/// An abstract view into a vacant entry within the map.
+pub struct VacantEntry<'a, 'b, K, V> {
+    /// Mutable reference to the map containing this entry.
+    pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
+    /// The key to be inserted into this entry.
+    pub(crate) key: K,
+    /// The position within the dictionary corresponding to the key's hash.
+    pub(crate) dict_pos: u32,
+}
+
+impl<'b, K: Clone + Hash + Eq, V> VacantEntry<'_, 'b, K, V> {
+    /// Insert a value into the vacant entry, finding and populating an empty bucket in the process.
+    ///
+    /// # Errors
+    /// Will return [`FullError`] if there are no unoccupied buckets in the map.
+    pub fn insert(mut self, value: V) -> Result<ValueWriteGuard<'b, V>, FullError> {
+        let pos = self.map.alloc_bucket(self.key, value)?;
+        self.map.buckets[pos as usize].next = self.map.dictionary[self.dict_pos as usize];
+        self.map.dictionary[self.dict_pos as usize] = pos;
+
+        Ok(RwLockWriteGuard::map(self.map, |m| {
+            &mut m.buckets[pos as usize].inner.as_mut().unwrap().1
+        }))
+    }
+}
diff --git a/libs/neon-shmem/src/hash/tests.rs b/libs/neon-shmem/src/hash/tests.rs
new file mode 100644
index 0000000000..92233e8140
--- /dev/null
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -0,0 +1,428 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::Debug;
+use std::mem::MaybeUninit;
+
+use crate::hash::Entry;
+use crate::hash::HashMapAccess;
+use crate::hash::HashMapInit;
+use crate::hash::core::FullError;
+
+use rand::seq::SliceRandom;
+use rand::{Rng, RngCore};
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    let w = HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, "test_inserts")
+        .attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let res = w.entry((*k).into());
+        match res {
+            Entry::Occupied(mut e) => {
+                e.insert(idx);
+            }
+            Entry::Vacant(e) => {
+                let res = e.insert(idx);
+                assert!(res.is_ok());
+            }
+        };
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let x = w.get(&(*k).into());
+        let value = x.as_deref().copied();
+        assert_eq!(value, Some(idx));
+    }
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.contains(&key) {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op(
+    op: &TestOp,
+    map: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    let entry = map.entry(op.0);
+    let hash_existing = match op.1 {
+        Some(new) => match entry {
+            Entry::Occupied(mut e) => Some(e.insert(new)),
+            Entry::Vacant(e) => {
+                _ = e.insert(new).unwrap();
+                None
+            }
+        },
+        None => match entry {
+            Entry::Occupied(e) => Some(e.remove()),
+            Entry::Vacant(_) => None,
+        },
+    };
+
+    assert_eq!(shadow_existing, hash_existing);
+}
+
+fn do_random_ops(
+    num_ops: usize,
+    size: u32,
+    del_prob: f64,
+    writer: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+    rng: &mut rand::rngs::ThreadRng,
+) {
+    for i in 0..num_ops {
+        let key: TestKey = ((rng.next_u32() % size) as u128).into();
+        let op = TestOp(
+            key,
+            if rng.random_bool(del_prob) {
+                Some(i)
+            } else {
+                None
+            },
+        );
+        apply_op(&op, writer, shadow);
+    }
+}
+
+fn do_deletes(
+    num_ops: usize,
+    writer: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    for _ in 0..num_ops {
+        let (k, _) = shadow.pop_first().unwrap();
+        writer.remove(&k);
+    }
+}
+
+fn do_shrink(
+    writer: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+    from: u32,
+    to: u32,
+) {
+    assert!(writer.shrink_goal().is_none());
+    writer.begin_shrink(to);
+    assert_eq!(writer.shrink_goal(), Some(to as usize));
+    for i in to..from {
+        if let Some(entry) = writer.entry_at_bucket(i as usize) {
+            shadow.remove(&entry._key);
+            entry.remove();
+        }
+    }
+    let old_usage = writer.get_num_buckets_in_use();
+    writer.finish_shrink().unwrap();
+    assert!(writer.shrink_goal().is_none());
+    assert_eq!(writer.get_num_buckets_in_use(), old_usage);
+}
+
+#[test]
+fn random_ops() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, "test_random")
+            .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let key: TestKey = (rng.sample(distribution) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &mut writer, &mut shadow);
+    }
+}
+
+#[test]
+fn test_shuffle() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, "test_shuf")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+    writer.shuffle();
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_grow() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 2000, "test_grow")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+    let old_usage = writer.get_num_buckets_in_use();
+    writer.grow(1500).unwrap();
+    assert_eq!(writer.get_num_buckets_in_use(), old_usage);
+    assert_eq!(writer.get_num_buckets(), 1500);
+    do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_clear() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+    do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    writer.clear();
+    assert_eq!(writer.get_num_buckets_in_use(), 0);
+    assert_eq!(writer.get_num_buckets(), 1500);
+    while let Some((key, _)) = shadow.pop_first() {
+        assert!(writer.get(&key).is_none());
+    }
+    do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    for i in 0..(1500 - writer.get_num_buckets_in_use()) {
+        writer.insert((1500 + i as u128).into(), 0).unwrap();
+    }
+    assert_eq!(writer.insert(5000.into(), 0), Err(FullError {}));
+    writer.clear();
+    assert!(writer.insert(5000.into(), 0).is_ok());
+}
+
+#[test]
+fn test_idx_remove() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+    do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+    for _ in 0..100 {
+        let idx = (rng.next_u32() % 1500) as usize;
+        if let Some(e) = writer.entry_at_bucket(idx) {
+            shadow.remove(&e._key);
+            e.remove();
+        }
+    }
+    while let Some((key, val)) = shadow.pop_first() {
+        assert_eq!(*writer.get(&key).unwrap(), val);
+    }
+}
+
+#[test]
+fn test_idx_get() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+    do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+    for _ in 0..100 {
+        let idx = (rng.next_u32() % 1500) as usize;
+        if let Some(pair) = writer.get_at_bucket(idx) {
+            {
+                let v: *const usize = &pair.1;
+                assert_eq!(writer.get_bucket_for_value(v), idx);
+            }
+            {
+                let v: *const usize = &pair.1;
+                assert_eq!(writer.get_bucket_for_value(v), idx);
+            }
+        }
+    }
+}
+
+#[test]
+fn test_shrink() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    do_shrink(&mut writer, &mut shadow, 1500, 1000);
+    assert_eq!(writer.get_num_buckets(), 1000);
+    do_deletes(500, &mut writer, &mut shadow);
+    do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
+    assert!(writer.get_num_buckets_in_use() <= 1000);
+}
+
+#[test]
+fn test_shrink_grow_seq() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 20000, "test_grow_seq")
+            .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
+    eprintln!("Shrinking to 750");
+    do_shrink(&mut writer, &mut shadow, 1000, 750);
+    do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
+    eprintln!("Growing to 1500");
+    writer.grow(1500).unwrap();
+    do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng);
+    eprintln!("Shrinking to 200");
+    while shadow.len() > 100 {
+        do_deletes(1, &mut writer, &mut shadow);
+    }
+    do_shrink(&mut writer, &mut shadow, 1500, 200);
+    do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+    eprintln!("Growing to 10k");
+    writer.grow(10000).unwrap();
+    do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_bucket_ops() {
+    let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, "test_bucket_ops")
+        .attach_writer();
+    match writer.entry(1.into()) {
+        Entry::Occupied(mut e) => {
+            e.insert(2);
+        }
+        Entry::Vacant(e) => {
+            _ = e.insert(2).unwrap();
+        }
+    }
+    assert_eq!(writer.get_num_buckets_in_use(), 1);
+    assert_eq!(writer.get_num_buckets(), 1000);
+    assert_eq!(*writer.get(&1.into()).unwrap(), 2);
+    let pos = match writer.entry(1.into()) {
+        Entry::Occupied(e) => {
+            assert_eq!(e._key, 1.into());
+            e.bucket_pos as usize
+        }
+        Entry::Vacant(_) => {
+            panic!("Insert didn't affect entry");
+        }
+    };
+    assert_eq!(writer.entry_at_bucket(pos).unwrap()._key, 1.into());
+    assert_eq!(*writer.get_at_bucket(pos).unwrap(), (1.into(), 2));
+    {
+        let ptr: *const usize = &*writer.get(&1.into()).unwrap();
+        assert_eq!(writer.get_bucket_for_value(ptr), pos);
+    }
+    writer.remove(&1.into());
+    assert!(writer.get(&1.into()).is_none());
+}
+
+#[test]
+fn test_shrink_zero() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink_zero")
+            .attach_writer();
+    writer.begin_shrink(0);
+    for i in 0..1500 {
+        writer.entry_at_bucket(i).map(|x| x.remove());
+    }
+    writer.finish_shrink().unwrap();
+    assert_eq!(writer.get_num_buckets_in_use(), 0);
+    let entry = writer.entry(1.into());
+    if let Entry::Vacant(v) = entry {
+        assert!(v.insert(2).is_err());
+    } else {
+        panic!("Somehow got non-vacant entry in empty map.")
+    }
+    writer.grow(50).unwrap();
+    let entry = writer.entry(1.into());
+    if let Entry::Vacant(v) = entry {
+        assert!(v.insert(2).is_ok());
+    } else {
+        panic!("Somehow got non-vacant entry in empty map.")
+    }
+    assert_eq!(writer.get_num_buckets_in_use(), 1);
+}
+
+#[test]
+#[should_panic]
+fn test_grow_oom() {
+    let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_grow_oom")
+        .attach_writer();
+    writer.grow(20000).unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_bigger() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_bigger")
+            .attach_writer();
+    writer.begin_shrink(2000);
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_early_finish() {
+    let writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_early_finish")
+            .attach_writer();
+    writer.finish_shrink().unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_fixed_size() {
+    let mut area = [MaybeUninit::uninit(); 10000];
+    let init_struct = HashMapInit::<TestKey, usize>::with_fixed(3, &mut area);
+    let mut writer = init_struct.attach_writer();
+    writer.begin_shrink(1);
+}
diff --git a/libs/neon-shmem/src/lib.rs b/libs/neon-shmem/src/lib.rs
index 50d3fbb3cf..226cc0c22d 100644
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1 +1,3 @@
+pub mod hash;
 pub mod shmem;
+pub mod sync;
diff --git a/libs/neon-shmem/src/sync.rs b/libs/neon-shmem/src/sync.rs
new file mode 100644
index 0000000000..95719778ba
--- /dev/null
+++ b/libs/neon-shmem/src/sync.rs
@@ -0,0 +1,111 @@
+//! Simple utilities akin to what's in [`std::sync`] but designed to work with shared memory.
+
+use std::mem::MaybeUninit;
+use std::ptr::NonNull;
+
+use nix::errno::Errno;
+
+pub type RwLock<T> = lock_api::RwLock<PthreadRwLock, T>;
+pub type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, PthreadRwLock, T>;
+pub type RwLockWriteGuard<'a, T> = lock_api::RwLockWriteGuard<'a, PthreadRwLock, T>;
+pub type ValueReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, PthreadRwLock, T>;
+pub type ValueWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, PthreadRwLock, T>;
+
+/// Shared memory read-write lock.
+pub struct PthreadRwLock(Option<NonNull<libc::pthread_rwlock_t>>);
+
+/// Simple macro that calls a function in the libc namespace and panics if return value is nonzero.
+macro_rules! libc_checked {
+    ($fn_name:ident ( $($arg:expr),* )) => {{
+        let res = libc::$fn_name($($arg),*);
+        if res != 0 {
+            panic!("{} failed with {}", stringify!($fn_name), Errno::from_raw(res));
+        }
+    }};
+}
+
+impl PthreadRwLock {
+    /// Creates a new `PthreadRwLock` on top of a pointer to a pthread rwlock.
+    ///
+    /// # Safety
+    /// `lock` must be non-null. Every unsafe operation will panic in the event of an error.
+    pub unsafe fn new(lock: *mut libc::pthread_rwlock_t) -> Self {
+        unsafe {
+            let mut attrs = MaybeUninit::uninit();
+            libc_checked!(pthread_rwlockattr_init(attrs.as_mut_ptr()));
+            libc_checked!(pthread_rwlockattr_setpshared(
+                attrs.as_mut_ptr(),
+                libc::PTHREAD_PROCESS_SHARED
+            ));
+            libc_checked!(pthread_rwlock_init(lock, attrs.as_mut_ptr()));
+            // Safety: POSIX specifies that "any function affecting the attributes
+            // object (including destruction) shall not affect any previously
+            // initialized read-write locks".
+            libc_checked!(pthread_rwlockattr_destroy(attrs.as_mut_ptr()));
+            Self(Some(NonNull::new_unchecked(lock)))
+        }
+    }
+
+    fn inner(&self) -> NonNull<libc::pthread_rwlock_t> {
+        match self.0 {
+            None => {
+                panic!("PthreadRwLock constructed badly - something likely used RawRwLock::INIT")
+            }
+            Some(x) => x,
+        }
+    }
+}
+
+unsafe impl lock_api::RawRwLock for PthreadRwLock {
+    type GuardMarker = lock_api::GuardSend;
+    const INIT: Self = Self(None);
+
+    fn try_lock_shared(&self) -> bool {
+        unsafe {
+            let res = libc::pthread_rwlock_tryrdlock(self.inner().as_ptr());
+            match res {
+                0 => true,
+                libc::EAGAIN => false,
+                _ => panic!(
+                    "pthread_rwlock_tryrdlock failed with {}",
+                    Errno::from_raw(res)
+                ),
+            }
+        }
+    }
+
+    fn try_lock_exclusive(&self) -> bool {
+        unsafe {
+            let res = libc::pthread_rwlock_trywrlock(self.inner().as_ptr());
+            match res {
+                0 => true,
+                libc::EAGAIN => false,
+                _ => panic!("try_wrlock failed with {}", Errno::from_raw(res)),
+            }
+        }
+    }
+
+    fn lock_shared(&self) {
+        unsafe {
+            libc_checked!(pthread_rwlock_rdlock(self.inner().as_ptr()));
+        }
+    }
+
+    fn lock_exclusive(&self) {
+        unsafe {
+            libc_checked!(pthread_rwlock_wrlock(self.inner().as_ptr()));
+        }
+    }
+
+    unsafe fn unlock_exclusive(&self) {
+        unsafe {
+            libc_checked!(pthread_rwlock_unlock(self.inner().as_ptr()));
+        }
+    }
+
+    unsafe fn unlock_shared(&self) {
+        unsafe {
+            libc_checked!(pthread_rwlock_unlock(self.inner().as_ptr()));
+        }
+    }
+}

From 8b0f2efa573834a11f9bd01a673fac87970023fb Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 17 Jul 2025 18:58:47 +0100
Subject: [PATCH 18/39] experiment with an InfoMetrics metric family (#12612)

Putting this in the neon codebase for now, to experiment. Can be lifted
into measured at a later date.

This metric family is like a MetricVec, but it only supports 1 label
being set at a time. It is useful for reporting info, rather than
reporting metrics.
https://www.robustperception.io/exposing-the-software-version-to-prometheus/
---
 libs/metrics/src/lib.rs | 68 ++++++++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 22 deletions(-)

diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 5d028ee041..41873cdcd6 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -4,12 +4,14 @@
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]
 
+use std::sync::RwLock;
+
 use measured::label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels};
 use measured::metric::counter::CounterState;
 use measured::metric::gauge::GaugeState;
 use measured::metric::group::Encoding;
 use measured::metric::name::{MetricName, MetricNameEncoder};
-use measured::metric::{MetricEncoding, MetricFamilyEncoding};
+use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType};
 use measured::{FixedCardinalityLabel, LabelGroup, MetricGroup};
 use once_cell::sync::Lazy;
 use prometheus::Registry;
@@ -116,12 +118,52 @@ pub fn pow2_buckets(start: usize, end: usize) -> Vec<f64> {
         .collect()
 }
 
+pub struct InfoMetric<L: LabelGroup, M: MetricType = GaugeState> {
+    label: RwLock<L>,
+    metric: M,
+}
+
+impl<L: LabelGroup> InfoMetric<L> {
+    pub fn new(label: L) -> Self {
+        Self::with_metric(label, GaugeState::new(1))
+    }
+}
+
+impl<L: LabelGroup, M: MetricType<Metadata = ()>> InfoMetric<L, M> {
+    pub fn with_metric(label: L, metric: M) -> Self {
+        Self {
+            label: RwLock::new(label),
+            metric,
+        }
+    }
+
+    pub fn set_label(&self, label: L) {
+        *self.label.write().unwrap() = label;
+    }
+}
+
+impl<L, M, E> MetricFamilyEncoding<E> for InfoMetric<L, M>
+where
+    L: LabelGroup,
+    M: MetricEncoding<E, Metadata = ()>,
+    E: Encoding,
+{
+    fn collect_family_into(
+        &self,
+        name: impl measured::metric::name::MetricNameEncoder,
+        enc: &mut E,
+    ) -> Result<(), E::Err> {
+        M::write_type(&name, enc)?;
+        self.metric
+            .collect_into(&(), &*self.label.read().unwrap(), name, enc)
+    }
+}
+
 pub struct BuildInfo {
     pub revision: &'static str,
     pub build_tag: &'static str,
 }
 
-// todo: allow label group without the set
 impl LabelGroup for BuildInfo {
     fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
         const REVISION: &LabelName = LabelName::from_str("revision");
@@ -131,24 +173,6 @@ impl LabelGroup for BuildInfo {
     }
 }
 
-impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
-where
-    GaugeState: MetricEncoding<T>,
-{
-    fn collect_family_into(
-        &self,
-        name: impl measured::metric::name::MetricNameEncoder,
-        enc: &mut T,
-    ) -> Result<(), T::Err> {
-        enc.write_help(&name, "Build/version information")?;
-        GaugeState::write_type(&name, enc)?;
-        GaugeState {
-            count: std::sync::atomic::AtomicI64::new(1),
-        }
-        .collect_into(&(), self, name, enc)
-    }
-}
-
 #[derive(MetricGroup)]
 #[metric(new(build_info: BuildInfo))]
 pub struct NeonMetrics {
@@ -165,8 +189,8 @@ pub struct NeonMetrics {
 #[derive(MetricGroup)]
 #[metric(new(build_info: BuildInfo))]
 pub struct LibMetrics {
-    #[metric(init = build_info)]
-    build_info: BuildInfo,
+    #[metric(init = InfoMetric::new(build_info))]
+    build_info: InfoMetric<BuildInfo>,
 
     #[metric(flatten)]
     rusage: Rusage,

From 29ee273d780e70471286ac9238c70894eba7b6e2 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 17 Jul 2025 15:42:48 -0400
Subject: [PATCH 19/39] fix(storcon): correctly converts 404 for tenant
 passthrough requests (#12631)

## Problem

Follow up of https://github.com/neondatabase/neon/pull/12620

Discussions:
https://databricks.slack.com/archives/C09254R641L/p1752677940697529

The original code and after the patch above we converts 404s to 503s
regardless of the type of 404. We should only do that for tenant not
found errors. For other 404s like timeline not found, we should not
prompt clients to retry.

## Summary of changes

- Inspect the response body to figure out the type of 404. If it's a
tenant not found error, return 503.
- Otherwise, fallthrough and return 404 as-is.
- Add `tenant_shard_remote_mutation` that manipulates a single shard.
- Use `Service::tenant_shard_remote_mutation` for tenant shard
passthrough requests. This prevents us from another race that the attach
state changes within the request. (This patch mainly addresses the case
that the tenant is "not yet attached").
- TODO: lease API is still using the old code path. We should refactor
it to use `tenant_remote_mutation`.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/http.rs                | 148 +++++++++++-------
 storage_controller/src/service.rs             | 110 ++++++++++---
 test_runner/fixtures/pageserver/http.py       |   3 +-
 .../regress/test_storage_controller.py        | 103 +++++++++++-
 4 files changed, 284 insertions(+), 80 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 5f9a1124de..6b6d081dcd 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -48,7 +48,10 @@ use crate::metrics::{
 };
 use crate::persistence::SafekeeperUpsert;
 use crate::reconciler::ReconcileError;
-use crate::service::{LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT, Service};
+use crate::service::{
+    LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT, Service,
+    TenantMutationLocations,
+};
 
 /// State available to HTTP request handlers
 pub struct HttpState {
@@ -734,77 +737,104 @@ async fn handle_tenant_timeline_passthrough(
         path
     );
 
-    // Find the node that holds shard zero
-    let (node, tenant_shard_id, consistent) = if tenant_or_shard_id.is_unsharded() {
-        service
+    let tenant_shard_id = if tenant_or_shard_id.is_unsharded() {
+        // If the request contains only tenant ID, find the node that holds shard zero
+        let (_, shard_id) = service
             .tenant_shard0_node(tenant_or_shard_id.tenant_id)
-            .await?
+            .await?;
+        shard_id
     } else {
-        let (node, consistent) = service.tenant_shard_node(tenant_or_shard_id).await?;
-        (node, tenant_or_shard_id, consistent)
+        tenant_or_shard_id
     };
 
-    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
-    // rewrite this to a shard-aware shard zero ID.
-    let path = format!("{path}");
-    let tenant_str = tenant_or_shard_id.tenant_id.to_string();
-    let tenant_shard_str = format!("{tenant_shard_id}");
-    let path = path.replace(&tenant_str, &tenant_shard_str);
+    let service_inner = service.clone();
 
-    let latency = &METRICS_REGISTRY
-        .metrics_group
-        .storage_controller_passthrough_request_latency;
+    service.tenant_shard_remote_mutation(tenant_shard_id, |locations| async move {
+        let TenantMutationLocations(locations) = locations;
+        if locations.is_empty() {
+            return Err(ApiError::NotFound(anyhow::anyhow!("Tenant {} not found", tenant_or_shard_id.tenant_id).into()));
+        }
 
-    let path_label = path_without_ids(&path)
-        .split('/')
-        .filter(|token| !token.is_empty())
-        .collect::<Vec<_>>()
-        .join("_");
-    let labels = PageserverRequestLabelGroup {
-        pageserver_id: &node.get_id().to_string(),
-        path: &path_label,
-        method: crate::metrics::Method::Get,
-    };
+        let (tenant_or_shard_id, locations) = locations.into_iter().next().unwrap();
+        let node = locations.latest.node;
 
-    let _timer = latency.start_timer(labels.clone());
+        // Callers will always pass an unsharded tenant ID.  Before proxying, we must
+        // rewrite this to a shard-aware shard zero ID.
+        let path = format!("{path}");
+        let tenant_str = tenant_or_shard_id.tenant_id.to_string();
+        let tenant_shard_str = format!("{tenant_shard_id}");
+        let path = path.replace(&tenant_str, &tenant_shard_str);
 
-    let client = mgmt_api::Client::new(
-        service.get_http_client().clone(),
-        node.base_url(),
-        service.get_config().pageserver_jwt_token.as_deref(),
-    );
-    let resp = client.op_raw(method, path).await.map_err(|e|
-        // We return 503 here because if we can't successfully send a request to the pageserver,
-        // either we aren't available or the pageserver is unavailable.
-        ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?;
-
-    if !resp.status().is_success() {
-        let error_counter = &METRICS_REGISTRY
+        let latency = &METRICS_REGISTRY
             .metrics_group
-            .storage_controller_passthrough_request_error;
-        error_counter.inc(labels);
-    }
+            .storage_controller_passthrough_request_latency;
 
-    // Transform 404 into 503 if we raced with a migration
-    if resp.status() == reqwest::StatusCode::NOT_FOUND && !consistent {
-        // Rather than retry here, send the client a 503 to prompt a retry: this matches
-        // the pageserver's use of 503, and all clients calling this API should retry on 503.
-        return Err(ApiError::ResourceUnavailable(
-            format!("Pageserver {node} returned 404 due to ongoing migration, retry later").into(),
-        ));
-    }
+        let path_label = path_without_ids(&path)
+            .split('/')
+            .filter(|token| !token.is_empty())
+            .collect::<Vec<_>>()
+            .join("_");
+        let labels = PageserverRequestLabelGroup {
+            pageserver_id: &node.get_id().to_string(),
+            path: &path_label,
+            method: crate::metrics::Method::Get,
+        };
 
-    // We have a reqest::Response, would like a http::Response
-    let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?);
-    for (k, v) in resp.headers() {
-        builder = builder.header(k.as_str(), v.as_bytes());
-    }
+        let _timer = latency.start_timer(labels.clone());
 
-    let response = builder
-        .body(Body::wrap_stream(resp.bytes_stream()))
-        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+        let client = mgmt_api::Client::new(
+            service_inner.get_http_client().clone(),
+            node.base_url(),
+            service_inner.get_config().pageserver_jwt_token.as_deref(),
+        );
+        let resp = client.op_raw(method, path).await.map_err(|e|
+            // We return 503 here because if we can't successfully send a request to the pageserver,
+            // either we aren't available or the pageserver is unavailable.
+            ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?;
 
-    Ok(response)
+        if !resp.status().is_success() {
+            let error_counter = &METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_passthrough_request_error;
+            error_counter.inc(labels);
+        }
+        let resp_staus = resp.status();
+
+        // We have a reqest::Response, would like a http::Response
+        let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp_staus)?);
+        for (k, v) in resp.headers() {
+            builder = builder.header(k.as_str(), v.as_bytes());
+        }
+        let resp_bytes = resp
+            .bytes()
+            .await
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+        // Inspect 404 errors: at this point, we know that the tenant exists, but the pageserver we route
+        // the request to might not yet be ready. Therefore, if it is a _tenant_ not found error, we can
+        // convert it into a 503. TODO: we should make this part of the check in `tenant_shard_remote_mutation`.
+        // However, `tenant_shard_remote_mutation` currently cannot inspect the HTTP error response body,
+        // so we have to do it here instead.
+        if resp_staus == reqwest::StatusCode::NOT_FOUND {
+            let resp_str = std::str::from_utf8(&resp_bytes)
+                .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            // We only handle "tenant not found" errors; other 404s like timeline not found should
+            // be forwarded as-is.
+            if resp_str.contains(&format!("tenant {tenant_or_shard_id}")) {
+                // Rather than retry here, send the client a 503 to prompt a retry: this matches
+                // the pageserver's use of 503, and all clients calling this API should retry on 503.
+                return Err(ApiError::ResourceUnavailable(
+                    format!(
+                        "Pageserver {node} returned tenant 404 due to ongoing migration, retry later"
+                    )
+                    .into(),
+                ));
+            }
+        }
+        let response = builder
+            .body(Body::from(resp_bytes))
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+        Ok(response)
+    }).await?
 }
 
 async fn handle_tenant_locate(
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ec3b419437..a1ff9b3c61 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -719,19 +719,19 @@ pub(crate) enum ReconcileResultRequest {
 }
 
 #[derive(Clone)]
-struct MutationLocation {
-    node: Node,
-    generation: Generation,
+pub(crate) struct MutationLocation {
+    pub(crate) node: Node,
+    pub(crate) generation: Generation,
 }
 
 #[derive(Clone)]
-struct ShardMutationLocations {
-    latest: MutationLocation,
-    other: Vec<MutationLocation>,
+pub(crate) struct ShardMutationLocations {
+    pub(crate) latest: MutationLocation,
+    pub(crate) other: Vec<MutationLocation>,
 }
 
 #[derive(Default, Clone)]
-struct TenantMutationLocations(BTreeMap<TenantShardId, ShardMutationLocations>);
+pub(crate) struct TenantMutationLocations(pub BTreeMap<TenantShardId, ShardMutationLocations>);
 
 struct ReconcileAllResult {
     spawned_reconciles: usize,
@@ -763,6 +763,29 @@ impl ReconcileAllResult {
     }
 }
 
+enum TenantIdOrShardId {
+    TenantId(TenantId),
+    TenantShardId(TenantShardId),
+}
+
+impl TenantIdOrShardId {
+    fn tenant_id(&self) -> TenantId {
+        match self {
+            TenantIdOrShardId::TenantId(tenant_id) => *tenant_id,
+            TenantIdOrShardId::TenantShardId(tenant_shard_id) => tenant_shard_id.tenant_id,
+        }
+    }
+
+    fn matches(&self, tenant_shard_id: &TenantShardId) -> bool {
+        match self {
+            TenantIdOrShardId::TenantId(tenant_id) => tenant_shard_id.tenant_id == *tenant_id,
+            TenantIdOrShardId::TenantShardId(this_tenant_shard_id) => {
+                this_tenant_shard_id == tenant_shard_id
+            }
+        }
+    }
+}
+
 impl Service {
     pub fn get_config(&self) -> &Config {
         &self.config
@@ -4814,6 +4837,12 @@ impl Service {
             }
         }
 
+        if targets.is_empty() {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+            ));
+        }
+
         Ok(TenantShardAttachState {
             targets,
             by_node_id,
@@ -5040,11 +5069,37 @@ impl Service {
     /// - Looks up the shards and the nodes where they were most recently attached
     /// - Guarantees that after the inner function returns, the shards' generations haven't moved on: this
     ///   ensures that the remote operation acted on the most recent generation, and is therefore durable.
-    async fn tenant_remote_mutation<R, O, F>(
+    pub(crate) async fn tenant_remote_mutation<R, O, F>(
         &self,
         tenant_id: TenantId,
         op: O,
     ) -> Result<R, ApiError>
+    where
+        O: FnOnce(TenantMutationLocations) -> F,
+        F: std::future::Future<Output = R>,
+    {
+        self.tenant_remote_mutation_inner(TenantIdOrShardId::TenantId(tenant_id), op)
+            .await
+    }
+
+    pub(crate) async fn tenant_shard_remote_mutation<R, O, F>(
+        &self,
+        tenant_shard_id: TenantShardId,
+        op: O,
+    ) -> Result<R, ApiError>
+    where
+        O: FnOnce(TenantMutationLocations) -> F,
+        F: std::future::Future<Output = R>,
+    {
+        self.tenant_remote_mutation_inner(TenantIdOrShardId::TenantShardId(tenant_shard_id), op)
+            .await
+    }
+
+    async fn tenant_remote_mutation_inner<R, O, F>(
+        &self,
+        tenant_id_or_shard_id: TenantIdOrShardId,
+        op: O,
+    ) -> Result<R, ApiError>
     where
         O: FnOnce(TenantMutationLocations) -> F,
         F: std::future::Future<Output = R>,
@@ -5056,7 +5111,13 @@ impl Service {
             // run concurrently with reconciliations, and it is not guaranteed that the node we find here
             // will still be the latest when we're done: we will check generations again at the end of
             // this function to handle that.
-            let generations = self.persistence.tenant_generations(tenant_id).await?;
+            let generations = self
+                .persistence
+                .tenant_generations(tenant_id_or_shard_id.tenant_id())
+                .await?
+                .into_iter()
+                .filter(|i| tenant_id_or_shard_id.matches(&i.tenant_shard_id))
+                .collect::<Vec<_>>();
 
             if generations
                 .iter()
@@ -5070,9 +5131,14 @@ impl Service {
                 // One or more shards has not been attached to a pageserver.  Check if this is because it's configured
                 // to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry)
                 let locked = self.inner.read().unwrap();
-                for (shard_id, shard) in
-                    locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-                {
+                let tenant_shards = locked
+                    .tenants
+                    .range(TenantShardId::tenant_range(
+                        tenant_id_or_shard_id.tenant_id(),
+                    ))
+                    .filter(|(shard_id, _)| tenant_id_or_shard_id.matches(shard_id))
+                    .collect::<Vec<_>>();
+                for (shard_id, shard) in tenant_shards {
                     match shard.policy {
                         PlacementPolicy::Attached(_) => {
                             // This shard is meant to be attached: the caller is not wrong to try and
@@ -5182,7 +5248,14 @@ impl Service {
         // Post-check: are all the generations of all the shards the same as they were initially?  This proves that
         // our remote operation executed on the latest generation and is therefore persistent.
         {
-            let latest_generations = self.persistence.tenant_generations(tenant_id).await?;
+            let latest_generations = self
+                .persistence
+                .tenant_generations(tenant_id_or_shard_id.tenant_id())
+                .await?
+                .into_iter()
+                .filter(|i| tenant_id_or_shard_id.matches(&i.tenant_shard_id))
+                .collect::<Vec<_>>();
+
             if latest_generations
                 .into_iter()
                 .map(
@@ -5316,7 +5389,7 @@ impl Service {
     pub(crate) async fn tenant_shard0_node(
         &self,
         tenant_id: TenantId,
-    ) -> Result<(Node, TenantShardId, bool), ApiError> {
+    ) -> Result<(Node, TenantShardId), ApiError> {
         let tenant_shard_id = {
             let locked = self.inner.read().unwrap();
             let Some((tenant_shard_id, _shard)) = locked
@@ -5334,7 +5407,7 @@ impl Service {
 
         self.tenant_shard_node(tenant_shard_id)
             .await
-            .map(|(node, consistent)| (node, tenant_shard_id, consistent))
+            .map(|node| (node, tenant_shard_id))
     }
 
     /// When you need to send an HTTP request to the pageserver that holds a shard of a tenant, this
@@ -5344,7 +5417,7 @@ impl Service {
     pub(crate) async fn tenant_shard_node(
         &self,
         tenant_shard_id: TenantShardId,
-    ) -> Result<(Node, bool), ApiError> {
+    ) -> Result<Node, ApiError> {
         // Look up in-memory state and maybe use the node from there.
         {
             let locked = self.inner.read().unwrap();
@@ -5374,8 +5447,7 @@ impl Service {
                         "Shard refers to nonexistent node"
                     )));
                 };
-                let consistent = self.is_observed_consistent_with_intent(shard, *intent_node_id);
-                return Ok((node.clone(), consistent));
+                return Ok(node.clone());
             }
         };
 
@@ -5410,7 +5482,7 @@ impl Service {
             )));
         };
         // As a reconciliation is in flight, we do not have the observed state yet, and therefore we assume it is always inconsistent.
-        Ok((node.clone(), false))
+        Ok(node.clone())
     }
 
     pub(crate) fn tenant_locate(
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 23b9d1c8c9..f95b0ee4d1 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -847,7 +847,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         return res_json
 
     def timeline_lsn_lease(
-        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn, **kwargs
     ):
         data = {
             "lsn": str(lsn),
@@ -857,6 +857,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease",
             json=data,
+            **kwargs,
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index fbdb14b6bb..9986c1f24a 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING
 import fixtures.utils
 import pytest
 from fixtures.auth_tokens import TokenScope
-from fixtures.common_types import TenantId, TenantShardId, TimelineId
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     DEFAULT_AZ_ID,
@@ -47,6 +47,7 @@ from fixtures.utils import (
     wait_until,
 )
 from fixtures.workload import Workload
+from requests.adapters import HTTPAdapter
 from urllib3 import Retry
 from werkzeug.wrappers.response import Response
 
@@ -4858,3 +4859,103 @@ def test_storage_controller_migrate_with_pageserver_restart(
         "shards": [{"node_id": int(secondary.id), "shard_number": 0}],
         "preferred_az": DEFAULT_AZ_ID,
     }
+
+
+@run_only_on_default_postgres("PG version is not important for this test")
+def test_storage_controller_forward_404(neon_env_builder: NeonEnvBuilder):
+    """
+    Ensures that the storage controller correctly forwards 404s and converts some of them
+    into 503s before forwarding to the client.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_azs = 2
+
+    env = neon_env_builder.init_start()
+    env.storage_controller.allowed_errors.append(".*Reconcile error.*")
+    env.storage_controller.allowed_errors.append(".*Timed out.*")
+
+    env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}})
+    env.storage_controller.reconcile_until_idle()
+
+    # 404s on tenants and timelines are forwarded as-is when reconciler is not running.
+
+    # Access a non-existing timeline -> 404
+    with pytest.raises(PageserverApiException) as e:
+        env.storage_controller.pageserver_api().timeline_detail(
+            env.initial_tenant, TimelineId.generate()
+        )
+    assert e.value.status_code == 404
+    with pytest.raises(PageserverApiException) as e:
+        env.storage_controller.pageserver_api().timeline_lsn_lease(
+            env.initial_tenant, TimelineId.generate(), Lsn(0)
+        )
+    assert e.value.status_code == 404
+
+    # Access a non-existing tenant when reconciler is not running -> 404
+    with pytest.raises(PageserverApiException) as e:
+        env.storage_controller.pageserver_api().timeline_detail(
+            TenantId.generate(), env.initial_timeline
+        )
+    assert e.value.status_code == 404
+    with pytest.raises(PageserverApiException) as e:
+        env.storage_controller.pageserver_api().timeline_lsn_lease(
+            TenantId.generate(), env.initial_timeline, Lsn(0)
+        )
+    assert e.value.status_code == 404
+
+    # Normal requests should succeed
+    detail = env.storage_controller.pageserver_api().timeline_detail(
+        env.initial_tenant, env.initial_timeline
+    )
+    last_record_lsn = Lsn(detail["last_record_lsn"])
+    env.storage_controller.pageserver_api().timeline_lsn_lease(
+        env.initial_tenant, env.initial_timeline, last_record_lsn
+    )
+
+    # Get into a situation where the intent state is not the same as the observed state.
+    describe = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0]
+    current_primary = describe["node_attached"]
+    current_secondary = describe["node_secondary"][0]
+    assert current_primary != current_secondary
+
+    # Pause the reconciler so that the generation number won't be updated.
+    env.storage_controller.configure_failpoints(
+        ("reconciler-live-migrate-post-generation-inc", "pause")
+    )
+
+    # Do the migration in another thread; the request will be dropped as we don't wait.
+    shard_zero = TenantShardId(env.initial_tenant, 0, 0)
+    concurrent.futures.ThreadPoolExecutor(max_workers=1).submit(
+        env.storage_controller.tenant_shard_migrate,
+        shard_zero,
+        current_secondary,
+        StorageControllerMigrationConfig(override_scheduler=True),
+    )
+    # Not the best way to do this, we should wait until the migration gets started.
+    time.sleep(1)
+    placement = env.storage_controller.get_tenants_placement()[str(shard_zero)]
+    assert placement["observed"] != placement["intent"]
+    assert placement["observed"]["attached"] == current_primary
+    assert placement["intent"]["attached"] == current_secondary
+
+    # Now we issue requests that would cause 404 again
+    retry_strategy = Retry(total=0)
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+
+    no_retry_api = env.storage_controller.pageserver_api()
+    no_retry_api.mount("http://", adapter)
+    no_retry_api.mount("https://", adapter)
+
+    # As intent state != observed state, tenant not found error should return 503,
+    # so that the client can retry once we've successfully migrated.
+    with pytest.raises(PageserverApiException) as e:
+        no_retry_api.timeline_detail(env.initial_tenant, TimelineId.generate())
+    assert e.value.status_code == 503, f"unexpected status code and error: {e.value}"
+    with pytest.raises(PageserverApiException) as e:
+        no_retry_api.timeline_lsn_lease(env.initial_tenant, TimelineId.generate(), Lsn(0))
+    assert e.value.status_code == 503, f"unexpected status code and error: {e.value}"
+
+    # Unblock reconcile operations
+    env.storage_controller.configure_failpoints(
+        ("reconciler-live-migrate-post-generation-inc", "off")
+    )

From 43fd5b218b28267aa500de9907a2bcfc325f3eb1 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 17 Jul 2025 23:20:38 +0300
Subject: [PATCH 20/39] Refactor shmem initialization in Neon extension
 (#12630)

## Problem

Initializing of shared memory in extension is complex and non-portable.
In neon extension this boilerplate code is duplicated in several files.

## Summary of changes

Perform all initialization in one place - neon.c
All other module procvide *ShmemRequest() and *ShmemInit() fuinction
which are called from neon.c

---------

Co-authored-by: Kosntantin Knizhnik <konstantin.knizhnik@databricks.com>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pgxn/neon/file_cache.c         |  45 ++++-----------
 pgxn/neon/libpagestore.c       |  54 ++----------------
 pgxn/neon/neon.c               | 101 ++++++++++++++++++++++++++++++---
 pgxn/neon/neon.h               |  15 +++++
 pgxn/neon/neon_lwlsncache.c    |  37 +++---------
 pgxn/neon/neon_perf_counters.c |  26 ++++++---
 pgxn/neon/relsize_cache.c      |  48 ++++------------
 pgxn/neon/walproposer_pg.c     |  48 ++--------------
 8 files changed, 164 insertions(+), 210 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 2c87f139af..7cfa769959 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -219,10 +219,6 @@ static char *lfc_path;
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
 static bool lfc_do_prewarm;
-static shmem_startup_hook_type prev_shmem_startup_hook;
-#if PG_VERSION_NUM>=150000
-static shmem_request_hook_type prev_shmem_request_hook;
-#endif
 
 bool lfc_store_prefetch_result;
 bool lfc_prewarm_update_ws_estimation;
@@ -342,18 +338,14 @@ lfc_ensure_opened(void)
 	return true;
 }
 
-static void
-lfc_shmem_startup(void)
+void
+LfcShmemInit(void)
 {
 	bool		found;
 	static HASHCTL info;
 
-	if (prev_shmem_startup_hook)
-	{
-		prev_shmem_startup_hook();
-	}
-
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+	if (lfc_max_size <= 0)
+		return;
 
 	lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
 	if (!found)
@@ -398,19 +390,16 @@ lfc_shmem_startup(void)
 			ConditionVariableInit(&lfc_ctl->cv[i]);
 
 	}
-	LWLockRelease(AddinShmemInitLock);
 }
 
-static void
-lfc_shmem_request(void)
+void
+LfcShmemRequest(void)
 {
-#if PG_VERSION_NUM>=150000
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-#endif
-
-	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE));
-	RequestNamedLWLockTranche("lfc_lock", 1);
+	if (lfc_max_size > 0)
+	{
+		RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE));
+		RequestNamedLWLockTranche("lfc_lock", 1);
+	}
 }
 
 static bool
@@ -642,18 +631,6 @@ lfc_init(void)
 							NULL,
 							NULL,
 							NULL);
-
-	if (lfc_max_size == 0)
-		return;
-
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = lfc_shmem_startup;
-#if PG_VERSION_NUM>=150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = lfc_shmem_request;
-#else
-	lfc_shmem_request();
-#endif
 }
 
 FileCacheState*
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 05ba6da663..596258007a 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -118,10 +118,6 @@ typedef struct
 	ShardMap	shard_map;
 } PagestoreShmemState;
 
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-#endif
-static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
 
@@ -1284,18 +1280,12 @@ check_neon_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(id, *newval, 16);
 }
 
-static Size
-PagestoreShmemSize(void)
-{
-	return add_size(sizeof(PagestoreShmemState), NeonPerfCountersShmemSize());
-}
 
-static bool
+void
 PagestoreShmemInit(void)
 {
 	bool		found;
 
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 	pagestore_shared = ShmemInitStruct("libpagestore shared state",
 									   sizeof(PagestoreShmemState),
 									   &found);
@@ -1306,44 +1296,12 @@ PagestoreShmemInit(void)
 		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
 		AssignPageserverConnstring(page_server_connstring, NULL);
 	}
-
-	NeonPerfCountersShmemInit();
-
-	LWLockRelease(AddinShmemInitLock);
-	return found;
 }
 
-static void
-pagestore_shmem_startup_hook(void)
+void
+PagestoreShmemRequest(void)
 {
-	if (prev_shmem_startup_hook)
-		prev_shmem_startup_hook();
-
-	PagestoreShmemInit();
-}
-
-static void
-pagestore_shmem_request(void)
-{
-#if PG_VERSION_NUM >= 150000
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-#endif
-
-	RequestAddinShmemSpace(PagestoreShmemSize());
-}
-
-static void
-pagestore_prepare_shmem(void)
-{
-#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = pagestore_shmem_request;
-#else
-	pagestore_shmem_request();
-#endif
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = pagestore_shmem_startup_hook;
+	RequestAddinShmemSpace(sizeof(PagestoreShmemState));
 }
 
 /*
@@ -1352,8 +1310,6 @@ pagestore_prepare_shmem(void)
 void
 pg_init_libpagestore(void)
 {
-	pagestore_prepare_shmem();
-
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
@@ -1504,8 +1460,6 @@ pg_init_libpagestore(void)
 							0,
 							NULL, NULL, NULL);
 
-	relsize_hash_init();
-
 	if (page_server != NULL)
 		neon_log(ERROR, "libpagestore already loaded");
 
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index df5dcf5334..4e4320e498 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -22,6 +22,7 @@
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/proc.h"
+#include "storage/ipc.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
 #include "utils/builtins.h"
@@ -59,11 +60,15 @@ static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
 static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags);
 static void neon_ExecutorEnd(QueryDesc *queryDesc);
 
-#if PG_MAJORVERSION_NUM >= 16
 static shmem_startup_hook_type prev_shmem_startup_hook;
-
 static void neon_shmem_startup_hook(void);
+static void neon_shmem_request_hook(void);
+
+#if PG_MAJORVERSION_NUM >= 15
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
 #endif
+
+
 #if PG_MAJORVERSION_NUM >= 17
 uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
 uint32		WAIT_EVENT_NEON_LFC_READ;
@@ -450,15 +455,44 @@ _PG_init(void)
 	 */
 #if PG_VERSION_NUM >= 160000
 	load_file("$libdir/neon_rmgr", false);
-
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = neon_shmem_startup_hook;
 #endif
 
 	/* dummy call to a Rust function in the communicator library, to check that it works */
 	(void) communicator_dummy(123);
 
+	/*
+	 * Initializing a pre-loaded Postgres extension happens in three stages:
+	 *
+	 * 1. _PG_init() is called early at postmaster startup. In this stage, no
+	 *    shared memory has been allocated yet. Core Postgres GUCs have been
+	 *    initialized from the config files, but notably, MaxBackends has not
+	 *    calculated yet. In this stage, we must register any extension GUCs
+	 *    and can do other early initialization that doesn't depend on shared
+	 *    memory. In this stage we must also register "shmem request" and
+	 *    "shmem starutup" hooks, to be called in stages 2 and 3.
+	 *
+	 * 2. After MaxBackends have been calculated, the "shmem request" hooks
+	 *    are called. The hooks can reserve shared memory by calling
+	 *    RequestAddinShmemSpace and RequestNamedLWLockTranche().  The "shmem
+	 *    request hooks" are a new mechanism in Postgres v15. In v14 and
+	 *    below, you had to make those Requests in stage 1 already, which
+	 *    means they could not depend on MaxBackends. (See hack in
+	 *    NeonPerfCountersShmemRequest())
+	 *
+	 * 3. After some more runtime-computed GUCs that affect the amount of
+	 *    shared memory needed have been calculated, the "shmem startup" hooks
+	 *    are called. In this stage, we allocate any shared memory, LWLocks
+	 *    and other shared resources.
+	 *
+	 * Here, in the 'neon' extension, we register just one shmem request hook
+	 * and one startup hook, which call into functions in all the subsystems
+	 * that are part of the extension. On v14, the ShmemRequest functions are
+	 * called in stage 1, and on v15 onwards they are called in stage 2.
+	 */
+
+	/* Stage 1: Define GUCs, and other early intialization */
 	pg_init_libpagestore();
+	relsize_hash_init();
 	lfc_init();
 	pg_init_walproposer();
 	init_lwlsncache();
@@ -561,6 +595,22 @@ _PG_init(void)
 
 	ReportSearchPath();
 
+	/*
+	 * Register initialization hooks for stage 2. (On v14, there's no "shmem
+	 * request" hooks, so call the ShmemRequest functions immediately.)
+	 */
+#if PG_VERSION_NUM >= 150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = neon_shmem_request_hook;
+#else
+	neon_shmem_request_hook();
+#endif
+
+	/* Register hooks for stage 3 */
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = neon_shmem_startup_hook;
+
+	/* Other misc initialization */
 	prev_ExecutorStart = ExecutorStart_hook;
 	ExecutorStart_hook = neon_ExecutorStart;
 	prev_ExecutorEnd = ExecutorEnd_hook;
@@ -646,7 +696,34 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
 		PG_RETURN_INT32(dc);
 }
 
-#if PG_MAJORVERSION_NUM >= 16
+/*
+ * Initialization stage 2: make requests for the amount of shared memory we
+ * will need.
+ *
+ * For a high-level explanation of the initialization process, see _PG_init().
+ */
+static void
+neon_shmem_request_hook(void)
+{
+#if PG_VERSION_NUM >= 150000
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+#endif
+
+	LfcShmemRequest();
+	NeonPerfCountersShmemRequest();
+	PagestoreShmemRequest();
+	RelsizeCacheShmemRequest();
+	WalproposerShmemRequest();
+	LwLsnCacheShmemRequest();
+}
+
+
+/*
+ * Initialization stage 3: Initialize shared memory.
+ *
+ * For a high-level explanation of the initialization process, see _PG_init().
+ */
 static void
 neon_shmem_startup_hook(void)
 {
@@ -654,6 +731,15 @@ neon_shmem_startup_hook(void)
 	if (prev_shmem_startup_hook)
 		prev_shmem_startup_hook();
 
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	LfcShmemInit();
+	NeonPerfCountersShmemInit();
+	PagestoreShmemInit();
+	RelsizeCacheShmemInit();
+	WalproposerShmemInit();
+	LwLsnCacheShmemInit();
+
 #if PG_MAJORVERSION_NUM >= 17
 	WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance");
 	WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read");
@@ -666,8 +752,9 @@ neon_shmem_startup_hook(void)
 	WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO");
 	WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download");
 #endif
+
+	LWLockRelease(AddinShmemInitLock);
 }
-#endif
 
 /*
  * ExecutorStart hook: start up tracking if needed
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index 215396ef7a..20c850864a 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -70,4 +70,19 @@ extern PGDLLEXPORT void WalProposerSync(int argc, char *argv[]);
 extern PGDLLEXPORT void WalProposerMain(Datum main_arg);
 extern PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
 
+extern void LfcShmemRequest(void);
+extern void PagestoreShmemRequest(void);
+extern void RelsizeCacheShmemRequest(void);
+extern void WalproposerShmemRequest(void);
+extern void LwLsnCacheShmemRequest(void);
+extern void NeonPerfCountersShmemRequest(void);
+
+extern void LfcShmemInit(void);
+extern void PagestoreShmemInit(void);
+extern void RelsizeCacheShmemInit(void);
+extern void WalproposerShmemInit(void);
+extern void LwLsnCacheShmemInit(void);
+extern void NeonPerfCountersShmemInit(void);
+
+
 #endif							/* NEON_H */
diff --git a/pgxn/neon/neon_lwlsncache.c b/pgxn/neon/neon_lwlsncache.c
index a8cfa0f825..5887c02c36 100644
--- a/pgxn/neon/neon_lwlsncache.c
+++ b/pgxn/neon/neon_lwlsncache.c
@@ -1,5 +1,6 @@
 #include "postgres.h"
 
+#include "neon.h"
 #include "neon_lwlsncache.h"
 
 #include "miscadmin.h"
@@ -81,14 +82,6 @@ static set_max_lwlsn_hook_type prev_set_max_lwlsn_hook = NULL;
 static set_lwlsn_relation_hook_type prev_set_lwlsn_relation_hook = NULL;
 static set_lwlsn_db_hook_type prev_set_lwlsn_db_hook = NULL;
 
-static shmem_startup_hook_type prev_shmem_startup_hook;
-
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook;
-#endif
-
-static void shmemrequest(void);
-static void shmeminit(void);
 static void neon_set_max_lwlsn(XLogRecPtr lsn);
 
 void
@@ -99,16 +92,6 @@ init_lwlsncache(void)
 	
 	lwlc_register_gucs();
 
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = shmeminit;
-
-	#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = shmemrequest;
-	#else
-	shmemrequest();
-	#endif
-	
 	prev_set_lwlsn_block_range_hook = set_lwlsn_block_range_hook;
 	set_lwlsn_block_range_hook = neon_set_lwlsn_block_range;
 	prev_set_lwlsn_block_v_hook = set_lwlsn_block_v_hook;
@@ -124,20 +107,19 @@ init_lwlsncache(void)
 }
 
 
-static void shmemrequest(void) {
+void
+LwLsnCacheShmemRequest(void)
+{
 	Size requested_size = sizeof(LwLsnCacheCtl);
-	
+
 	requested_size += hash_estimate_size(lwlsn_cache_size, sizeof(LastWrittenLsnCacheEntry));
 
 	RequestAddinShmemSpace(requested_size);
-
-	#if PG_VERSION_NUM >= 150000
-	if (prev_shmem_request_hook)
-			prev_shmem_request_hook();
-	#endif
 }
 
-static void shmeminit(void) {
+void
+LwLsnCacheShmemInit(void)
+{
 	static HASHCTL info;
 	bool found;
 	if (lwlsn_cache_size > 0)
@@ -157,9 +139,6 @@ static void shmeminit(void) {
 	}
 	dlist_init(&LwLsnCache->lastWrittenLsnLRU);
     LwLsnCache->maxLastWrittenLsn = GetRedoRecPtr();
-	if (prev_shmem_startup_hook) {
-		prev_shmem_startup_hook();
-	}
 }
 
 /*
diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c
index d0a3d15108..dd576e4e73 100644
--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -17,22 +17,32 @@
 #include "storage/shmem.h"
 #include "utils/builtins.h"
 
+#include "neon.h"
 #include "neon_perf_counters.h"
 #include "neon_pgversioncompat.h"
 
 neon_per_backend_counters *neon_per_backend_counters_shared;
 
-Size
-NeonPerfCountersShmemSize(void)
+void
+NeonPerfCountersShmemRequest(void)
 {
-	Size		size = 0;
-
-	size = add_size(size, mul_size(NUM_NEON_PERF_COUNTER_SLOTS,
-								   sizeof(neon_per_backend_counters)));
-
-	return size;
+	Size size;
+#if PG_MAJORVERSION_NUM < 15
+	/* Hack: in PG14 MaxBackends is not initialized at the time of calling NeonPerfCountersShmemRequest function.
+	 * Do it ourselves and then undo to prevent assertion failure
+	 */
+	Assert(MaxBackends == 0); /* not initialized yet */
+	InitializeMaxBackends();
+	size = mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters));
+	MaxBackends = 0;
+#else
+	size = mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters));
+#endif
+	RequestAddinShmemSpace(size);
 }
 
+
+
 void
 NeonPerfCountersShmemInit(void)
 {
diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c
index 60ca1675d9..bf7961574a 100644
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -10,6 +10,7 @@
  */
 #include "postgres.h"
 
+#include "neon.h"
 #include "neon_pgversioncompat.h"
 
 #include "pagestore_client.h"
@@ -49,32 +50,23 @@ typedef struct
 								 * algorithm */
 } RelSizeHashControl;
 
-static HTAB *relsize_hash;
-static LWLockId relsize_lock;
-static int	relsize_hash_size;
-static RelSizeHashControl* relsize_ctl;
-static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-static void relsize_shmem_request(void);
-#endif
-
 /*
  * Size of a cache entry is 36 bytes. So this default will take about 2.3 MB,
  * which seems reasonable.
  */
 #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)
 
-static void
-neon_smgr_shmem_startup(void)
+static HTAB *relsize_hash;
+static LWLockId relsize_lock;
+static int	relsize_hash_size = DEFAULT_RELSIZE_HASH_SIZE;
+static RelSizeHashControl* relsize_ctl;
+
+void
+RelsizeCacheShmemInit(void)
 {
 	static HASHCTL info;
 	bool found;
 
-	if (prev_shmem_startup_hook)
-		prev_shmem_startup_hook();
-
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 	relsize_ctl = (RelSizeHashControl *) ShmemInitStruct("relsize_hash", sizeof(RelSizeHashControl), &found);
 	if (!found)
 	{
@@ -85,7 +77,6 @@ neon_smgr_shmem_startup(void)
 									 relsize_hash_size, relsize_hash_size,
 									 &info,
 									 HASH_ELEM | HASH_BLOBS);
-		LWLockRelease(AddinShmemInitLock);
 		relsize_ctl->size = 0;
 		relsize_ctl->hits = 0;
 		relsize_ctl->misses = 0;
@@ -242,34 +233,15 @@ relsize_hash_init(void)
 							PGC_POSTMASTER,
 							0,
 							NULL, NULL, NULL);
-
-	if (relsize_hash_size > 0)
-	{
-#if PG_VERSION_NUM >= 150000
-		prev_shmem_request_hook = shmem_request_hook;
-		shmem_request_hook = relsize_shmem_request;
-#else
-		RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
-		RequestNamedLWLockTranche("neon_relsize", 1);
-#endif
-
-		prev_shmem_startup_hook = shmem_startup_hook;
-		shmem_startup_hook = neon_smgr_shmem_startup;
-	}
 }
 
-#if PG_VERSION_NUM >= 150000
 /*
  * shmem_request hook: request additional shared resources.  We'll allocate or
  * attach to the shared resources in neon_smgr_shmem_startup().
  */
-static void
-relsize_shmem_request(void)
+void
+RelsizeCacheShmemRequest(void)
 {
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-
 	RequestAddinShmemSpace(sizeof(RelSizeHashControl) + hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
 	RequestNamedLWLockTranche("neon_relsize", 1);
 }
-#endif
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 18655d4c6c..9ed8d0d2d2 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -83,10 +83,8 @@ static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr;
 static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr;
 static HotStandbyFeedback agg_hs_feedback;
 
-static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
 static void assign_neon_safekeepers(const char *newval, void *extra);
-static void nwp_prepare_shmem(void);
 static uint64 backpressure_lag_impl(void);
 static uint64 startup_backpressure_wrap(void);
 static bool backpressure_throttling_impl(void);
@@ -99,11 +97,6 @@ static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
 static void walprop_pg_load_libpqwalreceiver(void);
 
 static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL;
-static shmem_startup_hook_type prev_shmem_startup_hook_type;
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-static void walproposer_shmem_request(void);
-#endif
 static void WalproposerShmemInit_SyncSafekeeper(void);
 
 
@@ -193,8 +186,6 @@ pg_init_walproposer(void)
 
 	nwp_register_gucs();
 
-	nwp_prepare_shmem();
-
 	delay_backend_us = &startup_backpressure_wrap;
 	PrevProcessInterruptsCallback = ProcessInterruptsCallback;
 	ProcessInterruptsCallback = backpressure_throttling_impl;
@@ -494,12 +485,11 @@ WalproposerShmemSize(void)
 	return sizeof(WalproposerShmemState);
 }
 
-static bool
+void
 WalproposerShmemInit(void)
 {
 	bool		found;
 
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 	walprop_shared = ShmemInitStruct("Walproposer shared state",
 									 sizeof(WalproposerShmemState),
 									 &found);
@@ -517,9 +507,6 @@ WalproposerShmemInit(void)
 		pg_atomic_init_u64(&walprop_shared->wal_rate_limiter.last_recorded_time_us, 0);
 		/* END_HADRON */
 	}
-	LWLockRelease(AddinShmemInitLock);
-
-	return found;
 }
 
 static void
@@ -623,42 +610,15 @@ walprop_register_bgworker(void)
 
 /* shmem handling */
 
-static void
-nwp_prepare_shmem(void)
-{
-#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = walproposer_shmem_request;
-#else
-	RequestAddinShmemSpace(WalproposerShmemSize());
-#endif
-	prev_shmem_startup_hook_type = shmem_startup_hook;
-	shmem_startup_hook = nwp_shmem_startup_hook;
-}
-
-#if PG_VERSION_NUM >= 150000
 /*
  * shmem_request hook: request additional shared resources.  We'll allocate or
- * attach to the shared resources in nwp_shmem_startup_hook().
+ * attach to the shared resources in WalproposerShmemInit().
  */
-static void
-walproposer_shmem_request(void)
+void
+WalproposerShmemRequest(void)
 {
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-
 	RequestAddinShmemSpace(WalproposerShmemSize());
 }
-#endif
-
-static void
-nwp_shmem_startup_hook(void)
-{
-	if (prev_shmem_startup_hook_type)
-		prev_shmem_startup_hook_type();
-
-	WalproposerShmemInit();
-}
 
 WalproposerShmemState *
 GetWalpropShmemState(void)

From 7fef4435c19c053f89af6f27cbb9750d3c7bbadc Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 17 Jul 2025 23:32:34 +0300
Subject: [PATCH 21/39] Store stripe_size in shared memory (#12560)

## Problem

See https://databricks.slack.com/archives/C09254R641L/p1752004515032899

stripe_size GUC update may be delayed at different backends and so cause
inconsistency with connection strings (shard map).

## Summary of changes

Postmaster should store stripe_size in shared memory as well as
connection strings.
It should be also enforced that stripe size is defined prior to
connection strings in postgresql.conf

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Kosntantin Knizhnik <konstantin.knizhnik@databricks.com>
---
 compute_tools/src/config.rs |  7 ++++---
 pgxn/neon/libpagestore.c    | 20 +++++++++++++++++---
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index f6487d33b3..dd46353343 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -56,14 +56,15 @@ pub fn write_postgres_conf(
         writeln!(file, "{conf}")?;
     }
 
+    // Stripe size GUC should be defined prior to connection string
+    if let Some(stripe_size) = spec.shard_stripe_size {
+        writeln!(file, "neon.stripe_size={stripe_size}")?;
+    }
     // Add options for connecting to storage
     writeln!(file, "# Neon storage settings")?;
     if let Some(s) = &spec.pageserver_connstring {
         writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
     }
-    if let Some(stripe_size) = spec.shard_stripe_size {
-        writeln!(file, "neon.stripe_size={stripe_size}")?;
-    }
     if !spec.safekeeper_connstrings.is_empty() {
         let mut neon_safekeepers_value = String::new();
         tracing::info!(
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 596258007a..acb8092990 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -90,6 +90,7 @@ typedef struct
 {
 	char		connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
 	size_t		num_shards;
+	size_t		stripe_size;
 } ShardMap;
 
 /*
@@ -110,6 +111,11 @@ typedef struct
  * has changed since last access, and to detect and retry copying the value if
  * the postmaster changes the value concurrently. (Postmaster doesn't have a
  * PGPROC entry and therefore cannot use LWLocks.)
+ *
+ * stripe_size is now also part of ShardMap, although it is defined by separate GUC.
+ * Postgres doesn't provide any mechanism to enforce dependencies between GUCs,
+ * that it we we have to rely on order of GUC definition in config file.
+ * "neon.stripe_size" should be defined prior to "neon.pageserver_connstring"
  */
 typedef struct
 {
@@ -230,7 +236,10 @@ ParseShardMap(const char *connstr, ShardMap *result)
 		p = sep + 1;
 	}
 	if (result)
+	{
 		result->num_shards = nshards;
+		result->stripe_size = stripe_size;
+	}
 
 	return true;
 }
@@ -291,12 +300,13 @@ AssignPageserverConnstring(const char *newval, void *extra)
  * last call, terminates all existing connections to all pageservers.
  */
 static void
-load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
+load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p, size_t* stripe_size_p)
 {
 	uint64		begin_update_counter;
 	uint64		end_update_counter;
 	ShardMap   *shard_map = &pagestore_shared->shard_map;
 	shardno_t	num_shards;
+	size_t		stripe_size;
 
 	/*
 	 * Postmaster can update the shared memory values concurrently, in which
@@ -311,6 +321,7 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
 		end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
 
 		num_shards = shard_map->num_shards;
+		stripe_size = shard_map->stripe_size;
 		if (connstr_p && shard_no < MAX_SHARDS)
 			strlcpy(connstr_p, shard_map->connstring[shard_no], MAX_PAGESERVER_CONNSTRING_SIZE);
 		pg_memory_barrier();
@@ -345,6 +356,8 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
 
 	if (num_shards_p)
 		*num_shards_p = num_shards;
+	if (stripe_size_p)
+		*stripe_size_p = stripe_size;
 }
 
 #define MB (1024*1024)
@@ -353,9 +366,10 @@ shardno_t
 get_shard_number(BufferTag *tag)
 {
 	shardno_t	n_shards;
+	size_t		stripe_size;
 	uint32		hash;
 
-	load_shard_map(0, NULL, &n_shards);
+	load_shard_map(0, NULL, &n_shards, &stripe_size);
 
 #if PG_MAJORVERSION_NUM < 16
 	hash = murmurhash32(tag->rnode.relNode);
@@ -408,7 +422,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	 * Note that connstr is used both during connection start, and when we
 	 * log the successful connection.
 	 */
-	load_shard_map(shard_no, connstr, NULL);
+	load_shard_map(shard_no, connstr, NULL, NULL);
 
 	switch (shard->state)
 	{

From 62c0152e6bbb00b6fdd1061516317383a2e0ad82 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 17 Jul 2025 22:03:55 +0100
Subject: [PATCH 22/39] pageserver: shut down compute connections at libpq
 level (#12642)

## Problem

Previously, if a get page failure was cause by timeline shutdown, the
pageserver would attempt to tear down the connection gracefully:
`shutdown(SHUT_WR)` followed by `close()`.

This triggers a code path on the compute where it has to tell apart
between an idle connection and a closed one. That code is bug prone, so
we can just side-step the issue by shutting down the connection via a
libpq error message.

This surfaced as instability in test_shard_resolve_during_split_abort.
It's a new test, but the issue existed for ages.

## Summary of Changes

Send a libpq error message instead of doing graceful TCP connection
shutdown.

Closes LKB-648
---
 libs/postgres_backend/src/lib.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 851d824291..20afa8bb46 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -749,7 +749,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                 trace!("got query {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string).await {
                     match e {
-                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
+                        err @ QueryError::Shutdown => {
+                            // Notify postgres of the connection shutdown at the libpq
+                            // protocol level. This avoids postgres having to tell apart
+                            // from an idle connection and a stale one, which is bug prone.
+                            let shutdown_error = short_error(&err);
+                            self.write_message_noflush(&BeMessage::ErrorResponse(
+                                &shutdown_error,
+                                Some(err.pg_error_code()),
+                            ))?;
+
+                            return Ok(ProcessMsgResult::Break);
+                        }
                         QueryError::SimulatedConnectionError => {
                             return Err(QueryError::SimulatedConnectionError);
                         }

From 53a05e8ccbb8b17a5eec07d96c0a1182cf717ffd Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 17 Jul 2025 23:43:43 +0200
Subject: [PATCH 23/39] fix(compute_ctl): Only offload LFC state if no
 prewarming is in progress (#12645)

## Problem

We currently offload LFC state unconditionally, which can cause
problems. Imagine a situation:
1. Endpoint started with `autoprewarm: true`.
2. While prewarming is not completed, we upload the new incomplete
state.
3. Compute gets interrupted and restarts.
4. We start again and try to prewarm with the state from 2. instead of
the previous complete state.

During the orchestrated prewarming, it's probably not a big issue, but
it's still better to do not interfere with the prewarm process.

## Summary of changes

Do not offload LFC state if we are currently prewarming or any issue
occurred. While on it, also introduce `Skipped` LFC prewarm status,
which is used when the corresponding LFC state is not present in the
endpoint storage. It's primarily needed to distinguish the first compute
start for particular endpoint, as it's completely valid to do not have
LFC state yet.
---
 compute_tools/src/compute.rs             | 21 +++++++-
 compute_tools/src/compute_prewarm.rs     | 61 +++++++++++++++++-------
 compute_tools/src/http/openapi_spec.yaml | 10 ++--
 libs/compute_api/src/responses.rs        | 24 ++++++++--
 4 files changed, 88 insertions(+), 28 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 941a21806f..3ae946c10e 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -2450,14 +2450,31 @@ LIMIT 100",
     pub fn spawn_lfc_offload_task(self: &Arc<Self>, interval: Duration) {
         self.terminate_lfc_offload_task();
         let secs = interval.as_secs();
-        info!("spawning lfc offload worker with {secs}s interval");
         let this = self.clone();
+
+        info!("spawning LFC offload worker with {secs}s interval");
         let handle = spawn(async move {
             let mut interval = time::interval(interval);
             interval.tick().await; // returns immediately
             loop {
                 interval.tick().await;
-                this.offload_lfc_async().await;
+
+                let prewarm_state = this.state.lock().unwrap().lfc_prewarm_state.clone();
+                // Do not offload LFC state if we are currently prewarming or any issue occurred.
+                // If we'd do that, we might override the LFC state in endpoint storage with some
+                // incomplete state. Imagine a situation:
+                // 1. Endpoint started with `autoprewarm: true`
+                // 2. While prewarming is not completed, we upload the new incomplete state
+                // 3. Compute gets interrupted and restarts
+                // 4. We start again and try to prewarm with the state from 2. instead of the previous complete state
+                if matches!(
+                    prewarm_state,
+                    LfcPrewarmState::Completed
+                        | LfcPrewarmState::NotPrewarmed
+                        | LfcPrewarmState::Skipped
+                ) {
+                    this.offload_lfc_async().await;
+                }
             }
         });
         *self.lfc_offload_task.lock().unwrap() = Some(handle);
diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs
index d014a5bb72..07b4a596cc 100644
--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -89,7 +89,7 @@ impl ComputeNode {
         self.state.lock().unwrap().lfc_offload_state.clone()
     }
 
-    /// If there is a prewarm request ongoing, return false, true otherwise
+    /// If there is a prewarm request ongoing, return `false`, `true` otherwise.
     pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
         {
             let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
@@ -101,15 +101,25 @@ impl ComputeNode {
 
         let cloned = self.clone();
         spawn(async move {
-            let Err(err) = cloned.prewarm_impl(from_endpoint).await else {
-                cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed;
-                return;
-            };
-            crate::metrics::LFC_PREWARM_ERRORS.inc();
-            error!(%err, "prewarming lfc");
-            cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed {
-                error: err.to_string(),
+            let state = match cloned.prewarm_impl(from_endpoint).await {
+                Ok(true) => LfcPrewarmState::Completed,
+                Ok(false) => {
+                    info!(
+                        "skipping LFC prewarm because LFC state is not found in endpoint storage"
+                    );
+                    LfcPrewarmState::Skipped
+                }
+                Err(err) => {
+                    crate::metrics::LFC_PREWARM_ERRORS.inc();
+                    error!(%err, "could not prewarm LFC");
+
+                    LfcPrewarmState::Failed {
+                        error: err.to_string(),
+                    }
+                }
             };
+
+            cloned.state.lock().unwrap().lfc_prewarm_state = state;
         });
         true
     }
@@ -120,15 +130,21 @@ impl ComputeNode {
         EndpointStoragePair::from_spec_and_endpoint(state.pspec.as_ref().unwrap(), from_endpoint)
     }
 
-    async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<()> {
+    /// Request LFC state from endpoint storage and load corresponding pages into Postgres.
+    /// Returns a result with `false` if the LFC state is not found in endpoint storage.
+    async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<bool> {
         let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
-        info!(%url, "requesting LFC state from endpoint storage");
 
+        info!(%url, "requesting LFC state from endpoint storage");
         let request = Client::new().get(&url).bearer_auth(token);
         let res = request.send().await.context("querying endpoint storage")?;
         let status = res.status();
-        if status != StatusCode::OK {
-            bail!("{status} querying endpoint storage")
+        match status {
+            StatusCode::OK => (),
+            StatusCode::NOT_FOUND => {
+                return Ok(false);
+            }
+            _ => bail!("{status} querying endpoint storage"),
         }
 
         let mut uncompressed = Vec::new();
@@ -141,7 +157,8 @@ impl ComputeNode {
             .await
             .context("decoding LFC state")?;
         let uncompressed_len = uncompressed.len();
-        info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into postgres");
+
+        info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres");
 
         ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
             .await
@@ -149,7 +166,9 @@ impl ComputeNode {
             .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
             .await
             .context("loading LFC state into postgres")
-            .map(|_| ())
+            .map(|_| ())?;
+
+        Ok(true)
     }
 
     /// If offload request is ongoing, return false, true otherwise
@@ -177,12 +196,14 @@ impl ComputeNode {
 
     async fn offload_lfc_with_state_update(&self) {
         crate::metrics::LFC_OFFLOADS.inc();
+
         let Err(err) = self.offload_lfc_impl().await else {
             self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
             return;
         };
+
         crate::metrics::LFC_OFFLOAD_ERRORS.inc();
-        error!(%err, "offloading lfc");
+        error!(%err, "could not offload LFC state to endpoint storage");
         self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
             error: err.to_string(),
         };
@@ -190,7 +211,7 @@ impl ComputeNode {
 
     async fn offload_lfc_impl(&self) -> Result<()> {
         let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
-        info!(%url, "requesting LFC state from postgres");
+        info!(%url, "requesting LFC state from Postgres");
 
         let mut compressed = Vec::new();
         ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
@@ -205,13 +226,17 @@ impl ComputeNode {
             .read_to_end(&mut compressed)
             .await
             .context("compressing LFC state")?;
+
         let compressed_len = compressed.len();
         info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage");
 
         let request = Client::new().put(url).bearer_auth(token).body(compressed);
         match request.send().await {
             Ok(res) if res.status() == StatusCode::OK => Ok(()),
-            Ok(res) => bail!("Error writing to endpoint storage: {}", res.status()),
+            Ok(res) => bail!(
+                "Request to endpoint storage failed with status: {}",
+                res.status()
+            ),
             Err(err) => Err(err).context("writing to endpoint storage"),
         }
     }
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 93a357e160..3cf5ea7c51 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -613,11 +613,11 @@ components:
         - skipped
       properties:
         status:
-          description: Lfc prewarm status
-          enum: [not_prewarmed, prewarming, completed, failed]
+          description: LFC prewarm status
+          enum: [not_prewarmed, prewarming, completed, failed, skipped]
           type: string
         error:
-          description: Lfc prewarm error, if any
+          description: LFC prewarm error, if any
           type: string
         total:
           description: Total pages processed
@@ -635,11 +635,11 @@ components:
         - status
       properties:
         status:
-          description: Lfc offload status
+          description: LFC offload status
           enum: [not_offloaded, offloading, completed, failed]
           type: string
         error:
-          description: Lfc offload error, if any
+          description: LFC offload error, if any
           type: string
 
     PromoteState:
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 2fe233214a..5b8fc49750 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -46,16 +46,33 @@ pub struct ExtensionInstallResponse {
     pub version: ExtVersion,
 }
 
+/// Status of the LFC prewarm process. The same state machine is reused for
+/// both autoprewarm (prewarm after compute/Postgres start using the previously
+/// stored LFC state) and explicit prewarming via API.
 #[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcPrewarmState {
+    /// Default value when compute boots up.
     #[default]
     NotPrewarmed,
+    /// Prewarming thread is active and loading pages into LFC.
     Prewarming,
+    /// We found requested LFC state in the endpoint storage and
+    /// completed prewarming successfully.
     Completed,
-    Failed {
-        error: String,
-    },
+    /// Unexpected error happened during prewarming. Note, `Not Found 404`
+    /// response from the endpoint storage is explicitly excluded here
+    /// because it can normally happen on the first compute start,
+    /// since LFC state is not available yet.
+    Failed { error: String },
+    /// We tried to fetch the corresponding LFC state from the endpoint storage,
+    /// but received `Not Found 404`. This should normally happen only during the
+    /// first endpoint start after creation with `autoprewarm: true`.
+    ///
+    /// During the orchestrated prewarm via API, when a caller explicitly
+    /// provides the LFC state key to prewarm from, it's the caller responsibility
+    /// to handle this status as an error state in this case.
+    Skipped,
 }
 
 impl Display for LfcPrewarmState {
@@ -64,6 +81,7 @@ impl Display for LfcPrewarmState {
             LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
             LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
             LfcPrewarmState::Completed => f.write_str("Completed"),
+            LfcPrewarmState::Skipped => f.write_str("Skipped"),
             LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
         }
     }

From 64d0008389849f11c31b6253ea00e86c224caaaf Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 17 Jul 2025 23:52:20 +0200
Subject: [PATCH 24/39] proxy: Shorten the initial TTL of cancel keys (#12647)

## Problem

A high rate of short-lived connections means that there a lot of cancel
keys in Redis with TTL=10min that could be avoided by having a much
shorter initial TTL.

## Summary of changes

* Introduce an initial TTL of 1min used with the SET command.
* Fix: don't delay repushing cancel data when expired.
* Prepare for exponentially increasing TTLs.

## Alternatives

A best-effort UNLINK command on connection termination would clean up
cancel keys right away. This needs a bigger refactor due to how batching
is handled.
---
 proxy/src/cancellation.rs | 82 ++++++++++++++++++++++++---------------
 1 file changed, 51 insertions(+), 31 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 77062d3bb4..f25121331f 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -32,8 +32,11 @@ use crate::util::run_until;
 
 type IpSubnetKey = IpNet;
 
-const CANCEL_KEY_TTL: Duration = Duration::from_secs(600);
-const CANCEL_KEY_REFRESH: Duration = Duration::from_secs(570);
+/// Initial period and TTL is shorter to clear keys of short-lived connections faster.
+const CANCEL_KEY_INITIAL_PERIOD: Duration = Duration::from_secs(60);
+const CANCEL_KEY_REFRESH_PERIOD: Duration = Duration::from_secs(10 * 60);
+/// `CANCEL_KEY_TTL_SLACK` is added to the periods to determine the actual TTL.
+const CANCEL_KEY_TTL_SLACK: Duration = Duration::from_secs(30);
 
 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
@@ -54,6 +57,24 @@ pub enum CancelKeyOp {
     },
 }
 
+impl CancelKeyOp {
+    const fn redis_msg_kind(&self) -> RedisMsgKind {
+        match self {
+            CancelKeyOp::Store { .. } => RedisMsgKind::Set,
+            CancelKeyOp::Refresh { .. } => RedisMsgKind::Expire,
+            CancelKeyOp::Get { .. } => RedisMsgKind::Get,
+            CancelKeyOp::GetOld { .. } => RedisMsgKind::HGet,
+        }
+    }
+
+    fn cancel_channel_metric_guard(&self) -> CancelChannelSizeGuard<'static> {
+        Metrics::get()
+            .proxy
+            .cancel_channel_size
+            .guard(self.redis_msg_kind())
+    }
+}
+
 #[derive(thiserror::Error, Debug, Clone)]
 pub enum PipelineError {
     #[error("could not send cmd to redis: {0}")]
@@ -483,50 +504,49 @@ impl Session {
         let mut cancel = pin!(cancel);
 
         enum State {
-            Set,
+            Init,
             Refresh,
         }
-        let mut state = State::Set;
 
+        let mut state = State::Init;
         loop {
-            let guard_op = match state {
-                State::Set => {
-                    let guard = Metrics::get()
-                        .proxy
-                        .cancel_channel_size
-                        .guard(RedisMsgKind::Set);
-                    let op = CancelKeyOp::Store {
-                        key: self.key,
-                        value: closure_json.clone(),
-                        expire: CANCEL_KEY_TTL,
-                    };
+            let (op, mut wait_interval) = match state {
+                State::Init => {
                     tracing::debug!(
                         src=%self.key,
                         dest=?cancel_closure.cancel_token,
                         "registering cancellation key"
                     );
-                    (guard, op)
+                    (
+                        CancelKeyOp::Store {
+                            key: self.key,
+                            value: closure_json.clone(),
+                            expire: CANCEL_KEY_INITIAL_PERIOD + CANCEL_KEY_TTL_SLACK,
+                        },
+                        CANCEL_KEY_INITIAL_PERIOD,
+                    )
                 }
 
                 State::Refresh => {
-                    let guard = Metrics::get()
-                        .proxy
-                        .cancel_channel_size
-                        .guard(RedisMsgKind::Expire);
-                    let op = CancelKeyOp::Refresh {
-                        key: self.key,
-                        expire: CANCEL_KEY_TTL,
-                    };
                     tracing::debug!(
                         src=%self.key,
                         dest=?cancel_closure.cancel_token,
                         "refreshing cancellation key"
                     );
-                    (guard, op)
+                    (
+                        CancelKeyOp::Refresh {
+                            key: self.key,
+                            expire: CANCEL_KEY_REFRESH_PERIOD + CANCEL_KEY_TTL_SLACK,
+                        },
+                        CANCEL_KEY_REFRESH_PERIOD,
+                    )
                 }
             };
 
-            match tx.call(guard_op, cancel.as_mut()).await {
+            match tx
+                .call((op.cancel_channel_metric_guard(), op), cancel.as_mut())
+                .await
+            {
                 // SET returns OK
                 Ok(Value::Okay) => {
                     tracing::debug!(
@@ -549,23 +569,23 @@ impl Session {
                 Ok(_) => {
                     // Any other response likely means the key expired.
                     tracing::warn!(src=%self.key, "refreshing cancellation key failed");
-                    // Re-enter the SET loop to repush full data.
-                    state = State::Set;
+                    // Re-enter the SET loop quickly to repush full data.
+                    state = State::Init;
+                    wait_interval = Duration::ZERO;
                 }
 
                 // retry immediately.
                 Err(BatchQueueError::Result(error)) => {
                     tracing::warn!(?error, "error refreshing cancellation key");
                     // Small delay to prevent busy loop with high cpu and logging.
-                    tokio::time::sleep(Duration::from_millis(10)).await;
-                    continue;
+                    wait_interval = Duration::from_millis(10);
                 }
 
                 Err(BatchQueueError::Cancelled(Err(_cancelled))) => break,
             }
 
             // wait before continuing. break immediately if cancelled.
-            if run_until(tokio::time::sleep(CANCEL_KEY_REFRESH), cancel.as_mut())
+            if run_until(tokio::time::sleep(wait_interval), cancel.as_mut())
                 .await
                 .is_err()
             {

From 6a353c33e3fe074f2083b315646cc6602a05350a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 18 Jul 2025 00:13:21 +0200
Subject: [PATCH 25/39] print more timestamps in find_lsn_for_timestamp
 (#12641)

Observability of `find_lsn_for_timestamp` is lacking, as well as how and
when we update gc space and time cutoffs. Log them.
---
 pageserver/src/pgdatadir_mapping.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 08828ec4eb..cda08f2cc4 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -813,6 +813,7 @@ impl Timeline {
         let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn();
         let gc_cutoff_planned = {
             let gc_info = self.gc_info.read().unwrap();
+            info!(cutoffs=?gc_info.cutoffs, applied_cutoff=%*gc_cutoff_lsn_guard, "starting find_lsn_for_timestamp");
             gc_info.min_cutoff()
         };
         // Usually the planned cutoff is newer than the cutoff of the last gc run,

From 8f627ea0abbe8079052061bd25f9cec321a775bd Mon Sep 17 00:00:00 2001
From: HaoyuHuang <haoyu.huang.68@gmail.com>
Date: Thu, 17 Jul 2025 16:17:01 -0700
Subject: [PATCH 26/39] A few more SC changes (#12649)

## Problem

## Summary of changes
---
 pageserver/client/src/mgmt_api.rs             | 16 +++++++
 .../down.sql                                  |  2 +
 .../up.sql                                    | 17 +++++++
 storage_controller/src/hadron_utils.rs        | 44 +++++++++++++++++
 storage_controller/src/lib.rs                 |  1 +
 storage_controller/src/pageserver_client.rs   | 48 +++++++++++++++++++
 storage_controller/src/reconciler.rs          | 13 +++--
 storage_controller/src/schema.rs              | 20 ++++++++
 storage_controller/src/tenant_shard.rs        |  8 +++-
 9 files changed, 163 insertions(+), 6 deletions(-)
 create mode 100644 storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/down.sql
 create mode 100644 storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/up.sql
 create mode 100644 storage_controller/src/hadron_utils.rs

diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index fe1ddc2e7d..3867e536f4 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -873,6 +873,22 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn reset_alert_gauges(&self) -> Result<()> {
+        let uri = format!(
+            "{}/hadron-internal/reset_alert_gauges",
+            self.mgmt_api_endpoint
+        );
+        self.start_request(Method::POST, uri)
+            .send()
+            .await
+            .map_err(Error::SendRequest)?
+            .error_from_body()
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn wait_lsn(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/down.sql b/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/down.sql
new file mode 100644
index 0000000000..b45b45e438
--- /dev/null
+++ b/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/down.sql
@@ -0,0 +1,2 @@
+DROP TABLE hadron_safekeepers;
+DROP TABLE hadron_timeline_safekeepers;
diff --git a/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/up.sql b/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/up.sql
new file mode 100644
index 0000000000..6cee981efc
--- /dev/null
+++ b/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/up.sql
@@ -0,0 +1,17 @@
+-- hadron_safekeepers keep track of all Safe Keeper nodes that exist in the system.
+-- Upon startup, each Safe Keeper reaches out to the hadron cluster coordinator to register its node ID and listen addresses.
+
+CREATE TABLE hadron_safekeepers (
+  sk_node_id BIGINT PRIMARY KEY NOT NULL,
+  listen_http_addr VARCHAR NOT NULL,
+  listen_http_port INTEGER NOT NULL,
+  listen_pg_addr VARCHAR NOT NULL,
+  listen_pg_port INTEGER NOT NULL
+);
+
+CREATE TABLE hadron_timeline_safekeepers (
+  timeline_id VARCHAR NOT NULL,
+  sk_node_id BIGINT NOT NULL,
+  legacy_endpoint_id UUID DEFAULT NULL,
+  PRIMARY KEY(timeline_id, sk_node_id)
+);
diff --git a/storage_controller/src/hadron_utils.rs b/storage_controller/src/hadron_utils.rs
new file mode 100644
index 0000000000..871e21c367
--- /dev/null
+++ b/storage_controller/src/hadron_utils.rs
@@ -0,0 +1,44 @@
+use std::collections::BTreeMap;
+
+use rand::Rng;
+use utils::shard::TenantShardId;
+
+static CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*()";
+
+/// Generate a random string of `length` that can be used as a password. The generated string
+/// contains alphanumeric characters and special characters (!@#$%^&*())
+pub fn generate_random_password(length: usize) -> String {
+    let mut rng = rand::thread_rng();
+    (0..length)
+        .map(|_| {
+            let idx = rng.gen_range(0..CHARSET.len());
+            CHARSET[idx] as char
+        })
+        .collect()
+}
+
+pub(crate) struct TenantShardSizeMap {
+    #[expect(dead_code)]
+    pub map: BTreeMap<TenantShardId, u64>,
+}
+
+impl TenantShardSizeMap {
+    pub fn new(map: BTreeMap<TenantShardId, u64>) -> Self {
+        Self { map }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_generate_random_password() {
+        let pwd1 = generate_random_password(10);
+        assert_eq!(pwd1.len(), 10);
+        let pwd2 = generate_random_password(10);
+        assert_ne!(pwd1, pwd2);
+        assert!(pwd1.chars().all(|c| CHARSET.contains(&(c as u8))));
+        assert!(pwd2.chars().all(|c| CHARSET.contains(&(c as u8))));
+    }
+}
diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index 36e3c5dc6c..24b06da83a 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -6,6 +6,7 @@ extern crate hyper0 as hyper;
 mod auth;
 mod background_node_operations;
 mod compute_hook;
+pub mod hadron_utils;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index da0687895a..9e829e252d 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -14,6 +14,8 @@ use reqwest::StatusCode;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
 
+use crate::hadron_utils::TenantShardSizeMap;
+
 /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
 /// controller to collect metrics in a non-intrusive manner.
 #[derive(Debug, Clone)]
@@ -86,6 +88,31 @@ impl PageserverClient {
         )
     }
 
+    #[expect(dead_code)]
+    pub(crate) async fn tenant_timeline_compact(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        force_image_layer_creation: bool,
+        wait_until_done: bool,
+    ) -> Result<()> {
+        measured_request!(
+            "tenant_timeline_compact",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .tenant_timeline_compact(
+                    tenant_shard_id,
+                    timeline_id,
+                    force_image_layer_creation,
+                    true,
+                    false,
+                    wait_until_done,
+                )
+                .await
+        )
+    }
+
     /* BEGIN_HADRON */
     pub(crate) async fn tenant_timeline_describe(
         &self,
@@ -101,6 +128,17 @@ impl PageserverClient {
                 .await
         )
     }
+
+    #[expect(dead_code)]
+    pub(crate) async fn list_tenant_visible_size(&self) -> Result<TenantShardSizeMap> {
+        measured_request!(
+            "list_tenant_visible_size",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.list_tenant_visible_size().await
+        )
+        .map(TenantShardSizeMap::new)
+    }
     /* END_HADRON */
 
     pub(crate) async fn tenant_scan_remote_storage(
@@ -365,6 +403,16 @@ impl PageserverClient {
         )
     }
 
+    #[expect(dead_code)]
+    pub(crate) async fn reset_alert_gauges(&self) -> Result<()> {
+        measured_request!(
+            "reset_alert_gauges",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.reset_alert_gauges().await
+        )
+    }
+
     pub(crate) async fn wait_lsn(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index a2fba0fa56..d1590ec75e 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -862,11 +862,11 @@ impl Reconciler {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     if refreshed {
                         tracing::info!(
-                            node_id=%node.get_id(), "Observed configuration correct after refresh. Notifying compute.");
+                            node_id=%node.get_id(), "[Attached] Observed configuration correct after refresh. Notifying compute.");
                         self.compute_notify().await?;
                     } else {
                         // Nothing to do
-                        tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.");
+                        tracing::info!(node_id=%node.get_id(), "[Attached] Observed configuration already correct.");
                     }
                 }
                 observed => {
@@ -945,17 +945,17 @@ impl Reconciler {
             match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
-                    tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
+                    tracing::info!(node_id=%node.get_id(), "[Secondary] Observed configuration already correct.")
                 }
                 _ => {
                     // Only try and configure secondary locations on nodes that are available.  This
                     // allows the reconciler to "succeed" while some secondaries are offline (e.g. after
                     // a node failure, where the failed node will have a secondary intent)
                     if node.is_available() {
-                        tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+                        tracing::info!(node_id=%node.get_id(), "[Secondary] Observed configuration requires update.");
                         changes.push((node.clone(), wanted_conf))
                     } else {
-                        tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable");
+                        tracing::info!(node_id=%node.get_id(), "[Secondary] Skipping configuration as secondary, node is unavailable");
                         self.observed
                             .locations
                             .insert(node.get_id(), ObservedStateLocation { conf: None });
@@ -1066,6 +1066,9 @@ impl Reconciler {
             }
             result
         } else {
+            tracing::info!(
+                "Compute notification is skipped because the tenant shard does not have an attached (primary) location"
+            );
             Ok(())
         }
     }
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 312f7e0b0e..f3dcdaf798 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -13,6 +13,24 @@ diesel::table! {
     }
 }
 
+diesel::table! {
+    hadron_safekeepers (sk_node_id) {
+        sk_node_id -> Int8,
+        listen_http_addr -> Varchar,
+        listen_http_port -> Int4,
+        listen_pg_addr -> Varchar,
+        listen_pg_port -> Int4,
+    }
+}
+
+diesel::table! {
+    hadron_timeline_safekeepers (timeline_id, sk_node_id) {
+        timeline_id -> Varchar,
+        sk_node_id -> Int8,
+        legacy_endpoint_id -> Nullable<Uuid>,
+    }
+}
+
 diesel::table! {
     metadata_health (tenant_id, shard_number, shard_count) {
         tenant_id -> Varchar,
@@ -105,6 +123,8 @@ diesel::table! {
 
 diesel::allow_tables_to_appear_in_same_query!(
     controllers,
+    hadron_safekeepers,
+    hadron_timeline_safekeepers,
     metadata_health,
     nodes,
     safekeeper_timeline_pending_ops,
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 05de155963..f60378470e 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1611,7 +1611,13 @@ impl TenantShard {
 
         // Update result counter
         let outcome_label = match &result {
-            Ok(_) => ReconcileOutcome::Success,
+            Ok(_) => {
+                if reconciler.compute_notify_failure {
+                    ReconcileOutcome::SuccessNoNotify
+                } else {
+                    ReconcileOutcome::Success
+                }
+            }
             Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
             Err(_) => ReconcileOutcome::Error,
         };

From f3ef60d236300ed15d72b26215092052ed253895 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 18 Jul 2025 00:40:35 -0400
Subject: [PATCH 27/39] fix(storcon): use unified interface to handle 404 lsn
 lease (#12650)

## Problem

Close LKB-270. This is part of our series of efforts to make sure
lsn_lease API prompts clients to retry. Follow up of
https://github.com/neondatabase/neon/pull/12631.

Slack thread w/ Vlad:
https://databricks.slack.com/archives/C09254R641L/p1752677940697529

## Summary of changes

- Use `tenant_remote_mutation` API for LSN leases. Makes it consistent
with new APIs added to storcon.
- For 404, we now always retry because we know the tenant is
to-be-attached and will eventually reach a point that we can find that
tenant on the intent pageserver.
- Using the `tenant_remote_mutation` API also prevents us from the case
where the intent pageserver changes within the lease request. The
wrapper function will error with 503 if such things happen.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/http.rs    |   2 +-
 storage_controller/src/service.rs | 156 +++++++++---------------------
 2 files changed, 48 insertions(+), 110 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 6b6d081dcd..ff73719adb 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -819,7 +819,7 @@ async fn handle_tenant_timeline_passthrough(
                 .map_err(|e| ApiError::InternalServerError(e.into()))?;
             // We only handle "tenant not found" errors; other 404s like timeline not found should
             // be forwarded as-is.
-            if resp_str.contains(&format!("tenant {tenant_or_shard_id}")) {
+            if Service::is_tenant_not_found_error(resp_str, tenant_or_shard_id.tenant_id) {
                 // Rather than retry here, send the client a 503 to prompt a retry: this matches
                 // the pageserver's use of 503, and all clients calling this API should retry on 503.
                 return Err(ApiError::ResourceUnavailable(
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index a1ff9b3c61..71186076ec 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -207,27 +207,6 @@ enum ShardGenerationValidity {
     },
 }
 
-/// We collect the state of attachments for some operations to determine if the operation
-/// needs to be retried when it fails.
-struct TenantShardAttachState {
-    /// The targets of the operation.
-    ///
-    /// Tenant shard ID, node ID, node, is intent node observed primary.
-    targets: Vec<(TenantShardId, NodeId, Node, bool)>,
-
-    /// The targets grouped by node ID.
-    by_node_id: HashMap<NodeId, (TenantShardId, Node, bool)>,
-}
-
-impl TenantShardAttachState {
-    fn for_api_call(&self) -> Vec<(TenantShardId, Node)> {
-        self.targets
-            .iter()
-            .map(|(tenant_shard_id, _, node, _)| (*tenant_shard_id, node.clone()))
-            .collect()
-    }
-}
-
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
 pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
@@ -4795,78 +4774,24 @@ impl Service {
         Ok(())
     }
 
-    fn is_observed_consistent_with_intent(
-        &self,
-        shard: &TenantShard,
-        intent_node_id: NodeId,
-    ) -> bool {
-        if let Some(location) = shard.observed.locations.get(&intent_node_id)
-            && let Some(ref conf) = location.conf
-            && (conf.mode == LocationConfigMode::AttachedSingle
-                || conf.mode == LocationConfigMode::AttachedMulti)
-        {
-            true
-        } else {
-            false
-        }
-    }
-
-    fn collect_tenant_shards(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<TenantShardAttachState, ApiError> {
-        let locked = self.inner.read().unwrap();
-        let mut targets = Vec::new();
-        let mut by_node_id = HashMap::new();
-
-        // If the request got an unsharded tenant id, then apply
-        // the operation to all shards. Otherwise, apply it to a specific shard.
-        let shards_range = TenantShardId::tenant_range(tenant_id);
-
-        for (tenant_shard_id, shard) in locked.tenants.range(shards_range) {
-            if let Some(node_id) = shard.intent.get_attached() {
-                let node = locked
-                    .nodes
-                    .get(node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                let consistent = self.is_observed_consistent_with_intent(shard, *node_id);
-
-                targets.push((*tenant_shard_id, *node_id, node.clone(), consistent));
-                by_node_id.insert(*node_id, (*tenant_shard_id, node.clone(), consistent));
-            }
-        }
-
-        if targets.is_empty() {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
-            ));
-        }
-
-        Ok(TenantShardAttachState {
-            targets,
-            by_node_id,
-        })
+    pub(crate) fn is_tenant_not_found_error(body: &str, tenant_id: TenantId) -> bool {
+        body.contains(&format!("tenant {tenant_id}"))
     }
 
     fn process_result_and_passthrough_errors<T>(
         &self,
+        tenant_id: TenantId,
         results: Vec<(Node, Result<T, mgmt_api::Error>)>,
-        attach_state: TenantShardAttachState,
     ) -> Result<Vec<(Node, T)>, ApiError> {
         let mut processed_results: Vec<(Node, T)> = Vec::with_capacity(results.len());
-        debug_assert_eq!(results.len(), attach_state.targets.len());
         for (node, res) in results {
-            let is_consistent = attach_state
-                .by_node_id
-                .get(&node.get_id())
-                .map(|(_, _, consistent)| *consistent);
             match res {
                 Ok(res) => processed_results.push((node, res)),
-                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _))
-                    if is_consistent == Some(false) =>
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, body))
+                    if Self::is_tenant_not_found_error(&body, tenant_id) =>
                 {
-                    // This is expected if the attach is not finished yet. Return 503 so that the client can retry.
+                    // If there's a tenant not found, we are still in the process of attaching the tenant.
+                    // Return 503 so that the client can retry.
                     return Err(ApiError::ResourceUnavailable(
                         format!(
                             "Timeline is not attached to the pageserver {} yet, please retry",
@@ -4894,35 +4819,48 @@ impl Service {
         )
         .await;
 
-        let attach_state = self.collect_tenant_shards(tenant_id)?;
-
-        let results = self
-            .tenant_for_shards_api(
-                attach_state.for_api_call(),
-                |tenant_shard_id, client| async move {
-                    client
-                        .timeline_lease_lsn(tenant_shard_id, timeline_id, lsn)
-                        .await
-                },
-                1,
-                1,
-                SHORT_RECONCILE_TIMEOUT,
-                &self.cancel,
-            )
-            .await;
-
-        let leases = self.process_result_and_passthrough_errors(results, attach_state)?;
-        let mut valid_until = None;
-        for (_, lease) in leases {
-            if let Some(ref mut valid_until) = valid_until {
-                *valid_until = std::cmp::min(*valid_until, lease.valid_until);
-            } else {
-                valid_until = Some(lease.valid_until);
+        self.tenant_remote_mutation(tenant_id, |locations| async move {
+            if locations.0.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
             }
-        }
-        Ok(LsnLease {
-            valid_until: valid_until.unwrap_or_else(SystemTime::now),
+
+            let results = self
+                .tenant_for_shards_api(
+                    locations
+                        .0
+                        .iter()
+                        .map(|(tenant_shard_id, ShardMutationLocations { latest, .. })| {
+                            (*tenant_shard_id, latest.node.clone())
+                        })
+                        .collect(),
+                    |tenant_shard_id, client| async move {
+                        client
+                            .timeline_lease_lsn(tenant_shard_id, timeline_id, lsn)
+                            .await
+                    },
+                    1,
+                    1,
+                    SHORT_RECONCILE_TIMEOUT,
+                    &self.cancel,
+                )
+                .await;
+
+            let leases = self.process_result_and_passthrough_errors(tenant_id, results)?;
+            let mut valid_until = None;
+            for (_, lease) in leases {
+                if let Some(ref mut valid_until) = valid_until {
+                    *valid_until = std::cmp::min(*valid_until, lease.valid_until);
+                } else {
+                    valid_until = Some(lease.valid_until);
+                }
+            }
+            Ok(LsnLease {
+                valid_until: valid_until.unwrap_or_else(SystemTime::now),
+            })
         })
+        .await?
     }
 
     pub(crate) async fn tenant_timeline_download_heatmap_layers(

From 8e95455aef9e18d8b9df5af2388828832c50ec82 Mon Sep 17 00:00:00 2001
From: Shockingly Good <fx@thefx.co>
Date: Fri, 18 Jul 2025 10:21:22 +0200
Subject: [PATCH 28/39] Update the postgres submodules (#12636)

Synchronises the main branch's postgres submodules with the
`neondatabase/postgres` repository state.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index ac3c460e01..47304b9215 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit ac3c460e01a31f11fb52fd8d8e88e60f0e1069b4
+Subproject commit 47304b921555b3f33eb3b49daada3078e774cfd7
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 24313bf8f3..cef72d5308 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 24313bf8f3de722968a2fdf764de7ef77ed64f06
+Subproject commit cef72d5308ddce3795a9043fcd94f8849f7f4800
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 51194dc5ce..e9db1ff5a6 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 51194dc5ce2e3523068d8607852e6c3125a17e58
+Subproject commit e9db1ff5a6f3ca18f626ba3d62ab475e6c688a96
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index eac5279cd1..a50d80c750 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit eac5279cd147d4086e0eb242198aae2f4b766d7b
+Subproject commit a50d80c7507e8ae9fc37bf1869051cf2d51370ab
diff --git a/vendor/revisions.json b/vendor/revisions.json
index e4b6c8e23a..24a33dec42 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.5",
-    "eac5279cd147d4086e0eb242198aae2f4b766d7b"
+    "a50d80c7507e8ae9fc37bf1869051cf2d51370ab"
   ],
   "v16": [
     "16.9",
-    "51194dc5ce2e3523068d8607852e6c3125a17e58"
+    "e9db1ff5a6f3ca18f626ba3d62ab475e6c688a96"
   ],
   "v15": [
     "15.13",
-    "24313bf8f3de722968a2fdf764de7ef77ed64f06"
+    "cef72d5308ddce3795a9043fcd94f8849f7f4800"
   ],
   "v14": [
     "14.18",
-    "ac3c460e01a31f11fb52fd8d8e88e60f0e1069b4"
+    "47304b921555b3f33eb3b49daada3078e774cfd7"
   ]
 }

From 96bcfba79e4919a7a5b8fddd2149231b42059883 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Szafra=C5=84ski?= <k.p.szafranski@gmail.com>
Date: Fri, 18 Jul 2025 12:17:58 +0200
Subject: [PATCH 29/39] [proxy] Cache GetEndpointAccessControl errors (#12571)

Related to https://github.com/neondatabase/cloud/issues/19353
---
 proxy/src/cache/project_info.rs               | 286 +++++++++++++++---
 .../control_plane/client/cplane_proxy_v1.rs   | 185 ++++++-----
 proxy/src/control_plane/errors.rs             |   2 +-
 proxy/src/control_plane/messages.rs           |  16 +-
 proxy/src/control_plane/mod.rs                |   6 +-
 5 files changed, 376 insertions(+), 119 deletions(-)

diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index c812779e30..0ef09a8a9a 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -10,6 +10,7 @@ use tokio::time::Instant;
 use tracing::{debug, info};
 
 use crate::config::ProjectInfoCacheOptions;
+use crate::control_plane::messages::{ControlPlaneErrorMessage, Reason};
 use crate::control_plane::{EndpointAccessControl, RoleAccessControl};
 use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::types::{EndpointId, RoleName};
@@ -36,22 +37,37 @@ impl<T> Entry<T> {
     }
 
     pub(crate) fn get(&self) -> Option<&T> {
-        (self.expires_at > Instant::now()).then_some(&self.value)
+        (!self.is_expired()).then_some(&self.value)
+    }
+
+    fn is_expired(&self) -> bool {
+        self.expires_at <= Instant::now()
     }
 }
 
 struct EndpointInfo {
-    role_controls: HashMap<RoleNameInt, Entry<RoleAccessControl>>,
-    controls: Option<Entry<EndpointAccessControl>>,
+    role_controls: HashMap<RoleNameInt, Entry<ControlPlaneResult<RoleAccessControl>>>,
+    controls: Option<Entry<ControlPlaneResult<EndpointAccessControl>>>,
 }
 
+type ControlPlaneResult<T> = Result<T, Box<ControlPlaneErrorMessage>>;
+
 impl EndpointInfo {
-    pub(crate) fn get_role_secret(&self, role_name: RoleNameInt) -> Option<RoleAccessControl> {
-        self.role_controls.get(&role_name)?.get().cloned()
+    pub(crate) fn get_role_secret_with_ttl(
+        &self,
+        role_name: RoleNameInt,
+    ) -> Option<(ControlPlaneResult<RoleAccessControl>, Duration)> {
+        let entry = self.role_controls.get(&role_name)?;
+        let ttl = entry.expires_at - Instant::now();
+        Some((entry.get()?.clone(), ttl))
     }
 
-    pub(crate) fn get_controls(&self) -> Option<EndpointAccessControl> {
-        self.controls.as_ref()?.get().cloned()
+    pub(crate) fn get_controls_with_ttl(
+        &self,
+    ) -> Option<(ControlPlaneResult<EndpointAccessControl>, Duration)> {
+        let entry = self.controls.as_ref()?;
+        let ttl = entry.expires_at - Instant::now();
+        Some((entry.get()?.clone(), ttl))
     }
 
     pub(crate) fn invalidate_endpoint(&mut self) {
@@ -153,28 +169,28 @@ impl ProjectInfoCacheImpl {
         self.cache.get(&endpoint_id)
     }
 
-    pub(crate) fn get_role_secret(
+    pub(crate) fn get_role_secret_with_ttl(
         &self,
         endpoint_id: &EndpointId,
         role_name: &RoleName,
-    ) -> Option<RoleAccessControl> {
+    ) -> Option<(ControlPlaneResult<RoleAccessControl>, Duration)> {
         let role_name = RoleNameInt::get(role_name)?;
         let endpoint_info = self.get_endpoint_cache(endpoint_id)?;
-        endpoint_info.get_role_secret(role_name)
+        endpoint_info.get_role_secret_with_ttl(role_name)
     }
 
-    pub(crate) fn get_endpoint_access(
+    pub(crate) fn get_endpoint_access_with_ttl(
         &self,
         endpoint_id: &EndpointId,
-    ) -> Option<EndpointAccessControl> {
+    ) -> Option<(ControlPlaneResult<EndpointAccessControl>, Duration)> {
         let endpoint_info = self.get_endpoint_cache(endpoint_id)?;
-        endpoint_info.get_controls()
+        endpoint_info.get_controls_with_ttl()
     }
 
     pub(crate) fn insert_endpoint_access(
         &self,
         account_id: Option<AccountIdInt>,
-        project_id: ProjectIdInt,
+        project_id: Option<ProjectIdInt>,
         endpoint_id: EndpointIdInt,
         role_name: RoleNameInt,
         controls: EndpointAccessControl,
@@ -183,26 +199,89 @@ impl ProjectInfoCacheImpl {
         if let Some(account_id) = account_id {
             self.insert_account2endpoint(account_id, endpoint_id);
         }
-        self.insert_project2endpoint(project_id, endpoint_id);
+        if let Some(project_id) = project_id {
+            self.insert_project2endpoint(project_id, endpoint_id);
+        }
 
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
             return;
         }
 
-        let controls = Entry::new(controls, self.config.ttl);
-        let role_controls = Entry::new(role_controls, self.config.ttl);
+        debug!(
+            key = &*endpoint_id,
+            "created a cache entry for endpoint access"
+        );
+
+        let controls = Some(Entry::new(Ok(controls), self.config.ttl));
+        let role_controls = Entry::new(Ok(role_controls), self.config.ttl);
 
         match self.cache.entry(endpoint_id) {
             clashmap::Entry::Vacant(e) => {
                 e.insert(EndpointInfo {
                     role_controls: HashMap::from_iter([(role_name, role_controls)]),
-                    controls: Some(controls),
+                    controls,
                 });
             }
             clashmap::Entry::Occupied(mut e) => {
                 let ep = e.get_mut();
-                ep.controls = Some(controls);
+                ep.controls = controls;
+                if ep.role_controls.len() < self.config.max_roles {
+                    ep.role_controls.insert(role_name, role_controls);
+                }
+            }
+        }
+    }
+
+    pub(crate) fn insert_endpoint_access_err(
+        &self,
+        endpoint_id: EndpointIdInt,
+        role_name: RoleNameInt,
+        msg: Box<ControlPlaneErrorMessage>,
+        ttl: Option<Duration>,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+
+        debug!(
+            key = &*endpoint_id,
+            "created a cache entry for an endpoint access error"
+        );
+
+        let ttl = ttl.unwrap_or(self.config.ttl);
+
+        let controls = if msg.get_reason() == Reason::RoleProtected {
+            // RoleProtected is the only role-specific error that control plane can give us.
+            // If a given role name does not exist, it still returns a successful response,
+            // just with an empty secret.
+            None
+        } else {
+            // We can cache all the other errors in EndpointInfo.controls,
+            // because they don't depend on what role name we pass to control plane.
+            Some(Entry::new(Err(msg.clone()), ttl))
+        };
+
+        let role_controls = Entry::new(Err(msg), ttl);
+
+        match self.cache.entry(endpoint_id) {
+            clashmap::Entry::Vacant(e) => {
+                e.insert(EndpointInfo {
+                    role_controls: HashMap::from_iter([(role_name, role_controls)]),
+                    controls,
+                });
+            }
+            clashmap::Entry::Occupied(mut e) => {
+                let ep = e.get_mut();
+                if let Some(entry) = &ep.controls
+                    && !entry.is_expired()
+                    && entry.value.is_ok()
+                {
+                    // If we have cached non-expired, non-error controls, keep them.
+                } else {
+                    ep.controls = controls;
+                }
                 if ep.role_controls.len() < self.config.max_roles {
                     ep.role_controls.insert(role_name, role_controls);
                 }
@@ -245,7 +324,7 @@ impl ProjectInfoCacheImpl {
             return;
         };
 
-        if role_controls.get().expires_at <= Instant::now() {
+        if role_controls.get().is_expired() {
             role_controls.remove();
         }
     }
@@ -284,13 +363,11 @@ impl ProjectInfoCacheImpl {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
     use super::*;
-    use crate::control_plane::messages::EndpointRateLimitConfig;
+    use crate::control_plane::messages::{Details, EndpointRateLimitConfig, ErrorInfo, Status};
     use crate::control_plane::{AccessBlockerFlags, AuthSecret};
     use crate::scram::ServerSecret;
-    use crate::types::ProjectId;
+    use std::sync::Arc;
 
     #[tokio::test]
     async fn test_project_info_cache_settings() {
@@ -301,9 +378,9 @@ mod tests {
             ttl: Duration::from_secs(1),
             gc_interval: Duration::from_secs(600),
         });
-        let project_id: ProjectId = "project".into();
+        let project_id: Option<ProjectIdInt> = Some(ProjectIdInt::from(&"project".into()));
         let endpoint_id: EndpointId = "endpoint".into();
-        let account_id: Option<AccountIdInt> = None;
+        let account_id = None;
 
         let user1: RoleName = "user1".into();
         let user2: RoleName = "user2".into();
@@ -316,7 +393,7 @@ mod tests {
 
         cache.insert_endpoint_access(
             account_id,
-            (&project_id).into(),
+            project_id,
             (&endpoint_id).into(),
             (&user1).into(),
             EndpointAccessControl {
@@ -332,7 +409,7 @@ mod tests {
 
         cache.insert_endpoint_access(
             account_id,
-            (&project_id).into(),
+            project_id,
             (&endpoint_id).into(),
             (&user2).into(),
             EndpointAccessControl {
@@ -346,11 +423,17 @@ mod tests {
             },
         );
 
-        let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
-        assert_eq!(cached.secret, secret1);
+        let (cached, ttl) = cache
+            .get_role_secret_with_ttl(&endpoint_id, &user1)
+            .unwrap();
+        assert_eq!(cached.unwrap().secret, secret1);
+        assert_eq!(ttl, cache.config.ttl);
 
-        let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap();
-        assert_eq!(cached.secret, secret2);
+        let (cached, ttl) = cache
+            .get_role_secret_with_ttl(&endpoint_id, &user2)
+            .unwrap();
+        assert_eq!(cached.unwrap().secret, secret2);
+        assert_eq!(ttl, cache.config.ttl);
 
         // Shouldn't add more than 2 roles.
         let user3: RoleName = "user3".into();
@@ -358,7 +441,7 @@ mod tests {
 
         cache.insert_endpoint_access(
             account_id,
-            (&project_id).into(),
+            project_id,
             (&endpoint_id).into(),
             (&user3).into(),
             EndpointAccessControl {
@@ -372,17 +455,144 @@ mod tests {
             },
         );
 
-        assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
+        assert!(
+            cache
+                .get_role_secret_with_ttl(&endpoint_id, &user3)
+                .is_none()
+        );
 
-        let cached = cache.get_endpoint_access(&endpoint_id).unwrap();
+        let cached = cache
+            .get_endpoint_access_with_ttl(&endpoint_id)
+            .unwrap()
+            .0
+            .unwrap();
         assert_eq!(cached.allowed_ips, allowed_ips);
 
         tokio::time::advance(Duration::from_secs(2)).await;
-        let cached = cache.get_role_secret(&endpoint_id, &user1);
+        let cached = cache.get_role_secret_with_ttl(&endpoint_id, &user1);
         assert!(cached.is_none());
-        let cached = cache.get_role_secret(&endpoint_id, &user2);
+        let cached = cache.get_role_secret_with_ttl(&endpoint_id, &user2);
         assert!(cached.is_none());
-        let cached = cache.get_endpoint_access(&endpoint_id);
+        let cached = cache.get_endpoint_access_with_ttl(&endpoint_id);
         assert!(cached.is_none());
     }
+
+    #[tokio::test]
+    async fn test_caching_project_info_errors() {
+        let cache = ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
+            size: 10,
+            max_roles: 10,
+            ttl: Duration::from_secs(1),
+            gc_interval: Duration::from_secs(600),
+        });
+        let project_id = Some(ProjectIdInt::from(&"project".into()));
+        let endpoint_id: EndpointId = "endpoint".into();
+        let account_id = None;
+
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
+        let secret = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
+
+        let role_msg = Box::new(ControlPlaneErrorMessage {
+            error: "role is protected and cannot be used for password-based authentication"
+                .to_owned()
+                .into_boxed_str(),
+            http_status_code: http::StatusCode::NOT_FOUND,
+            status: Some(Status {
+                code: "PERMISSION_DENIED".to_owned().into_boxed_str(),
+                message: "role is protected and cannot be used for password-based authentication"
+                    .to_owned()
+                    .into_boxed_str(),
+                details: Details {
+                    error_info: Some(ErrorInfo {
+                        reason: Reason::RoleProtected,
+                    }),
+                    retry_info: None,
+                    user_facing_message: None,
+                },
+            }),
+        });
+
+        let generic_msg = Box::new(ControlPlaneErrorMessage {
+            error: "oh noes".to_owned().into_boxed_str(),
+            http_status_code: http::StatusCode::NOT_FOUND,
+            status: None,
+        });
+
+        let get_role_secret = |endpoint_id, role_name| {
+            cache
+                .get_role_secret_with_ttl(endpoint_id, role_name)
+                .unwrap()
+                .0
+        };
+        let get_endpoint_access =
+            |endpoint_id| cache.get_endpoint_access_with_ttl(endpoint_id).unwrap().0;
+
+        // stores role-specific errors only for get_role_secret
+        cache.insert_endpoint_access_err(
+            (&endpoint_id).into(),
+            (&user1).into(),
+            role_msg.clone(),
+            None,
+        );
+        assert_eq!(
+            get_role_secret(&endpoint_id, &user1).unwrap_err().error,
+            role_msg.error
+        );
+        assert!(cache.get_endpoint_access_with_ttl(&endpoint_id).is_none());
+
+        // stores non-role specific errors for both get_role_secret and get_endpoint_access
+        cache.insert_endpoint_access_err(
+            (&endpoint_id).into(),
+            (&user1).into(),
+            generic_msg.clone(),
+            None,
+        );
+        assert_eq!(
+            get_role_secret(&endpoint_id, &user1).unwrap_err().error,
+            generic_msg.error
+        );
+        assert_eq!(
+            get_endpoint_access(&endpoint_id).unwrap_err().error,
+            generic_msg.error
+        );
+
+        // error isn't returned for other roles in the same endpoint
+        assert!(
+            cache
+                .get_role_secret_with_ttl(&endpoint_id, &user2)
+                .is_none()
+        );
+
+        // success for a role does not overwrite errors for other roles
+        cache.insert_endpoint_access(
+            account_id,
+            project_id,
+            (&endpoint_id).into(),
+            (&user2).into(),
+            EndpointAccessControl {
+                allowed_ips: Arc::new(vec![]),
+                allowed_vpce: Arc::new(vec![]),
+                flags: AccessBlockerFlags::default(),
+                rate_limits: EndpointRateLimitConfig::default(),
+            },
+            RoleAccessControl {
+                secret: secret.clone(),
+            },
+        );
+        assert!(get_role_secret(&endpoint_id, &user1).is_err());
+        assert!(get_role_secret(&endpoint_id, &user2).is_ok());
+        // ...but does clear the access control error
+        assert!(get_endpoint_access(&endpoint_id).is_ok());
+
+        // storing an error does not overwrite successful access control response
+        cache.insert_endpoint_access_err(
+            (&endpoint_id).into(),
+            (&user2).into(),
+            generic_msg.clone(),
+            None,
+        );
+        assert!(get_role_secret(&endpoint_id, &user2).is_err());
+        assert!(get_endpoint_access(&endpoint_id).is_ok());
+    }
 }
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index bb785b8b0c..8a0403c0b0 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -68,6 +68,66 @@ impl NeonControlPlaneClient {
         self.endpoint.url().as_str()
     }
 
+    async fn get_and_cache_auth_info<T>(
+        &self,
+        ctx: &RequestContext,
+        endpoint: &EndpointId,
+        role: &RoleName,
+        cache_key: &EndpointId,
+        extract: impl FnOnce(&EndpointAccessControl, &RoleAccessControl) -> T,
+    ) -> Result<T, GetAuthInfoError> {
+        match self.do_get_auth_req(ctx, endpoint, role).await {
+            Ok(auth_info) => {
+                let control = EndpointAccessControl {
+                    allowed_ips: Arc::new(auth_info.allowed_ips),
+                    allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids),
+                    flags: auth_info.access_blocker_flags,
+                    rate_limits: auth_info.rate_limits,
+                };
+                let role_control = RoleAccessControl {
+                    secret: auth_info.secret,
+                };
+                let res = extract(&control, &role_control);
+
+                self.caches.project_info.insert_endpoint_access(
+                    auth_info.account_id,
+                    auth_info.project_id,
+                    cache_key.into(),
+                    role.into(),
+                    control,
+                    role_control,
+                );
+
+                if let Some(project_id) = auth_info.project_id {
+                    ctx.set_project_id(project_id);
+                }
+
+                Ok(res)
+            }
+            Err(err) => match err {
+                GetAuthInfoError::ApiError(ControlPlaneError::Message(ref msg)) => {
+                    let retry_info = msg.status.as_ref().and_then(|s| s.details.retry_info);
+
+                    // If we can retry this error, do not cache it,
+                    // unless we were given a retry delay.
+                    if msg.could_retry() && retry_info.is_none() {
+                        return Err(err);
+                    }
+
+                    self.caches.project_info.insert_endpoint_access_err(
+                        cache_key.into(),
+                        role.into(),
+                        msg.clone(),
+                        retry_info.map(|r| Duration::from_millis(r.retry_delay_ms)),
+                    );
+
+                    Err(err)
+                }
+                err => Err(err),
+            },
+        }
+    }
+
     async fn do_get_auth_req(
         &self,
         ctx: &RequestContext,
@@ -284,43 +344,34 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
         ctx: &RequestContext,
         endpoint: &EndpointId,
         role: &RoleName,
-    ) -> Result<RoleAccessControl, crate::control_plane::errors::GetAuthInfoError> {
-        let normalized_ep = &endpoint.normalize();
-        if let Some(secret) = self
+    ) -> Result<RoleAccessControl, GetAuthInfoError> {
+        let key = endpoint.normalize();
+
+        if let Some((role_control, ttl)) = self
             .caches
             .project_info
-            .get_role_secret(normalized_ep, role)
+            .get_role_secret_with_ttl(&key, role)
         {
-            return Ok(secret);
+            return match role_control {
+                Err(mut msg) => {
+                    info!(key = &*key, "found cached get_role_access_control error");
+
+                    // if retry_delay_ms is set change it to the remaining TTL
+                    replace_retry_delay_ms(&mut msg, |_| ttl.as_millis() as u64);
+
+                    Err(GetAuthInfoError::ApiError(ControlPlaneError::Message(msg)))
+                }
+                Ok(role_control) => {
+                    debug!(key = &*key, "found cached role access control");
+                    Ok(role_control)
+                }
+            };
         }
 
-        let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?;
-
-        let control = EndpointAccessControl {
-            allowed_ips: Arc::new(auth_info.allowed_ips),
-            allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids),
-            flags: auth_info.access_blocker_flags,
-            rate_limits: auth_info.rate_limits,
-        };
-        let role_control = RoleAccessControl {
-            secret: auth_info.secret,
-        };
-
-        if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
-
-            self.caches.project_info.insert_endpoint_access(
-                auth_info.account_id,
-                project_id,
-                normalized_ep_int,
-                role.into(),
-                control,
-                role_control.clone(),
-            );
-            ctx.set_project_id(project_id);
-        }
-
-        Ok(role_control)
+        self.get_and_cache_auth_info(ctx, endpoint, role, &key, |_, role_control| {
+            role_control.clone()
+        })
+        .await
     }
 
     #[tracing::instrument(skip_all)]
@@ -330,38 +381,30 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
         endpoint: &EndpointId,
         role: &RoleName,
     ) -> Result<EndpointAccessControl, GetAuthInfoError> {
-        let normalized_ep = &endpoint.normalize();
-        if let Some(control) = self.caches.project_info.get_endpoint_access(normalized_ep) {
-            return Ok(control);
+        let key = endpoint.normalize();
+
+        if let Some((control, ttl)) = self.caches.project_info.get_endpoint_access_with_ttl(&key) {
+            return match control {
+                Err(mut msg) => {
+                    info!(
+                        key = &*key,
+                        "found cached get_endpoint_access_control error"
+                    );
+
+                    // if retry_delay_ms is set change it to the remaining TTL
+                    replace_retry_delay_ms(&mut msg, |_| ttl.as_millis() as u64);
+
+                    Err(GetAuthInfoError::ApiError(ControlPlaneError::Message(msg)))
+                }
+                Ok(control) => {
+                    debug!(key = &*key, "found cached endpoint access control");
+                    Ok(control)
+                }
+            };
         }
 
-        let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?;
-
-        let control = EndpointAccessControl {
-            allowed_ips: Arc::new(auth_info.allowed_ips),
-            allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids),
-            flags: auth_info.access_blocker_flags,
-            rate_limits: auth_info.rate_limits,
-        };
-        let role_control = RoleAccessControl {
-            secret: auth_info.secret,
-        };
-
-        if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
-
-            self.caches.project_info.insert_endpoint_access(
-                auth_info.account_id,
-                project_id,
-                normalized_ep_int,
-                role.into(),
-                control.clone(),
-                role_control,
-            );
-            ctx.set_project_id(project_id);
-        }
-
-        Ok(control)
+        self.get_and_cache_auth_info(ctx, endpoint, role, &key, |control, _| control.clone())
+            .await
     }
 
     #[tracing::instrument(skip_all)]
@@ -390,13 +433,9 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                             info!(key = &*key, "found cached wake_compute error");
 
                             // if retry_delay_ms is set, reduce it by the amount of time it spent in cache
-                            if let Some(status) = &mut msg.status {
-                                if let Some(retry_info) = &mut status.details.retry_info {
-                                    retry_info.retry_delay_ms = retry_info
-                                        .retry_delay_ms
-                                        .saturating_sub(created_at.elapsed().as_millis() as u64)
-                                }
-                            }
+                            replace_retry_delay_ms(&mut msg, |delay| {
+                                delay.saturating_sub(created_at.elapsed().as_millis() as u64)
+                            });
 
                             Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
                                 msg,
@@ -478,6 +517,14 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
     }
 }
 
+fn replace_retry_delay_ms(msg: &mut ControlPlaneErrorMessage, f: impl FnOnce(u64) -> u64) {
+    if let Some(status) = &mut msg.status
+        && let Some(retry_info) = &mut status.details.retry_info
+    {
+        retry_info.retry_delay_ms = f(retry_info.retry_delay_ms);
+    }
+}
+
 /// Parse http response body, taking status code into account.
 fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     status: StatusCode,
diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs
index 12843e48c7..1e43010957 100644
--- a/proxy/src/control_plane/errors.rs
+++ b/proxy/src/control_plane/errors.rs
@@ -52,7 +52,7 @@ impl ReportableError for ControlPlaneError {
                 | Reason::EndpointNotFound
                 | Reason::EndpointDisabled
                 | Reason::BranchNotFound
-                | Reason::InvalidEphemeralEndpointOptions => ErrorKind::User,
+                | Reason::WrongLsnOrTimestamp => ErrorKind::User,
 
                 Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
 
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index cf193ed268..d44d7efcc3 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -107,7 +107,7 @@ pub(crate) struct ErrorInfo {
     // Schema could also have `metadata` field, but it's not structured. Skip it for now.
 }
 
-#[derive(Clone, Copy, Debug, Deserialize, Default)]
+#[derive(Clone, Copy, Debug, Deserialize, Default, PartialEq, Eq)]
 pub(crate) enum Reason {
     /// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles.
     #[serde(rename = "ROLE_PROTECTED")]
@@ -133,9 +133,9 @@ pub(crate) enum Reason {
     /// or that the subject doesn't have enough permissions to access the requested branch.
     #[serde(rename = "BRANCH_NOT_FOUND")]
     BranchNotFound,
-    /// InvalidEphemeralEndpointOptions indicates that the specified LSN or timestamp are wrong.
-    #[serde(rename = "INVALID_EPHEMERAL_OPTIONS")]
-    InvalidEphemeralEndpointOptions,
+    /// WrongLsnOrTimestamp indicates that the specified LSN or timestamp are wrong.
+    #[serde(rename = "WRONG_LSN_OR_TIMESTAMP")]
+    WrongLsnOrTimestamp,
     /// RateLimitExceeded indicates that the rate limit for the operation has been exceeded.
     #[serde(rename = "RATE_LIMIT_EXCEEDED")]
     RateLimitExceeded,
@@ -205,7 +205,7 @@ impl Reason {
             | Reason::EndpointNotFound
             | Reason::EndpointDisabled
             | Reason::BranchNotFound
-            | Reason::InvalidEphemeralEndpointOptions => false,
+            | Reason::WrongLsnOrTimestamp => false,
             // we were asked to go away
             Reason::RateLimitExceeded
             | Reason::NonDefaultBranchComputeTimeExceeded
@@ -257,19 +257,19 @@ pub(crate) struct GetEndpointAccessControl {
     pub(crate) rate_limits: EndpointRateLimitConfig,
 }
 
-#[derive(Copy, Clone, Deserialize, Default)]
+#[derive(Copy, Clone, Deserialize, Default, Debug)]
 pub struct EndpointRateLimitConfig {
     pub connection_attempts: ConnectionAttemptsLimit,
 }
 
-#[derive(Copy, Clone, Deserialize, Default)]
+#[derive(Copy, Clone, Deserialize, Default, Debug)]
 pub struct ConnectionAttemptsLimit {
     pub tcp: Option<LeakyBucketSetting>,
     pub ws: Option<LeakyBucketSetting>,
     pub http: Option<LeakyBucketSetting>,
 }
 
-#[derive(Copy, Clone, Deserialize)]
+#[derive(Copy, Clone, Deserialize, Debug)]
 pub struct LeakyBucketSetting {
     pub rps: f64,
     pub burst: f64,
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index a8c59dad0c..9bbd3f4fb7 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -82,7 +82,7 @@ impl NodeInfo {
     }
 }
 
-#[derive(Copy, Clone, Default)]
+#[derive(Copy, Clone, Default, Debug)]
 pub(crate) struct AccessBlockerFlags {
     pub public_access_blocked: bool,
     pub vpc_access_blocked: bool,
@@ -92,12 +92,12 @@ pub(crate) type NodeInfoCache =
     TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneErrorMessage>>>;
 pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct RoleAccessControl {
     pub secret: Option<AuthSecret>,
 }
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct EndpointAccessControl {
     pub allowed_ips: Arc<Vec<IpPattern>>,
     pub allowed_vpce: Arc<Vec<String>>,

From 791b5d736b921d54aed868a944f522d551ad0a8e Mon Sep 17 00:00:00 2001
From: Paul Banks <banks@banksco.de>
Date: Fri, 18 Jul 2025 18:09:20 +0100
Subject: [PATCH 30/39] Fixes #10441: control_plane README incorrect neon init
 args (#12646)

## Problem

As reported in #10441 the `control_plane/README/md` incorrectly
specified that `--pg-version` should be specified in the `cargo neon
init` command. This is not the case and causes an invalid argument
error.

## Summary of changes

Fix the README

## Test Plan

I verified that the steps in the README now work locally. I connected to
the started postgres endpoint and executed some basic metadata queries.
---
 control_plane/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/control_plane/README.md b/control_plane/README.md
index aa6f935e27..60c6120d82 100644
--- a/control_plane/README.md
+++ b/control_plane/README.md
@@ -8,10 +8,10 @@ code changes locally, but not suitable for running production systems.
 
 ## Example: Start with Postgres 16
 
-To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
+To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 2 of the start-up commands.
 
 ```shell
-cargo neon init --pg-version 16
+cargo neon init
 cargo neon start
 cargo neon tenant create --set-default --pg-version 16
 cargo neon endpoint create main --pg-version 16

From 1406bdc6a831423b8f0a70a63260c77da50afefb Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 21 Jul 2025 12:52:18 +0200
Subject: [PATCH 31/39] pageserver: improve gRPC cancellation (#12635)

## Problem

The gRPC page service does not properly react to shutdown cancellation.
In particular, Tonic considers an open GetPage stream to be an in-flight
request, so it will wait for it to complete before shutting down.

Touches [LKB-191](https://databricks.atlassian.net/browse/LKB-191).

## Summary of changes

Properly react to the server's cancellation token and take out gate
guards in gRPC request handlers.

Also document cancellation handling. In particular, that Tonic will drop
futures when clients go away (e.g. on timeout or shutdown), so the read
path must be cancellation-safe. It is believed to be (modulo possible
logging noise), but this will be verified later.
---
 pageserver/src/page_service.rs      | 72 ++++++++++++++++++++++-------
 pageserver/src/pgdatadir_mapping.rs |  4 ++
 pageserver/src/tenant/timeline.rs   |  3 ++
 3 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 1fc7e4eac7..2b266c6811 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -3218,13 +3218,25 @@ where
 pub struct GrpcPageServiceHandler {
     tenant_manager: Arc<TenantManager>,
     ctx: RequestContext,
+
+    /// Cancelled to shut down the server. Tonic will shut down in response to this, but wait for
+    /// in-flight requests to complete. Any tasks we spawn ourselves must respect this token.
+    cancel: CancellationToken,
+
+    /// Any tasks we spawn ourselves should clone this gate guard, so that we can wait for them to
+    /// complete during shutdown. Request handlers implicitly hold this guard already.
     gate_guard: GateGuard,
+
+    /// `get_vectored` concurrency setting.
     get_vectored_concurrent_io: GetVectoredConcurrentIo,
 }
 
 impl GrpcPageServiceHandler {
     /// Spawns a gRPC server for the page service.
     ///
+    /// Returns a `CancellableTask` handle that can be used to shut down the server. It waits for
+    /// any in-flight requests and tasks to complete first.
+    ///
     /// TODO: this doesn't support TLS. We need TLS reloading via ReloadingCertificateResolver, so we
     /// need to reimplement the TCP+TLS accept loop ourselves.
     pub fn spawn(
@@ -3234,12 +3246,15 @@ impl GrpcPageServiceHandler {
         get_vectored_concurrent_io: GetVectoredConcurrentIo,
         listener: std::net::TcpListener,
     ) -> anyhow::Result<CancellableTask> {
+        // Set up a cancellation token for shutting down the server, and a gate to wait for all
+        // requests and spawned tasks to complete.
         let cancel = CancellationToken::new();
+        let gate = Gate::default();
+
         let ctx = RequestContextBuilder::new(TaskKind::PageRequestHandler)
             .download_behavior(DownloadBehavior::Download)
             .perf_span_dispatch(perf_trace_dispatch)
             .detached_child();
-        let gate = Gate::default();
 
         // Set up the TCP socket. We take a preconfigured TcpListener to bind the
         // port early during startup.
@@ -3270,6 +3285,7 @@ impl GrpcPageServiceHandler {
         let page_service_handler = GrpcPageServiceHandler {
             tenant_manager,
             ctx,
+            cancel: cancel.clone(),
             gate_guard: gate.enter().expect("gate was just created"),
             get_vectored_concurrent_io,
         };
@@ -3306,19 +3322,20 @@ impl GrpcPageServiceHandler {
             .build_v1()?;
         let server = server.add_service(reflection_service);
 
-        // Spawn server task.
+        // Spawn server task. It runs until the cancellation token fires and in-flight requests and
+        // tasks complete. The `CancellableTask` will wait for the task's join handle, which
+        // implicitly waits for the gate to close.
         let task_cancel = cancel.clone();
         let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-            "grpc listener",
+            "grpc pageservice listener",
             async move {
-                let result = server
+                server
                     .serve_with_incoming_shutdown(incoming, task_cancel.cancelled())
-                    .await;
-                if result.is_ok() {
-                    // TODO: revisit shutdown logic once page service is implemented.
-                    gate.close().await;
-                }
-                result
+                    .await?;
+                // Server exited cleanly. All requests should have completed by now. Wait for any
+                // spawned tasks to complete as well (e.g. IoConcurrency sidecars) via the gate.
+                gate.close().await;
+                anyhow::Ok(())
             },
         ));
 
@@ -3508,7 +3525,10 @@ impl GrpcPageServiceHandler {
 
 /// Implements the gRPC page service.
 ///
-/// TODO: cancellation.
+/// On client disconnect (e.g. timeout or client shutdown), Tonic will drop the request handler
+/// futures, so the read path must be cancellation-safe. On server shutdown, Tonic will wait for
+/// in-flight requests to complete.
+///
 /// TODO: when the libpq impl is removed, remove the Pagestream types and inline the handler code.
 #[tonic::async_trait]
 impl proto::PageService for GrpcPageServiceHandler {
@@ -3593,8 +3613,14 @@ impl proto::PageService for GrpcPageServiceHandler {
 
         // Spawn a task to run the basebackup.
         let span = Span::current();
+        let gate_guard = self
+            .gate_guard
+            .try_clone()
+            .map_err(|_| tonic::Status::unavailable("shutting down"))?;
         let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE);
         let jh = tokio::spawn(async move {
+            let _gate_guard = gate_guard; // keep gate open until task completes
+
             let gzip_level = match req.compression {
                 page_api::BaseBackupCompression::None => None,
                 // NB: using fast compression because it's on the critical path for compute
@@ -3718,15 +3744,17 @@ impl proto::PageService for GrpcPageServiceHandler {
             .await?;
 
         // Spawn an IoConcurrency sidecar, if enabled.
-        let Ok(gate_guard) = self.gate_guard.try_clone() else {
-            return Err(tonic::Status::unavailable("shutting down"));
-        };
+        let gate_guard = self
+            .gate_guard
+            .try_clone()
+            .map_err(|_| tonic::Status::unavailable("shutting down"))?;
         let io_concurrency =
             IoConcurrency::spawn_from_conf(self.get_vectored_concurrent_io, gate_guard);
 
-        // Spawn a task to handle the GetPageRequest stream.
+        // Construct the GetPageRequest stream handler.
         let span = Span::current();
         let ctx = self.ctx.attached_child();
+        let cancel = self.cancel.clone();
         let mut reqs = req.into_inner();
 
         let resps = async_stream::try_stream! {
@@ -3734,7 +3762,19 @@ impl proto::PageService for GrpcPageServiceHandler {
                 .get(ttid.tenant_id, ttid.timeline_id, shard_selector)
                 .await?
                 .downgrade();
-            while let Some(req) = reqs.message().await? {
+            loop {
+                // NB: Tonic considers the entire stream to be an in-flight request and will wait
+                // for it to complete before shutting down. React to cancellation between requests.
+                let req = tokio::select! {
+                    biased;
+                    _ = cancel.cancelled() => Err(tonic::Status::unavailable("shutting down")),
+
+                    result = reqs.message() => match result {
+                        Ok(Some(req)) => Ok(req),
+                        Ok(None) => break, // client closed the stream
+                        Err(err) => Err(err),
+                    },
+                }?;
                 let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
                 let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
                     .instrument(span.clone()) // propagate request span
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index cda08f2cc4..8b76d980fc 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -286,6 +286,10 @@ impl Timeline {
     /// Like [`Self::get_rel_page_at_lsn`], but returns a batch of pages.
     ///
     /// The ordering of the returned vec corresponds to the ordering of `pages`.
+    ///
+    /// NB: the read path must be cancellation-safe. The Tonic gRPC service will drop the future
+    /// if the client goes away (e.g. due to timeout or cancellation).
+    /// TODO: verify that it actually is cancellation-safe.
     pub(crate) async fn get_rel_page_at_lsn_batched(
         &self,
         pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, LsnRange, RequestContext)>,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8f25555929..06e02a7386 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1324,6 +1324,9 @@ impl Timeline {
     ///
     /// This naive implementation will be replaced with a more efficient one
     /// which actually vectorizes the read path.
+    ///
+    /// NB: the read path must be cancellation-safe. The Tonic gRPC service will drop the future
+    /// if the client goes away (e.g. due to timeout or cancellation).
     pub(crate) async fn get_vectored(
         &self,
         query: VersionedKeySpaceQuery,

From e181b996c3e6ef78a66ad25f4a7583f2e6016193 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 21 Jul 2025 12:56:20 +0200
Subject: [PATCH 32/39] utils: move `ShardStripeSize` into `shard` module
 (#12640)

## Problem

`ShardStripeSize` will be used in the compute spec and internally in the
communicator. It shouldn't require pulling in all of `pageserver_api`.

## Summary of changes

Move `ShardStripeSize` into `utils::shard`, along with other basic shard
types. Also remove the `Default` implementation, to discourage clients
from falling back to a default (it's generally a footgun).

The type is still re-exported from `pageserver_api::shard`, along with
all the other shard types.
---
 control_plane/src/endpoint.rs                 |  2 +-
 libs/pageserver_api/src/shard.rs              | 16 ------
 libs/utils/src/shard.rs                       | 12 +++++
 pageserver/client_grpc/src/client.rs          |  6 +--
 pageserver/client_grpc/src/split.rs           |  4 +-
 pageserver/ctl/src/key.rs                     |  6 ++-
 pageserver/src/pgdatadir_mapping.rs           |  3 +-
 pageserver/src/tenant/mgr.rs                  |  2 +-
 .../tenant/remote_timeline_client/manifest.rs |  2 +-
 pageserver/src/tenant/timeline/handle.rs      |  4 +-
 safekeeper/src/send_interpreted_wal.rs        | 51 +++++++------------
 storage_controller/src/scheduler.rs           |  8 +--
 12 files changed, 49 insertions(+), 67 deletions(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 24956e3ac9..4c569d7005 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -65,7 +65,6 @@ use jsonwebtoken::jwk::{
     OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
 };
 use nix::sys::signal::{Signal, kill};
-use pageserver_api::shard::ShardStripeSize;
 use pem::Pem;
 use reqwest::header::CONTENT_TYPE;
 use safekeeper_api::PgMajorVersion;
@@ -77,6 +76,7 @@ use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
 use tracing::debug;
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
+use utils::shard::ShardStripeSize;
 
 use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index d6f4cd5e66..74f5f14f87 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -69,22 +69,6 @@ impl Hash for ShardIdentity {
     }
 }
 
-/// Stripe size in number of pages
-#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
-pub struct ShardStripeSize(pub u32);
-
-impl Default for ShardStripeSize {
-    fn default() -> Self {
-        DEFAULT_STRIPE_SIZE
-    }
-}
-
-impl std::fmt::Display for ShardStripeSize {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        self.0.fmt(f)
-    }
-}
-
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
 pub struct ShardLayout(u8);
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index 5a0edf8cea..6ad6cab3a8 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -25,6 +25,12 @@ pub struct ShardIndex {
     pub shard_count: ShardCount,
 }
 
+/// Stripe size as number of pages.
+///
+/// NB: don't implement Default, so callers don't lazily use it by mistake. See DEFAULT_STRIPE_SIZE.
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardStripeSize(pub u32);
+
 /// Formatting helper, for generating the `shard_id` label in traces.
 pub struct ShardSlug<'a>(&'a TenantShardId);
 
@@ -177,6 +183,12 @@ impl std::fmt::Display for ShardCount {
     }
 }
 
+impl std::fmt::Display for ShardStripeSize {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 impl std::fmt::Display for ShardSlug<'_> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 3a9edc7092..e4670f74cc 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -8,6 +8,7 @@ use anyhow::anyhow;
 use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
+use pageserver_api::shard::DEFAULT_STRIPE_SIZE;
 use tonic::codec::CompressionEncoding;
 use tracing::{debug, instrument};
 use utils::logging::warn_slow;
@@ -16,10 +17,9 @@ use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool}
 use crate::retry::Retry;
 use crate::split::GetPageSplitter;
 use compute_api::spec::PageserverProtocol;
-use pageserver_api::shard::ShardStripeSize;
 use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
-use utils::shard::{ShardCount, ShardIndex, ShardNumber};
+use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize};
 
 /// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
 /// when full.
@@ -418,7 +418,7 @@ impl ShardSpec {
         if stripe_size.is_none() && !count.is_unsharded() {
             return Err(anyhow!("stripe size must be given for sharded tenants"));
         }
-        let stripe_size = stripe_size.unwrap_or_default();
+        let stripe_size = stripe_size.unwrap_or(DEFAULT_STRIPE_SIZE);
 
         // Validate the shard spec.
         for (shard_id, url) in &urls {
diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/client_grpc/src/split.rs
index b7539b900c..ca8965b8dd 100644
--- a/pageserver/client_grpc/src/split.rs
+++ b/pageserver/client_grpc/src/split.rs
@@ -3,9 +3,9 @@ use std::collections::HashMap;
 use bytes::Bytes;
 
 use pageserver_api::key::rel_block_to_key;
-use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
+use pageserver_api::shard::key_to_shard_number;
 use pageserver_page_api as page_api;
-use utils::shard::{ShardCount, ShardIndex, ShardNumber};
+use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize};
 
 /// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
 /// TODO: add tests for this.
diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs
index c4daafdfd0..75bab94757 100644
--- a/pageserver/ctl/src/key.rs
+++ b/pageserver/ctl/src/key.rs
@@ -4,7 +4,7 @@ use anyhow::Context;
 use clap::Parser;
 use pageserver_api::key::Key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
-use pageserver_api::shard::{ShardCount, ShardStripeSize};
+use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize};
 
 #[derive(Parser)]
 pub(super) struct DescribeKeyCommand {
@@ -128,7 +128,9 @@ impl DescribeKeyCommand {
             // seeing the sharding placement might be confusing, so leave it out unless shard
             // count was given.
 
-            let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default();
+            let stripe_size = stripe_size
+                .map(ShardStripeSize)
+                .unwrap_or(DEFAULT_STRIPE_SIZE);
             println!(
                 "# placement with shard_count: {} and stripe_size: {}:",
                 shard_count.0, stripe_size.0
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 8b76d980fc..ea0fb5de2f 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -2912,9 +2912,8 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 mod tests {
     use hex_literal::hex;
     use pageserver_api::models::ShardParameters;
-    use pageserver_api::shard::ShardStripeSize;
     use utils::id::TimelineId;
-    use utils::shard::{ShardCount, ShardNumber};
+    use utils::shard::{ShardCount, ShardNumber, ShardStripeSize};
 
     use super::*;
     use crate::DEFAULT_PG_VERSION;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 52f67abde5..01db09ed59 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -328,7 +328,7 @@ fn emergency_generations(
                     LocationMode::Attached(alc) => TenantStartupMode::Attached((
                         alc.attach_mode,
                         alc.generation,
-                        ShardStripeSize::default(),
+                        lc.shard.stripe_size,
                     )),
                     LocationMode::Secondary(_) => TenantStartupMode::Secondary,
                 },
diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs
index 7dba4508e2..41e9647d8f 100644
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -1,8 +1,8 @@
 use chrono::NaiveDateTime;
-use pageserver_api::shard::ShardStripeSize;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
+use utils::shard::ShardStripeSize;
 
 /// Tenant shard manifest, stored in remote storage. Contains offloaded timelines and other tenant
 /// shard-wide information that must be persisted in remote storage.
diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 7bca66190f..0b118dd65d 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -654,7 +654,7 @@ mod tests {
     use pageserver_api::key::{DBDIR_KEY, Key, rel_block_to_key};
     use pageserver_api::models::ShardParameters;
     use pageserver_api::reltag::RelTag;
-    use pageserver_api::shard::ShardStripeSize;
+    use pageserver_api::shard::DEFAULT_STRIPE_SIZE;
     use utils::shard::ShardCount;
     use utils::sync::gate::GateGuard;
 
@@ -955,7 +955,7 @@ mod tests {
         });
         let child_params = ShardParameters {
             count: ShardCount(2),
-            stripe_size: ShardStripeSize::default(),
+            stripe_size: DEFAULT_STRIPE_SIZE,
         };
         let child0 = Arc::new_cyclic(|myself| StubTimeline {
             gate: Default::default(),
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index 72a436e25f..671798298b 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -742,7 +742,7 @@ mod tests {
     use std::str::FromStr;
     use std::time::Duration;
 
-    use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
+    use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardIdentity};
     use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion};
     use tokio::sync::mpsc::error::TryRecvError;
     use utils::id::{NodeId, TenantTimelineId};
@@ -786,19 +786,13 @@ mod tests {
             MAX_SEND_SIZE,
         );
 
-        let shard_0 = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount(SHARD_COUNT),
-            ShardStripeSize::default(),
-        )
-        .unwrap();
+        let shard_0 =
+            ShardIdentity::new(ShardNumber(0), ShardCount(SHARD_COUNT), DEFAULT_STRIPE_SIZE)
+                .unwrap();
 
-        let shard_1 = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount(SHARD_COUNT),
-            ShardStripeSize::default(),
-        )
-        .unwrap();
+        let shard_1 =
+            ShardIdentity::new(ShardNumber(1), ShardCount(SHARD_COUNT), DEFAULT_STRIPE_SIZE)
+                .unwrap();
 
         let mut shards = HashMap::new();
 
@@ -806,7 +800,7 @@ mod tests {
             let shard_id = ShardIdentity::new(
                 ShardNumber(shard_number),
                 ShardCount(SHARD_COUNT),
-                ShardStripeSize::default(),
+                DEFAULT_STRIPE_SIZE,
             )
             .unwrap();
             let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
@@ -934,12 +928,9 @@ mod tests {
             MAX_SEND_SIZE,
         );
 
-        let shard_0 = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount(SHARD_COUNT),
-            ShardStripeSize::default(),
-        )
-        .unwrap();
+        let shard_0 =
+            ShardIdentity::new(ShardNumber(0), ShardCount(SHARD_COUNT), DEFAULT_STRIPE_SIZE)
+                .unwrap();
 
         struct Sender {
             tx: Option<tokio::sync::mpsc::Sender<Batch>>,
@@ -1088,19 +1079,13 @@ mod tests {
             WAL_READER_BATCH_SIZE,
         );
 
-        let shard_0 = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount(SHARD_COUNT),
-            ShardStripeSize::default(),
-        )
-        .unwrap();
+        let shard_0 =
+            ShardIdentity::new(ShardNumber(0), ShardCount(SHARD_COUNT), DEFAULT_STRIPE_SIZE)
+                .unwrap();
 
-        let shard_1 = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount(SHARD_COUNT),
-            ShardStripeSize::default(),
-        )
-        .unwrap();
+        let shard_1 =
+            ShardIdentity::new(ShardNumber(1), ShardCount(SHARD_COUNT), DEFAULT_STRIPE_SIZE)
+                .unwrap();
 
         let mut shards = HashMap::new();
 
@@ -1108,7 +1093,7 @@ mod tests {
             let shard_id = ShardIdentity::new(
                 ShardNumber(shard_number),
                 ShardCount(SHARD_COUNT),
-                ShardStripeSize::default(),
+                DEFAULT_STRIPE_SIZE,
             )
             .unwrap();
             let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index b86b4dfab1..23f002d32a 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -981,7 +981,7 @@ mod tests {
     use pageserver_api::models::utilization::test_utilization;
     use pageserver_api::shard::ShardIdentity;
     use utils::id::TenantId;
-    use utils::shard::{ShardCount, ShardNumber, TenantShardId};
+    use utils::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 
     use super::*;
     use crate::tenant_shard::IntentState;
@@ -1337,7 +1337,7 @@ mod tests {
             let shard_identity = ShardIdentity::new(
                 tenant_shard_id.shard_number,
                 tenant_shard_id.shard_count,
-                pageserver_api::shard::ShardStripeSize(1),
+                ShardStripeSize(1),
             )
             .unwrap();
             let mut shard = TenantShard::new(
@@ -1411,7 +1411,7 @@ mod tests {
             let shard_identity = ShardIdentity::new(
                 tenant_shard_id.shard_number,
                 tenant_shard_id.shard_count,
-                pageserver_api::shard::ShardStripeSize(1),
+                ShardStripeSize(1),
             )
             .unwrap();
             let mut shard = TenantShard::new(
@@ -1573,7 +1573,7 @@ mod tests {
         let shard_identity = ShardIdentity::new(
             tenant_shard_id.shard_number,
             tenant_shard_id.shard_count,
-            pageserver_api::shard::ShardStripeSize(1),
+            ShardStripeSize(1),
         )
         .unwrap();
         // 1 attached and 1 secondary.

From 1e30b31fa75c07a1faf57fdf104543c1fc7c9d01 Mon Sep 17 00:00:00 2001
From: Dimitri Fontaine <dim@tapoueh.org>
Date: Mon, 21 Jul 2025 13:10:10 +0200
Subject: [PATCH 33/39] Cherry pick: pg hooks for online table. (#12654)

## Problem

## Summary of changes
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 47304b9215..4cacada8bd 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 47304b921555b3f33eb3b49daada3078e774cfd7
+Subproject commit 4cacada8bde7f6424751a0727a657783c6a1d20b
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index cef72d5308..e5ee23d998 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit cef72d5308ddce3795a9043fcd94f8849f7f4800
+Subproject commit e5ee23d99874ea9f5b62f8acc7d076162ae95d6c
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index e9db1ff5a6..ad2b69b582 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit e9db1ff5a6f3ca18f626ba3d62ab475e6c688a96
+Subproject commit ad2b69b58230290fc44c08fbe0c97981c64f6c7d
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index a50d80c750..ba750903a9 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit a50d80c7507e8ae9fc37bf1869051cf2d51370ab
+Subproject commit ba750903a90dded8098f2f56d0b2a9012e6166af
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 24a33dec42..d62f8e5736 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.5",
-    "a50d80c7507e8ae9fc37bf1869051cf2d51370ab"
+    "ba750903a90dded8098f2f56d0b2a9012e6166af"
   ],
   "v16": [
     "16.9",
-    "e9db1ff5a6f3ca18f626ba3d62ab475e6c688a96"
+    "ad2b69b58230290fc44c08fbe0c97981c64f6c7d"
   ],
   "v15": [
     "15.13",
-    "cef72d5308ddce3795a9043fcd94f8849f7f4800"
+    "e5ee23d99874ea9f5b62f8acc7d076162ae95d6c"
   ],
   "v14": [
     "14.18",
-    "47304b921555b3f33eb3b49daada3078e774cfd7"
+    "4cacada8bde7f6424751a0727a657783c6a1d20b"
   ]
 }

From 194b9ffc41434d4b574d9c0836f663d99db6c027 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 21 Jul 2025 13:43:26 +0200
Subject: [PATCH 34/39] pageserver: remove gRPC `CheckRelExists` (#12616)

## Problem

Postgres will often immediately follow a relation existence check with a
relation size query. This incurs two roundtrips, and may prevent
effective caching.

See [Slack
thread](https://databricks.slack.com/archives/C091SDX74SC/p1751951732136139).

Touches #11728.

## Summary of changes

For the gRPC API:

* Add an `allow_missing` parameter to `GetRelSize`, which returns
`missing=true` instead of a `NotFound` error.
* Remove `CheckRelExists`.

There are no changes to libpq behavior.
---
 pageserver/client_grpc/src/client.rs         | 17 -----
 pageserver/page_api/proto/page_service.proto | 26 +++-----
 pageserver/page_api/src/client.rs            | 13 +---
 pageserver/page_api/src/model.rs             | 65 +++++---------------
 pageserver/src/page_service.rs               | 65 +++++++-------------
 pageserver/src/pgdatadir_mapping.rs          | 32 +++++++---
 6 files changed, 75 insertions(+), 143 deletions(-)

diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index e4670f74cc..42bc3c40ac 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -157,23 +157,6 @@ impl PageserverClient {
         Ok(())
     }
 
-    /// Returns whether a relation exists.
-    #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
-    pub async fn check_rel_exists(
-        &self,
-        req: page_api::CheckRelExistsRequest,
-    ) -> tonic::Result<page_api::CheckRelExistsResponse> {
-        debug!("sending request: {req:?}");
-        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
-            // Relation metadata is only available on shard 0.
-            let mut client = self.shards.load_full().get_zero().client().await?;
-            Self::with_timeout(REQUEST_TIMEOUT, client.check_rel_exists(req)).await
-        })
-        .await?;
-        debug!("received response: {resp:?}");
-        Ok(resp)
-    }
-
     /// Returns the total size of a database, as # of bytes.
     #[instrument(skip_all, fields(db_oid=%req.db_oid, lsn=%req.read_lsn))]
     pub async fn get_db_size(
diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto
index d113a04a42..aaccbd5ef0 100644
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -17,11 +17,11 @@
 //    grpcurl \
 //      -plaintext \
 //      -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \
-//      -H "neon-shard-id: 0b10" \
+//      -H "neon-shard-id: 0000" \
 //      -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \
 //      -H "authorization: Bearer $JWT" \
-//      -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}'
-//      localhost:51051 page_api.PageService/CheckRelExists
+//      -d '{"read_lsn": {"request_lsn": 100000000, "not_modified_since_lsn": 1}, "db_oid": 1}' \
+//      localhost:51051 page_api.PageService/GetDbSize
 //    ```
 //
 // TODO: consider adding neon-compute-mode ("primary", "static", "replica").
@@ -38,8 +38,8 @@ package page_api;
 import "google/protobuf/timestamp.proto";
 
 service PageService {
-  // Returns whether a relation exists.
-  rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
+  // NB: unlike libpq, there is no CheckRelExists in gRPC, at the compute team's request. Instead,
+  // use GetRelSize with allow_missing=true to check existence.
 
   // Fetches a base backup.
   rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
@@ -97,17 +97,6 @@ message RelTag {
     uint32 fork_number = 4;
 }
 
-// Checks whether a relation exists, at the given LSN. Only valid on shard 0,
-// other shards will error.
-message CheckRelExistsRequest {
-  ReadLsn read_lsn = 1;
-  RelTag rel = 2;
-}
-
-message CheckRelExistsResponse {
-  bool exists = 1;
-}
-
 // Requests a base backup.
 message GetBaseBackupRequest {
   // The LSN to fetch the base backup at. 0 or absent means the latest LSN known to the Pageserver.
@@ -260,10 +249,15 @@ enum GetPageStatusCode {
 message GetRelSizeRequest {
   ReadLsn read_lsn = 1;
   RelTag rel = 2;
+  // If true, return missing=true for missing relations instead of a NotFound error.
+  bool allow_missing = 3;
 }
 
 message GetRelSizeResponse {
+  // The number of blocks in the relation.
   uint32 num_blocks = 1;
+  // If allow_missing=true, this is true for missing relations.
+  bool missing = 2;
 }
 
 // Requests an SLRU segment. Only valid on shard 0, other shards will error.
diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs
index f70d0e7b28..fc27ea448b 100644
--- a/pageserver/page_api/src/client.rs
+++ b/pageserver/page_api/src/client.rs
@@ -69,16 +69,6 @@ impl Client {
         Ok(Self { inner })
     }
 
-    /// Returns whether a relation exists.
-    pub async fn check_rel_exists(
-        &mut self,
-        req: CheckRelExistsRequest,
-    ) -> tonic::Result<CheckRelExistsResponse> {
-        let req = proto::CheckRelExistsRequest::from(req);
-        let resp = self.inner.check_rel_exists(req).await?.into_inner();
-        Ok(resp.into())
-    }
-
     /// Fetches a base backup.
     pub async fn get_base_backup(
         &mut self,
@@ -114,7 +104,8 @@ impl Client {
         Ok(resps.and_then(|resp| ready(GetPageResponse::try_from(resp).map_err(|err| err.into()))))
     }
 
-    /// Returns the size of a relation, as # of blocks.
+    /// Returns the size of a relation as # of blocks, or None if allow_missing=true and the
+    /// relation does not exist.
     pub async fn get_rel_size(
         &mut self,
         req: GetRelSizeRequest,
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index a3286ecf15..6375c47998 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -139,50 +139,6 @@ impl From<RelTag> for proto::RelTag {
     }
 }
 
-/// Checks whether a relation exists, at the given LSN. Only valid on shard 0, other shards error.
-#[derive(Clone, Copy, Debug)]
-pub struct CheckRelExistsRequest {
-    pub read_lsn: ReadLsn,
-    pub rel: RelTag,
-}
-
-impl TryFrom<proto::CheckRelExistsRequest> for CheckRelExistsRequest {
-    type Error = ProtocolError;
-
-    fn try_from(pb: proto::CheckRelExistsRequest) -> Result<Self, Self::Error> {
-        Ok(Self {
-            read_lsn: pb
-                .read_lsn
-                .ok_or(ProtocolError::Missing("read_lsn"))?
-                .try_into()?,
-            rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
-        })
-    }
-}
-
-impl From<CheckRelExistsRequest> for proto::CheckRelExistsRequest {
-    fn from(request: CheckRelExistsRequest) -> Self {
-        Self {
-            read_lsn: Some(request.read_lsn.into()),
-            rel: Some(request.rel.into()),
-        }
-    }
-}
-
-pub type CheckRelExistsResponse = bool;
-
-impl From<proto::CheckRelExistsResponse> for CheckRelExistsResponse {
-    fn from(pb: proto::CheckRelExistsResponse) -> Self {
-        pb.exists
-    }
-}
-
-impl From<CheckRelExistsResponse> for proto::CheckRelExistsResponse {
-    fn from(exists: CheckRelExistsResponse) -> Self {
-        Self { exists }
-    }
-}
-
 /// Requests a base backup.
 #[derive(Clone, Copy, Debug)]
 pub struct GetBaseBackupRequest {
@@ -707,6 +663,8 @@ impl From<GetPageStatusCode> for tonic::Code {
 pub struct GetRelSizeRequest {
     pub read_lsn: ReadLsn,
     pub rel: RelTag,
+    /// If true, return missing=true for missing relations instead of a NotFound error.
+    pub allow_missing: bool,
 }
 
 impl TryFrom<proto::GetRelSizeRequest> for GetRelSizeRequest {
@@ -719,6 +677,7 @@ impl TryFrom<proto::GetRelSizeRequest> for GetRelSizeRequest {
                 .ok_or(ProtocolError::Missing("read_lsn"))?
                 .try_into()?,
             rel: proto.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
+            allow_missing: proto.allow_missing,
         })
     }
 }
@@ -728,21 +687,29 @@ impl From<GetRelSizeRequest> for proto::GetRelSizeRequest {
         Self {
             read_lsn: Some(request.read_lsn.into()),
             rel: Some(request.rel.into()),
+            allow_missing: request.allow_missing,
         }
     }
 }
 
-pub type GetRelSizeResponse = u32;
+/// The size of a relation as number of blocks, or None if `allow_missing=true` and the relation
+/// does not exist.
+///
+/// INVARIANT: never None if `allow_missing=false` (returns `NotFound` error instead).
+pub type GetRelSizeResponse = Option<u32>;
 
 impl From<proto::GetRelSizeResponse> for GetRelSizeResponse {
-    fn from(proto: proto::GetRelSizeResponse) -> Self {
-        proto.num_blocks
+    fn from(pb: proto::GetRelSizeResponse) -> Self {
+        (!pb.missing).then_some(pb.num_blocks)
     }
 }
 
 impl From<GetRelSizeResponse> for proto::GetRelSizeResponse {
-    fn from(num_blocks: GetRelSizeResponse) -> Self {
-        Self { num_blocks }
+    fn from(resp: GetRelSizeResponse) -> Self {
+        Self {
+            num_blocks: resp.unwrap_or_default(),
+            missing: resp.is_none(),
+        }
     }
 }
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 2b266c6811..26a23da66f 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1636,9 +1636,10 @@ impl PageServerHandler {
                 let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                 (
                     vec![
-                        Self::handle_get_nblocks_request(&shard, &req, &ctx)
+                        Self::handle_get_nblocks_request(&shard, &req, false, &ctx)
                             .instrument(span.clone())
                             .await
+                            .map(|msg| msg.expect("allow_missing=false"))
                             .map(|msg| (PagestreamBeMessage::Nblocks(msg), timer, ctx))
                             .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                     ],
@@ -2303,12 +2304,16 @@ impl PageServerHandler {
         Ok(PagestreamExistsResponse { req: *req, exists })
     }
 
+    /// If `allow_missing` is true, returns None instead of Err on missing relations. Otherwise,
+    /// never returns None. It is only supported by the gRPC protocol, so we pass it separately to
+    /// avoid changing the libpq protocol types.
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_nblocks_request(
         timeline: &Timeline,
         req: &PagestreamNblocksRequest,
+        allow_missing: bool,
         ctx: &RequestContext,
-    ) -> Result<PagestreamNblocksResponse, PageStreamError> {
+    ) -> Result<Option<PagestreamNblocksResponse>, PageStreamError> {
         let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -2320,20 +2325,25 @@ impl PageServerHandler {
         .await?;
 
         let n_blocks = timeline
-            .get_rel_size(
+            .get_rel_size_in_reldir(
                 req.rel,
                 Version::LsnRange(LsnRange {
                     effective_lsn: lsn,
                     request_lsn: req.hdr.request_lsn,
                 }),
+                None,
+                allow_missing,
                 ctx,
             )
             .await?;
+        let Some(n_blocks) = n_blocks else {
+            return Ok(None);
+        };
 
-        Ok(PagestreamNblocksResponse {
+        Ok(Some(PagestreamNblocksResponse {
             req: *req,
             n_blocks,
-        })
+        }))
     }
 
     #[instrument(skip_all, fields(shard_id))]
@@ -3539,39 +3549,6 @@ impl proto::PageService for GrpcPageServiceHandler {
     type GetPagesStream =
         Pin<Box<dyn Stream<Item = Result<proto::GetPageResponse, tonic::Status>> + Send>>;
 
-    #[instrument(skip_all, fields(rel, lsn))]
-    async fn check_rel_exists(
-        &self,
-        req: tonic::Request<proto::CheckRelExistsRequest>,
-    ) -> Result<tonic::Response<proto::CheckRelExistsResponse>, tonic::Status> {
-        let received_at = extract::<ReceivedAt>(&req).0;
-        let timeline = self.get_request_timeline(&req).await?;
-        let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
-
-        // Validate the request, decorate the span, and convert it to a Pagestream request.
-        Self::ensure_shard_zero(&timeline)?;
-        let req: page_api::CheckRelExistsRequest = req.into_inner().try_into()?;
-
-        span_record!(rel=%req.rel, lsn=%req.read_lsn);
-
-        let req = PagestreamExistsRequest {
-            hdr: Self::make_hdr(req.read_lsn, None),
-            rel: req.rel,
-        };
-
-        // Execute the request and convert the response.
-        let _timer = Self::record_op_start_and_throttle(
-            &timeline,
-            metrics::SmgrQueryType::GetRelExists,
-            received_at,
-        )
-        .await?;
-
-        let resp = PageServerHandler::handle_get_rel_exists_request(&timeline, &req, &ctx).await?;
-        let resp: page_api::CheckRelExistsResponse = resp.exists;
-        Ok(tonic::Response::new(resp.into()))
-    }
-
     #[instrument(skip_all, fields(lsn))]
     async fn get_base_backup(
         &self,
@@ -3798,7 +3775,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         Ok(tonic::Response::new(Box::pin(resps)))
     }
 
-    #[instrument(skip_all, fields(rel, lsn))]
+    #[instrument(skip_all, fields(rel, lsn, allow_missing))]
     async fn get_rel_size(
         &self,
         req: tonic::Request<proto::GetRelSizeRequest>,
@@ -3810,8 +3787,9 @@ impl proto::PageService for GrpcPageServiceHandler {
         // Validate the request, decorate the span, and convert it to a Pagestream request.
         Self::ensure_shard_zero(&timeline)?;
         let req: page_api::GetRelSizeRequest = req.into_inner().try_into()?;
+        let allow_missing = req.allow_missing;
 
-        span_record!(rel=%req.rel, lsn=%req.read_lsn);
+        span_record!(rel=%req.rel, lsn=%req.read_lsn, allow_missing=%req.allow_missing);
 
         let req = PagestreamNblocksRequest {
             hdr: Self::make_hdr(req.read_lsn, None),
@@ -3826,8 +3804,11 @@ impl proto::PageService for GrpcPageServiceHandler {
         )
         .await?;
 
-        let resp = PageServerHandler::handle_get_nblocks_request(&timeline, &req, &ctx).await?;
-        let resp: page_api::GetRelSizeResponse = resp.n_blocks;
+        let resp =
+            PageServerHandler::handle_get_nblocks_request(&timeline, &req, allow_missing, &ctx)
+                .await?;
+        let resp: page_api::GetRelSizeResponse = resp.map(|resp| resp.n_blocks);
+
         Ok(tonic::Response::new(resp.into()))
     }
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index ea0fb5de2f..ab9cc88e5f 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -504,8 +504,9 @@ impl Timeline {
 
         for rel in rels {
             let n_blocks = self
-                .get_rel_size_in_reldir(rel, version, Some((reldir_key, &reldir)), ctx)
-                .await?;
+                .get_rel_size_in_reldir(rel, version, Some((reldir_key, &reldir)), false, ctx)
+                .await?
+                .expect("allow_missing=false");
             total_blocks += n_blocks as usize;
         }
         Ok(total_blocks)
@@ -521,10 +522,16 @@ impl Timeline {
         version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
-        self.get_rel_size_in_reldir(tag, version, None, ctx).await
+        Ok(self
+            .get_rel_size_in_reldir(tag, version, None, false, ctx)
+            .await?
+            .expect("allow_missing=false"))
     }
 
-    /// Get size of a relation file. The relation must exist, otherwise an error is returned.
+    /// Get size of a relation file. If `allow_missing` is true, returns None for missing relations,
+    /// otherwise errors.
+    ///
+    /// INVARIANT: never returns None if `allow_missing=false`.
     ///
     /// See [`Self::get_rel_exists_in_reldir`] on why we need `deserialized_reldir_v1`.
     pub(crate) async fn get_rel_size_in_reldir(
@@ -532,8 +539,9 @@ impl Timeline {
         tag: RelTag,
         version: Version<'_>,
         deserialized_reldir_v1: Option<(Key, &RelDirectory)>,
+        allow_missing: bool,
         ctx: &RequestContext,
-    ) -> Result<BlockNumber, PageReconstructError> {
+    ) -> Result<Option<BlockNumber>, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(
                 RelationError::InvalidRelnode.into(),
@@ -541,7 +549,15 @@ impl Timeline {
         }
 
         if let Some(nblocks) = self.get_cached_rel_size(&tag, version) {
-            return Ok(nblocks);
+            return Ok(Some(nblocks));
+        }
+
+        if allow_missing
+            && !self
+                .get_rel_exists_in_reldir(tag, version, deserialized_reldir_v1, ctx)
+                .await?
+        {
+            return Ok(None);
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
@@ -553,7 +569,7 @@ impl Timeline {
             // FSM, and smgrnblocks() on it immediately afterwards,
             // without extending it.  Tolerate that by claiming that
             // any non-existent FSM fork has size 0.
-            return Ok(0);
+            return Ok(Some(0));
         }
 
         let key = rel_size_to_key(tag);
@@ -562,7 +578,7 @@ impl Timeline {
 
         self.update_cached_rel_size(tag, version, nblocks);
 
-        Ok(nblocks)
+        Ok(Some(nblocks))
     }
 
     /// Does the relation exist?

From 5a48365fb92124d6367ae8953dc9d97e76fd3f06 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 21 Jul 2025 14:28:39 +0200
Subject: [PATCH 35/39] pageserver/client_grpc: don't set stripe size for
 unsharded tenants (#12639)

## Problem

We've had bugs where the compute would use the stale default stripe size
from an unsharded tenant after the tenant split with a new stripe size.

## Summary of changes

Never specify a stripe size for unsharded tenants, to guard against
misuse. Only specify it once tenants are sharded and the stripe size
can't change.

Also opportunistically changes `GetPageSplitter` to return
`anyhow::Result`, since we'll be using this in other code paths as well
(specifically during server-side shard splits).
---
 pageserver/client_grpc/src/client.rs | 35 +++++++----
 pageserver/client_grpc/src/split.rs  | 90 +++++++++++++++++-----------
 2 files changed, 78 insertions(+), 47 deletions(-)

diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 42bc3c40ac..e6a90fb582 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -8,7 +8,6 @@ use anyhow::anyhow;
 use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
-use pageserver_api::shard::DEFAULT_STRIPE_SIZE;
 use tonic::codec::CompressionEncoding;
 use tracing::{debug, instrument};
 use utils::logging::warn_slow;
@@ -141,8 +140,8 @@ impl PageserverClient {
         if !old.count.is_unsharded() && shard_spec.stripe_size != old.stripe_size {
             return Err(anyhow!(
                 "can't change stripe size from {} to {}",
-                old.stripe_size,
-                shard_spec.stripe_size
+                old.stripe_size.expect("always Some when sharded"),
+                shard_spec.stripe_size.expect("always Some when sharded")
             ));
         }
 
@@ -232,13 +231,15 @@ impl PageserverClient {
         // Fast path: request is for a single shard.
         if let Some(shard_id) =
             GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)
+                .map_err(|err| tonic::Status::internal(err.to_string()))?
         {
             return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
         }
 
         // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
         // reassemble the responses.
-        let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size);
+        let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size)
+            .map_err(|err| tonic::Status::internal(err.to_string()))?;
 
         let mut shard_requests = FuturesUnordered::new();
         for (shard_id, shard_req) in splitter.drain_requests() {
@@ -248,10 +249,14 @@ impl PageserverClient {
         }
 
         while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
-            splitter.add_response(shard_id, shard_response)?;
+            splitter
+                .add_response(shard_id, shard_response)
+                .map_err(|err| tonic::Status::internal(err.to_string()))?;
         }
 
-        splitter.get_response()
+        splitter
+            .get_response()
+            .map_err(|err| tonic::Status::internal(err.to_string()))
     }
 
     /// Fetches pages on the given shard. Does not retry internally.
@@ -379,12 +384,14 @@ pub struct ShardSpec {
     /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
     count: ShardCount,
     /// The stripe size for these shards.
-    stripe_size: ShardStripeSize,
+    ///
+    /// INVARIANT: None for unsharded tenants, Some for sharded.
+    stripe_size: Option<ShardStripeSize>,
 }
 
 impl ShardSpec {
     /// Creates a new shard spec with the given URLs and stripe size. All shards must be given.
-    /// The stripe size may be omitted for unsharded tenants.
+    /// The stripe size must be Some for sharded tenants, or None for unsharded tenants.
     pub fn new(
         urls: HashMap<ShardIndex, String>,
         stripe_size: Option<ShardStripeSize>,
@@ -397,11 +404,13 @@ impl ShardSpec {
             n => ShardCount::new(n as u8),
         };
 
-        // Determine the stripe size. It doesn't matter for unsharded tenants.
+        // Validate the stripe size.
         if stripe_size.is_none() && !count.is_unsharded() {
             return Err(anyhow!("stripe size must be given for sharded tenants"));
         }
-        let stripe_size = stripe_size.unwrap_or(DEFAULT_STRIPE_SIZE);
+        if stripe_size.is_some() && count.is_unsharded() {
+            return Err(anyhow!("stripe size can't be given for unsharded tenants"));
+        }
 
         // Validate the shard spec.
         for (shard_id, url) in &urls {
@@ -441,8 +450,10 @@ struct Shards {
     ///
     /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
     count: ShardCount,
-    /// The stripe size. Only used for sharded tenants.
-    stripe_size: ShardStripeSize,
+    /// The stripe size.
+    ///
+    /// INVARIANT: None for unsharded tenants, Some for sharded.
+    stripe_size: Option<ShardStripeSize>,
 }
 
 impl Shards {
diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/client_grpc/src/split.rs
index ca8965b8dd..8631638686 100644
--- a/pageserver/client_grpc/src/split.rs
+++ b/pageserver/client_grpc/src/split.rs
@@ -1,11 +1,12 @@
 use std::collections::HashMap;
 
+use anyhow::anyhow;
 use bytes::Bytes;
 
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::shard::key_to_shard_number;
 use pageserver_page_api as page_api;
-use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize};
+use utils::shard::{ShardCount, ShardIndex, ShardStripeSize};
 
 /// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
 /// TODO: add tests for this.
@@ -25,43 +26,54 @@ impl GetPageSplitter {
     pub fn for_single_shard(
         req: &page_api::GetPageRequest,
         count: ShardCount,
-        stripe_size: ShardStripeSize,
-    ) -> Option<ShardIndex> {
+        stripe_size: Option<ShardStripeSize>,
+    ) -> anyhow::Result<Option<ShardIndex>> {
         // Fast path: unsharded tenant.
         if count.is_unsharded() {
-            return Some(ShardIndex::unsharded());
+            return Ok(Some(ShardIndex::unsharded()));
         }
 
-        // Find the first page's shard, for comparison. If there are no pages, just return the first
-        // shard (caller likely checked already, otherwise the server will reject it).
+        let Some(stripe_size) = stripe_size else {
+            return Err(anyhow!("stripe size must be given for sharded tenants"));
+        };
+
+        // Find the first page's shard, for comparison.
         let Some(&first_page) = req.block_numbers.first() else {
-            return Some(ShardIndex::new(ShardNumber(0), count));
+            return Err(anyhow!("no block numbers in request"));
         };
         let key = rel_block_to_key(req.rel, first_page);
         let shard_number = key_to_shard_number(count, stripe_size, &key);
 
-        req.block_numbers
+        Ok(req
+            .block_numbers
             .iter()
             .skip(1) // computed above
             .all(|&blkno| {
                 let key = rel_block_to_key(req.rel, blkno);
                 key_to_shard_number(count, stripe_size, &key) == shard_number
             })
-            .then_some(ShardIndex::new(shard_number, count))
+            .then_some(ShardIndex::new(shard_number, count)))
     }
 
     /// Splits the given request.
     pub fn split(
         req: page_api::GetPageRequest,
         count: ShardCount,
-        stripe_size: ShardStripeSize,
-    ) -> Self {
+        stripe_size: Option<ShardStripeSize>,
+    ) -> anyhow::Result<Self> {
         // The caller should make sure we don't split requests unnecessarily.
         debug_assert!(
-            Self::for_single_shard(&req, count, stripe_size).is_none(),
+            Self::for_single_shard(&req, count, stripe_size)?.is_none(),
             "unnecessary request split"
         );
 
+        if count.is_unsharded() {
+            return Err(anyhow!("unsharded tenant, no point in splitting request"));
+        }
+        let Some(stripe_size) = stripe_size else {
+            return Err(anyhow!("stripe size must be given for sharded tenants"));
+        };
+
         // Split the requests by shard index.
         let mut requests = HashMap::with_capacity(2); // common case
         let mut block_shards = Vec::with_capacity(req.block_numbers.len());
@@ -103,11 +115,11 @@ impl GetPageSplitter {
                 .collect(),
         };
 
-        Self {
+        Ok(Self {
             requests,
             response,
             block_shards,
-        }
+        })
     }
 
     /// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations.
@@ -124,21 +136,30 @@ impl GetPageSplitter {
         &mut self,
         shard_id: ShardIndex,
         response: page_api::GetPageResponse,
-    ) -> tonic::Result<()> {
+    ) -> anyhow::Result<()> {
         // The caller should already have converted status codes into tonic::Status.
         if response.status_code != page_api::GetPageStatusCode::Ok {
-            return Err(tonic::Status::internal(format!(
+            return Err(anyhow!(
                 "unexpected non-OK response for shard {shard_id}: {} {}",
                 response.status_code,
                 response.reason.unwrap_or_default()
-            )));
+            ));
         }
 
         if response.request_id != self.response.request_id {
-            return Err(tonic::Status::internal(format!(
+            return Err(anyhow!(
                 "response ID mismatch for shard {shard_id}: expected {}, got {}",
-                self.response.request_id, response.request_id
-            )));
+                self.response.request_id,
+                response.request_id
+            ));
+        }
+
+        if response.request_id != self.response.request_id {
+            return Err(anyhow!(
+                "response ID mismatch for shard {shard_id}: expected {}, got {}",
+                self.response.request_id,
+                response.request_id
+            ));
         }
 
         // Place the shard response pages into the assembled response, in request order.
@@ -150,27 +171,26 @@ impl GetPageSplitter {
             }
 
             let Some(slot) = self.response.pages.get_mut(i) else {
-                return Err(tonic::Status::internal(format!(
-                    "no block_shards slot {i} for shard {shard_id}"
-                )));
+                return Err(anyhow!("no block_shards slot {i} for shard {shard_id}"));
             };
             let Some(page) = pages.next() else {
-                return Err(tonic::Status::internal(format!(
+                return Err(anyhow!(
                     "missing page {} in shard {shard_id} response",
                     slot.block_number
-                )));
+                ));
             };
             if page.block_number != slot.block_number {
-                return Err(tonic::Status::internal(format!(
+                return Err(anyhow!(
                     "shard {shard_id} returned wrong page at index {i}, expected {} got {}",
-                    slot.block_number, page.block_number
-                )));
+                    slot.block_number,
+                    page.block_number
+                ));
             }
             if !slot.image.is_empty() {
-                return Err(tonic::Status::internal(format!(
+                return Err(anyhow!(
                     "shard {shard_id} returned duplicate page {} at index {i}",
                     slot.block_number
-                )));
+                ));
             }
 
             *slot = page;
@@ -178,10 +198,10 @@ impl GetPageSplitter {
 
         // Make sure we've consumed all pages from the shard response.
         if let Some(extra_page) = pages.next() {
-            return Err(tonic::Status::internal(format!(
+            return Err(anyhow!(
                 "shard {shard_id} returned extra page: {}",
                 extra_page.block_number
-            )));
+            ));
         }
 
         Ok(())
@@ -189,18 +209,18 @@ impl GetPageSplitter {
 
     /// Fetches the final, assembled response.
     #[allow(clippy::result_large_err)]
-    pub fn get_response(self) -> tonic::Result<page_api::GetPageResponse> {
+    pub fn get_response(self) -> anyhow::Result<page_api::GetPageResponse> {
         // Check that the response is complete.
         for (i, page) in self.response.pages.iter().enumerate() {
             if page.image.is_empty() {
-                return Err(tonic::Status::internal(format!(
+                return Err(anyhow!(
                     "missing page {} for shard {}",
                     page.block_number,
                     self.block_shards
                         .get(i)
                         .map(|s| s.to_string())
                         .unwrap_or_else(|| "?".to_string())
-                )));
+                ));
             }
         }
 

From b2ecb10f914bd7f1ce1245a1a8d5f85411a06901 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 21 Jul 2025 13:50:13 +0100
Subject: [PATCH 36/39] [proxy] rework handling of notices in sql-over-http
 (#12659)

A replacement for #10254 which allows us to introduce notice messages
for sql-over-http in the future if we want to. This also removes the
`ParameterStatus` and `Notification` handling as there's nothing we
could/should do for those.
---
 .../postgres-protocol2/src/message/backend.rs |  35 +---
 libs/proxy/tokio-postgres2/src/client.rs      |  14 +-
 libs/proxy/tokio-postgres2/src/codec.rs       |  16 +-
 libs/proxy/tokio-postgres2/src/connect.rs     |  14 +-
 libs/proxy/tokio-postgres2/src/connect_raw.rs |  28 +--
 libs/proxy/tokio-postgres2/src/connection.rs  | 164 ++++++++----------
 libs/proxy/tokio-postgres2/src/lib.rs         |  16 --
 proxy/src/serverless/conn_pool.rs             |  53 ++----
 proxy/src/serverless/local_conn_pool.rs       |  62 +++----
 9 files changed, 150 insertions(+), 252 deletions(-)

diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs
index 3fc9a9335c..b1728ef37d 100644
--- a/libs/proxy/postgres-protocol2/src/message/backend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/backend.rs
@@ -74,7 +74,6 @@ impl Header {
 }
 
 /// An enum representing Postgres backend messages.
-#[non_exhaustive]
 pub enum Message {
     AuthenticationCleartextPassword,
     AuthenticationGss,
@@ -145,16 +144,7 @@ impl Message {
             PARSE_COMPLETE_TAG => Message::ParseComplete,
             BIND_COMPLETE_TAG => Message::BindComplete,
             CLOSE_COMPLETE_TAG => Message::CloseComplete,
-            NOTIFICATION_RESPONSE_TAG => {
-                let process_id = buf.read_i32::<BigEndian>()?;
-                let channel = buf.read_cstr()?;
-                let message = buf.read_cstr()?;
-                Message::NotificationResponse(NotificationResponseBody {
-                    process_id,
-                    channel,
-                    message,
-                })
-            }
+            NOTIFICATION_RESPONSE_TAG => Message::NotificationResponse(NotificationResponseBody {}),
             COPY_DONE_TAG => Message::CopyDone,
             COMMAND_COMPLETE_TAG => {
                 let tag = buf.read_cstr()?;
@@ -543,28 +533,7 @@ impl NoticeResponseBody {
     }
 }
 
-pub struct NotificationResponseBody {
-    process_id: i32,
-    channel: Bytes,
-    message: Bytes,
-}
-
-impl NotificationResponseBody {
-    #[inline]
-    pub fn process_id(&self) -> i32 {
-        self.process_id
-    }
-
-    #[inline]
-    pub fn channel(&self) -> io::Result<&str> {
-        get_str(&self.channel)
-    }
-
-    #[inline]
-    pub fn message(&self) -> io::Result<&str> {
-        get_str(&self.message)
-    }
-}
+pub struct NotificationResponseBody {}
 
 pub struct ParameterDescriptionBody {
     storage: Bytes,
diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
index 828884ffd8..068566e955 100644
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -13,7 +13,7 @@ use serde::{Deserialize, Serialize};
 use tokio::sync::mpsc;
 
 use crate::cancel_token::RawCancelToken;
-use crate::codec::{BackendMessages, FrontendMessage};
+use crate::codec::{BackendMessages, FrontendMessage, RecordNotices};
 use crate::config::{Host, SslMode};
 use crate::query::RowStream;
 use crate::simple_query::SimpleQueryStream;
@@ -221,6 +221,18 @@ impl Client {
         &mut self.inner
     }
 
+    pub fn record_notices(&mut self, limit: usize) -> mpsc::UnboundedReceiver<Box<str>> {
+        let (tx, rx) = mpsc::unbounded_channel();
+
+        let notices = RecordNotices { sender: tx, limit };
+        self.inner
+            .sender
+            .send(FrontendMessage::RecordNotices(notices))
+            .ok();
+
+        rx
+    }
+
     /// Pass text directly to the Postgres backend to allow it to sort out typing itself and
     /// to save a roundtrip
     pub async fn query_raw_txt<S, I>(
diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs
index daa5371426..813faa0e35 100644
--- a/libs/proxy/tokio-postgres2/src/codec.rs
+++ b/libs/proxy/tokio-postgres2/src/codec.rs
@@ -3,10 +3,17 @@ use std::io;
 use bytes::{Bytes, BytesMut};
 use fallible_iterator::FallibleIterator;
 use postgres_protocol2::message::backend;
+use tokio::sync::mpsc::UnboundedSender;
 use tokio_util::codec::{Decoder, Encoder};
 
 pub enum FrontendMessage {
     Raw(Bytes),
+    RecordNotices(RecordNotices),
+}
+
+pub struct RecordNotices {
+    pub sender: UnboundedSender<Box<str>>,
+    pub limit: usize,
 }
 
 pub enum BackendMessage {
@@ -33,14 +40,11 @@ impl FallibleIterator for BackendMessages {
 
 pub struct PostgresCodec;
 
-impl Encoder<FrontendMessage> for PostgresCodec {
+impl Encoder<Bytes> for PostgresCodec {
     type Error = io::Error;
 
-    fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> {
-        match item {
-            FrontendMessage::Raw(buf) => dst.extend_from_slice(&buf),
-        }
-
+    fn encode(&mut self, item: Bytes, dst: &mut BytesMut) -> io::Result<()> {
+        dst.extend_from_slice(&item);
         Ok(())
     }
 }
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index 4a07eccf9a..2f718e1e7d 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -1,11 +1,9 @@
 use std::net::IpAddr;
 
-use postgres_protocol2::message::backend::Message;
 use tokio::net::TcpStream;
 use tokio::sync::mpsc;
 
 use crate::client::SocketConfig;
-use crate::codec::BackendMessage;
 use crate::config::Host;
 use crate::connect_raw::connect_raw;
 use crate::connect_socket::connect_socket;
@@ -48,8 +46,8 @@ where
     let stream = connect_tls(socket, config.ssl_mode, tls).await?;
     let RawConnection {
         stream,
-        parameters,
-        delayed_notice,
+        parameters: _,
+        delayed_notice: _,
         process_id,
         secret_key,
     } = connect_raw(stream, config).await?;
@@ -72,13 +70,7 @@ where
         secret_key,
     );
 
-    // delayed notices are always sent as "Async" messages.
-    let delayed = delayed_notice
-        .into_iter()
-        .map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
-        .collect();
-
-    let connection = Connection::new(stream, delayed, parameters, conn_tx, conn_rx);
+    let connection = Connection::new(stream, conn_tx, conn_rx);
 
     Ok((client, connection))
 }
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
index b89a600a2e..462e1be1aa 100644
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -3,7 +3,7 @@ use std::io;
 use std::pin::Pin;
 use std::task::{Context, Poll};
 
-use bytes::BytesMut;
+use bytes::{Bytes, BytesMut};
 use fallible_iterator::FallibleIterator;
 use futures_util::{Sink, SinkExt, Stream, TryStreamExt, ready};
 use postgres_protocol2::authentication::sasl;
@@ -14,7 +14,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::codec::Framed;
 
 use crate::Error;
-use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
+use crate::codec::{BackendMessage, BackendMessages, PostgresCodec};
 use crate::config::{self, AuthKeys, Config};
 use crate::maybe_tls_stream::MaybeTlsStream;
 use crate::tls::TlsStream;
@@ -25,7 +25,7 @@ pub struct StartupStream<S, T> {
     delayed_notice: Vec<NoticeResponseBody>,
 }
 
-impl<S, T> Sink<FrontendMessage> for StartupStream<S, T>
+impl<S, T> Sink<Bytes> for StartupStream<S, T>
 where
     S: AsyncRead + AsyncWrite + Unpin,
     T: AsyncRead + AsyncWrite + Unpin,
@@ -36,7 +36,7 @@ where
         Pin::new(&mut self.inner).poll_ready(cx)
     }
 
-    fn start_send(mut self: Pin<&mut Self>, item: FrontendMessage) -> io::Result<()> {
+    fn start_send(mut self: Pin<&mut Self>, item: Bytes) -> io::Result<()> {
         Pin::new(&mut self.inner).start_send(item)
     }
 
@@ -120,10 +120,7 @@ where
     let mut buf = BytesMut::new();
     frontend::startup_message(&config.server_params, &mut buf).map_err(Error::encode)?;
 
-    stream
-        .send(FrontendMessage::Raw(buf.freeze()))
-        .await
-        .map_err(Error::io)
+    stream.send(buf.freeze()).await.map_err(Error::io)
 }
 
 async fn authenticate<S, T>(stream: &mut StartupStream<S, T>, config: &Config) -> Result<(), Error>
@@ -191,10 +188,7 @@ where
     let mut buf = BytesMut::new();
     frontend::password_message(password, &mut buf).map_err(Error::encode)?;
 
-    stream
-        .send(FrontendMessage::Raw(buf.freeze()))
-        .await
-        .map_err(Error::io)
+    stream.send(buf.freeze()).await.map_err(Error::io)
 }
 
 async fn authenticate_sasl<S, T>(
@@ -253,10 +247,7 @@ where
 
     let mut buf = BytesMut::new();
     frontend::sasl_initial_response(mechanism, scram.message(), &mut buf).map_err(Error::encode)?;
-    stream
-        .send(FrontendMessage::Raw(buf.freeze()))
-        .await
-        .map_err(Error::io)?;
+    stream.send(buf.freeze()).await.map_err(Error::io)?;
 
     let body = match stream.try_next().await.map_err(Error::io)? {
         Some(Message::AuthenticationSaslContinue(body)) => body,
@@ -272,10 +263,7 @@ where
 
     let mut buf = BytesMut::new();
     frontend::sasl_response(scram.message(), &mut buf).map_err(Error::encode)?;
-    stream
-        .send(FrontendMessage::Raw(buf.freeze()))
-        .await
-        .map_err(Error::io)?;
+    stream.send(buf.freeze()).await.map_err(Error::io)?;
 
     let body = match stream.try_next().await.map_err(Error::io)? {
         Some(Message::AuthenticationSaslFinal(body)) => body,
diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs
index fe0372b266..c43a22ffe7 100644
--- a/libs/proxy/tokio-postgres2/src/connection.rs
+++ b/libs/proxy/tokio-postgres2/src/connection.rs
@@ -1,22 +1,23 @@
-use std::collections::{HashMap, VecDeque};
 use std::future::Future;
 use std::pin::Pin;
 use std::task::{Context, Poll};
 
 use bytes::BytesMut;
-use futures_util::{Sink, Stream, ready};
-use postgres_protocol2::message::backend::Message;
+use fallible_iterator::FallibleIterator;
+use futures_util::{Sink, StreamExt, ready};
+use postgres_protocol2::message::backend::{Message, NoticeResponseBody};
 use postgres_protocol2::message::frontend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::sync::mpsc;
 use tokio_util::codec::Framed;
 use tokio_util::sync::PollSender;
-use tracing::{info, trace};
+use tracing::trace;
 
-use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
-use crate::error::DbError;
+use crate::Error;
+use crate::codec::{
+    BackendMessage, BackendMessages, FrontendMessage, PostgresCodec, RecordNotices,
+};
 use crate::maybe_tls_stream::MaybeTlsStream;
-use crate::{AsyncMessage, Error, Notification};
 
 #[derive(PartialEq, Debug)]
 enum State {
@@ -33,18 +34,18 @@ enum State {
 /// occurred, or because its associated `Client` has dropped and all outstanding work has completed.
 #[must_use = "futures do nothing unless polled"]
 pub struct Connection<S, T> {
-    /// HACK: we need this in the Neon Proxy.
-    pub stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
-    /// HACK: we need this in the Neon Proxy to forward params.
-    pub parameters: HashMap<String, String>,
+    stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
 
     sender: PollSender<BackendMessages>,
     receiver: mpsc::UnboundedReceiver<FrontendMessage>,
+    notices: Option<RecordNotices>,
 
-    pending_responses: VecDeque<BackendMessage>,
+    pending_response: Option<BackendMessages>,
     state: State,
 }
 
+pub enum Never {}
+
 impl<S, T> Connection<S, T>
 where
     S: AsyncRead + AsyncWrite + Unpin,
@@ -52,70 +53,42 @@ where
 {
     pub(crate) fn new(
         stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
-        pending_responses: VecDeque<BackendMessage>,
-        parameters: HashMap<String, String>,
         sender: mpsc::Sender<BackendMessages>,
         receiver: mpsc::UnboundedReceiver<FrontendMessage>,
     ) -> Connection<S, T> {
         Connection {
             stream,
-            parameters,
             sender: PollSender::new(sender),
             receiver,
-            pending_responses,
+            notices: None,
+            pending_response: None,
             state: State::Active,
         }
     }
 
-    fn poll_response(
-        &mut self,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<BackendMessage, Error>>> {
-        if let Some(message) = self.pending_responses.pop_front() {
-            trace!("retrying pending response");
-            return Poll::Ready(Some(Ok(message)));
-        }
-
-        Pin::new(&mut self.stream)
-            .poll_next(cx)
-            .map(|o| o.map(|r| r.map_err(Error::io)))
-    }
-
     /// Read and process messages from the connection to postgres.
     /// client <- postgres
-    fn poll_read(&mut self, cx: &mut Context<'_>) -> Poll<Result<AsyncMessage, Error>> {
+    fn poll_read(&mut self, cx: &mut Context<'_>) -> Poll<Result<Never, Error>> {
         loop {
-            let message = match self.poll_response(cx)? {
-                Poll::Ready(Some(message)) => message,
-                Poll::Ready(None) => return Poll::Ready(Err(Error::closed())),
-                Poll::Pending => {
-                    trace!("poll_read: waiting on response");
-                    return Poll::Pending;
-                }
-            };
-
-            let messages = match message {
-                BackendMessage::Async(Message::NoticeResponse(body)) => {
-                    let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?;
-                    return Poll::Ready(Ok(AsyncMessage::Notice(error)));
-                }
-                BackendMessage::Async(Message::NotificationResponse(body)) => {
-                    let notification = Notification {
-                        process_id: body.process_id(),
-                        channel: body.channel().map_err(Error::parse)?.to_string(),
-                        payload: body.message().map_err(Error::parse)?.to_string(),
+            let messages = match self.pending_response.take() {
+                Some(messages) => messages,
+                None => {
+                    let message = match self.stream.poll_next_unpin(cx) {
+                        Poll::Pending => return Poll::Pending,
+                        Poll::Ready(None) => return Poll::Ready(Err(Error::closed())),
+                        Poll::Ready(Some(Err(e))) => return Poll::Ready(Err(Error::io(e))),
+                        Poll::Ready(Some(Ok(message))) => message,
                     };
-                    return Poll::Ready(Ok(AsyncMessage::Notification(notification)));
+
+                    match message {
+                        BackendMessage::Async(Message::NoticeResponse(body)) => {
+                            self.handle_notice(body)?;
+                            continue;
+                        }
+                        BackendMessage::Async(_) => continue,
+                        BackendMessage::Normal { messages } => messages,
+                    }
                 }
-                BackendMessage::Async(Message::ParameterStatus(body)) => {
-                    self.parameters.insert(
-                        body.name().map_err(Error::parse)?.to_string(),
-                        body.value().map_err(Error::parse)?.to_string(),
-                    );
-                    continue;
-                }
-                BackendMessage::Async(_) => unreachable!(),
-                BackendMessage::Normal { messages } => messages,
             };
 
             match self.sender.poll_reserve(cx) {
@@ -126,8 +99,7 @@ where
                     return Poll::Ready(Err(Error::closed()));
                 }
                 Poll::Pending => {
-                    self.pending_responses
-                        .push_back(BackendMessage::Normal { messages });
+                    self.pending_response = Some(messages);
                     trace!("poll_read: waiting on sender");
                     return Poll::Pending;
                 }
@@ -135,6 +107,31 @@ where
         }
     }
 
+    fn handle_notice(&mut self, body: NoticeResponseBody) -> Result<(), Error> {
+        let Some(notices) = &mut self.notices else {
+            return Ok(());
+        };
+
+        let mut fields = body.fields();
+        while let Some(field) = fields.next().map_err(Error::parse)? {
+            // loop until we find the message field
+            if field.type_() == b'M' {
+                // if the message field is within the limit, send it.
+                if let Some(new_limit) = notices.limit.checked_sub(field.value().len()) {
+                    match notices.sender.send(field.value().into()) {
+                        // set the new limit.
+                        Ok(()) => notices.limit = new_limit,
+                        // closed.
+                        Err(_) => self.notices = None,
+                    }
+                }
+                break;
+            }
+        }
+
+        Ok(())
+    }
+
     /// Fetch the next client request and enqueue the response sender.
     fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll<Option<FrontendMessage>> {
         if self.receiver.is_closed() {
@@ -168,21 +165,23 @@ where
 
             match self.poll_request(cx) {
                 // send the message to postgres
-                Poll::Ready(Some(request)) => {
+                Poll::Ready(Some(FrontendMessage::Raw(request))) => {
                     Pin::new(&mut self.stream)
                         .start_send(request)
                         .map_err(Error::io)?;
                 }
+                Poll::Ready(Some(FrontendMessage::RecordNotices(notices))) => {
+                    self.notices = Some(notices)
+                }
                 // No more messages from the client, and no more responses to wait for.
                 // Send a terminate message to postgres
                 Poll::Ready(None) => {
                     trace!("poll_write: at eof, terminating");
                     let mut request = BytesMut::new();
                     frontend::terminate(&mut request);
-                    let request = FrontendMessage::Raw(request.freeze());
 
                     Pin::new(&mut self.stream)
-                        .start_send(request)
+                        .start_send(request.freeze())
                         .map_err(Error::io)?;
 
                     trace!("poll_write: sent eof, closing");
@@ -231,34 +230,17 @@ where
         }
     }
 
-    /// Returns the value of a runtime parameter for this connection.
-    pub fn parameter(&self, name: &str) -> Option<&str> {
-        self.parameters.get(name).map(|s| &**s)
-    }
-
-    /// Polls for asynchronous messages from the server.
-    ///
-    /// The server can send notices as well as notifications asynchronously to the client. Applications that wish to
-    /// examine those messages should use this method to drive the connection rather than its `Future` implementation.
-    pub fn poll_message(
-        &mut self,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<AsyncMessage, Error>>> {
+    fn poll_message(&mut self, cx: &mut Context<'_>) -> Poll<Option<Result<Never, Error>>> {
         if self.state != State::Closing {
             // if the state is still active, try read from and write to postgres.
-            let message = self.poll_read(cx)?;
-            let closing = self.poll_write(cx)?;
-            if let Poll::Ready(()) = closing {
+            let Poll::Pending = self.poll_read(cx)?;
+            if self.poll_write(cx)?.is_ready() {
                 self.state = State::Closing;
             }
 
-            if let Poll::Ready(message) = message {
-                return Poll::Ready(Some(Ok(message)));
-            }
-
             // poll_read returned Pending.
-            // poll_write returned Pending or Ready(WriteReady::WaitingOnRead).
-            // if poll_write returned Ready(WriteReady::WaitingOnRead), then we are waiting to read more data from postgres.
+            // poll_write returned Pending or Ready(()).
+            // if poll_write returned Ready(()), then we are waiting to read more data from postgres.
             if self.state != State::Closing {
                 return Poll::Pending;
             }
@@ -280,11 +262,9 @@ where
     type Output = Result<(), Error>;
 
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
-        while let Some(message) = ready!(self.poll_message(cx)?) {
-            if let AsyncMessage::Notice(notice) = message {
-                info!("{}: {}", notice.severity(), notice.message());
-            }
+        match self.poll_message(cx)? {
+            Poll::Ready(None) => Poll::Ready(Ok(())),
+            Poll::Pending => Poll::Pending,
         }
-        Poll::Ready(Ok(()))
     }
 }
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
index 791c93b972..e3dd6d9261 100644
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -8,7 +8,6 @@ pub use crate::client::{Client, SocketConfig};
 pub use crate::config::Config;
 pub use crate::connect_raw::RawConnection;
 pub use crate::connection::Connection;
-use crate::error::DbError;
 pub use crate::error::Error;
 pub use crate::generic_client::GenericClient;
 pub use crate::query::RowStream;
@@ -93,21 +92,6 @@ impl Notification {
     }
 }
 
-/// An asynchronous message from the server.
-#[allow(clippy::large_enum_variant)]
-#[derive(Debug, Clone)]
-#[non_exhaustive]
-pub enum AsyncMessage {
-    /// A notice.
-    ///
-    /// Notices use the same format as errors, but aren't "errors" per-se.
-    Notice(DbError),
-    /// A notification.
-    ///
-    /// Connections can subscribe to notifications with the `LISTEN` command.
-    Notification(Notification),
-}
-
 /// Message returned by the `SimpleQuery` stream.
 #[derive(Debug)]
 #[non_exhaustive]
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 672e59f81f..015c46f787 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -3,15 +3,14 @@ use std::pin::pin;
 use std::sync::{Arc, Weak};
 use std::task::{Poll, ready};
 
-use futures::Future;
 use futures::future::poll_fn;
-use postgres_client::AsyncMessage;
+use futures::{Future, FutureExt};
 use postgres_client::tls::MakeTlsConnect;
 use smallvec::SmallVec;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{Instrument, error, info, info_span, warn};
+use tracing::{error, info, info_span};
 #[cfg(test)]
 use {
     super::conn_pool_lib::GlobalConnPoolOptions,
@@ -85,16 +84,17 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     let cancel = CancellationToken::new();
     let cancelled = cancel.clone().cancelled_owned();
 
-    tokio::spawn(
-    async move {
+    tokio::spawn(async move {
         let _conn_gauge = conn_gauge;
         let mut idle_timeout = pin!(tokio::time::sleep(idle));
         let mut cancelled = pin!(cancelled);
 
         poll_fn(move |cx| {
+            let _instrument = span.enter();
+
             if cancelled.as_mut().poll(cx).is_ready() {
                 info!("connection dropped");
-                return Poll::Ready(())
+                return Poll::Ready(());
             }
 
             match rx.has_changed() {
@@ -105,7 +105,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
                 }
                 Err(_) => {
                     info!("connection dropped");
-                    return Poll::Ready(())
+                    return Poll::Ready(());
                 }
                 _ => {}
             }
@@ -123,41 +123,22 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
                 }
             }
 
-            loop {
-                let message = ready!(connection.poll_message(cx));
-
-                match message {
-                    Some(Ok(AsyncMessage::Notice(notice))) => {
-                        info!(%session_id, "notice: {}", notice);
-                    }
-                    Some(Ok(AsyncMessage::Notification(notif))) => {
-                        warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                    }
-                    Some(Ok(_)) => {
-                        warn!(%session_id, "unknown message");
-                    }
-                    Some(Err(e)) => {
-                        error!(%session_id, "connection error: {}", e);
-                        break
-                    }
-                    None => {
-                        info!("connection closed");
-                        break
-                    }
-                }
+            match ready!(connection.poll_unpin(cx)) {
+                Err(e) => error!(%session_id, "connection error: {}", e),
+                Ok(()) => info!("connection closed"),
             }
 
             // remove from connection pool
             if let Some(pool) = pool.clone().upgrade()
-                && pool.write().remove_client(db_user.clone(), conn_id) {
-                    info!("closed connection removed");
-                }
+                && pool.write().remove_client(db_user.clone(), conn_id)
+            {
+                info!("closed connection removed");
+            }
 
             Poll::Ready(())
-        }).await;
-
-    }
-    .instrument(span));
+        })
+        .await;
+    });
     let inner = ClientInnerCommon {
         inner: client,
         aux,
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index e4cbd02bfe..f63d84d66b 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -19,18 +19,17 @@ use std::time::Duration;
 use base64::Engine as _;
 use base64::prelude::BASE64_URL_SAFE_NO_PAD;
 use ed25519_dalek::{Signature, Signer, SigningKey};
-use futures::Future;
 use futures::future::poll_fn;
+use futures::{Future, FutureExt};
 use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use parking_lot::RwLock;
-use postgres_client::AsyncMessage;
 use postgres_client::tls::NoTlsStream;
 use serde_json::value::RawValue;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{Instrument, debug, error, info, info_span, warn};
+use tracing::{debug, error, info, info_span};
 
 use super::backend::HttpConnError;
 use super::conn_pool_lib::{
@@ -186,16 +185,17 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     let cancel = CancellationToken::new();
     let cancelled = cancel.clone().cancelled_owned();
 
-    tokio::spawn(
-    async move {
+    tokio::spawn(async move {
         let _conn_gauge = conn_gauge;
         let mut idle_timeout = pin!(tokio::time::sleep(idle));
         let mut cancelled = pin!(cancelled);
 
         poll_fn(move |cx| {
+            let _instrument = span.enter();
+
             if cancelled.as_mut().poll(cx).is_ready() {
                 info!("connection dropped");
-                return Poll::Ready(())
+                return Poll::Ready(());
             }
 
             match rx.has_changed() {
@@ -206,7 +206,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
                 }
                 Err(_) => {
                     info!("connection dropped");
-                    return Poll::Ready(())
+                    return Poll::Ready(());
                 }
                 _ => {}
             }
@@ -218,47 +218,35 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
                 if let Some(pool) = pool.clone().upgrade() {
                     // remove client from pool - should close the connection if it's idle.
                     // does nothing if the client is currently checked-out and in-use
-                    if pool.global_pool.write().remove_client(db_user.clone(), conn_id) {
+                    if pool
+                        .global_pool
+                        .write()
+                        .remove_client(db_user.clone(), conn_id)
+                    {
                         info!("idle connection removed");
                     }
                 }
             }
 
-            loop {
-                let message = ready!(connection.poll_message(cx));
-
-                match message {
-                    Some(Ok(AsyncMessage::Notice(notice))) => {
-                        info!(%session_id, "notice: {}", notice);
-                    }
-                    Some(Ok(AsyncMessage::Notification(notif))) => {
-                        warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                    }
-                    Some(Ok(_)) => {
-                        warn!(%session_id, "unknown message");
-                    }
-                    Some(Err(e)) => {
-                        error!(%session_id, "connection error: {}", e);
-                        break
-                    }
-                    None => {
-                        info!("connection closed");
-                        break
-                    }
-                }
+            match ready!(connection.poll_unpin(cx)) {
+                Err(e) => error!(%session_id, "connection error: {}", e),
+                Ok(()) => info!("connection closed"),
             }
 
             // remove from connection pool
             if let Some(pool) = pool.clone().upgrade()
-                && pool.global_pool.write().remove_client(db_user.clone(), conn_id) {
-                    info!("closed connection removed");
-                }
+                && pool
+                    .global_pool
+                    .write()
+                    .remove_client(db_user.clone(), conn_id)
+            {
+                info!("closed connection removed");
+            }
 
             Poll::Ready(())
-        }).await;
-
-    }
-    .instrument(span));
+        })
+        .await;
+    });
 
     let inner = ClientInnerCommon {
         inner: client,

From 25efbcc7f0603b106afdc890fe7ee3e28218e1bc Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 21 Jul 2025 15:47:58 +0100
Subject: [PATCH 37/39] safekeeper: parallelise segment copy (#12664)

Parallelise segment copying on the SK. I'm not aware of the neon
deployment using this endpoint.
---
 safekeeper/src/wal_backup.rs | 72 ++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 16 deletions(-)

diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 0e8dfd64c3..03c8f7e84a 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -8,7 +8,7 @@ use std::time::Duration;
 use anyhow::{Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
-use futures::stream::FuturesOrdered;
+use futures::stream::{self, FuturesOrdered};
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
 use remote_storage::{
@@ -723,8 +723,6 @@ pub async fn copy_s3_segments(
     from_segment: XLogSegNo,
     to_segment: XLogSegNo,
 ) -> Result<()> {
-    const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024;
-
     let remote_dst_path = remote_timeline_path(dst_ttid)?;
 
     let cancel = CancellationToken::new();
@@ -744,27 +742,69 @@ pub async fn copy_s3_segments(
         .filter_map(|o| o.key.object_name().map(ToOwned::to_owned))
         .collect::<HashSet<_>>();
 
-    debug!(
+    info!(
         "these segments have already been uploaded: {:?}",
         uploaded_segments
     );
 
-    for segno in from_segment..to_segment {
-        if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
-            info!("copied all segments from {} until {}", from_segment, segno);
-        }
+    /* BEGIN_HADRON */
+    // Copying multiple segments async.
+    let mut copy_stream = stream::iter(from_segment..to_segment)
+        .map(|segno| {
+            let segment_name = XLogFileName(PG_TLI, segno, wal_seg_size);
+            let remote_dst_path = remote_dst_path.clone();
+            let cancel = cancel.clone();
 
-        let segment_name = XLogFileName(PG_TLI, segno, wal_seg_size);
-        if uploaded_segments.contains(&segment_name) {
-            continue;
-        }
-        debug!("copying segment {}", segment_name);
+            async move {
+                if uploaded_segments.contains(&segment_name) {
+                    return Ok(());
+                }
 
-        let from = remote_timeline_path(src_ttid)?.join(&segment_name);
-        let to = remote_dst_path.join(&segment_name);
+                if segno % 1000 == 0 {
+                    info!("copying segment {} {}", segno, segment_name);
+                }
 
-        storage.copy_object(&from, &to, &cancel).await?;
+                let from = remote_timeline_path(src_ttid)?.join(&segment_name);
+                let to = remote_dst_path.join(&segment_name);
+
+                // Retry logic: retry up to 10 times with 1 second delay
+                let mut retry_count = 0;
+                const MAX_RETRIES: u32 = 10;
+
+                loop {
+                    match storage.copy_object(&from, &to, &cancel).await {
+                        Ok(()) => return Ok(()),
+                        Err(e) => {
+                            if cancel.is_cancelled() {
+                                // Don't retry if cancellation was requested
+                                return Err(e);
+                            }
+
+                            retry_count += 1;
+                            if retry_count >= MAX_RETRIES {
+                                error!(
+                                    "Failed to copy segment {} after {} retries: {}",
+                                    segment_name, MAX_RETRIES, e
+                                );
+                                return Err(e);
+                            }
+                            warn!(
+                                "Failed to copy segment {} (attempt {}/{}): {}, retrying...",
+                                segment_name, retry_count, MAX_RETRIES, e
+                            );
+                            tokio::time::sleep(Duration::from_secs(1)).await;
+                        }
+                    }
+                }
+            }
+        })
+        .buffer_unordered(32); // Limit to 32 concurrent uploads
+
+    // Process results, stopping on first error
+    while let Some(result) = copy_stream.next().await {
+        result?;
     }
+    /* END_HADRON */
 
     info!(
         "finished copying segments from {} until {}",

From 30e1213141ee38c08c0fc69fc53843eda04488e9 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 21 Jul 2025 16:32:28 +0100
Subject: [PATCH 38/39] pageserver: check env var for ip address before node
 registration (#12666)

Include the ip address (optionally read from an env var) in the
pageserver's registration request.
Note that the ip address is ignored by the storage controller at the
moment, which makes it a no-op
in the neon env.
---
 pageserver/src/controller_upcall_client.rs | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index be1de43d18..8da4cee4b9 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::net::IpAddr;
 
 use futures::Future;
 use pageserver_api::config::NodeMetadata;
@@ -16,7 +17,7 @@ use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
-use utils::{backoff, failpoint_support};
+use utils::{backoff, failpoint_support, ip_address};
 
 use crate::config::PageServerConf;
 use crate::virtual_file::on_fatal_io_error;
@@ -27,6 +28,7 @@ pub struct StorageControllerUpcallClient {
     http_client: reqwest::Client,
     base_url: Url,
     node_id: NodeId,
+    node_ip_addr: Option<IpAddr>,
     cancel: CancellationToken,
 }
 
@@ -91,11 +93,18 @@ impl StorageControllerUpcallClient {
             );
         }
 
+        // Intentionally panics if we encountered any errors parsing or reading the IP address.
+        // Note that if the required environment variable is not set, `read_node_ip_addr_from_env` returns `Ok(None)`
+        // instead of an error.
+        let node_ip_addr =
+            ip_address::read_node_ip_addr_from_env().expect("Error reading node IP address.");
+
         Self {
             http_client: client.build().expect("Failed to construct HTTP client"),
             base_url: url,
             node_id: conf.id,
             cancel: cancel.clone(),
+            node_ip_addr,
         }
     }
 
@@ -193,8 +202,8 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
                         listen_http_addr: m.http_host,
                         listen_http_port: m.http_port,
                         listen_https_port: m.https_port,
+                        node_ip_addr: self.node_ip_addr,
                         availability_zone_id: az_id.expect("Checked above"),
-                        node_ip_addr: None,
                     })
                 }
                 Err(e) => {

From 187170be47dc4701c84cb67aeccbbb9ff55ddf51 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 21 Jul 2025 12:58:03 -0500
Subject: [PATCH 39/39] Add max_wal_rate test (#12621)

## Problem
Add a test for max_wal_rate

## Summary of changes
Test max_wal_rate

## How is this tested?
python test

Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>
---
 test_runner/regress/test_pg_regress.py | 69 ++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 728241b465..a240071a7f 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -3,6 +3,7 @@
 #
 from __future__ import annotations
 
+import time
 from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING, Any, cast
 
@@ -356,6 +357,74 @@ def test_sql_regress(
     post_checks(env, test_output_dir, DBNAME, endpoint)
 
 
+def test_max_wal_rate(neon_simple_env: NeonEnv):
+    """
+    Test the databricks.max_wal_mb_per_second GUC and how it affects WAL rate
+    limiting.
+    """
+    env = neon_simple_env
+
+    DBNAME = "regression"
+    superuser_name = "databricks_superuser"
+
+    # Connect to postgres and create a database called "regression".
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql_many(
+        [
+            f"CREATE ROLE {superuser_name}",
+            f"CREATE DATABASE {DBNAME}",
+            "CREATE EXTENSION neon",
+        ]
+    )
+
+    endpoint.safe_psql("CREATE TABLE usertable (YCSB_KEY INT, FIELD0 TEXT);", dbname=DBNAME)
+
+    # Write ~1 MB data.
+    with endpoint.cursor(dbname=DBNAME) as cur:
+        for _ in range(0, 1000):
+            cur.execute("INSERT INTO usertable SELECT random(), repeat('a', 1000);")
+
+    # No backpressure
+    tuples = endpoint.safe_psql("SELECT backpressure_throttling_time();")
+    assert tuples[0][0] == 0, "Backpressure throttling detected"
+
+    # 0 MB/s max_wal_rate. WAL proposer can still push some WALs but will be super slow.
+    endpoint.safe_psql_many(
+        [
+            "ALTER SYSTEM SET databricks.max_wal_mb_per_second = 0;",
+            "SELECT pg_reload_conf();",
+        ]
+    )
+
+    # Write ~10 KB data should hit backpressure.
+    with endpoint.cursor(dbname=DBNAME) as cur:
+        cur.execute("SET databricks.max_wal_mb_per_second = 0;")
+        for _ in range(0, 10):
+            cur.execute("INSERT INTO usertable SELECT random(), repeat('a', 1000);")
+
+    tuples = endpoint.safe_psql("SELECT backpressure_throttling_time();")
+    assert tuples[0][0] > 0, "No backpressure throttling detected"
+
+    # 1 MB/s max_wal_rate.
+    endpoint.safe_psql_many(
+        [
+            "ALTER SYSTEM SET databricks.max_wal_mb_per_second = 1;",
+            "SELECT pg_reload_conf();",
+        ]
+    )
+
+    # Write 10 MB data.
+    with endpoint.cursor(dbname=DBNAME) as cur:
+        start = int(time.time())
+        for _ in range(0, 10000):
+            cur.execute("INSERT INTO usertable SELECT random(), repeat('a', 1000);")
+
+    end = int(time.time())
+    assert end - start >= 10, (
+        "Throttling should cause the previous inserts to take greater than or equal to 10 seconds"
+    )
+
+
 @skip_in_debug_build("only run with release build")
 @pytest.mark.parametrize("reldir_type", ["v1", "v2"])
 def test_tx_abort_with_many_relations(