From 4bbabc092a7b675eb5b624c3d647e216e98dbe2d Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Wed, 9 Jul 2025 21:16:06 +0400
Subject: [PATCH 01/56] tests: wait for flush lsn in
 test_branch_creation_before_gc (#12527)

## Problem
Test `test_branch_creation_before_gc` is flaky in the internal repo.
Pageserver sometimes lags behind write LSN. When we call GC it might not
reach the LSN we try to create the branch at yet.

## Summary of changes
- Wait till flush lsn on pageserver reaches the latest LSN before
calling GC.
---
 test_runner/regress/test_branch_and_gc.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index 8447c9bf2d..148f469a95 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING
 import pytest
 from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
+from fixtures.neon_fixtures import wait_for_last_flush_lsn
 from fixtures.pageserver.http import TimelineCreate406
 from fixtures.utils import query_scalar, skip_in_debug_build
 
@@ -162,6 +163,9 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
     )
     lsn = Lsn(res[2][0][0])
 
+    # Wait for all WAL to reach the pageserver, so GC cutoff LSN is greater than `lsn`.
+    wait_for_last_flush_lsn(env, endpoint0, tenant, b0)
+
     # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the
     # branch creation task but the individual timeline GC iteration happens *after*
     # the branch creation task.

From fe0ddb7169514b3c755c01a519d1067b696d2925 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 9 Jul 2025 18:41:34 +0100
Subject: [PATCH 02/56] libs: make remote storage failure injection
 probabilistic (#12526)

Change the unreliable storage wrapper to fail by probability when there
are more failure attempts left.

Co-authored-by: Yecheng Yang <carlton.yang@databricks.com>
---
 libs/pageserver_api/src/config.rs            |  2 +
 libs/remote_storage/Cargo.toml               |  1 +
 libs/remote_storage/src/lib.rs               | 10 +++-
 libs/remote_storage/src/simulate_failures.rs | 31 ++++++++--
 libs/utils/src/env.rs                        | 59 ++++++++++++++++++++
 pageserver/src/bin/pageserver.rs             |  7 ++-
 pageserver/src/config.rs                     |  6 ++
 proxy/src/context/parquet.rs                 |  2 +-
 8 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index dc7e9aed7f..22815955c1 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -226,6 +226,7 @@ pub struct ConfigToml {
     pub synthetic_size_calculation_interval: Duration,
     pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
     pub test_remote_failures: u64,
+    pub test_remote_failures_probability: u64,
     pub ondemand_download_behavior_treat_error_as_warn: bool,
     #[serde(with = "humantime_serde")]
     pub background_task_maximum_delay: Duration,
@@ -758,6 +759,7 @@ impl Default for ConfigToml {
             disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
 
             test_remote_failures: (0),
+            test_remote_failures_probability: (100),
 
             ondemand_download_behavior_treat_error_as_warn: (false),
 
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 69316fd493..0ae13552b8 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -43,6 +43,7 @@ itertools.workspace = true
 sync_wrapper = { workspace = true, features = ["futures"] }
 
 byteorder = "1.4"
+rand = "0.8.5"
 
 [dev-dependencies]
 camino-tempfile.workspace = true
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index ed416b2811..5885c3e791 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -732,9 +732,15 @@ impl GenericRemoteStorage {
         })
     }
 
-    pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
-        Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
+    /* BEGIN_HADRON */
+    pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self {
+        Self::Unreliable(Arc::new(UnreliableWrapper::new(
+            s,
+            fail_first,
+            fail_probability,
+        )))
     }
+    /* END_HADRON */
 
     /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
     pub async fn upload_storage_object(
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index f9856a5856..30d116f57c 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -1,6 +1,8 @@
 //! This module provides a wrapper around a real RemoteStorage implementation that
 //! causes the first N attempts at each upload or download operatio to fail. For
 //! testing purposes.
+use rand::Rng;
+use std::cmp;
 use std::collections::HashMap;
 use std::collections::hash_map::Entry;
 use std::num::NonZeroU32;
@@ -25,6 +27,12 @@ pub struct UnreliableWrapper {
 
     // Tracks how many failed attempts of each operation has been made.
     attempts: Mutex<HashMap<RemoteOp, u64>>,
+
+    /* BEGIN_HADRON */
+    // This the probability of failure for each operation, ranged from [0, 100].
+    // The probability is default to 100, which means that all operations will fail.
+    attempt_failure_probability: u64,
+    /* END_HADRON */
 }
 
 /// Used to identify retries of different unique operation.
@@ -40,7 +48,11 @@ enum RemoteOp {
 }
 
 impl UnreliableWrapper {
-    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
+    pub fn new(
+        inner: crate::GenericRemoteStorage,
+        attempts_to_fail: u64,
+        attempt_failure_probability: u64,
+    ) -> Self {
         assert!(attempts_to_fail > 0);
         let inner = match inner {
             GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
@@ -51,9 +63,11 @@ impl UnreliableWrapper {
                 panic!("Can't wrap unreliable wrapper unreliably")
             }
         };
+        let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100);
         UnreliableWrapper {
             inner,
             attempts_to_fail,
+            attempt_failure_probability: actual_attempt_failure_probability,
             attempts: Mutex::new(HashMap::new()),
         }
     }
@@ -66,6 +80,7 @@ impl UnreliableWrapper {
     ///
     fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
         let mut attempts = self.attempts.lock().unwrap();
+        let mut rng = rand::thread_rng();
 
         match attempts.entry(op) {
             Entry::Occupied(mut e) => {
@@ -75,15 +90,19 @@ impl UnreliableWrapper {
                     *p
                 };
 
-                if attempts_before_this >= self.attempts_to_fail {
-                    // let it succeed
-                    e.remove();
-                    Ok(attempts_before_this)
-                } else {
+                /* BEGIN_HADRON */
+                // If there are more attempts to fail, fail the request by probability.
+                if (attempts_before_this < self.attempts_to_fail)
+                    && (rng.gen_range(0..=100) < self.attempt_failure_probability)
+                {
                     let error =
                         anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
                     Err(error)
+                } else {
+                    e.remove();
+                    Ok(attempts_before_this)
                 }
+                /* END_HADRON */
             }
             Entry::Vacant(e) => {
                 let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs
index 2a85f54a01..cc1cbf8009 100644
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -44,3 +44,62 @@ where
         }
     }
 }
+
+/* BEGIN_HADRON */
+pub enum DeploymentMode {
+    Dev,
+    Staging,
+    Prod,
+}
+
+pub fn get_deployment_mode() -> Option<DeploymentMode> {
+    match std::env::var("DEPLOYMENT_MODE") {
+        Ok(env) => match env.as_str() {
+            "development" => Some(DeploymentMode::Dev),
+            "staging" => Some(DeploymentMode::Staging),
+            "production" => Some(DeploymentMode::Prod),
+            _ => {
+                tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env);
+                None
+            }
+        },
+        Err(_) => {
+            tracing::error!("DEPLOYMENT_MODE not set");
+            None
+        }
+    }
+}
+
+pub fn is_dev_or_staging() -> bool {
+    matches!(
+        get_deployment_mode(),
+        Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging)
+    )
+}
+
+pub enum TestingMode {
+    Chaos,
+    Stress,
+}
+
+pub fn get_test_mode() -> Option<TestingMode> {
+    match std::env::var("HADRON_TEST_MODE") {
+        Ok(env) => match env.as_str() {
+            "chaos" => Some(TestingMode::Chaos),
+            "stress" => Some(TestingMode::Stress),
+            _ => {
+                tracing::error!("Unexpected HADRON_TEST_MODE: {}", env);
+                None
+            }
+        },
+        Err(_) => {
+            tracing::error!("HADRON_TEST_MODE not set");
+            None
+        }
+    }
+}
+
+pub fn is_chaos_testing() -> bool {
+    matches!(get_test_mode(), Some(TestingMode::Chaos))
+}
+/* END_HADRON */
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 327384fd82..78aba25d2e 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -889,8 +889,11 @@ async fn create_remote_storage_client(
             "Simulating remote failures for first {} attempts of each op",
             conf.test_remote_failures
         );
-        remote_storage =
-            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
+        remote_storage = GenericRemoteStorage::unreliable_wrapper(
+            remote_storage,
+            conf.test_remote_failures,
+            conf.test_remote_failures_probability,
+        );
     }
 
     Ok(remote_storage)
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 99d7e0ca3a..15ec31b0a6 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -147,7 +147,11 @@ pub struct PageServerConf {
 
     pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
 
+    // The number of allowed failures in remote storage operations.
     pub test_remote_failures: u64,
+    // The probability of failure in remote storage operations. Only works when test_remote_failures > 1.
+    // Use 100 for 100% failure, 0 for no failure.
+    pub test_remote_failures_probability: u64,
 
     pub ondemand_download_behavior_treat_error_as_warn: bool,
 
@@ -392,6 +396,7 @@ impl PageServerConf {
             synthetic_size_calculation_interval,
             disk_usage_based_eviction,
             test_remote_failures,
+            test_remote_failures_probability,
             ondemand_download_behavior_treat_error_as_warn,
             background_task_maximum_delay,
             control_plane_api,
@@ -461,6 +466,7 @@ impl PageServerConf {
             synthetic_size_calculation_interval,
             disk_usage_based_eviction,
             test_remote_failures,
+            test_remote_failures_probability,
             ondemand_download_behavior_treat_error_as_warn,
             background_task_maximum_delay,
             control_plane_api: control_plane_api
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index b55cc14532..4d8df19476 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -267,7 +267,7 @@ async fn worker_inner(
 ) -> anyhow::Result<()> {
     #[cfg(any(test, feature = "testing"))]
     let storage = if config.test_remote_failures > 0 {
-        GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
+        GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures, 100)
     } else {
         storage
     };

From 28f604d628bfefa26b3016421756f91c8b6b2817 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 9 Jul 2025 13:45:50 -0500
Subject: [PATCH 03/56] Make pg_monitor neon_superuser test more robust
 (#12532)

Make sure to check for NULL just in case.

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
Co-authored-by: Vikas Jain <vikas.jain@databricks.com>
---
 .../0004-grant_pg_monitor_to_neon_superuser.sql      | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
index deb7a364af..3464a2b1cf 100644
--- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
@@ -6,14 +6,18 @@ BEGIN
             admin_option AS admin
         INTO monitor
         FROM pg_auth_members
-        WHERE roleid = 'neon_superuser'::regrole
-            AND member = 'pg_monitor'::regrole;
+        WHERE roleid = 'pg_monitor'::regrole
+            AND member = 'neon_superuser'::regrole;
 
-    IF NOT monitor.member THEN
+    IF monitor IS NULL THEN
+        RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';
+    END IF;
+
+    IF monitor.admin IS NULL OR NOT monitor.member THEN
         RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor';
     END IF;
 
-    IF NOT monitor.admin THEN
+    IF monitor.admin IS NULL OR NOT monitor.admin THEN
         RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor';
     END IF;
 END $$;

From 0b639ba608d65b90df468346e72777e8953a4f42 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 9 Jul 2025 16:22:55 -0400
Subject: [PATCH 04/56] fix(storcon): correctly pass through lease error code
 (#12519)

## Problem

close LKB-199

## Summary of changes

We always return the error as 500 to the cplane if a LSN lease request
fails. This cause issues for the cplane as they don't retry on 500. This
patch correctly passes through the error and assign the error code so
that cplane can know if it is a retryable error. (TODO: look at the
cplane code and learn the retry logic).

Note that this patch does not resolve LKB-253 -- we need to handle not
found error separately in the lsn lease path, like wait until the tenant
gets attached, or return 503 so that cplane can retry.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/service.rs | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ed6643d641..d2f7287be9 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4428,7 +4428,7 @@ impl Service {
                 .await;
 
             let mut failed = 0;
-            for (tid, result) in targeted_tenant_shards.iter().zip(results.into_iter()) {
+            for (tid, (_, result)) in targeted_tenant_shards.iter().zip(results.into_iter()) {
                 match result {
                     Ok(ok) => {
                         if tid.is_shard_zero() {
@@ -4795,7 +4795,7 @@ impl Service {
             .await;
 
         let mut valid_until = None;
-        for r in res {
+        for (node, r) in res {
             match r {
                 Ok(lease) => {
                     if let Some(ref mut valid_until) = valid_until {
@@ -4805,7 +4805,7 @@ impl Service {
                     }
                 }
                 Err(e) => {
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
+                    return Err(passthrough_api_error(&node, e));
                 }
             }
         }
@@ -4919,7 +4919,7 @@ impl Service {
         max_retries: u32,
         timeout: Duration,
         cancel: &CancellationToken,
-    ) -> Vec<mgmt_api::Result<T>>
+    ) -> Vec<(Node, mgmt_api::Result<T>)>
     where
         O: Fn(TenantShardId, PageserverClient) -> F + Copy,
         F: std::future::Future<Output = mgmt_api::Result<T>>,
@@ -4940,16 +4940,16 @@ impl Service {
                         cancel,
                     )
                     .await;
-                (idx, r)
+                (idx, node, r)
             });
         }
 
-        while let Some((idx, r)) = futs.next().await {
-            results.push((idx, r.unwrap_or(Err(mgmt_api::Error::Cancelled))));
+        while let Some((idx, node, r)) = futs.next().await {
+            results.push((idx, node, r.unwrap_or(Err(mgmt_api::Error::Cancelled))));
         }
 
-        results.sort_by_key(|(idx, _)| *idx);
-        results.into_iter().map(|(_, r)| r).collect()
+        results.sort_by_key(|(idx, _, _)| *idx);
+        results.into_iter().map(|(_, node, r)| (node, r)).collect()
     }
 
     /// Helper for safely working with the shards in a tenant remotely on pageservers, for example
@@ -5862,7 +5862,7 @@ impl Service {
             return;
         }
 
-        for result in self
+        for (_, result) in self
             .tenant_for_shards_api(
                 attached,
                 |tenant_shard_id, client| async move {
@@ -5881,7 +5881,7 @@ impl Service {
             }
         }
 
-        for result in self
+        for (_, result) in self
             .tenant_for_shards_api(
                 secondary,
                 |tenant_shard_id, client| async move {
@@ -8768,7 +8768,7 @@ impl Service {
             )
             .await;
 
-        for ((tenant_shard_id, node, optimization), secondary_status) in
+        for ((tenant_shard_id, node, optimization), (_, secondary_status)) in
             want_secondary_status.into_iter().zip(results.into_iter())
         {
             match secondary_status {

From 2edd59aefbcd79288735cbbed335a27880597529 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 9 Jul 2025 23:15:44 +0200
Subject: [PATCH 05/56] impr(compaction): unify checking of `CompactionError`
 for cancellation reason (#12392)

There are a couple of places that call `CompactionError::is_cancel` but
don't check the `::Other` variant via downcasting for root cause being
cancellation.
The only place that does it is `log_compaction_error`.
It's sad we have to do it, but, until we get around cleaning up all the
culprits,
a step forward is to unify the behavior so that all places that inspect
a
`CompactionError` for cancellation reason follow the same behavior.

Thus, this PR ...
- moves the downcasting checks against the `::Other` variant from
  `log_compaction_error` into `is_cancel()` and
- enforces via type system that `.is_cancel()` is used to check whether
  a CompactionError is due to cancellation (matching on the
  `CompactionError::ShuttingDown` will cause a compile-time error).

I don't think there's a _serious_ case right now where matching instead
of using `is_cancel` causes problems.
The worst case I could find is the circuit breaker and
`compaction_failed`,
which don't really matter if we're shutting down the timeline anyway.
But it's unaesthetic and might cause log/alert noise down the line,
so, this PR fixes that at least.

Refs
- https://databricks.atlassian.net/browse/LKB-182
- slack conversation about this PR:
https://databricks.slack.com/archives/C09254R641L/p1751284317955159
---
 pageserver/src/http/routes.rs                |  11 +-
 pageserver/src/tenant.rs                     |  17 ++-
 pageserver/src/tenant/tasks.rs               |  46 +-------
 pageserver/src/tenant/timeline.rs            | 115 ++++++++++++++-----
 pageserver/src/tenant/timeline/compaction.rs |  26 ++---
 5 files changed, 117 insertions(+), 98 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3612686b5d..767bba49e2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -79,8 +79,8 @@ use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerNa
 use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
 use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
 use crate::tenant::timeline::{
-    CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
-    WaitLsnTimeout, WaitLsnWaiter, import_pgdata,
+    CompactFlags, CompactOptions, CompactRequest, MarkInvisibleRequest, Timeline, WaitLsnTimeout,
+    WaitLsnWaiter, import_pgdata,
 };
 use crate::tenant::{
     GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
@@ -2500,9 +2500,10 @@ async fn timeline_checkpoint_handler(
                 .compact(&cancel, flags, &ctx)
                 .await
                 .map_err(|e|
-                    match e {
-                        CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                        CompactionError::Other(e) => ApiError::InternalServerError(e),
+                    if e.is_cancel() {
+                        ApiError::ShuttingDown
+                    } else {
+                        ApiError::InternalServerError(e.into_anyhow())
                     }
                 )?;
         }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f576119db8..240ba36236 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3291,7 +3291,7 @@ impl TenantShard {
                         // Ignore this, we likely raced with unarchival.
                         OffloadError::NotArchived => Ok(()),
                         OffloadError::AlreadyInProgress => Ok(()),
-                        OffloadError::Cancelled => Err(CompactionError::ShuttingDown),
+                        OffloadError::Cancelled => Err(CompactionError::new_cancelled()),
                         // don't break the anyhow chain
                         OffloadError::Other(err) => Err(CompactionError::Other(err)),
                     })?;
@@ -3321,16 +3321,13 @@ impl TenantShard {
 
     /// Trips the compaction circuit breaker if appropriate.
     pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
-        match err {
-            err if err.is_cancel() => {}
-            CompactionError::ShuttingDown => (),
-            CompactionError::Other(err) => {
-                self.compaction_circuit_breaker
-                    .lock()
-                    .unwrap()
-                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
-            }
+        if err.is_cancel() {
+            return;
         }
+        self.compaction_circuit_breaker
+            .lock()
+            .unwrap()
+            .fail(&CIRCUIT_BREAKERS_BROKEN, err);
     }
 
     /// Cancel scheduled compaction tasks
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index bcece5589a..08fc7d61a5 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -17,17 +17,14 @@ use tracing::*;
 use utils::backoff::exponential_backoff_duration;
 use utils::completion::Barrier;
 use utils::pausable_failpoint;
-use utils::sync::gate::GateError;
 
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
 use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
-use crate::tenant::blob_io::WriteBlobError;
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::{TenantShard, TenantState};
-use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
 
 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
@@ -310,45 +307,12 @@ pub(crate) fn log_compaction_error(
     task_cancelled: bool,
     degrade_to_warning: bool,
 ) {
-    use CompactionError::*;
+    let is_cancel = err.is_cancel();
 
-    use crate::tenant::PageReconstructError;
-    use crate::tenant::upload_queue::NotInitialized;
-
-    let level = match err {
-        e if e.is_cancel() => return,
-        ShuttingDown => return,
-        _ if task_cancelled => Level::INFO,
-        Other(err) => {
-            let root_cause = err.root_cause();
-
-            let upload_queue = root_cause
-                .downcast_ref::<NotInitialized>()
-                .is_some_and(|e| e.is_stopping());
-            let timeline = root_cause
-                .downcast_ref::<PageReconstructError>()
-                .is_some_and(|e| e.is_cancel());
-            let buffered_writer_flush_task_canelled = root_cause
-                .downcast_ref::<FlushTaskError>()
-                .is_some_and(|e| e.is_cancel());
-            let write_blob_cancelled = root_cause
-                .downcast_ref::<WriteBlobError>()
-                .is_some_and(|e| e.is_cancel());
-            let gate_closed = root_cause
-                .downcast_ref::<GateError>()
-                .is_some_and(|e| e.is_cancel());
-            let is_stopping = upload_queue
-                || timeline
-                || buffered_writer_flush_task_canelled
-                || write_blob_cancelled
-                || gate_closed;
-
-            if is_stopping {
-                Level::INFO
-            } else {
-                Level::ERROR
-            }
-        }
+    let level = if is_cancel || task_cancelled {
+        Level::INFO
+    } else {
+        Level::ERROR
     };
 
     if let Some((error_count, sleep_duration)) = retry_info {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6088f40669..0a026d288e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1002,7 +1002,7 @@ impl From<WaitLsnError> for tonic::Status {
 impl From<CreateImageLayersError> for CompactionError {
     fn from(e: CreateImageLayersError) -> Self {
         match e {
-            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
+            CreateImageLayersError::Cancelled => CompactionError::new_cancelled(),
             CreateImageLayersError::Other(e) => {
                 CompactionError::Other(e.context("create image layers"))
             }
@@ -2117,12 +2117,7 @@ impl Timeline {
         match &result {
             Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
             Err(e) if e.is_cancel() => {}
-            Err(CompactionError::ShuttingDown) => {
-                // Covered by the `Err(e) if e.is_cancel()` branch.
-            }
-            Err(CompactionError::Other(_)) => {
-                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
-            }
+            Err(_) => self.compaction_failed.store(true, AtomicOrdering::Relaxed),
         };
 
         result
@@ -6057,26 +6052,88 @@ impl Drop for Timeline {
     }
 }
 
-/// Top-level failure to compact.
-#[derive(Debug, thiserror::Error)]
-pub(crate) enum CompactionError {
-    #[error("The timeline or pageserver is shutting down")]
-    ShuttingDown,
-    #[error(transparent)]
-    Other(anyhow::Error),
-}
+pub(crate) use compaction_error::CompactionError;
+/// In a private mod to enforce that [`CompactionError::is_cancel`] is used
+/// instead of `match`ing on [`CompactionError::ShuttingDown`].
+mod compaction_error {
+    use utils::sync::gate::GateError;
 
-impl CompactionError {
-    /// Errors that can be ignored, i.e., cancel and shutdown.
-    pub fn is_cancel(&self) -> bool {
-        matches!(self, Self::ShuttingDown)
+    use crate::{
+        pgdatadir_mapping::CollectKeySpaceError,
+        tenant::{PageReconstructError, blob_io::WriteBlobError, upload_queue::NotInitialized},
+        virtual_file::owned_buffers_io::write::FlushTaskError,
+    };
+
+    /// Top-level failure to compact. Use [`Self::is_cancel`].
+    #[derive(Debug, thiserror::Error)]
+    pub(crate) enum CompactionError {
+        /// Use [`Self::is_cancel`] instead of checking for this variant.
+        #[error("The timeline or pageserver is shutting down")]
+        #[allow(private_interfaces)]
+        ShuttingDown(ForbidMatching), // private ForbidMatching enforces use of [`Self::is_cancel`].
+        #[error(transparent)]
+        Other(anyhow::Error),
     }
 
-    pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self {
-        if err.is_cancel() {
-            Self::ShuttingDown
-        } else {
-            Self::Other(err.into_anyhow())
+    #[derive(Debug)]
+    struct ForbidMatching;
+
+    impl CompactionError {
+        pub fn new_cancelled() -> Self {
+            Self::ShuttingDown(ForbidMatching)
+        }
+        /// Errors that can be ignored, i.e., cancel and shutdown.
+        pub fn is_cancel(&self) -> bool {
+            let other = match self {
+                CompactionError::ShuttingDown(_) => return true,
+                CompactionError::Other(other) => other,
+            };
+
+            // The write path of compaction in particular often lacks differentiated
+            // handling errors stemming from cancellation from other errors.
+            // So, if requested, we also check the ::Other variant by downcasting.
+            // The list below has been found empirically from flaky tests and production logs.
+            // The process is simple: on ::Other(), compaction will print the enclosed
+            // anyhow::Error in debug mode, i.e., with backtrace. That backtrace contains the
+            // line where the write path / compaction code does undifferentiated error handling
+            // from a non-anyhow type to an anyhow type. Add the type to the list of downcasts
+            // below, following the same is_cancel() pattern.
+
+            let root_cause = other.root_cause();
+
+            let upload_queue = root_cause
+                .downcast_ref::<NotInitialized>()
+                .is_some_and(|e| e.is_stopping());
+            let timeline = root_cause
+                .downcast_ref::<PageReconstructError>()
+                .is_some_and(|e| e.is_cancel());
+            let buffered_writer_flush_task_canelled = root_cause
+                .downcast_ref::<FlushTaskError>()
+                .is_some_and(|e| e.is_cancel());
+            let write_blob_cancelled = root_cause
+                .downcast_ref::<WriteBlobError>()
+                .is_some_and(|e| e.is_cancel());
+            let gate_closed = root_cause
+                .downcast_ref::<GateError>()
+                .is_some_and(|e| e.is_cancel());
+            upload_queue
+                || timeline
+                || buffered_writer_flush_task_canelled
+                || write_blob_cancelled
+                || gate_closed
+        }
+        pub fn into_anyhow(self) -> anyhow::Error {
+            match self {
+                CompactionError::ShuttingDown(ForbidMatching) => anyhow::Error::new(self),
+                CompactionError::Other(e) => e,
+            }
+        }
+        pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self {
+            if err.is_cancel() {
+                Self::new_cancelled()
+            } else {
+                Self::Other(err.into_anyhow())
+            }
         }
     }
 }
@@ -6088,7 +6145,7 @@ impl From<super::upload_queue::NotInitialized> for CompactionError {
                 CompactionError::Other(anyhow::anyhow!(value))
             }
             super::upload_queue::NotInitialized::ShuttingDown
-            | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown,
+            | super::upload_queue::NotInitialized::Stopped => CompactionError::new_cancelled(),
         }
     }
 }
@@ -6098,7 +6155,7 @@ impl From<super::storage_layer::layer::DownloadError> for CompactionError {
         match e {
             super::storage_layer::layer::DownloadError::TimelineShutdown
             | super::storage_layer::layer::DownloadError::DownloadCancelled => {
-                CompactionError::ShuttingDown
+                CompactionError::new_cancelled()
             }
             super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
             | super::storage_layer::layer::DownloadError::DownloadRequired
@@ -6117,14 +6174,14 @@ impl From<super::storage_layer::layer::DownloadError> for CompactionError {
 
 impl From<layer_manager::Shutdown> for CompactionError {
     fn from(_: layer_manager::Shutdown) -> Self {
-        CompactionError::ShuttingDown
+        CompactionError::new_cancelled()
     }
 }
 
 impl From<super::storage_layer::errors::PutError> for CompactionError {
     fn from(e: super::storage_layer::errors::PutError) -> Self {
         if e.is_cancel() {
-            CompactionError::ShuttingDown
+            CompactionError::new_cancelled()
         } else {
             CompactionError::Other(e.into_anyhow())
         }
@@ -6223,7 +6280,7 @@ impl Timeline {
         let mut guard = tokio::select! {
             guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard,
             _ = self.cancel.cancelled() => {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
         };
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index c263df1eb2..18a0ca852d 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -572,8 +572,8 @@ impl GcCompactionQueue {
         }
         match res {
             Ok(res) => Ok(res),
-            Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
-            Err(CompactionError::Other(_)) => {
+            Err(e) if e.is_cancel() => Err(e),
+            Err(_) => {
                 // There are some cases where traditional gc might collect some layer
                 // files causing gc-compaction cannot read the full history of the key.
                 // This needs to be resolved in the long-term by improving the compaction
@@ -1260,7 +1260,7 @@ impl Timeline {
         // Is the timeline being deleted?
         if self.is_stopping() {
             trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
+            return Err(CompactionError::new_cancelled());
         }
 
         let target_file_size = self.get_checkpoint_distance();
@@ -1624,7 +1624,7 @@ impl Timeline {
 
         for (i, layer) in layers_to_rewrite.into_iter().enumerate() {
             if self.cancel.is_cancelled() {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
 
             info!(layer=%layer, "rewriting layer after shard split: {}/{}", i, total);
@@ -1722,7 +1722,7 @@ impl Timeline {
                     Ok(()) => {},
                     Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
                     Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
-                        return Err(CompactionError::ShuttingDown);
+                        return Err(CompactionError::new_cancelled());
                     }
                 },
                 // Don't wait if there's L0 compaction to do. We don't need to update the outcome
@@ -1985,7 +1985,7 @@ impl Timeline {
             let mut all_keys = Vec::new();
             for l in deltas_to_compact.iter() {
                 if self.cancel.is_cancelled() {
-                    return Err(CompactionError::ShuttingDown);
+                    return Err(CompactionError::new_cancelled());
                 }
                 let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
                 let keys = delta
@@ -2078,7 +2078,7 @@ impl Timeline {
         stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
 
         if self.cancel.is_cancelled() {
-            return Err(CompactionError::ShuttingDown);
+            return Err(CompactionError::new_cancelled());
         }
 
         stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
@@ -2186,7 +2186,7 @@ impl Timeline {
                 // avoid hitting the cancellation token on every key. in benches, we end up
                 // shuffling an order of million keys per layer, this means we'll check it
                 // around tens of times per layer.
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
 
             let same_key = prev_key == Some(key);
@@ -2271,7 +2271,7 @@ impl Timeline {
                 if writer.is_none() {
                     if self.cancel.is_cancelled() {
                         // to be somewhat responsive to cancellation, check for each new layer
-                        return Err(CompactionError::ShuttingDown);
+                        return Err(CompactionError::new_cancelled());
                     }
                     // Create writer if not initiaized yet
                     writer = Some(
@@ -2527,7 +2527,7 @@ impl Timeline {
         // Is the timeline being deleted?
         if self.is_stopping() {
             trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
+            return Err(CompactionError::new_cancelled());
         }
 
         let (dense_ks, _sparse_ks) = self
@@ -3189,7 +3189,7 @@ impl Timeline {
         let gc_lock = async {
             tokio::select! {
                 guard = self.gc_lock.lock() => Ok(guard),
-                _ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
+                _ = cancel.cancelled() => Err(CompactionError::new_cancelled()),
             }
         };
 
@@ -3462,7 +3462,7 @@ impl Timeline {
             }
             total_layer_size += layer.layer_desc().file_size;
             if cancel.is_cancelled() {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
             let should_yield = yield_for_l0
                 && self
@@ -3609,7 +3609,7 @@ impl Timeline {
             }
 
             if cancel.is_cancelled() {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
 
             let should_yield = yield_for_l0

From 13e38a58a14c60da94486904d60a8b9e8e391503 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 9 Jul 2025 16:35:39 -0500
Subject: [PATCH 06/56] Grant pg_signal_backend to neon_superuser (#12533)

Allow neon_superuser to cancel backends from non-neon_superusers,
excluding Postgres superusers.

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
Co-authored-by: Vikas Jain <vikas.jain@databricks.com>
---
 ...nt_pg_signal_backend_to_neon_superuser.sql |  1 +
 ...nt_pg_signal_backend_to_neon_superuser.sql | 23 +++++++++++++++++++
 compute_tools/src/spec.rs                     |  1 +
 3 files changed, 25 insertions(+)
 create mode 100644 compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql

diff --git a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
new file mode 100644
index 0000000000..36e31544be
--- /dev/null
+++ b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
@@ -0,0 +1 @@
+GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;
diff --git a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
new file mode 100644
index 0000000000..e62b742d30
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
@@ -0,0 +1,23 @@
+DO $$
+DECLARE
+    signal_backend record;
+BEGIN
+    SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
+            admin_option AS admin
+        INTO signal_backend
+        FROM pg_auth_members
+        WHERE roleid = 'pg_signal_backend'::regrole
+            AND member = 'neon_superuser'::regrole;
+
+    IF signal_backend IS NULL THEN
+        RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_signal_backend';
+    END IF;
+
+    IF signal_backend.member IS NULL OR NOT signal_backend.member THEN
+        RAISE EXCEPTION 'neon_superuser is not a member of pg_signal_backend';
+    END IF;
+
+    IF signal_backend.admin IS NULL OR NOT signal_backend.admin THEN
+        RAISE EXCEPTION 'neon_superuser cannot grant pg_signal_backend';
+    END IF;
+END $$;
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 43cfbb48f7..b6382b2f56 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -197,6 +197,7 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
         include_str!(
             "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
         ),
+        include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"),
     ];
 
     MigrationRunner::new(client, &migrations)

From 1a45b2ec900b37c40608ca94a1d9d37cad12fce8 Mon Sep 17 00:00:00 2001
From: Dimitri Fontaine <dim@tapoueh.org>
Date: Thu, 10 Jul 2025 10:06:33 +0200
Subject: [PATCH 07/56] Review security model for executing Event Trigger code.
 (#12463)

When a function is owned by a superuser (bootstrap user or otherwise),
we consider it safe to run it. Only a superuser could have installed it,
typically from CREATE EXTENSION script: we trust the code to execute.

## Problem

This is intended to solve running pg_graphql Event Triggers
graphql_watch_ddl and graphql_watch_drop which are executing the secdef
function graphql.increment_schema_version().

## Summary of changes

Allow executing Event Trigger function owned by a superuser and with
SECURITY DEFINER properties. The Event Trigger code runs with superuser
privileges, and we consider that it's fine.

---------

Co-authored-by: Tristan Partin <tristan.partin@databricks.com>
---
 pgxn/neon/neon_ddl_handler.c                  |  33 +-----
 test_runner/fixtures/neon_fixtures.py         |  27 +++++
 .../test_event_trigger_extension--1.0.sql     |  32 ++++++
 .../test_event_trigger_extension.control      |   8 ++
 .../regress/test_download_extensions.py       |  22 ----
 .../regress/test_event_trigger_extension.py   | 102 ++++++++++++++++++
 6 files changed, 174 insertions(+), 50 deletions(-)
 create mode 100644 test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql
 create mode 100644 test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control
 create mode 100644 test_runner/regress/test_event_trigger_extension.py

diff --git a/pgxn/neon/neon_ddl_handler.c b/pgxn/neon/neon_ddl_handler.c
index 2ce7b0086b..1f03e52c67 100644
--- a/pgxn/neon/neon_ddl_handler.c
+++ b/pgxn/neon/neon_ddl_handler.c
@@ -953,7 +953,9 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 
 	/*
 	 * Fire Event Trigger if both function owner and current user are
-	 * superuser, or none of them are.
+	 * superuser. Allow executing Event Trigger function that belongs to a
+	 * superuser when connected as a non-superuser, even when the function is
+	 * SECURITY DEFINER.
 	 */
     else if (event == FHET_START
 		/* still enable it to pass pg_regress tests */
@@ -976,32 +978,7 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 		function_is_owned_by_super = superuser_arg(function_owner);
 
 		/*
-		 * 1. Refuse to run SECURITY DEFINER function that belongs to a
-		 * superuser when the current user is not a superuser itself.
-		 */
-		if (!role_is_super
-			&& function_is_owned_by_super
-			&& function_is_secdef)
-		{
-			char *func_name = get_func_name(flinfo->fn_oid);
-
-			ereport(WARNING,
-					(errmsg("Skipping Event Trigger"),
-					 errdetail("Event Trigger function \"%s\" is owned by \"%s\" "
-							   "and is SECURITY DEFINER",
-							   func_name,
-							   GetUserNameFromId(function_owner, false))));
-
-			/*
-			 * we can't skip execution directly inside the fmgr_hook so
-			 * instead we change the event trigger function to a noop
-			 * function.
-			 */
-			force_noop(flinfo);
-		}
-
-		/*
-		 * 2. Refuse to run functions that belongs to a non-superuser when the
+		 * Refuse to run functions that belongs to a non-superuser when the
 		 * current user is a superuser.
 		 *
 		 * We could run a SECURITY DEFINER user-function here and be safe with
@@ -1009,7 +986,7 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 		 * infrastructure maintenance operations, where we prefer to skip
 		 * running user-defined code.
 		 */
-		else if (role_is_super && !function_is_owned_by_super)
+		if (role_is_super && !function_is_owned_by_super)
 		{
 			char *func_name = get_func_name(flinfo->fn_oid);
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f54d5be635..42924f9b83 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1795,6 +1795,33 @@ def neon_env_builder(
         record_property("preserve_database_files", builder.preserve_database_files)
 
 
+@pytest.fixture(scope="function")
+def neon_env_builder_local(
+    neon_env_builder: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_distrib_dir: Path,
+) -> NeonEnvBuilder:
+    """
+    Fixture to create a Neon environment for test with its own pg_install copy.
+
+    This allows the test to edit the list of available extensions in the
+    local instance of Postgres used for the test, and install extensions via
+    downloading them when a remote extension is tested, for instance, or
+    copying files around for local extension testing.
+    """
+    test_local_pginstall = test_output_dir / "pg_install"
+    log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}")
+
+    # We can't copy only the version that we are currently testing because other
+    # binaries like the storage controller need specific Postgres versions.
+    shutil.copytree(pg_distrib_dir, test_local_pginstall)
+
+    neon_env_builder.pg_distrib_dir = test_local_pginstall
+    log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}")
+
+    return neon_env_builder
+
+
 @dataclass
 class PageserverPort:
     pg: int
diff --git a/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql
new file mode 100644
index 0000000000..2b82102802
--- /dev/null
+++ b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql
@@ -0,0 +1,32 @@
+\echo Use "CREATE EXTENSION test_event_trigger_extension" to load this file. \quit
+
+CREATE SCHEMA event_trigger;
+
+create sequence if not exists event_trigger.seq_schema_version as int cycle;
+
+create or replace function event_trigger.increment_schema_version()
+    returns event_trigger
+    security definer
+    language plpgsql
+as $$
+begin
+    perform pg_catalog.nextval('event_trigger.seq_schema_version');
+end;
+$$;
+
+create or replace function event_trigger.get_schema_version()
+    returns int
+    security definer
+    language sql
+as $$
+    select last_value from event_trigger.seq_schema_version;
+$$;
+
+-- On DDL event, increment the schema version number
+create event trigger event_trigger_watch_ddl
+    on ddl_command_end
+    execute procedure event_trigger.increment_schema_version();
+
+create event trigger event_trigger_watch_drop
+    on sql_drop
+    execute procedure event_trigger.increment_schema_version();
diff --git a/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control
new file mode 100644
index 0000000000..4fe8c3341b
--- /dev/null
+++ b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control
@@ -0,0 +1,8 @@
+default_version = '1.0'
+comment = 'Test extension with Event Trigger'
+
+# make sure the extension objects are owned by the bootstrap user
+# to check that the SECURITY DEFINER event trigger function is still
+# called during non-superuser DDL events.
+superuser = true
+trusted = true
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index fe3b220c67..d7f78afac8 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import os
 import platform
-import shutil
 import tarfile
 from enum import StrEnum
 from pathlib import Path
@@ -31,27 +30,6 @@ if TYPE_CHECKING:
     from werkzeug.wrappers.request import Request
 
 
-# use neon_env_builder_local fixture to override the default neon_env_builder fixture
-# and use a test-specific pg_install instead of shared one
-@pytest.fixture(scope="function")
-def neon_env_builder_local(
-    neon_env_builder: NeonEnvBuilder,
-    test_output_dir: Path,
-    pg_distrib_dir: Path,
-) -> NeonEnvBuilder:
-    test_local_pginstall = test_output_dir / "pg_install"
-    log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}")
-
-    # We can't copy only the version that we are currently testing because other
-    # binaries like the storage controller need specific Postgres versions.
-    shutil.copytree(pg_distrib_dir, test_local_pginstall)
-
-    neon_env_builder.pg_distrib_dir = test_local_pginstall
-    log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}")
-
-    return neon_env_builder
-
-
 @final
 class RemoteExtension(StrEnum):
     SQL_ONLY = "test_extension_sql_only"
diff --git a/test_runner/regress/test_event_trigger_extension.py b/test_runner/regress/test_event_trigger_extension.py
new file mode 100644
index 0000000000..ac4351dcd5
--- /dev/null
+++ b/test_runner/regress/test_event_trigger_extension.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+import shutil
+from pathlib import Path
+from typing import TYPE_CHECKING, cast
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.paths import BASE_DIR
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from fixtures.neon_fixtures import (
+        NeonEnvBuilder,
+    )
+    from fixtures.pg_version import PgVersion
+
+
+# use neon_env_builder_local fixture to override the default neon_env_builder fixture
+# and use a test-specific pg_install instead of shared one
+@pytest.fixture(scope="function")
+def neon_env_builder_event_trigger_extension(
+    neon_env_builder_local: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_version: PgVersion,
+) -> NeonEnvBuilder:
+    test_local_pginstall = test_output_dir / "pg_install"
+
+    # Now copy the SQL only extension test_event_trigger_extension in the local
+    # pginstall extension directory on-disk
+    test_event_trigger_extension_dir = (
+        BASE_DIR / "test_runner" / "regress" / "data" / "test_event_trigger_extension"
+    )
+
+    test_local_extension_dir = (
+        test_local_pginstall / f"v{pg_version}" / "share" / "postgresql" / "extension"
+    )
+
+    log.info(f"copy {test_event_trigger_extension_dir} to {test_local_extension_dir}")
+
+    for f in [
+        test_event_trigger_extension_dir / "test_event_trigger_extension.control",
+        test_event_trigger_extension_dir / "test_event_trigger_extension--1.0.sql",
+    ]:
+        shutil.copy(f, test_local_extension_dir)
+
+    return neon_env_builder_local
+
+
+def test_event_trigger_extension(neon_env_builder_event_trigger_extension: NeonEnvBuilder):
+    """
+    Test installing an extension that contains an Event Trigger.
+
+    The Event Trigger function is owned by the extension owner, which at
+    CREATE EXTENSION is going to be the Postgres bootstrap user, per the
+    extension control file where both superuser = true and trusted = true.
+
+    Also this function is SECURTY DEFINER, to allow for making changes to
+    the extension SQL objects, in our case a sequence.
+
+    This test makes sure that the event trigger function is fired correctly
+    by non-privileged user DDL actions such as CREATE TABLE.
+    """
+    env = neon_env_builder_event_trigger_extension.init_start()
+    env.create_branch("test_event_trigger_extension")
+
+    endpoint = env.endpoints.create_start("test_event_trigger_extension")
+    extension = "test_event_trigger_extension"
+    database = "test_event_trigger_extension"
+
+    endpoint.safe_psql(f"CREATE DATABASE {database}")
+    endpoint.safe_psql(f"CREATE EXTENSION {extension}", dbname=database)
+
+    # check that the extension is owned by the bootstrap superuser (cloud_admin)
+    pg_bootstrap_superuser_name = "cloud_admin"
+    with endpoint.connect(dbname=database) as pg_conn:
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                f"select rolname from pg_roles r join pg_extension e on r.oid = e.extowner where extname = '{extension}'"
+            )
+            owner = cast("tuple[str]", cur.fetchone())[0]
+            assert owner == pg_bootstrap_superuser_name, (
+                f"extension {extension} is not owned by bootstrap user '{pg_bootstrap_superuser_name}'"
+            )
+
+    # test that the SQL-only Event Trigger (SECURITY DEFINER function) runs
+    # correctly now that the extension has been installed
+    #
+    # create table to trigger the event trigger, twice, check sequence count
+    with endpoint.connect(dbname=database) as pg_conn:
+        log.info("creating SQL objects (tables)")
+        with pg_conn.cursor() as cur:
+            cur.execute("CREATE TABLE foo1(id int primary key)")
+            cur.execute("CREATE TABLE foo2(id int)")
+
+            cur.execute("SELECT event_trigger.get_schema_version()")
+            res = cast("tuple[int]", cur.fetchone())
+            ver = res[0]
+
+            log.info(f"schema version is now {ver}")
+            assert ver == 2, "schema version is not 2"

From 08b19f001c77afbcd3fbde06a11e495d6222967a Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 10 Jul 2025 11:07:21 +0100
Subject: [PATCH 08/56] pageserver: optionally force image layer creation on
 timeout (#12529)

This PR introduces a `image_creation_timeout` to page servers so that we
can force the image creation after a certain period. This is set to 1
day on dev/staging for now, and will rollout to production 1/2 weeks
later.

Majority of the PR are boilerplate code to add the new knob. Specific
changes of the PR are:
1. During L0 compaction, check if we should force a compaction if
min(LSN) of all delta layers < force_image_creation LSN.
2. During image creation, check if we should force a compaction if the
image's LSN < force_image_creation LSN and there are newer deltas with
overlapping key ranges.
3. Also tweaked the check image creation interval to make sure we honor
image_creation_timeout.

Vlad's note: This should be a no-op. I added an extra PS config for the
large timeline
threshold to enable this.

---------

Co-authored-by: Chen Luo <chen.luo@databricks.com>
---
 control_plane/src/pageserver.rs               |   6 ++
 libs/pageserver_api/src/config.rs             |   9 ++
 libs/pageserver_api/src/models.rs             |  18 ++++
 pageserver/src/config.rs                      |   6 ++
 pageserver/src/tenant.rs                      |   9 ++
 pageserver/src/tenant/timeline.rs             |  68 ++++++++++--
 pageserver/src/tenant/timeline/compaction.rs  | 100 ++++++++++++++++--
 .../regress/test_attach_tenant_config.py      |   1 +
 test_runner/regress/test_compaction.py        |  75 +++++++++++++
 9 files changed, 275 insertions(+), 17 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 3f66960edd..3673d1f4f2 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -452,6 +452,12 @@ impl PageServerNode {
                 .map(|x| x.parse::<usize>())
                 .transpose()
                 .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
+            // HADRON
+            image_layer_force_creation_period: settings
+                .remove("image_layer_force_creation_period")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'image_layer_force_creation_period' as duration")?,
             image_layer_creation_check_threshold: settings
                 .remove("image_layer_creation_check_threshold")
                 .map(|x| x.parse::<u8>())
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 22815955c1..9e9c7a4dcb 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -272,6 +272,8 @@ pub struct ConfigToml {
     pub timeline_import_config: TimelineImportConfig,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub basebackup_cache_config: Option<BasebackupCacheConfig>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_layer_generation_large_timeline_threshold: Option<u64>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -561,6 +563,11 @@ pub struct TenantConfigToml {
     pub gc_period: Duration,
     // Delta layer churn threshold to create L1 image layers.
     pub image_creation_threshold: usize,
+    // HADRON
+    // When the timeout is reached, PageServer will (1) force compact any remaining L0 deltas and
+    // (2) create image layers if there are any L1 deltas.
+    #[serde(with = "humantime_serde")]
+    pub image_layer_force_creation_period: Option<Duration>,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
     // The unit is time.
@@ -823,6 +830,7 @@ impl Default for ConfigToml {
             },
             basebackup_cache_config: None,
             posthog_config: None,
+            image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024),
         }
     }
 }
@@ -916,6 +924,7 @@ impl Default for TenantConfigToml {
             gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                 .expect("cannot parse default gc period"),
             image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
+            image_layer_force_creation_period: None,
             pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
                 .expect("cannot parse default PITR interval"),
             walreceiver_connect_timeout: humantime::parse_duration(
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 6735320484..56dd95eab3 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -597,6 +597,9 @@ pub struct TenantConfigPatch {
     pub gc_period: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub image_creation_threshold: FieldPatch<usize>,
+    // HADRON
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_layer_force_creation_period: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub pitr_interval: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
@@ -700,6 +703,11 @@ pub struct TenantConfig {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_creation_threshold: Option<usize>,
 
+    // HADRON
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    pub image_layer_force_creation_period: Option<Duration>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(with = "humantime_serde")]
     pub pitr_interval: Option<Duration>,
@@ -798,6 +806,7 @@ impl TenantConfig {
             mut gc_horizon,
             mut gc_period,
             mut image_creation_threshold,
+            mut image_layer_force_creation_period,
             mut pitr_interval,
             mut walreceiver_connect_timeout,
             mut lagging_wal_timeout,
@@ -861,6 +870,11 @@ impl TenantConfig {
         patch
             .image_creation_threshold
             .apply(&mut image_creation_threshold);
+        // HADRON
+        patch
+            .image_layer_force_creation_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut image_layer_force_creation_period);
         patch
             .pitr_interval
             .map(|v| humantime::parse_duration(&v))?
@@ -942,6 +956,7 @@ impl TenantConfig {
             gc_horizon,
             gc_period,
             image_creation_threshold,
+            image_layer_force_creation_period,
             pitr_interval,
             walreceiver_connect_timeout,
             lagging_wal_timeout,
@@ -1016,6 +1031,9 @@ impl TenantConfig {
             image_creation_threshold: self
                 .image_creation_threshold
                 .unwrap_or(global_conf.image_creation_threshold),
+            image_layer_force_creation_period: self
+                .image_layer_force_creation_period
+                .or(global_conf.image_layer_force_creation_period),
             pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval),
             walreceiver_connect_timeout: self
                 .walreceiver_connect_timeout
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 15ec31b0a6..f64c5838ff 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -252,6 +252,10 @@ pub struct PageServerConf {
     pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
 
     pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
+
+    /// Defines what is a big tenant for the purpose of image layer generation.
+    /// See Timeline::should_check_if_image_layers_required
+    pub image_layer_generation_large_timeline_threshold: Option<u64>,
 }
 
 /// Token for authentication to safekeepers
@@ -432,6 +436,7 @@ impl PageServerConf {
             posthog_config,
             timeline_import_config,
             basebackup_cache_config,
+            image_layer_generation_large_timeline_threshold,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -490,6 +495,7 @@ impl PageServerConf {
             dev_mode,
             timeline_import_config,
             basebackup_cache_config,
+            image_layer_generation_large_timeline_threshold,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 240ba36236..7e2e6d96b8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4171,6 +4171,15 @@ impl TenantShard {
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
+    // HADRON
+    pub fn get_image_creation_timeout(&self) -> Option<Duration> {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf.image_layer_force_creation_period.or(self
+            .conf
+            .default_tenant_conf
+            .image_layer_force_creation_period)
+    }
+
     pub fn get_pitr_interval(&self) -> Duration {
         let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0a026d288e..a9bc0a060b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -351,6 +351,13 @@ pub struct Timeline {
     last_image_layer_creation_check_at: AtomicLsn,
     last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
 
+    // HADRON
+    /// If a key range has writes with LSN > force_image_creation_lsn, then we should force image layer creation
+    /// on this key range.
+    force_image_creation_lsn: AtomicLsn,
+    /// The last time instant when force_image_creation_lsn is computed.
+    force_image_creation_lsn_computed_at: std::sync::Mutex<Option<Instant>>,
+
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
 
@@ -2846,6 +2853,18 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
+    // HADRON
+    fn get_image_creation_timeout(&self) -> Option<Duration> {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .image_layer_force_creation_period
+            .or(self
+                .conf
+                .default_tenant_conf
+                .image_layer_force_creation_period)
+    }
+
     fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
         let tenant_conf = &self.tenant_conf.load();
         tenant_conf
@@ -3115,7 +3134,9 @@ impl Timeline {
                 repartition_threshold: 0,
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
                 last_image_layer_creation_check_instant: Mutex::new(None),
-
+                // HADRON
+                force_image_creation_lsn: AtomicLsn::new(0),
+                force_image_creation_lsn_computed_at: std::sync::Mutex::new(None),
                 last_received_wal: Mutex::new(None),
                 rel_size_latest_cache: RwLock::new(HashMap::new()),
                 rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
@@ -5036,6 +5057,7 @@ impl Timeline {
                 .create_image_layers(
                     &partitions,
                     self.initdb_lsn,
+                    None,
                     ImageLayerCreationMode::Initial,
                     ctx,
                     LastImageLayerCreationStatus::Initial,
@@ -5307,14 +5329,19 @@ impl Timeline {
     }
 
     // Is it time to create a new image layer for the given partition? True if we want to generate.
-    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
+    async fn time_for_new_image_layer(
+        &self,
+        partition: &KeySpace,
+        lsn: Lsn,
+        force_image_creation_lsn: Option<Lsn>,
+    ) -> bool {
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
         let Ok(layers) = guard.layer_map() else {
             return false;
         };
-
+        let mut min_image_lsn: Lsn = Lsn::MAX;
         let mut max_deltas = 0;
         for part_range in &partition.ranges {
             let image_coverage = layers.image_coverage(part_range, lsn);
@@ -5349,9 +5376,22 @@ impl Timeline {
                         return true;
                     }
                 }
+                min_image_lsn = min(min_image_lsn, img_lsn);
             }
         }
 
+        // HADRON
+        if min_image_lsn < force_image_creation_lsn.unwrap_or(Lsn(0)) && max_deltas > 0 {
+            info!(
+                "forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}",
+                partition.ranges[0].start,
+                partition.ranges[0].end,
+                min_image_lsn,
+                force_image_creation_lsn.unwrap()
+            );
+            return true;
+        }
+
         debug!(
             max_deltas,
             "none of the partitioned ranges had >= {threshold} deltas"
@@ -5577,7 +5617,7 @@ impl Timeline {
     ///        suffer from the lack of image layers
     ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
     fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
-        const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
+        let large_timeline_threshold = self.conf.image_layer_generation_large_timeline_threshold;
 
         let last_checks_at = self.last_image_layer_creation_check_at.load();
         let distance = lsn
@@ -5591,12 +5631,12 @@ impl Timeline {
         let mut time_based_decision = false;
         let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
         if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
-            let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
-            {
-                self.get_checkpoint_timeout()
-            } else {
-                Duration::from_secs(3600 * 48)
-            };
+            let check_required_after =
+                if Some(Into::<u64>::into(&logical_size)) >= large_timeline_threshold {
+                    self.get_checkpoint_timeout()
+                } else {
+                    Duration::from_secs(3600 * 48)
+                };
 
             time_based_decision = match *last_check_instant {
                 Some(last_check) => {
@@ -5624,10 +5664,12 @@ impl Timeline {
     /// true = we have generate all image layers, false = we preempt the process for L0 compaction.
     ///
     /// `partition_mode` is only for logging purpose and is not used anywhere in this function.
+    #[allow(clippy::too_many_arguments)]
     async fn create_image_layers(
         self: &Arc<Timeline>,
         partitioning: &KeyPartitioning,
         lsn: Lsn,
+        force_image_creation_lsn: Option<Lsn>,
         mode: ImageLayerCreationMode,
         ctx: &RequestContext,
         last_status: LastImageLayerCreationStatus,
@@ -5731,7 +5773,11 @@ impl Timeline {
             } else if let ImageLayerCreationMode::Try = mode {
                 // check_for_image_layers = false -> skip
                 // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
-                if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
+                if !check_for_image_layers
+                    || !self
+                        .time_for_new_image_layer(partition, lsn, force_image_creation_lsn)
+                        .await
+                {
                     start = img_range.end;
                     continue;
                 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 18a0ca852d..171f9d1284 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,10 +4,11 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
+use std::cmp::min;
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::{Duration, Instant, SystemTime};
 
 use super::layer_manager::LayerManagerLockHolder;
 use super::{
@@ -33,6 +34,7 @@ use pageserver_api::models::{CompactInfoResponse, CompactKeyRange};
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use pageserver_compaction::helpers::{fully_contains, overlaps_with};
 use pageserver_compaction::interface::*;
+use postgres_ffi::to_pg_timestamp;
 use serde::Serialize;
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio_util::sync::CancellationToken;
@@ -45,6 +47,7 @@ use wal_decoder::models::value::Value;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
@@ -1267,6 +1270,12 @@ impl Timeline {
 
         // Define partitioning schema if needed
 
+        // HADRON
+        let force_image_creation_lsn = self
+            .get_or_compute_force_image_creation_lsn(cancel, ctx)
+            .await
+            .map_err(CompactionError::Other)?;
+
         // 1. L0 Compact
         let l0_outcome = {
             let timer = self.metrics.compact_time_histo.start_timer();
@@ -1274,6 +1283,7 @@ impl Timeline {
                 .compact_level0(
                     target_file_size,
                     options.flags.contains(CompactFlags::ForceL0Compaction),
+                    force_image_creation_lsn,
                     ctx,
                 )
                 .await?;
@@ -1376,6 +1386,7 @@ impl Timeline {
                     .create_image_layers(
                         &partitioning,
                         lsn,
+                        force_image_creation_lsn,
                         mode,
                         &image_ctx,
                         self.last_image_layer_creation_status
@@ -1472,6 +1483,63 @@ impl Timeline {
         Ok(CompactionOutcome::Done)
     }
 
+    /* BEGIN_HADRON */
+    // Get the force image creation LSN. Compute it if the last computed LSN is too old.
+    async fn get_or_compute_force_image_creation_lsn(
+        self: &Arc<Self>,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Option<Lsn>> {
+        const FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL: Duration = Duration::from_secs(10 * 60); // 10 minutes
+        let image_layer_force_creation_period = self.get_image_creation_timeout();
+        if image_layer_force_creation_period.is_none() {
+            return Ok(None);
+        }
+
+        let image_layer_force_creation_period = image_layer_force_creation_period.unwrap();
+        let force_image_creation_lsn_computed_at =
+            *self.force_image_creation_lsn_computed_at.lock().unwrap();
+        if force_image_creation_lsn_computed_at.is_none()
+            || force_image_creation_lsn_computed_at.unwrap().elapsed()
+                > FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL
+        {
+            let now: SystemTime = SystemTime::now();
+            let timestamp = now
+                .checked_sub(image_layer_force_creation_period)
+                .ok_or_else(|| {
+                    anyhow::anyhow!(
+                        "image creation timeout is too large: {image_layer_force_creation_period:?}"
+                    )
+                })?;
+            let timestamp = to_pg_timestamp(timestamp);
+            let force_image_creation_lsn = match self
+                .find_lsn_for_timestamp(timestamp, cancel, ctx)
+                .await?
+            {
+                LsnForTimestamp::Present(lsn) | LsnForTimestamp::Future(lsn) => lsn,
+                _ => {
+                    let gc_lsn = *self.get_applied_gc_cutoff_lsn();
+                    tracing::info!(
+                        "no LSN found for timestamp {timestamp:?}, using latest GC cutoff LSN {}",
+                        gc_lsn
+                    );
+                    gc_lsn
+                }
+            };
+            self.force_image_creation_lsn
+                .store(force_image_creation_lsn);
+            *self.force_image_creation_lsn_computed_at.lock().unwrap() = Some(Instant::now());
+            tracing::info!(
+                "computed force image creation LSN: {}",
+                force_image_creation_lsn
+            );
+            Ok(Some(force_image_creation_lsn))
+        } else {
+            Ok(Some(self.force_image_creation_lsn.load()))
+        }
+    }
+    /* END_HADRON */
+
     /// Check for layers that are elegible to be rewritten:
     /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
     ///   we don't indefinitely retain keys in this shard that aren't needed.
@@ -1801,6 +1869,7 @@ impl Timeline {
         self: &Arc<Self>,
         target_file_size: u64,
         force_compaction_ignore_threshold: bool,
+        force_compaction_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<CompactionOutcome, CompactionError> {
         let CompactLevel0Phase1Result {
@@ -1821,6 +1890,7 @@ impl Timeline {
                 stats,
                 target_file_size,
                 force_compaction_ignore_threshold,
+                force_compaction_lsn,
                 &ctx,
             )
             .instrument(phase1_span)
@@ -1843,6 +1913,7 @@ impl Timeline {
         mut stats: CompactLevel0Phase1StatsBuilder,
         target_file_size: u64,
         force_compaction_ignore_threshold: bool,
+        force_compaction_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
         let begin = tokio::time::Instant::now();
@@ -1872,11 +1943,28 @@ impl Timeline {
                     return Ok(CompactLevel0Phase1Result::default());
                 }
             } else {
-                debug!(
-                    level0_deltas = level0_deltas.len(),
-                    threshold, "too few deltas to compact"
-                );
-                return Ok(CompactLevel0Phase1Result::default());
+                // HADRON
+                let min_lsn = level0_deltas
+                    .iter()
+                    .map(|a| a.get_lsn_range().start)
+                    .reduce(min);
+                if force_compaction_lsn.is_some()
+                    && min_lsn.is_some()
+                    && min_lsn.unwrap() < force_compaction_lsn.unwrap()
+                {
+                    info!(
+                        "forcing L0 compaction of {} L0 deltas. Min lsn: {}, force compaction lsn: {}",
+                        level0_deltas.len(),
+                        min_lsn.unwrap(),
+                        force_compaction_lsn.unwrap()
+                    );
+                } else {
+                    debug!(
+                        level0_deltas = level0_deltas.len(),
+                        threshold, "too few deltas to compact"
+                    );
+                    return Ok(CompactLevel0Phase1Result::default());
+                }
             }
         }
 
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 7788faceb4..eaaa3014a5 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -165,6 +165,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "gc_horizon": 23 * (1024 * 1024),
         "gc_period": "2h 13m",
         "image_creation_threshold": 7,
+        "image_layer_force_creation_period": "1m",
         "pitr_interval": "1m",
         "lagging_wal_timeout": "23m",
         "lazy_slru_download": True,
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 1570d40ae9..e67161c6b7 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -944,3 +944,78 @@ def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool
                 f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)"
             )
             assert res[0][0] == 1
+
+
+# BEGIN_HADRON
+def get_layer_map(env, tenant_shard_id, timeline_id, ps_id):
+    client = env.pageservers[ps_id].http_client()
+    layer_map = client.layer_map_info(tenant_shard_id, timeline_id)
+    image_layer_count = 0
+    delta_layer_count = 0
+    for layer in layer_map.historic_layers:
+        if layer.kind == "Image":
+            image_layer_count += 1
+        elif layer.kind == "Delta":
+            delta_layer_count += 1
+    return image_layer_count, delta_layer_count
+
+
+def test_image_creation_timeout(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests that page server can force creating new images if image creation timeout is enabled
+    """
+    # use large knobs to disable L0 compaction/image creation except for the force image creation
+    tenant_conf = {
+        "compaction_threshold": "100",
+        "image_creation_threshold": "100",
+        "image_layer_creation_check_threshold": "1",
+        "checkpoint_distance": 10 * 1024,
+        "checkpoint_timeout": "1s",
+        "image_layer_force_creation_period": "1s",
+        # The lsn for forced image layer creations is calculated once every 10 minutes.
+        # Hence, drive compaction manually such that the test doesn't compute it at the
+        # wrong time.
+        "compaction_period": "0s",
+    }
+
+    # consider every tenant large to run the image layer generation check more eagerly
+    neon_env_builder.pageserver_config_override = (
+        "image_layer_generation_large_timeline_threshold=0"
+    )
+
+    neon_env_builder.num_pageservers = 1
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+    # Generate some rows.
+    for v in range(10):
+        endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))")
+
+    # Sleep a bit such that the inserts are considered when calculating the forced image layer creation LSN.
+    time.sleep(2)
+
+    def check_force_image_creation():
+        ps_http = env.pageserver.http_client()
+        ps_http.timeline_compact(tenant_id, timeline_id)
+        image, delta = get_layer_map(env, tenant_id, timeline_id, 0)
+        log.info(f"images: {image}, deltas: {delta}")
+        assert image > 0
+
+        env.pageserver.assert_log_contains("forcing L0 compaction of")
+        env.pageserver.assert_log_contains("forcing image creation for partitioned range")
+
+    wait_until(check_force_image_creation)
+
+    endpoint.stop_and_destroy()
+
+    env.pageserver.allowed_errors.append(
+        ".*created delta file of size.*larger than double of target.*"
+    )
+
+
+# END_HADRON

From f4b03ddd7b4cb858430b16d642f5f80f11e8b5b1 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 10 Jul 2025 12:18:37 +0200
Subject: [PATCH 09/56] pageserver/client_grpc: reap idle pool resources
 (#12476)

## Problem

The gRPC client pools don't reap idle resources.

Touches #11735.
Requires #12475.

## Summary of changes

Reap idle pool resources (channels/clients/streams) after 3 minutes of
inactivity.

Also restructure the `StreamPool` to use a mutex rather than atomics for
synchronization, for simplicity. This will be optimized later.
---
 Cargo.lock                         |   1 +
 pageserver/client_grpc/Cargo.toml  |   4 +
 pageserver/client_grpc/src/pool.rs | 241 ++++++++++++++++++++++++-----
 3 files changed, 207 insertions(+), 39 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index caed814d5f..4150944ad0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4506,6 +4506,7 @@ dependencies = [
  "pageserver_page_api",
  "tokio",
  "tokio-stream",
+ "tokio-util",
  "tonic 0.13.1",
  "tracing",
  "utils",
diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml
index 84e27abb84..ca224900ac 100644
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -4,6 +4,9 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+testing = ["pageserver_api/testing"]
+
 [dependencies]
 anyhow.workspace = true
 bytes.workspace = true
@@ -13,6 +16,7 @@ pageserver_api.workspace = true
 pageserver_page_api.workspace = true
 tokio.workspace = true
 tokio-stream.workspace = true
+tokio-util.workspace = true
 tonic.workspace = true
 tracing.workspace = true
 utils.workspace = true
diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 5a50004fd1..89b3bd646f 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -34,10 +34,12 @@ use std::num::NonZero;
 use std::ops::{Deref, DerefMut};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, Weak};
+use std::time::{Duration, Instant};
 
 use futures::StreamExt as _;
 use tokio::sync::mpsc::{Receiver, Sender};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use tokio_util::sync::CancellationToken;
 use tonic::transport::{Channel, Endpoint};
 use tracing::{error, warn};
 
@@ -45,6 +47,25 @@ use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;
 
+/// Reap channels/clients/streams that have been idle for this long.
+///
+/// TODO: this is per-pool. For nested pools, it can take up to 3x as long for a TCP connection to
+/// be reaped. First, we must wait for an idle stream to be reaped, which marks its client as idle.
+/// Then, we must wait for the idle client to be reaped, which marks its channel as idle. Then, we
+/// must wait for the idle channel to be reaped. Is that a problem? Maybe not, we just have to
+/// account for it when setting the reap threshold. Alternatively, we can immediately reap empty
+/// channels, and/or stream pool clients.
+const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
+    false => Duration::from_secs(180),
+    true => Duration::from_secs(1), // exercise reaping in tests
+};
+
+/// Reap idle resources with this interval.
+const REAP_IDLE_INTERVAL: Duration = match cfg!(any(test, feature = "testing")) {
+    false => Duration::from_secs(10),
+    true => Duration::from_secs(1), // exercise reaping in tests
+};
+
 /// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
 /// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this.
 /// The pool does not limit the number of channels, and instead relies on `ClientPool` or
@@ -52,7 +73,6 @@ use utils::shard::ShardIndex;
 ///
 /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
 ///
-/// TODO: reap idle channels.
 /// TODO: consider prewarming a set of channels, to avoid initial connection latency.
 /// TODO: consider adding a circuit breaker for errors and fail fast.
 pub struct ChannelPool {
@@ -62,6 +82,8 @@ pub struct ChannelPool {
     max_clients_per_channel: NonZero<usize>,
     /// Open channels.
     channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
+    /// Reaps idle channels.
+    idle_reaper: Reaper,
     /// Channel ID generator.
     next_channel_id: AtomicUsize,
 }
@@ -73,6 +95,9 @@ struct ChannelEntry {
     channel: Channel,
     /// Number of clients using this channel.
     clients: usize,
+    /// The channel has been idle (no clients) since this time. None if channel is in use.
+    /// INVARIANT: Some if clients == 0, otherwise None.
+    idle_since: Option<Instant>,
 }
 
 impl ChannelPool {
@@ -82,12 +107,15 @@ impl ChannelPool {
         E: TryInto<Endpoint> + Send + Sync + 'static,
         <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
     {
-        Ok(Arc::new(Self {
+        let pool = Arc::new(Self {
             endpoint: endpoint.try_into()?,
             max_clients_per_channel,
             channels: Mutex::default(),
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
             next_channel_id: AtomicUsize::default(),
-        }))
+        });
+        pool.idle_reaper.spawn(&pool);
+        Ok(pool)
     }
 
     /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
@@ -116,8 +144,14 @@ impl ChannelPool {
                 entry.clients <= self.max_clients_per_channel.get(),
                 "channel overflow"
             );
+            assert_eq!(
+                entry.idle_since.is_some(),
+                entry.clients == 0,
+                "incorrect channel idle state"
+            );
             if entry.clients < self.max_clients_per_channel.get() {
                 entry.clients += 1;
+                entry.idle_since = None;
                 return ChannelGuard {
                     pool: Arc::downgrade(self),
                     id,
@@ -134,6 +168,7 @@ impl ChannelPool {
         let entry = ChannelEntry {
             channel: channel.clone(),
             clients: 1, // account for the guard below
+            idle_since: None,
         };
         channels.insert(id, entry);
 
@@ -145,6 +180,20 @@ impl ChannelPool {
     }
 }
 
+impl Reapable for ChannelPool {
+    /// Reaps channels that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.channels.lock().unwrap().retain(|_, entry| {
+            let Some(idle_since) = entry.idle_since else {
+                assert_ne!(entry.clients, 0, "empty channel not marked idle");
+                return true;
+            };
+            assert_eq!(entry.clients, 0, "idle channel has clients");
+            idle_since >= cutoff
+        })
+    }
+}
+
 /// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`,
 /// since the gRPC client requires an owned `Channel`.
 pub struct ChannelGuard {
@@ -167,10 +216,15 @@ impl Drop for ChannelGuard {
         let Some(pool) = self.pool.upgrade() else {
             return; // pool was dropped
         };
+
         let mut channels = pool.channels.lock().unwrap();
         let entry = channels.get_mut(&self.id).expect("unknown channel");
+        assert!(entry.idle_since.is_none(), "active channel marked idle");
         assert!(entry.clients > 0, "channel underflow");
         entry.clients -= 1;
+        if entry.clients == 0 {
+            entry.idle_since = Some(Instant::now()); // mark channel as idle
+        }
     }
 }
 
@@ -179,8 +233,6 @@ impl Drop for ChannelGuard {
 /// number of concurrent clients to `max_clients` via semaphore.
 ///
 /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
-///
-/// TODO: reap idle clients.
 pub struct ClientPool {
     /// Tenant ID.
     tenant_id: TenantId,
@@ -201,6 +253,8 @@ pub struct ClientPool {
     /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
     /// clients are reaped.
     idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
+    /// Reaps idle clients.
+    idle_reaper: Reaper,
     /// Unique client ID generator.
     next_client_id: AtomicUsize,
 }
@@ -212,6 +266,9 @@ struct ClientEntry {
     client: page_api::Client,
     /// The channel guard for the channel used by the client.
     channel_guard: ChannelGuard,
+    /// The client has been idle since this time. All clients in `ClientPool::idle` are idle by
+    /// definition, so this is the time when it was added back to the pool.
+    idle_since: Instant,
 }
 
 impl ClientPool {
@@ -226,16 +283,19 @@ impl ClientPool {
         auth_token: Option<String>,
         max_clients: Option<NonZero<usize>>,
     ) -> Arc<Self> {
-        Arc::new(Self {
+        let pool = Arc::new(Self {
             tenant_id,
             timeline_id,
             shard_id,
             auth_token,
             channel_pool,
             idle: Mutex::default(),
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
             limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))),
             next_client_id: AtomicUsize::default(),
-        })
+        });
+        pool.idle_reaper.spawn(&pool);
+        pool
     }
 
     /// Gets a client from the pool, or creates a new one if necessary. Connections are established
@@ -287,6 +347,16 @@ impl ClientPool {
     }
 }
 
+impl Reapable for ClientPool {
+    /// Reaps clients that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.idle
+            .lock()
+            .unwrap()
+            .retain(|_, entry| entry.idle_since >= cutoff)
+    }
+}
+
 /// A client acquired from the pool. The inner client can be accessed via Deref. The client is
 /// returned to the pool when dropped.
 pub struct ClientGuard {
@@ -317,9 +387,11 @@ impl Drop for ClientGuard {
         let Some(pool) = self.pool.upgrade() else {
             return; // pool was dropped
         };
+
         let entry = ClientEntry {
             client: self.client.take().expect("dropped once"),
             channel_guard: self.channel_guard.take().expect("dropped once"),
+            idle_since: Instant::now(),
         };
         pool.idle.lock().unwrap().insert(self.id, entry);
 
@@ -334,7 +406,6 @@ impl Drop for ClientGuard {
 /// a single request and await the response. Internally, requests are multiplexed across streams and
 /// channels. This allows proper queue depth enforcement and response routing.
 ///
-/// TODO: reap idle streams.
 /// TODO: consider making this generic over request and response types; not currently needed.
 pub struct StreamPool {
     /// The client pool to acquire clients from. Must be unbounded.
@@ -344,7 +415,7 @@ pub struct StreamPool {
     /// Incoming requests will be sent over an existing stream with available capacity. If all
     /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
     /// stream has an associated Tokio task that processes requests and responses.
-    streams: Arc<Mutex<HashMap<StreamID, StreamEntry>>>,
+    streams: Mutex<HashMap<StreamID, StreamEntry>>,
     /// The max number of concurrent streams, or None if unbounded.
     max_streams: Option<NonZero<usize>>,
     /// The max number of concurrent requests per stream.
@@ -352,6 +423,8 @@ pub struct StreamPool {
     /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
     /// None if the pool is unbounded.
     limiter: Option<Arc<Semaphore>>,
+    /// Reaps idle streams.
+    idle_reaper: Reaper,
     /// Stream ID generator.
     next_stream_id: AtomicUsize,
 }
@@ -364,9 +437,11 @@ type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
 struct StreamEntry {
     /// Sends caller requests to the stream task. The stream task exits when this is dropped.
     sender: RequestSender,
-    /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on
-    /// completion without acquiring the `StreamPool::streams` lock.
-    queue_depth: Arc<AtomicUsize>,
+    /// Number of in-flight requests on this stream.
+    queue_depth: usize,
+    /// The time when this stream went idle (queue_depth == 0).
+    /// INVARIANT: Some if queue_depth == 0, otherwise None.
+    idle_since: Option<Instant>,
 }
 
 impl StreamPool {
@@ -383,16 +458,19 @@ impl StreamPool {
         max_queue_depth: NonZero<usize>,
     ) -> Arc<Self> {
         assert!(client_pool.limiter.is_none(), "bounded client pool");
-        Arc::new(Self {
+        let pool = Arc::new(Self {
             client_pool,
-            streams: Arc::default(),
+            streams: Mutex::default(),
             limiter: max_streams.map(|max_streams| {
                 Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
             }),
             max_streams,
             max_queue_depth,
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
             next_stream_id: AtomicUsize::default(),
-        })
+        });
+        pool.idle_reaper.spawn(&pool);
+        pool
     }
 
     /// Acquires an available stream from the pool, or spins up a new stream async if all streams
@@ -412,8 +490,8 @@ impl StreamPool {
     /// * Allow concurrent clients to join onto streams while they're spun up.
     /// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
     ///
-    /// For now, we just do something simple and functional, but very inefficient (linear scan).
-    pub async fn get(&self) -> StreamGuard {
+    /// For now, we just do something simple but inefficient (linear scan under mutex).
+    pub async fn get(self: &Arc<Self>) -> StreamGuard {
         // Acquire a permit if the pool is bounded.
         let mut permit = None;
         if let Some(limiter) = self.limiter.clone() {
@@ -422,23 +500,23 @@ impl StreamPool {
         let mut streams = self.streams.lock().unwrap();
 
         // Look for a pooled stream with available capacity.
-        for entry in streams.values() {
+        for (&id, entry) in streams.iter_mut() {
             assert!(
-                entry.queue_depth.load(Ordering::Relaxed) <= self.max_queue_depth.get(),
+                entry.queue_depth <= self.max_queue_depth.get(),
                 "stream queue overflow"
             );
-            if entry
-                .queue_depth
-                .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| {
-                    // Increment the queue depth via compare-and-swap.
-                    // TODO: review ordering.
-                    (queue_depth < self.max_queue_depth.get()).then_some(queue_depth + 1)
-                })
-                .is_ok()
-            {
+            assert_eq!(
+                entry.idle_since.is_some(),
+                entry.queue_depth == 0,
+                "incorrect stream idle state"
+            );
+            if entry.queue_depth < self.max_queue_depth.get() {
+                entry.queue_depth += 1;
+                entry.idle_since = None;
                 return StreamGuard {
+                    pool: Arc::downgrade(self),
+                    id,
                     sender: entry.sender.clone(),
-                    queue_depth: entry.queue_depth.clone(),
                     permit,
                 };
             }
@@ -448,11 +526,11 @@ impl StreamPool {
         // return the guard, while spinning up the stream task async. This allows other callers to
         // join onto this stream and also create additional streams concurrently if this fills up.
         let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
-        let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller
         let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
         let entry = StreamEntry {
             sender: req_tx.clone(),
-            queue_depth: queue_depth.clone(),
+            queue_depth: 1, // reserve quota for this caller
+            idle_since: None,
         };
         streams.insert(id, entry);
 
@@ -461,20 +539,23 @@ impl StreamPool {
         };
 
         let client_pool = self.client_pool.clone();
-        let streams = self.streams.clone();
+        let pool = Arc::downgrade(self);
 
         tokio::spawn(async move {
             if let Err(err) = Self::run_stream(client_pool, req_rx).await {
                 error!("stream failed: {err}");
             }
-            // Remove stream from pool on exit.
-            let entry = streams.lock().unwrap().remove(&id);
-            assert!(entry.is_some(), "unknown stream ID: {id}");
+            // Remove stream from pool on exit. Weak reference to avoid holding the pool alive.
+            if let Some(pool) = pool.upgrade() {
+                let entry = pool.streams.lock().unwrap().remove(&id);
+                assert!(entry.is_some(), "unknown stream ID: {id}");
+            }
         });
 
         StreamGuard {
+            pool: Arc::downgrade(self),
+            id,
             sender: req_tx,
-            queue_depth,
             permit,
         }
     }
@@ -552,11 +633,26 @@ impl StreamPool {
     }
 }
 
+impl Reapable for StreamPool {
+    /// Reaps streams that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.streams.lock().unwrap().retain(|_, entry| {
+            let Some(idle_since) = entry.idle_since else {
+                assert_ne!(entry.queue_depth, 0, "empty stream not marked idle");
+                return true;
+            };
+            assert_eq!(entry.queue_depth, 0, "idle stream has requests");
+            idle_since >= cutoff
+        });
+    }
+}
+
 /// A pooled stream reference. Can be used to send a single request, to properly enforce queue
 /// depth. Queue depth is already reserved and will be returned on drop.
 pub struct StreamGuard {
+    pool: Weak<StreamPool>,
+    id: StreamID,
     sender: RequestSender,
-    queue_depth: Arc<AtomicUsize>,
     permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
 }
 
@@ -588,11 +684,78 @@ impl StreamGuard {
 
 impl Drop for StreamGuard {
     fn drop(&mut self) {
+        let Some(pool) = self.pool.upgrade() else {
+            return; // pool was dropped
+        };
+
         // Release the queue depth reservation on drop. This can prematurely decrement it if dropped
         // before the response is received, but that's okay.
-        let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst);
-        assert!(prev_queue_depth > 0, "stream queue underflow");
+        let mut streams = pool.streams.lock().unwrap();
+        let entry = streams.get_mut(&self.id).expect("unknown stream");
+        assert!(entry.idle_since.is_none(), "active stream marked idle");
+        assert!(entry.queue_depth > 0, "stream queue underflow");
+        entry.queue_depth -= 1;
+        if entry.queue_depth == 0 {
+            entry.idle_since = Some(Instant::now()); // mark stream as idle
+        }
 
         _ = self.permit; // returned on drop, referenced for visibility
     }
 }
+
+/// Periodically reaps idle resources from a pool.
+struct Reaper {
+    /// The task check interval.
+    interval: Duration,
+    /// The threshold for reaping idle resources.
+    threshold: Duration,
+    /// Cancels the reaper task. Cancelled when the reaper is dropped.
+    cancel: CancellationToken,
+}
+
+impl Reaper {
+    /// Creates a new reaper.
+    pub fn new(threshold: Duration, interval: Duration) -> Self {
+        Self {
+            cancel: CancellationToken::new(),
+            threshold,
+            interval,
+        }
+    }
+
+    /// Spawns a task to periodically reap idle resources from the given task pool. The task is
+    /// cancelled when the reaper is dropped.
+    pub fn spawn(&self, pool: &Arc<impl Reapable>) {
+        // NB: hold a weak pool reference, otherwise the task will prevent dropping the pool.
+        let pool = Arc::downgrade(pool);
+        let cancel = self.cancel.clone();
+        let (interval, threshold) = (self.interval, self.threshold);
+
+        tokio::spawn(async move {
+            loop {
+                tokio::select! {
+                    _ = tokio::time::sleep(interval) => {
+                        let Some(pool) = pool.upgrade() else {
+                            return; // pool was dropped
+                        };
+                        pool.reap_idle(Instant::now() - threshold);
+                    }
+
+                    _ = cancel.cancelled() => return,
+                }
+            }
+        });
+    }
+}
+
+impl Drop for Reaper {
+    fn drop(&mut self) {
+        self.cancel.cancel(); // cancel reaper task
+    }
+}
+
+/// A reapable resource pool.
+trait Reapable: Send + Sync + 'static {
+    /// Reaps resources that have been idle since before the given cutoff.
+    fn reap_idle(&self, cutoff: Instant);
+}

From bdca5b500b078eb9afb528fd464f496e07c97024 Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Thu, 10 Jul 2025 12:11:53 +0100
Subject: [PATCH 10/56] Fix test_lfc_prewarm: reduce number of prewarms, sleep
 before LFC offloading (#12515)

Fixes:
- Sleep before LFC offloading in `test_lfc_prewarm[autoprewarm]` to
ensure offloaded LFC is the one exported after all writes finish
- Reduce number of prewarms and increase timeout in
`test_lfc_prewarm_under_workload` as debug builds were failing due to
timeout.

Additional changes:
- Remove `check_pinned_entries`:
https://github.com/neondatabase/neon/pull/12447#discussion_r2185946210
- Fix LFC error metrics description:
https://github.com/neondatabase/neon/pull/12486#discussion_r2190763107
---
 compute_tools/src/metrics.rs            |  4 +--
 test_runner/regress/test_lfc_prewarm.py | 44 +++++++++++--------------
 2 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index 91dedbb42a..6e4df73c0f 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -108,7 +108,7 @@ pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "compute_ctl_lfc_prewarm_errors_total",
-        "Total number of LFC prewarms errors requested by compute_ctl or autoprewarm option",
+        "Total number of LFC prewarm errors",
     )
     .expect("failed to define a metric")
 });
@@ -124,7 +124,7 @@ pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "compute_ctl_lfc_offload_errors_total",
-        "Total number of LFC offload errors requested by compute_ctl or lfc_offload_period_seconds option",
+        "Total number of LFC offload errors",
     )
     .expect("failed to define a metric")
 });
diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py
index ae36bbda79..22e5bf576f 100644
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -1,6 +1,7 @@
 import random
 import threading
 from enum import StrEnum
+from time import sleep
 from typing import Any
 
 import pytest
@@ -24,18 +25,7 @@ OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
 OFFLOAD_ERR_LABEL = "compute_ctl_lfc_offload_errors_total"
 METHOD_VALUES = [e for e in PrewarmMethod]
 METHOD_IDS = [e.value for e in PrewarmMethod]
-
-
-def check_pinned_entries(cur: Cursor):
-    """
-    Wait till none of LFC buffers are pinned
-    """
-
-    def none_pinned():
-        cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
-        assert cur.fetchall()[0][0] == 0
-
-    wait_until(none_pinned)
+AUTOOFFLOAD_INTERVAL_SECS = 2
 
 
 def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
@@ -49,9 +39,18 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
 
 
 def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
+    if method == PrewarmMethod.POSTGRES:
+        cur.execute("select get_local_cache_state()")
+        return cur.fetchall()[0][0]
+
     if method == PrewarmMethod.AUTOPREWARM:
+        # With autoprewarm, we need to be sure LFC was offloaded after all writes
+        # finish, so we sleep. Otherwise we'll have less prewarmed pages than we want
+        sleep(AUTOOFFLOAD_INTERVAL_SECS)
         client.offload_lfc_wait()
-    elif method == PrewarmMethod.COMPUTE_CTL:
+        return
+
+    if method == PrewarmMethod.COMPUTE_CTL:
         status = client.prewarm_lfc_status()
         assert status["status"] == "not_prewarmed"
         assert "error" not in status
@@ -60,11 +59,9 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor)
         parsed = prom_parse(client)
         desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0}
         assert parsed == desired, f"{parsed=} != {desired=}"
-    elif method == PrewarmMethod.POSTGRES:
-        cur.execute("select get_local_cache_state()")
-        return cur.fetchall()[0][0]
-    else:
-        raise AssertionError(f"{method} not in PrewarmMethod")
+        return
+
+    raise AssertionError(f"{method} not in PrewarmMethod")
 
 
 def prewarm_endpoint(
@@ -106,14 +103,13 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
         "neon.file_cache_size_limit=1GB",
         "neon.file_cache_prewarm_limit=1000",
     ]
-    offload_secs = 2
 
     if method == PrewarmMethod.AUTOPREWARM:
         endpoint = env.endpoints.create_start(
             branch_name="main",
             config_lines=cfg,
             autoprewarm=True,
-            offload_lfc_interval_seconds=offload_secs,
+            offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS,
         )
     else:
         endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
@@ -135,7 +131,7 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
 
     endpoint.stop()
     if method == PrewarmMethod.AUTOPREWARM:
-        endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs)
+        endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS)
     else:
         endpoint.start()
 
@@ -162,7 +158,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
     lfc_cur.execute("select sum(pk) from t")
     assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
 
-    check_pinned_entries(pg_cur)
     desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
     check_prewarmed(method, client, desired)
 
@@ -243,9 +238,9 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
     prewarm_thread.start()
 
     def prewarmed():
-        assert n_prewarms > 5
+        assert n_prewarms > 3
 
-    wait_until(prewarmed)
+    wait_until(prewarmed, timeout=40)  # debug builds don't finish in 20s
 
     running = False
     for t in workload_threads:
@@ -256,7 +251,6 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
     total_balance = lfc_cur.fetchall()[0][0]
     assert total_balance == 0
 
-    check_pinned_entries(pg_cur)
     if method == PrewarmMethod.POSTGRES:
         return
     desired = {

From ffeede085e3008616872372ac98edcef573c8677 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 10 Jul 2025 12:58:22 +0100
Subject: [PATCH 11/56] libs: move metric collection for pageserver and
 safekeeper in a background task (#12525)

## Problem

Safekeeper and pageserver metrics collection might time out. We've seen
this in both hadron and neon.

## Summary of changes

This PR moves metrics collection in PS/SK to the background so that we
will always get some metrics, despite there may be some delays. Will
leave it to the future work to reduce metrics collection time.

---------

Co-authored-by: Chen Luo <chen.luo@databricks.com>
---
 libs/http-utils/src/endpoint.rs               | 37 ++++++++-
 libs/pageserver_api/src/config.rs             |  2 +
 libs/utils/src/lib.rs                         |  2 +
 libs/utils/src/metrics_collector.rs           | 75 +++++++++++++++++++
 pageserver/src/bin/pageserver.rs              | 41 +++++++++-
 pageserver/src/config.rs                      |  6 ++
 pageserver/src/http/routes.rs                 |  7 +-
 pageserver/src/lib.rs                         | 12 +++
 safekeeper/src/bin/safekeeper.rs              | 27 +++++++
 safekeeper/src/http/routes.rs                 |  9 ++-
 safekeeper/src/lib.rs                         |  2 +
 safekeeper/src/wal_backup.rs                  |  2 +-
 .../tests/walproposer_sim/safekeeper.rs       |  1 +
 test_runner/fixtures/pageserver/http.py       |  2 +-
 test_runner/fixtures/safekeeper/http.py       |  2 +-
 15 files changed, 217 insertions(+), 10 deletions(-)
 create mode 100644 libs/utils/src/metrics_collector.rs

diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs
index f32ced1180..a61bf8e08a 100644
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -20,6 +20,7 @@ use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{Instrument, debug, info, info_span, warn};
 use utils::auth::{AuthError, Claims, SwappableJwtAuth};
+use utils::metrics_collector::{METRICS_COLLECTOR, METRICS_STALE_MILLIS};
 
 use crate::error::{ApiError, api_error_handler, route_error_handler};
 use crate::request::{get_query_param, parse_query_param};
@@ -250,9 +251,28 @@ impl std::io::Write for ChannelWriter {
     }
 }
 
-pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn prometheus_metrics_handler(
+    req: Request<Body>,
+    force_metric_collection_on_scrape: bool,
+) -> Result<Response<Body>, ApiError> {
     SERVE_METRICS_COUNT.inc();
 
+    // HADRON
+    let requested_use_latest = parse_query_param(&req, "use_latest")?;
+
+    let use_latest = match requested_use_latest {
+        None => force_metric_collection_on_scrape,
+        Some(true) => true,
+        Some(false) => {
+            if force_metric_collection_on_scrape {
+                // We don't cache in this case
+                true
+            } else {
+                false
+            }
+        }
+    };
+
     let started_at = std::time::Instant::now();
 
     let (tx, rx) = mpsc::channel(1);
@@ -277,12 +297,18 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
 
         let _span = span.entered();
 
-        let metrics = metrics::gather();
+        // HADRON
+        let collected = if use_latest {
+            // Skip caching the results if we always force metric collection on scrape.
+            METRICS_COLLECTOR.run_once(!force_metric_collection_on_scrape)
+        } else {
+            METRICS_COLLECTOR.last_collected()
+        };
 
         let gathered_at = std::time::Instant::now();
 
         let res = encoder
-            .encode(&metrics, &mut writer)
+            .encode(&collected.metrics, &mut writer)
             .and_then(|_| writer.flush().map_err(|e| e.into()));
 
         // this instant is not when we finally got the full response sent, sending is done by hyper
@@ -295,6 +321,10 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
         let encoded_in = encoded_at - gathered_at - writer.wait_time();
         let total = encoded_at - started_at;
 
+        // HADRON
+        let staleness_ms = (encoded_at - collected.collected_at).as_millis();
+        METRICS_STALE_MILLIS.set(staleness_ms as i64);
+
         match res {
             Ok(()) => {
                 tracing::info!(
@@ -303,6 +333,7 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
                     spawning_ms = spawned_in.as_millis(),
                     collection_ms = collected_in.as_millis(),
                     encoding_ms = encoded_in.as_millis(),
+                    stalenss_ms = staleness_ms,
                     "responded /metrics"
                 );
             }
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 9e9c7a4dcb..f01c65d1bd 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -274,6 +274,7 @@ pub struct ConfigToml {
     pub basebackup_cache_config: Option<BasebackupCacheConfig>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_layer_generation_large_timeline_threshold: Option<u64>,
+    pub force_metric_collection_on_scrape: bool,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -831,6 +832,7 @@ impl Default for ConfigToml {
             basebackup_cache_config: None,
             posthog_config: None,
             image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024),
+            force_metric_collection_on_scrape: true,
         }
     }
 }
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 11f787562c..2b81da017d 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -99,6 +99,8 @@ pub mod elapsed_accum;
 #[cfg(target_os = "linux")]
 pub mod linux_socket_ioctl;
 
+pub mod metrics_collector;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
diff --git a/libs/utils/src/metrics_collector.rs b/libs/utils/src/metrics_collector.rs
new file mode 100644
index 0000000000..9e57fcd643
--- /dev/null
+++ b/libs/utils/src/metrics_collector.rs
@@ -0,0 +1,75 @@
+use std::{
+    sync::{Arc, RwLock},
+    time::{Duration, Instant},
+};
+
+use metrics::{IntGauge, proto::MetricFamily, register_int_gauge};
+use once_cell::sync::Lazy;
+
+pub static METRICS_STALE_MILLIS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "metrics_metrics_stale_milliseconds",
+        "The current metrics stale time in milliseconds"
+    )
+    .expect("failed to define a metric")
+});
+
+#[derive(Debug)]
+pub struct CollectedMetrics {
+    pub metrics: Vec<MetricFamily>,
+    pub collected_at: Instant,
+}
+
+impl CollectedMetrics {
+    fn new(metrics: Vec<MetricFamily>) -> Self {
+        Self {
+            metrics,
+            collected_at: Instant::now(),
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct MetricsCollector {
+    last_collected: RwLock<Arc<CollectedMetrics>>,
+}
+
+impl MetricsCollector {
+    pub fn new() -> Self {
+        Self {
+            last_collected: RwLock::new(Arc::new(CollectedMetrics::new(vec![]))),
+        }
+    }
+
+    #[tracing::instrument(name = "metrics_collector", skip_all)]
+    pub fn run_once(&self, cache_metrics: bool) -> Arc<CollectedMetrics> {
+        let started = Instant::now();
+        let metrics = metrics::gather();
+        let collected = Arc::new(CollectedMetrics::new(metrics));
+        if cache_metrics {
+            let mut guard = self.last_collected.write().unwrap();
+            *guard = collected.clone();
+        }
+        tracing::info!(
+            "Collected {} metric families in {} ms",
+            collected.metrics.len(),
+            started.elapsed().as_millis()
+        );
+        collected
+    }
+
+    pub fn last_collected(&self) -> Arc<CollectedMetrics> {
+        self.last_collected.read().unwrap().clone()
+    }
+}
+
+impl Default for MetricsCollector {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+// Interval for metrics collection. Currently hard-coded to be the same as the metrics scape interval from the obs agent
+pub static METRICS_COLLECTION_INTERVAL: Duration = Duration::from_secs(30);
+
+pub static METRICS_COLLECTOR: Lazy<MetricsCollector> = Lazy::new(MetricsCollector::default);
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 78aba25d2e..299fe7e159 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -29,8 +29,8 @@ use pageserver::task_mgr::{
 };
 use pageserver::tenant::{TenantSharedResources, mgr, secondary};
 use pageserver::{
-    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
-    page_cache, page_service, task_mgr, virtual_file,
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener,
+    MetricsCollectionTask, http, page_cache, page_service, task_mgr, virtual_file,
 };
 use postgres_backend::AuthType;
 use remote_storage::GenericRemoteStorage;
@@ -41,6 +41,7 @@ use tracing_utils::OtelGuard;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::crashsafe::syncfs;
 use utils::logging::TracingErrorLayerEnablement;
+use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
 use utils::sentry_init::init_sentry;
 use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener};
 
@@ -763,6 +764,41 @@ fn start_pageserver(
         (http_task, https_task)
     };
 
+    /* BEGIN_HADRON */
+    let metrics_collection_task = {
+        let cancel = shutdown_pageserver.child_token();
+        let task = crate::BACKGROUND_RUNTIME.spawn({
+            let cancel = cancel.clone();
+            let background_jobs_barrier = background_jobs_barrier.clone();
+            async move {
+                if conf.force_metric_collection_on_scrape {
+                    return;
+                }
+
+                // first wait until background jobs are cleared to launch.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return; },
+                    _ = background_jobs_barrier.wait() => {}
+                };
+                let mut interval = tokio::time::interval(METRICS_COLLECTION_INTERVAL);
+                loop {
+                    tokio::select! {
+                        _ = cancel.cancelled() => {
+                            tracing::info!("cancelled metrics collection task, exiting...");
+                             break;
+                        },
+                        _ = interval.tick() => {}
+                    }
+                    tokio::task::spawn_blocking(|| {
+                        METRICS_COLLECTOR.run_once(true);
+                    });
+                }
+            }
+        });
+        MetricsCollectionTask(CancellableTask { task, cancel })
+    };
+    /* END_HADRON */
+
     let consumption_metrics_tasks = {
         let cancel = shutdown_pageserver.child_token();
         let task = crate::BACKGROUND_RUNTIME.spawn({
@@ -844,6 +880,7 @@ fn start_pageserver(
             https_endpoint_listener,
             page_service,
             page_service_grpc,
+            metrics_collection_task,
             consumption_metrics_tasks,
             disk_usage_eviction_task,
             &tenant_manager,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f64c5838ff..bb73ae1dd5 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -256,6 +256,10 @@ pub struct PageServerConf {
     /// Defines what is a big tenant for the purpose of image layer generation.
     /// See Timeline::should_check_if_image_layers_required
     pub image_layer_generation_large_timeline_threshold: Option<u64>,
+
+    /// Controls whether to collect all metrics on each scrape or to return potentially stale
+    /// results.
+    pub force_metric_collection_on_scrape: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -437,6 +441,7 @@ impl PageServerConf {
             timeline_import_config,
             basebackup_cache_config,
             image_layer_generation_large_timeline_threshold,
+            force_metric_collection_on_scrape,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -496,6 +501,7 @@ impl PageServerConf {
             timeline_import_config,
             basebackup_cache_config,
             image_layer_generation_large_timeline_threshold,
+            force_metric_collection_on_scrape,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 767bba49e2..ed0a5440cb 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3938,9 +3938,14 @@ pub fn make_router(
         .expect("construct launch timestamp header middleware"),
     );
 
+    let force_metric_collection_on_scrape = state.conf.force_metric_collection_on_scrape;
+
+    let prometheus_metrics_handler_wrapper =
+        move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape);
+
     Ok(router
         .data(state)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| request_span(r, prometheus_metrics_handler_wrapper))
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
         .get("/profile/heap", |r| request_span(r, profile_heap_handler))
         .get("/v1/status", |r| api_handler(r, status_handler))
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 0dd3c465e0..0864026f6b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -73,6 +73,9 @@ pub struct HttpEndpointListener(pub CancellableTask);
 pub struct HttpsEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
+// HADRON
+pub struct MetricsCollectionTask(pub CancellableTask);
+
 impl CancellableTask {
     pub async fn shutdown(self) {
         self.cancel.cancel();
@@ -87,6 +90,7 @@ pub async fn shutdown_pageserver(
     https_listener: Option<HttpsEndpointListener>,
     page_service: page_service::Listener,
     grpc_task: Option<CancellableTask>,
+    metrics_collection_task: MetricsCollectionTask,
     consumption_metrics_worker: ConsumptionMetricsTasks,
     disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
     tenant_manager: &TenantManager,
@@ -211,6 +215,14 @@ pub async fn shutdown_pageserver(
     // Best effort to persist any outstanding deletions, to avoid leaking objects
     deletion_queue.shutdown(Duration::from_secs(5)).await;
 
+    // HADRON
+    timed(
+        metrics_collection_task.0.shutdown(),
+        "shutdown metrics collections metrics",
+        Duration::from_secs(1),
+    )
+    .await;
+
     timed(
         consumption_metrics_worker.0.shutdown(),
         "shutdown consumption metrics",
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 8fda625817..b2d5976ef4 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -37,6 +37,7 @@ use tracing::*;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
 use utils::id::NodeId;
 use utils::logging::{self, LogFormat, SecretString};
+use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
 use utils::sentry_init::init_sentry;
 use utils::{pid_file, project_build_tag, project_git_version, tcp_listener};
 
@@ -243,6 +244,11 @@ struct Args {
     #[arg(long)]
     enable_tls_wal_service_api: bool,
 
+    /// Controls whether to collect all metrics on each scrape or to return potentially stale
+    /// results.
+    #[arg(long, default_value_t = true)]
+    force_metric_collection_on_scrape: bool,
+
     /// Run in development mode (disables security checks)
     #[arg(long, help = "Run in development mode (disables security checks)")]
     dev: bool,
@@ -428,6 +434,7 @@ async fn main() -> anyhow::Result<()> {
         ssl_ca_certs,
         use_https_safekeeper_api: args.use_https_safekeeper_api,
         enable_tls_wal_service_api: args.enable_tls_wal_service_api,
+        force_metric_collection_on_scrape: args.force_metric_collection_on_scrape,
     });
 
     // initialize sentry if SENTRY_DSN is provided
@@ -640,6 +647,26 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
         .map(|res| ("broker main".to_owned(), res));
     tasks_handles.push(Box::pin(broker_task_handle));
 
+    /* BEGIN_HADRON */
+    if conf.force_metric_collection_on_scrape {
+        let metrics_handle = current_thread_rt
+            .as_ref()
+            .unwrap_or_else(|| BACKGROUND_RUNTIME.handle())
+            .spawn(async move {
+                let mut interval: tokio::time::Interval =
+                    tokio::time::interval(METRICS_COLLECTION_INTERVAL);
+                loop {
+                    interval.tick().await;
+                    tokio::task::spawn_blocking(|| {
+                        METRICS_COLLECTOR.run_once(true);
+                    });
+                }
+            })
+            .map(|res| ("broker main".to_owned(), res));
+        tasks_handles.push(Box::pin(metrics_handle));
+    }
+    /* END_HADRON */
+
     set_build_info_metric(GIT_VERSION, BUILD_TAG);
 
     // TODO: update tokio-stream, convert to real async Stream with
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 384c582678..4b061c65d9 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -699,6 +699,11 @@ pub fn make_router(
         }))
     }
 
+    let force_metric_collection_on_scrape = conf.force_metric_collection_on_scrape;
+
+    let prometheus_metrics_handler_wrapper =
+        move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape);
+
     // NB: on any changes do not forget to update the OpenAPI spec
     // located nearby (/safekeeper/src/http/openapi_spec.yaml).
     let auth = conf.http_auth.clone();
@@ -706,7 +711,9 @@ pub fn make_router(
         .data(conf)
         .data(global_timelines)
         .data(auth)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| {
+            request_span(r, prometheus_metrics_handler_wrapper)
+        })
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
         .get("/profile/heap", |r| request_span(r, profile_heap_handler))
         .get("/v1/status", |r| request_span(r, status_handler))
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index c461c071da..c0b5403ebf 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -134,6 +134,7 @@ pub struct SafeKeeperConf {
     pub ssl_ca_certs: Vec<Pem>,
     pub use_https_safekeeper_api: bool,
     pub enable_tls_wal_service_api: bool,
+    pub force_metric_collection_on_scrape: bool,
 }
 
 impl SafeKeeperConf {
@@ -183,6 +184,7 @@ impl SafeKeeperConf {
             ssl_ca_certs: Vec::new(),
             use_https_safekeeper_api: false,
             enable_tls_wal_service_api: false,
+            force_metric_collection_on_scrape: true,
         }
     }
 }
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 7e10847a1b..0e8dfd64c3 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -166,7 +166,7 @@ fn hadron_determine_offloader(mgr: &Manager, state: &StateSnapshot) -> (Option<N
 
     let backup_lag = state.commit_lsn.checked_sub(state.backup_lsn);
     if backup_lag.is_none() {
-        info!("Backup lag is None. Skipping re-election.");
+        debug!("Backup lag is None. Skipping re-election.");
         return (offloader, election_dbg_str);
     }
 
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 1f6990c682..280cd790a4 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -190,6 +190,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         ssl_ca_certs: Vec::new(),
         use_https_safekeeper_api: false,
         enable_tls_wal_service_api: false,
+        force_metric_collection_on_scrape: true,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 79cfba8da6..8e7d957b22 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -1002,7 +1002,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def get_metrics_str(self) -> str:
         """You probably want to use get_metrics() instead."""
-        res = self.get(f"http://localhost:{self.port}/metrics")
+        res = self.get(f"http://localhost:{self.port}/metrics?use_latest=true")
         self.verbose_error(res)
         return res.text
 
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 942b620be6..ceb00c0f90 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -143,7 +143,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
 
     def get_metrics_str(self) -> str:
         """You probably want to use get_metrics() instead."""
-        request_result = self.get(f"http://localhost:{self.port}/metrics")
+        request_result = self.get(f"http://localhost:{self.port}/metrics?use_latest=true")
         request_result.raise_for_status()
         return request_result.text
 

From d33b3c7457e1bbe15f1961ebec749249c6f77f5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 10 Jul 2025 16:03:20 +0200
Subject: [PATCH 12/56] Print viability via custom printing impl (#12544)

As per
https://github.com/neondatabase/neon/pull/12485#issuecomment-3056525882
,

we don't want to print the viability error via a debug impl as it prints
the backtrace. SafekeeperInfo doesn't have a display impl, so fall back
to `Debug` for the `Ok` case. It gives single line output so it's okay
to use `Debug` for it.

Follow up of https://github.com/neondatabase/neon/pull/12485
---
 storage_controller/src/service.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index d2f7287be9..3844570b47 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1984,11 +1984,14 @@ impl Service {
         });
 
         // Check that there is enough safekeepers configured that we can create new timelines
-        let test_sk_res = this.safekeepers_for_new_timeline().await;
+        let test_sk_res_str = match this.safekeepers_for_new_timeline().await {
+            Ok(v) => format!("Ok({v:?})"),
+            Err(v) => format!("Err({v:})"),
+        };
         tracing::info!(
             timeline_safekeeper_count = config.timeline_safekeeper_count,
             timelines_onto_safekeepers = config.timelines_onto_safekeepers,
-            "viability test result (test timeline creation on safekeepers): {test_sk_res:?}",
+            "viability test result (test timeline creation on safekeepers): {test_sk_res_str}",
         );
 
         Ok(this)

From be5bbaecadda71478638608c469c184aaf124bf5 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 10 Jul 2025 10:28:58 -0400
Subject: [PATCH 13/56] fix(storcon): correctly handle 404 error in lsn lease
 (#12537)

## Problem

close LKB-253

## Summary of changes

404 for timeline requests could happen when the tenant is intended to be
on a pageserver but not attached yet. This patch adds handling for the
lease request. In the future, we should extend this handling to more
operations.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/service.rs | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 3844570b47..9c1b81d261 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4761,6 +4761,7 @@ impl Service {
         )
         .await;
 
+        let mut retry_if_not_attached = false;
         let targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -4777,6 +4778,24 @@ impl Service {
                         .expect("Pageservers may not be deleted while referenced");
 
                     targets.push((*tenant_shard_id, node.clone()));
+
+                    if let Some(location) = shard.observed.locations.get(node_id) {
+                        if let Some(ref conf) = location.conf {
+                            if conf.mode != LocationConfigMode::AttachedSingle
+                                && conf.mode != LocationConfigMode::AttachedMulti
+                            {
+                                // If the shard is attached as secondary, we need to retry if 404.
+                                retry_if_not_attached = true;
+                            }
+                            // If the shard is attached as primary, we should succeed.
+                        } else {
+                            // Location conf is not available yet, retry if 404.
+                            retry_if_not_attached = true;
+                        }
+                    } else {
+                        // The shard is not attached to the intended pageserver yet, retry if 404.
+                        retry_if_not_attached = true;
+                    }
                 }
             }
             targets
@@ -4807,6 +4826,18 @@ impl Service {
                         valid_until = Some(lease.valid_until);
                     }
                 }
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _))
+                    if retry_if_not_attached =>
+                {
+                    // This is expected if the attach is not finished yet. Return 503 so that the client can retry.
+                    return Err(ApiError::ResourceUnavailable(
+                        format!(
+                            "Timeline is not attached to the pageserver {} yet, please retry",
+                            node.get_id()
+                        )
+                        .into(),
+                    ));
+                }
                 Err(e) => {
                     return Err(passthrough_api_error(&node, e));
                 }

From 2c6b327be6619e66038427e5cbfe42159498a949 Mon Sep 17 00:00:00 2001
From: HaoyuHuang <haoyu.huang.68@gmail.com>
Date: Thu, 10 Jul 2025 07:39:38 -0700
Subject: [PATCH 14/56] A few PS changes (#12540)

# TLDR
All changes are no-op except some metrics.

## Summary of changes I
### Pageserver
Added a new global counter metric
`pageserver_pagestream_handler_results_total` that categorizes
pagestream request results according to their outcomes:
1. Success
2. Internal errors
3. Other errors

Internal errors include:
1. Page reconstruction error: This probably indicates a pageserver
bug/corruption
2. LSN timeout error: Could indicate overload or bugs with PS's ability
to reach other components
3. Misrouted request error: Indicates bugs in the Storage Controller/HCC

Other errors include transient errors that are expected during normal
operation or errors indicating bugs with other parts of the system
(e.g., malformed requests, errors due to cancelled operations during PS
shutdown, etc.)


## Summary of changes II
This PR adds a pageserver endpoint and its counterpart in storage
controller to list visible size of all tenant shards. This will be a
prerequisite of the tenant rebalance command.


## Problem III
We need a way to download WAL
segments/layerfiles from S3 and replay WAL records. We cannot access
production S3 from our laptops directly, and we also can't transfer any
user data out of production systems for GDPR compliance, so we need
solutions.

## Summary of changes III

This PR adds a couple of tools to support the debugging
workflow in production:
1. A new `pagectl download-remote-object` command that can be used to
download remote storage objects assuming the correct access is set up.

## Summary of changes IV
This PR adds a command to list all visible delta and image layers from
index_part. This is useful to debug compaction issues as index_part
often contain a lot of covered layers due to PITR.

---------

Co-authored-by: William Huang <william.huang@databricks.com>
Co-authored-by: Chen Luo <chen.luo@databricks.com>
Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 Cargo.lock                                    |   1 +
 pageserver/client/src/mgmt_api.rs             |  66 ++++++++++-
 pageserver/ctl/Cargo.toml                     |   1 +
 pageserver/ctl/src/download_remote_object.rs  |  85 ++++++++++++++
 pageserver/ctl/src/index_part.rs              | 110 +++++++++++++++---
 pageserver/ctl/src/main.rs                    |   6 +
 pageserver/src/http/routes.rs                 |  29 ++++-
 pageserver/src/metrics.rs                     |  18 +++
 pageserver/src/page_service.rs                |  53 ++++++++-
 pageserver/src/tenant.rs                      |  10 ++
 .../src/tenant/storage_layer/layer_name.rs    |   2 +-
 safekeeper/src/metrics.rs                     |   9 ++
 safekeeper/src/safekeeper.rs                  |  19 ++-
 test_runner/fixtures/pageserver/http.py       |   7 ++
 test_runner/regress/test_pageserver_api.py    |  13 +++
 15 files changed, 404 insertions(+), 25 deletions(-)
 create mode 100644 pageserver/ctl/src/download_remote_object.rs

diff --git a/Cargo.lock b/Cargo.lock
index 4150944ad0..85080f8473 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4323,6 +4323,7 @@ dependencies = [
  "pageserver_api",
  "postgres_ffi",
  "remote_storage",
+ "serde",
  "serde_json",
  "svg_fmt",
  "thiserror 1.0.69",
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index af4be23b9b..fe1ddc2e7d 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
 use std::error::Error as _;
 use std::time::Duration;
 
@@ -251,6 +251,70 @@ impl Client {
         Ok(())
     }
 
+    pub async fn tenant_timeline_compact(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        force_image_layer_creation: bool,
+        must_force_image_layer_creation: bool,
+        scheduled: bool,
+        wait_until_done: bool,
+    ) -> Result<()> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/compact",
+            self.mgmt_api_endpoint
+        ))
+        .expect("Cannot build URL");
+
+        if force_image_layer_creation {
+            path.query_pairs_mut()
+                .append_pair("force_image_layer_creation", "true");
+        }
+
+        if must_force_image_layer_creation {
+            path.query_pairs_mut()
+                .append_pair("must_force_image_layer_creation", "true");
+        }
+
+        if scheduled {
+            path.query_pairs_mut().append_pair("scheduled", "true");
+        }
+        if wait_until_done {
+            path.query_pairs_mut()
+                .append_pair("wait_until_scheduled_compaction_done", "true");
+            path.query_pairs_mut()
+                .append_pair("wait_until_uploaded", "true");
+        }
+        self.request(Method::PUT, path, ()).await?;
+        Ok(())
+    }
+
+    /* BEGIN_HADRON */
+    pub async fn tenant_timeline_describe(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Result<TimelineInfo> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        ))
+        .expect("Cannot build URL");
+        path.query_pairs_mut()
+            .append_pair("include-image-consistent-lsn", "true");
+
+        let response: reqwest::Response = self.request(Method::GET, path, ()).await?;
+        let body = response.json().await.map_err(Error::ReceiveBody)?;
+        Ok(body)
+    }
+
+    pub async fn list_tenant_visible_size(&self) -> Result<BTreeMap<TenantShardId, u64>> {
+        let uri = format!("{}/v1/list_tenant_visible_size", self.mgmt_api_endpoint);
+        let resp = self.get(&uri).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+    /* END_HADRON */
+
     pub async fn tenant_scan_remote_storage(
         &self,
         tenant_id: TenantId,
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index 7b70f0dc87..ba34fa1f69 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -17,6 +17,7 @@ pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
+serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
diff --git a/pageserver/ctl/src/download_remote_object.rs b/pageserver/ctl/src/download_remote_object.rs
new file mode 100644
index 0000000000..aa09774701
--- /dev/null
+++ b/pageserver/ctl/src/download_remote_object.rs
@@ -0,0 +1,85 @@
+use camino::Utf8PathBuf;
+use clap::Parser;
+use tokio_util::sync::CancellationToken;
+
+/// Download a specific object from remote storage to a local file.
+///
+/// The remote storage configuration is supplied via the `REMOTE_STORAGE_CONFIG` environment
+/// variable, in the same TOML format that the pageserver itself understands. This allows the
+/// command to work with any cloud supported by the `remote_storage` crate (currently AWS S3,
+/// Azure Blob Storage and local files), as long as the credentials are available via the
+/// standard environment variables expected by the underlying SDKs.
+///
+/// Examples for setting the environment variable:
+///
+/// ```bash
+/// # AWS S3 (region can also be provided via AWS_REGION)
+/// export REMOTE_STORAGE_CONFIG='remote_storage = { bucket_name = "my-bucket", bucket_region = "us-east-2" }'
+///
+/// # Azure Blob Storage (account key picked up from AZURE_STORAGE_ACCOUNT_KEY)
+/// export REMOTE_STORAGE_CONFIG='remote_storage = { container = "my-container", account = "my-account" }'
+/// ```
+#[derive(Parser)]
+pub(crate) struct DownloadRemoteObjectCmd {
+    /// Key / path of the object to download (relative to the remote storage prefix).
+    ///
+    /// Examples:
+    ///   "wal/3aa8f.../00000001000000000000000A"
+    ///   "pageserver/v1/tenants/<tenant_id>/timelines/<timeline_id>/layer_12345"
+    pub remote_path: String,
+
+    /// Path of the local file to create. Existing file will be overwritten.
+    ///
+    /// Examples:
+    ///   "./segment"
+    ///   "/tmp/layer_12345.parquet"
+    pub output_file: Utf8PathBuf,
+}
+
+pub(crate) async fn main(cmd: &DownloadRemoteObjectCmd) -> anyhow::Result<()> {
+    use remote_storage::{DownloadOpts, GenericRemoteStorage, RemotePath, RemoteStorageConfig};
+
+    // Fetch remote storage configuration from the environment
+    let config_str = std::env::var("REMOTE_STORAGE_CONFIG").map_err(|_| {
+        anyhow::anyhow!(
+            "'REMOTE_STORAGE_CONFIG' environment variable must be set to a valid remote storage TOML config"
+        )
+    })?;
+
+    let config = RemoteStorageConfig::from_toml_str(&config_str)?;
+
+    // Initialise remote storage client
+    let storage = GenericRemoteStorage::from_config(&config).await?;
+
+    // RemotePath must be relative – leading slashes confuse the parser.
+    let remote_path_str = cmd.remote_path.trim_start_matches('/');
+    let remote_path = RemotePath::from_string(remote_path_str)?;
+
+    let cancel = CancellationToken::new();
+
+    println!(
+        "Downloading '{remote_path}' from remote storage bucket {:?} ...",
+        config.storage.bucket_name()
+    );
+
+    // Start the actual download
+    let download = storage
+        .download(&remote_path, &DownloadOpts::default(), &cancel)
+        .await?;
+
+    // Stream to file
+    let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
+    let tmp_path = cmd.output_file.with_extension("tmp");
+    let mut file = tokio::fs::File::create(&tmp_path).await?;
+    tokio::io::copy(&mut reader, &mut file).await?;
+    file.sync_all().await?;
+    // Atomically move into place
+    tokio::fs::rename(&tmp_path, &cmd.output_file).await?;
+
+    println!(
+        "Downloaded to '{}'. Last modified: {:?}, etag: {}",
+        cmd.output_file, download.last_modified, download.etag
+    );
+
+    Ok(())
+}
diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 838d00e490..9801f3c9dc 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -1,14 +1,16 @@
 use std::str::FromStr;
 
-use anyhow::Context;
+use anyhow::{Context, Ok};
 use camino::Utf8PathBuf;
 use pageserver::tenant::{
     IndexPart,
     layer_map::{LayerMap, SearchResult},
-    remote_timeline_client::remote_layer_path,
-    storage_layer::{PersistentLayerDesc, ReadableLayerWeak},
+    remote_timeline_client::{index::LayerFileMetadata, remote_layer_path},
+    storage_layer::{LayerName, LayerVisibilityHint, PersistentLayerDesc, ReadableLayerWeak},
 };
 use pageserver_api::key::Key;
+use serde::Serialize;
+use std::collections::BTreeMap;
 use utils::{
     id::{TenantId, TimelineId},
     lsn::Lsn,
@@ -33,6 +35,31 @@ pub(crate) enum IndexPartCmd {
         #[arg(long)]
         lsn: String,
     },
+    /// List all visible delta and image layers at the latest LSN.
+    ListVisibleLayers {
+        #[arg(long)]
+        path: Utf8PathBuf,
+    },
+}
+
+fn create_layer_map_from_index_part(
+    index_part: &IndexPart,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+) -> LayerMap {
+    let mut layer_map = LayerMap::default();
+    {
+        let mut updates = layer_map.batch_update();
+        for (key, value) in index_part.layer_metadata.iter() {
+            updates.insert_historic(PersistentLayerDesc::from_filename(
+                tenant_shard_id,
+                timeline_id,
+                key.clone(),
+                value.file_size,
+            ));
+        }
+    }
+    layer_map
 }
 
 async fn search_layers(
@@ -49,18 +76,7 @@ async fn search_layers(
         let bytes = tokio::fs::read(path).await?;
         IndexPart::from_json_bytes(&bytes).unwrap()
     };
-    let mut layer_map = LayerMap::default();
-    {
-        let mut updates = layer_map.batch_update();
-        for (key, value) in index_json.layer_metadata.iter() {
-            updates.insert_historic(PersistentLayerDesc::from_filename(
-                tenant_shard_id,
-                timeline_id,
-                key.clone(),
-                value.file_size,
-            ));
-        }
-    }
+    let layer_map = create_layer_map_from_index_part(&index_json, tenant_shard_id, timeline_id);
     let key = Key::from_hex(key)?;
 
     let lsn = Lsn::from_str(lsn).unwrap();
@@ -98,6 +114,69 @@ async fn search_layers(
     Ok(())
 }
 
+#[derive(Debug, Clone, Serialize)]
+struct VisibleLayers {
+    pub total_images: u64,
+    pub total_image_bytes: u64,
+    pub total_deltas: u64,
+    pub total_delta_bytes: u64,
+    pub layer_metadata: BTreeMap<LayerName, LayerFileMetadata>,
+}
+
+impl VisibleLayers {
+    pub fn new() -> Self {
+        Self {
+            layer_metadata: BTreeMap::new(),
+            total_images: 0,
+            total_image_bytes: 0,
+            total_deltas: 0,
+            total_delta_bytes: 0,
+        }
+    }
+
+    pub fn add_layer(&mut self, name: LayerName, layer: LayerFileMetadata) {
+        match name {
+            LayerName::Image(_) => {
+                self.total_images += 1;
+                self.total_image_bytes += layer.file_size;
+            }
+            LayerName::Delta(_) => {
+                self.total_deltas += 1;
+                self.total_delta_bytes += layer.file_size;
+            }
+        }
+        self.layer_metadata.insert(name, layer);
+    }
+}
+
+async fn list_visible_layers(path: &Utf8PathBuf) -> anyhow::Result<()> {
+    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    let timeline_id = TimelineId::generate();
+
+    let bytes = tokio::fs::read(path).await.context("read file")?;
+    let index_part = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
+    let layer_map = create_layer_map_from_index_part(&index_part, tenant_shard_id, timeline_id);
+    let mut visible_layers = VisibleLayers::new();
+    let (layers, _key_space) = layer_map.get_visibility(Vec::new());
+    for (layer, visibility) in layers {
+        if visibility == LayerVisibilityHint::Visible {
+            visible_layers.add_layer(
+                layer.layer_name(),
+                index_part
+                    .layer_metadata
+                    .get(&layer.layer_name())
+                    .unwrap()
+                    .clone(),
+            );
+        }
+    }
+    let output = serde_json::to_string_pretty(&visible_layers).context("serialize output")?;
+    println!("{output}");
+
+    Ok(())
+}
+
 pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
     match cmd {
         IndexPartCmd::Dump { path } => {
@@ -114,5 +193,6 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
             key,
             lsn,
         } => search_layers(tenant_id, timeline_id, path, key, lsn).await,
+        IndexPartCmd::ListVisibleLayers { path } => list_visible_layers(path).await,
     }
 }
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 3cd4faaf2e..e84ad2c87f 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -4,6 +4,7 @@
 //!
 //! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
 
+mod download_remote_object;
 mod draw_timeline_dir;
 mod index_part;
 mod key;
@@ -16,6 +17,7 @@ use std::time::{Duration, SystemTime};
 
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
+use download_remote_object::DownloadRemoteObjectCmd;
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use page_trace::PageTraceCmd;
@@ -63,6 +65,7 @@ enum Commands {
     /// Debug print a hex key found from logs
     Key(key::DescribeKeyCommand),
     PageTrace(PageTraceCmd),
+    DownloadRemoteObject(DownloadRemoteObjectCmd),
 }
 
 /// Read and update pageserver metadata file
@@ -185,6 +188,9 @@ async fn main() -> anyhow::Result<()> {
         }
         Commands::Key(dkc) => dkc.execute(),
         Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
+        Commands::DownloadRemoteObject(cmd) => {
+            download_remote_object::main(&cmd).await?;
+        }
     };
     Ok(())
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ed0a5440cb..7030ac368d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2,7 +2,9 @@
 //! Management HTTP API
 //!
 use std::cmp::Reverse;
-use std::collections::{BinaryHeap, HashMap};
+use std::collections::BTreeMap;
+use std::collections::BinaryHeap;
+use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
@@ -3214,6 +3216,30 @@ async fn get_utilization(
         .map_err(ApiError::InternalServerError)
 }
 
+/// HADRON
+async fn list_tenant_visible_size_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let state = get_state(&request);
+
+    let mut map = BTreeMap::new();
+    for (tenant_shard_id, slot) in state.tenant_manager.list() {
+        match slot {
+            TenantSlot::Attached(tenant) => {
+                let visible_size = tenant.get_visible_size();
+                map.insert(tenant_shard_id, visible_size);
+            }
+            TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
+                continue;
+            }
+        }
+    }
+
+    json_response(StatusCode::OK, map)
+}
+
 async fn list_aux_files(
     mut request: Request<Body>,
     _cancel: CancellationToken,
@@ -4151,6 +4177,7 @@ pub fn make_router(
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
+        .get("/v1/list_tenant_visible_size", |r| api_handler(r, list_tenant_visible_size_handler))
         .post(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
             |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index eb89e166b2..1b783326a0 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2847,6 +2847,24 @@ pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy<IntCounter> = Lazy::new(||
     .expect("failed to define a metric")
 });
 
+// Global counter for PageStream request results by outcome. Outcomes are divided into 3 categories:
+// - success
+// - internal_error: errors that indicate bugs in the storage cluster (e.g. page reconstruction errors, misrouted requests, LSN timeout errors)
+// - other_error: transient error conditions that are expected in normal operation or indicate bugs with other parts of the system (e.g. error due to pageserver shutdown, malformed requests etc.)
+pub(crate) static PAGESTREAM_HANDLER_RESULTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_pagestream_handler_results_total",
+        "Number of pageserver pagestream handler results by outcome (success, internal_error, other_error)",
+        &["outcome"]
+    )
+    .expect("failed to define a metric")
+});
+
+// Constants for pageserver_pagestream_handler_results_total's outcome labels
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_SUCCESS: &str = "success";
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR: &str = "internal_error";
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR: &str = "other_error";
+
 // Metrics collected on WAL redo operations
 //
 // We collect the time spent in actual WAL redo ('redo'), and time waiting
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 70fdb2e789..ebb1addcdb 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -70,7 +70,7 @@ use crate::context::{
 };
 use crate::metrics::{
     self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
-    MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics,
+    MISROUTED_PAGESTREAM_REQUESTS, PAGESTREAM_HANDLER_RESULTS_TOTAL, SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::{LsnRange, Version};
 use crate::span::{
@@ -1441,20 +1441,57 @@ impl PageServerHandler {
             let (response_msg, ctx) = match handler_result {
                 Err(e) => match &e.err {
                     PageStreamError::Shutdown => {
+                        // BEGIN HADRON
+                        PAGESTREAM_HANDLER_RESULTS_TOTAL
+                            .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR])
+                            .inc();
+                        // END HADRON
+
                         // If we fail to fulfil a request during shutdown, which may be _because_ of
                         // shutdown, then do not send the error to the client.  Instead just drop the
                         // connection.
                         span.in_scope(|| info!("dropping connection due to shutdown"));
                         return Err(QueryError::Shutdown);
                     }
-                    PageStreamError::Reconnect(reason) => {
-                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                    PageStreamError::Reconnect(_reason) => {
+                        span.in_scope(|| {
+                            // BEGIN HADRON
+                            // We can get here because the compute node is pointing at the wrong PS. We
+                            // already have a metric to keep track of this so suppressing this log to
+                            // reduce log spam. The information in this log message is not going to be that
+                            // helpful given the volume of logs that can be generated.
+                            // info!("handler requested reconnect: {reason}")
+                            // END HADRON
+                        });
+                        // BEGIN HADRON
+                        PAGESTREAM_HANDLER_RESULTS_TOTAL
+                            .with_label_values(&[
+                                metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
+                            ])
+                            .inc();
+                        // END HADRON
                         return Err(QueryError::Reconnect);
                     }
                     PageStreamError::Read(_)
                     | PageStreamError::LsnTimeout(_)
                     | PageStreamError::NotFound(_)
                     | PageStreamError::BadRequest(_) => {
+                        // BEGIN HADRON
+                        if let PageStreamError::Read(_) | PageStreamError::LsnTimeout(_) = &e.err {
+                            PAGESTREAM_HANDLER_RESULTS_TOTAL
+                                .with_label_values(&[
+                                    metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
+                                ])
+                                .inc();
+                        } else {
+                            PAGESTREAM_HANDLER_RESULTS_TOTAL
+                                .with_label_values(&[
+                                    metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR,
+                                ])
+                                .inc();
+                        }
+                        // END HADRON
+
                         // print the all details to the log with {:#}, but for the client the
                         // error message is enough.  Do not log if shutting down, as the anyhow::Error
                         // here includes cancellation which is not an error.
@@ -1472,7 +1509,15 @@ impl PageServerHandler {
                         )
                     }
                 },
-                Ok((response_msg, _op_timer_already_observed, ctx)) => (response_msg, Some(ctx)),
+                Ok((response_msg, _op_timer_already_observed, ctx)) => {
+                    // BEGIN HADRON
+                    PAGESTREAM_HANDLER_RESULTS_TOTAL
+                        .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_SUCCESS])
+                        .inc();
+                    // END HADRON
+
+                    (response_msg, Some(ctx))
+                }
             };
 
             let ctx = ctx.map(|req_ctx| {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7e2e6d96b8..f67269851a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5719,6 +5719,16 @@ impl TenantShard {
             .unwrap_or(0)
     }
 
+    /// HADRON
+    /// Return the visible size of all timelines in this tenant.
+    pub(crate) fn get_visible_size(&self) -> u64 {
+        let timelines = self.timelines.lock().unwrap();
+        timelines
+            .values()
+            .map(|t| t.metrics.visible_physical_size_gauge.get())
+            .sum()
+    }
+
     /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
     /// manifest in `Self::remote_tenant_manifest`.
     ///
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index 0f7995f87b..973852defc 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -225,7 +225,7 @@ impl fmt::Display for ImageLayerName {
 /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
 /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
 /// and [`crate::tenant::storage_layer::layer::local_layer_path`])
-#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Ord, PartialOrd)]
 pub enum LayerName {
     Image(ImageLayerName),
     Delta(DeltaLayerName),
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 9baa80f73a..1f98651e71 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -59,6 +59,15 @@ pub static FLUSH_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     .expect("Failed to register safekeeper_flush_wal_seconds histogram")
 });
 /* BEGIN_HADRON */
+// Counter of all ProposerAcceptorMessage requests received
+pub static PROPOSER_ACCEPTOR_MESSAGES_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_proposer_acceptor_messages_total",
+        "Total number of ProposerAcceptorMessage requests received by the Safekeeper.",
+        &["outcome"]
+    )
+    .expect("Failed to register safekeeper_proposer_acceptor_messages_total counter")
+});
 pub static WAL_DISK_IO_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "safekeeper_wal_disk_io_errors",
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 4d15fc9de3..09ca041e22 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -24,7 +24,7 @@ use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;
 
-use crate::metrics::MISC_OPERATION_SECONDS;
+use crate::metrics::{MISC_OPERATION_SECONDS, PROPOSER_ACCEPTOR_MESSAGES_TOTAL};
 use crate::state::TimelineState;
 use crate::{control_file, wal_storage};
 
@@ -938,7 +938,7 @@ where
         &mut self,
         msg: &ProposerAcceptorMessage,
     ) -> Result<Option<AcceptorProposerMessage>> {
-        match msg {
+        let res = match msg {
             ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg).await,
             ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg).await,
             ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg).await,
@@ -949,7 +949,20 @@ where
                 self.handle_append_request(msg, false).await
             }
             ProposerAcceptorMessage::FlushWAL => self.handle_flush().await,
-        }
+        };
+
+        // BEGIN HADRON
+        match &res {
+            Ok(_) => PROPOSER_ACCEPTOR_MESSAGES_TOTAL
+                .with_label_values(&["success"])
+                .inc(),
+            Err(_) => PROPOSER_ACCEPTOR_MESSAGES_TOTAL
+                .with_label_values(&["error"])
+                .inc(),
+        };
+
+        res
+        // END HADRON
     }
 
     /// Handle initial message from proposer: check its sanity and send my
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 8e7d957b22..23b9d1c8c9 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -333,6 +333,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys")
         self.verbose_error(res)
 
+    def list_tenant_visible_size(self) -> dict[TenantShardId, int]:
+        res = self.get(f"http://localhost:{self.port}/v1/list_tenant_visible_size")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
     def tenant_list(self) -> list[dict[Any, Any]]:
         res = self.get(f"http://localhost:{self.port}/v1/tenant")
         self.verbose_error(res)
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 7f9207047e..92889e5de3 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 
 from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
@@ -164,3 +165,15 @@ def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder
             {"rel_size_migration": "legacy"},
         )
         assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy"
+
+
+def test_pageserver_get_tenant_visible_size(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_pageservers = 1
+    env = neon_env_builder.init_start()
+    env.create_tenant(shard_count=4)
+    env.create_tenant(shard_count=2)
+
+    json = env.pageserver.http_client().list_tenant_visible_size()
+    log.info(f"{json}")
+    # initial tennat + 2 newly created tenants
+    assert len(json) == 7

From 2fc77c836b3c0e88946254fb9235ded1db60dd75 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 10 Jul 2025 17:46:39 +0200
Subject: [PATCH 15/56] pageserver/client_grpc: add shard map updates (#12480)

## Problem

The communicator gRPC client must support changing the shard map on
splits.

Touches #11735.
Requires #12476.

## Summary of changes

* Wrap the shard set in a `ArcSwap` to allow swapping it out.
* Add a new `ShardSpec` parameter struct to pass validated shard info to
the client.
* Add `update_shards()` to change the shard set. In-flight requests are
allowed to complete using the old shards.
* Restructure `get_page` to use a stable view of the shard map, and
retry errors at the top (pre-split) level to pick up shard map changes.
* Also marks `tonic::Status::Internal` as non-retryable, so that we can
use it for client-side invariant checks without continually retrying
these.
---
 Cargo.lock                           |   1 +
 pageserver/client_grpc/Cargo.toml    |   1 +
 pageserver/client_grpc/src/client.rs | 257 ++++++++++++++++++---------
 pageserver/client_grpc/src/lib.rs    |   2 +-
 pageserver/client_grpc/src/retry.rs  |   5 +-
 pageserver/client_grpc/src/split.rs  |  31 ++--
 6 files changed, 199 insertions(+), 98 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 85080f8473..1d68b8f862 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4500,6 +4500,7 @@ name = "pageserver_client_grpc"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "arc-swap",
  "bytes",
  "compute_api",
  "futures",
diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml
index ca224900ac..e2741ad839 100644
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -9,6 +9,7 @@ testing = ["pageserver_api/testing"]
 
 [dependencies]
 anyhow.workspace = true
+arc-swap.workspace = true
 bytes.workspace = true
 compute_api.workspace = true
 futures.workspace = true
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 63852868c3..ee09c1f13c 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -3,6 +3,7 @@ use std::num::NonZero;
 use std::sync::Arc;
 
 use anyhow::anyhow;
+use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
 use tracing::instrument;
@@ -55,28 +56,74 @@ const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
 /// TODO: this client does not support base backups or LSN leases, as these are only used by
 /// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
 pub struct PageserverClient {
-    // TODO: support swapping out the shard map, e.g. via an ArcSwap.
-    shards: Shards,
+    /// The tenant ID.
+    tenant_id: TenantId,
+    /// The timeline ID.
+    timeline_id: TimelineId,
+    /// The JWT auth token for this tenant, if any.
+    auth_token: Option<String>,
+    /// The shards for this tenant.
+    shards: ArcSwap<Shards>,
+    /// The retry configuration.
     retry: Retry,
 }
 
 impl PageserverClient {
     /// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given
-    /// in the shard map, which must be complete and must use gRPC URLs.
+    /// in the shard spec, which must be complete and must use gRPC URLs.
     pub fn new(
         tenant_id: TenantId,
         timeline_id: TimelineId,
-        shard_map: HashMap<ShardIndex, String>,
-        stripe_size: ShardStripeSize,
+        shard_spec: ShardSpec,
         auth_token: Option<String>,
     ) -> anyhow::Result<Self> {
-        let shards = Shards::new(tenant_id, timeline_id, shard_map, stripe_size, auth_token)?;
+        let shards = Shards::new(tenant_id, timeline_id, shard_spec, auth_token.clone())?;
         Ok(Self {
-            shards,
+            tenant_id,
+            timeline_id,
+            auth_token,
+            shards: ArcSwap::new(Arc::new(shards)),
             retry: Retry,
         })
     }
 
+    /// Updates the shards from the given shard spec. In-flight requests will complete using the
+    /// existing shards, but may retry with the new shards if they fail.
+    ///
+    /// TODO: verify that in-flight requests are allowed to complete, and that the old pools are
+    /// properly spun down and dropped afterwards.
+    pub fn update_shards(&self, shard_spec: ShardSpec) -> anyhow::Result<()> {
+        // Validate the shard spec. We should really use `ArcSwap::rcu` for this, to avoid races
+        // with concurrent updates, but that involves creating a new `Shards` on every attempt,
+        // which spins up a bunch of Tokio tasks and such. These should already be checked elsewhere
+        // in the stack, and if they're violated then we already have problems elsewhere, so a
+        // best-effort but possibly-racy check is okay here.
+        let old = self.shards.load_full();
+        if shard_spec.count < old.count {
+            return Err(anyhow!(
+                "can't reduce shard count from {} to {}",
+                old.count,
+                shard_spec.count
+            ));
+        }
+        if !old.count.is_unsharded() && shard_spec.stripe_size != old.stripe_size {
+            return Err(anyhow!(
+                "can't change stripe size from {} to {}",
+                old.stripe_size,
+                shard_spec.stripe_size
+            ));
+        }
+
+        let shards = Shards::new(
+            self.tenant_id,
+            self.timeline_id,
+            shard_spec,
+            self.auth_token.clone(),
+        )?;
+        self.shards.store(Arc::new(shards));
+        Ok(())
+    }
+
     /// Returns whether a relation exists.
     #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
     pub async fn check_rel_exists(
@@ -86,7 +133,7 @@ impl PageserverClient {
         self.retry
             .with(async || {
                 // Relation metadata is only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
+                let mut client = self.shards.load_full().get_zero().client().await?;
                 client.check_rel_exists(req).await
             })
             .await
@@ -101,7 +148,7 @@ impl PageserverClient {
         self.retry
             .with(async || {
                 // Relation metadata is only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
+                let mut client = self.shards.load_full().get_zero().client().await?;
                 client.get_db_size(req).await
             })
             .await
@@ -129,28 +176,42 @@ impl PageserverClient {
             return Err(tonic::Status::invalid_argument("no block number"));
         }
 
+        // The shards may change while we're fetching pages. We execute the request using a stable
+        // view of the shards (especially important for requests that span shards), but retry the
+        // top-level (pre-split) request to pick up shard changes. This can lead to unnecessary
+        // retries and re-splits in some cases where requests span shards, but these are expected to
+        // be rare.
+        //
+        // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
+        // once we figure out how to handle these.
+        self.retry
+            .with(async || Self::get_page_with_shards(req.clone(), &self.shards.load_full()).await)
+            .await
+    }
+
+    /// Fetches pages using the given shards. This uses a stable view of the shards, regardless of
+    /// concurrent shard updates. Does not retry internally, but is retried by `get_page()`.
+    async fn get_page_with_shards(
+        req: page_api::GetPageRequest,
+        shards: &Shards,
+    ) -> tonic::Result<page_api::GetPageResponse> {
         // Fast path: request is for a single shard.
         if let Some(shard_id) =
-            GetPageSplitter::is_single_shard(&req, self.shards.count, self.shards.stripe_size)
+            GetPageSplitter::is_single_shard(&req, shards.count, shards.stripe_size)
         {
-            return self.get_page_for_shard(shard_id, req).await;
+            return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
         }
 
         // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
         // reassemble the responses.
-        //
-        // TODO: when we support shard map updates, we need to detect when it changes and re-split
-        // the request on errors.
-        let mut splitter = GetPageSplitter::split(req, self.shards.count, self.shards.stripe_size);
+        let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size);
 
-        let mut shard_requests: FuturesUnordered<_> = splitter
-            .drain_requests()
-            .map(|(shard_id, shard_req)| {
-                // NB: each request will retry internally.
-                self.get_page_for_shard(shard_id, shard_req)
-                    .map(move |result| result.map(|resp| (shard_id, resp)))
-            })
-            .collect();
+        let mut shard_requests = FuturesUnordered::new();
+        for (shard_id, shard_req) in splitter.drain_requests() {
+            let future = Self::get_page_with_shard(shard_req, shards.get(shard_id)?)
+                .map(move |result| result.map(|resp| (shard_id, resp)));
+            shard_requests.push(future);
+        }
 
         while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
             splitter.add_response(shard_id, shard_response)?;
@@ -159,41 +220,28 @@ impl PageserverClient {
         splitter.assemble_response()
     }
 
-    /// Fetches pages that belong to the given shard.
-    #[instrument(skip_all, fields(shard = %shard_id))]
-    async fn get_page_for_shard(
-        &self,
-        shard_id: ShardIndex,
+    /// Fetches pages on the given shard. Does not retry internally.
+    async fn get_page_with_shard(
         req: page_api::GetPageRequest,
+        shard: &Shard,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let resp = self
-            .retry
-            .with(async || {
-                let stream = self
-                    .shards
-                    .get(shard_id)?
-                    .stream(req.request_class.is_bulk())
-                    .await;
-                let resp = stream.send(req.clone()).await?;
+        let expected = req.block_numbers.len();
+        let stream = shard.stream(req.request_class.is_bulk()).await;
+        let resp = stream.send(req).await?;
 
-                // Convert per-request errors into a tonic::Status.
-                if resp.status_code != page_api::GetPageStatusCode::Ok {
-                    return Err(tonic::Status::new(
-                        resp.status_code.into(),
-                        resp.reason.unwrap_or_else(|| String::from("unknown error")),
-                    ));
-                }
+        // Convert per-request errors into a tonic::Status.
+        if resp.status_code != page_api::GetPageStatusCode::Ok {
+            return Err(tonic::Status::new(
+                resp.status_code.into(),
+                resp.reason.unwrap_or_else(|| String::from("unknown error")),
+            ));
+        }
 
-                Ok(resp)
-            })
-            .await?;
-
-        // Make sure we got the right number of pages.
-        // NB: check outside of the retry loop, since we don't want to retry this.
-        let (expected, actual) = (req.block_numbers.len(), resp.page_images.len());
+        // Check that we received the expected number of pages.
+        let actual = resp.page_images.len();
         if expected != actual {
             return Err(tonic::Status::internal(format!(
-                "expected {expected} pages for shard {shard_id}, got {actual}",
+                "expected {expected} pages, got {actual}",
             )));
         }
 
@@ -209,7 +257,7 @@ impl PageserverClient {
         self.retry
             .with(async || {
                 // Relation metadata is only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
+                let mut client = self.shards.load_full().get_zero().client().await?;
                 client.get_rel_size(req).await
             })
             .await
@@ -224,48 +272,51 @@ impl PageserverClient {
         self.retry
             .with(async || {
                 // SLRU segments are only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
+                let mut client = self.shards.load_full().get_zero().client().await?;
                 client.get_slru_segment(req).await
             })
             .await
     }
 }
 
-/// Tracks the tenant's shards.
-struct Shards {
+/// Shard specification for a PageserverClient.
+pub struct ShardSpec {
+    /// Maps shard indices to gRPC URLs.
+    ///
+    /// INVARIANT: every shard 0..count is present, and shard 0 is always present.
+    /// INVARIANT: every URL is valid and uses grpc:// scheme.
+    urls: HashMap<ShardIndex, String>,
     /// The shard count.
     ///
     /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
     count: ShardCount,
-    /// The stripe size. Only used for sharded tenants.
+    /// The stripe size for these shards.
     stripe_size: ShardStripeSize,
-    /// Shards by shard index.
-    ///
-    /// NB: unsharded tenants use count 0, like `ShardIndex::unsharded()`.
-    ///
-    /// INVARIANT: every shard 0..count is present.
-    /// INVARIANT: shard 0 is always present.
-    map: HashMap<ShardIndex, Shard>,
 }
 
-impl Shards {
-    /// Creates a new set of shards based on a shard map.
-    fn new(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        shard_map: HashMap<ShardIndex, String>,
-        stripe_size: ShardStripeSize,
-        auth_token: Option<String>,
+impl ShardSpec {
+    /// Creates a new shard spec with the given URLs and stripe size. All shards must be given.
+    /// The stripe size may be omitted for unsharded tenants.
+    pub fn new(
+        urls: HashMap<ShardIndex, String>,
+        stripe_size: Option<ShardStripeSize>,
     ) -> anyhow::Result<Self> {
-        let count = match shard_map.len() {
+        // Compute the shard count.
+        let count = match urls.len() {
             0 => return Err(anyhow!("no shards provided")),
             1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()`
             n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")),
             n => ShardCount::new(n as u8),
         };
 
-        let mut map = HashMap::new();
-        for (shard_id, url) in shard_map {
+        // Determine the stripe size. It doesn't matter for unsharded tenants.
+        if stripe_size.is_none() && !count.is_unsharded() {
+            return Err(anyhow!("stripe size must be given for sharded tenants"));
+        }
+        let stripe_size = stripe_size.unwrap_or_default();
+
+        // Validate the shard spec.
+        for (shard_id, url) in &urls {
             // The shard index must match the computed shard count, even for unsharded tenants.
             if shard_id.shard_count != count {
                 return Err(anyhow!("invalid shard index {shard_id}, expected {count}"));
@@ -276,21 +327,64 @@ impl Shards {
             }
             // The above conditions guarantee that we have all shards 0..count: len() matches count,
             // shard number < count, and numbers are unique (via hashmap).
-            let shard = Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?;
-            map.insert(shard_id, shard);
+
+            // Validate the URL.
+            if PageserverProtocol::from_connstring(url)? != PageserverProtocol::Grpc {
+                return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
+            }
         }
 
         Ok(Self {
+            urls,
             count,
             stripe_size,
-            map,
+        })
+    }
+}
+
+/// Tracks the tenant's shards.
+struct Shards {
+    /// Shards by shard index.
+    ///
+    /// INVARIANT: every shard 0..count is present.
+    /// INVARIANT: shard 0 is always present.
+    by_index: HashMap<ShardIndex, Shard>,
+    /// The shard count.
+    ///
+    /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
+    count: ShardCount,
+    /// The stripe size. Only used for sharded tenants.
+    stripe_size: ShardStripeSize,
+}
+
+impl Shards {
+    /// Creates a new set of shards based on a shard spec.
+    fn new(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_spec: ShardSpec,
+        auth_token: Option<String>,
+    ) -> anyhow::Result<Self> {
+        // NB: the shard spec has already been validated when constructed.
+        let mut shards = HashMap::with_capacity(shard_spec.urls.len());
+        for (shard_id, url) in shard_spec.urls {
+            shards.insert(
+                shard_id,
+                Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?,
+            );
+        }
+
+        Ok(Self {
+            by_index: shards,
+            count: shard_spec.count,
+            stripe_size: shard_spec.stripe_size,
         })
     }
 
     /// Looks up the given shard.
     #[allow(clippy::result_large_err)] // TODO: check perf impact
     fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> {
-        self.map
+        self.by_index
             .get(&shard_id)
             .ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}")))
     }
@@ -329,11 +423,6 @@ impl Shard {
         shard_id: ShardIndex,
         auth_token: Option<String>,
     ) -> anyhow::Result<Self> {
-        // Sanity-check that the URL uses gRPC.
-        if PageserverProtocol::from_connstring(&url)? != PageserverProtocol::Grpc {
-            return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
-        }
-
         // Common channel pool for unary and stream requests. Bounded by client/stream pools.
         let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
 
diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs
index 3fc7178be2..14fb3fbd5a 100644
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -3,4 +3,4 @@ mod pool;
 mod retry;
 mod split;
 
-pub use client::PageserverClient;
+pub use client::{PageserverClient, ShardSpec};
diff --git a/pageserver/client_grpc/src/retry.rs b/pageserver/client_grpc/src/retry.rs
index b0473204d7..a4d4b19870 100644
--- a/pageserver/client_grpc/src/retry.rs
+++ b/pageserver/client_grpc/src/retry.rs
@@ -131,7 +131,6 @@ impl Retry {
             tonic::Code::Aborted => true,
             tonic::Code::Cancelled => true,
             tonic::Code::DeadlineExceeded => true, // maybe transient slowness
-            tonic::Code::Internal => true,         // maybe transient failure?
             tonic::Code::ResourceExhausted => true,
             tonic::Code::Unavailable => true,
 
@@ -139,6 +138,10 @@ impl Retry {
             tonic::Code::AlreadyExists => false,
             tonic::Code::DataLoss => false,
             tonic::Code::FailedPrecondition => false,
+            // NB: don't retry Internal. It is intended for serious errors such as invariant
+            // violations, and is also used for client-side invariant checks that would otherwise
+            // result in retry loops.
+            tonic::Code::Internal => false,
             tonic::Code::InvalidArgument => false,
             tonic::Code::NotFound => false,
             tonic::Code::OutOfRange => false,
diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/client_grpc/src/split.rs
index 5bbcaab393..57c9299b96 100644
--- a/pageserver/client_grpc/src/split.rs
+++ b/pageserver/client_grpc/src/split.rs
@@ -97,7 +97,8 @@ impl GetPageSplitter {
         self.requests.drain()
     }
 
-    /// Adds a response from the given shard.
+    /// Adds a response from the given shard. The response must match the request ID and have an OK
+    /// status code. A response must not already exist for the given shard ID.
     #[allow(clippy::result_large_err)]
     pub fn add_response(
         &mut self,
@@ -105,24 +106,30 @@ impl GetPageSplitter {
         response: page_api::GetPageResponse,
     ) -> tonic::Result<()> {
         // The caller should already have converted status codes into tonic::Status.
-        assert_eq!(response.status_code, page_api::GetPageStatusCode::Ok);
+        if response.status_code != page_api::GetPageStatusCode::Ok {
+            return Err(tonic::Status::internal(format!(
+                "unexpected non-OK response for shard {shard_id}: {:?}",
+                response.status_code
+            )));
+        }
 
-        // Make sure the response matches the request ID.
+        // The stream pool ensures the response matches the request ID.
         if response.request_id != self.request_id {
             return Err(tonic::Status::internal(format!(
-                "response ID {} does not match request ID {}",
-                response.request_id, self.request_id
+                "response ID mismatch for shard {shard_id}: expected {}, got {}",
+                self.request_id, response.request_id
+            )));
+        }
+
+        // We only dispatch one request per shard.
+        if self.responses.contains_key(&shard_id) {
+            return Err(tonic::Status::internal(format!(
+                "duplicate response for shard {shard_id}"
             )));
         }
 
         // Add the response data to the map.
-        let old = self.responses.insert(shard_id, response.page_images);
-
-        if old.is_some() {
-            return Err(tonic::Status::internal(format!(
-                "duplicate response for shard {shard_id}",
-            )));
-        }
+        self.responses.insert(shard_id, response.page_images);
 
         Ok(())
     }

From 8630d37f5e52a851a48b5936acd409cac5044bb0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 10 Jul 2025 16:53:38 +0100
Subject: [PATCH 16/56] test_runner: manually reuse ports in PortDistributor
 (#12423)

## Problem

Sometimes we run out of free ports in `PortDistributor`. This affects
particularly failed tests that we rerun automatically up to 3 times
(which makes it use up to 3x more ports)

## Summary of changes
- Cycle over the range of ports to reuse freed ports from previous tests

Ref: LKB-62
---
 test_runner/fixtures/port_distributor.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/test_runner/fixtures/port_distributor.py b/test_runner/fixtures/port_distributor.py
index 6a829a9399..e51d08e16e 100644
--- a/test_runner/fixtures/port_distributor.py
+++ b/test_runner/fixtures/port_distributor.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import re
 import socket
 from contextlib import closing
+from itertools import cycle
 
 from fixtures.log_helper import log
 
@@ -34,15 +35,23 @@ def can_bind(host: str, port: int) -> bool:
 
 class PortDistributor:
     def __init__(self, base_port: int, port_number: int):
-        self.iterator = iter(range(base_port, base_port + port_number))
+        self.base_port = base_port
+        self.port_number = port_number
+        self.cycle = cycle(range(base_port, base_port + port_number))
         self.port_map: dict[int, int] = {}
 
     def get_port(self) -> int:
-        for port in self.iterator:
+        checked = 0
+        for port in self.cycle:
             if can_bind("localhost", port):
                 return port
+            elif checked < self.port_number:
+                checked += 1
+            else:
+                break
+
         raise RuntimeError(
-            "port range configured for test is exhausted, consider enlarging the range"
+            f"port range ({self.base_port}..{self.base_port + self.port_number}) configured for test is exhausted, consider enlarging the range"
         )
 
     def replace_with_new_port(self, value: int | str) -> int | str:

From dcdfe80bf015e93b991c0aa86ffbbffbcd18c198 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 10 Jul 2025 19:30:09 +0200
Subject: [PATCH 17/56] pagebench: add support for rich gRPC client (#12477)

## Problem

We need to benchmark the rich gRPC client
`client_grpc::PageserverClient` against the basic, no-frills
`page_api::Client` to determine how much overhead it adds.

Touches #11735.
Requires #12476.

## Summary of changes

Add a `pagebench --rich-client` parameter to use
`client_grpc::PageserverClient`. Also adds a compression parameter to
the client.
---
 Cargo.lock                                    |  1 +
 Cargo.toml                                    |  1 +
 pageserver/client_grpc/src/client.rs          | 28 ++++++-
 pageserver/client_grpc/src/pool.rs            |  7 +-
 pageserver/pagebench/Cargo.toml               |  3 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   | 84 +++++++++++++++++++
 6 files changed, 120 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1d68b8f862..c528354053 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4294,6 +4294,7 @@ dependencies = [
  "humantime-serde",
  "pageserver_api",
  "pageserver_client",
+ "pageserver_client_grpc",
  "pageserver_page_api",
  "rand 0.8.5",
  "reqwest",
diff --git a/Cargo.toml b/Cargo.toml
index 14f2cfcb56..0d521ee4d9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -262,6 +262,7 @@ neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_client_grpc = { path = "./pageserver/client_grpc" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index ee09c1f13c..e790f4018e 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -6,6 +6,7 @@ use anyhow::anyhow;
 use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
+use tonic::codec::CompressionEncoding;
 use tracing::instrument;
 
 use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
@@ -62,6 +63,8 @@ pub struct PageserverClient {
     timeline_id: TimelineId,
     /// The JWT auth token for this tenant, if any.
     auth_token: Option<String>,
+    /// The compression to use, if any.
+    compression: Option<CompressionEncoding>,
     /// The shards for this tenant.
     shards: ArcSwap<Shards>,
     /// The retry configuration.
@@ -76,12 +79,20 @@ impl PageserverClient {
         timeline_id: TimelineId,
         shard_spec: ShardSpec,
         auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
     ) -> anyhow::Result<Self> {
-        let shards = Shards::new(tenant_id, timeline_id, shard_spec, auth_token.clone())?;
+        let shards = Shards::new(
+            tenant_id,
+            timeline_id,
+            shard_spec,
+            auth_token.clone(),
+            compression,
+        )?;
         Ok(Self {
             tenant_id,
             timeline_id,
             auth_token,
+            compression,
             shards: ArcSwap::new(Arc::new(shards)),
             retry: Retry,
         })
@@ -119,6 +130,7 @@ impl PageserverClient {
             self.timeline_id,
             shard_spec,
             self.auth_token.clone(),
+            self.compression,
         )?;
         self.shards.store(Arc::new(shards));
         Ok(())
@@ -364,13 +376,21 @@ impl Shards {
         timeline_id: TimelineId,
         shard_spec: ShardSpec,
         auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
     ) -> anyhow::Result<Self> {
         // NB: the shard spec has already been validated when constructed.
         let mut shards = HashMap::with_capacity(shard_spec.urls.len());
         for (shard_id, url) in shard_spec.urls {
             shards.insert(
                 shard_id,
-                Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?,
+                Shard::new(
+                    url,
+                    tenant_id,
+                    timeline_id,
+                    shard_id,
+                    auth_token.clone(),
+                    compression,
+                )?,
             );
         }
 
@@ -422,6 +442,7 @@ impl Shard {
         timeline_id: TimelineId,
         shard_id: ShardIndex,
         auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
     ) -> anyhow::Result<Self> {
         // Common channel pool for unary and stream requests. Bounded by client/stream pools.
         let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
@@ -433,6 +454,7 @@ impl Shard {
             timeline_id,
             shard_id,
             auth_token.clone(),
+            compression,
             Some(MAX_UNARY_CLIENTS),
         );
 
@@ -445,6 +467,7 @@ impl Shard {
                 timeline_id,
                 shard_id,
                 auth_token.clone(),
+                compression,
                 None, // unbounded, limited by stream pool
             ),
             Some(MAX_STREAMS),
@@ -460,6 +483,7 @@ impl Shard {
                 timeline_id,
                 shard_id,
                 auth_token,
+                compression,
                 None, // unbounded, limited by stream pool
             ),
             Some(MAX_BULK_STREAMS),
diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 89b3bd646f..2dde40b5b4 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -40,6 +40,7 @@ use futures::StreamExt as _;
 use tokio::sync::mpsc::{Receiver, Sender};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
 use tokio_util::sync::CancellationToken;
+use tonic::codec::CompressionEncoding;
 use tonic::transport::{Channel, Endpoint};
 use tracing::{error, warn};
 
@@ -242,6 +243,8 @@ pub struct ClientPool {
     shard_id: ShardIndex,
     /// Authentication token, if any.
     auth_token: Option<String>,
+    /// Compression to use.
+    compression: Option<CompressionEncoding>,
     /// Channel pool to acquire channels from.
     channel_pool: Arc<ChannelPool>,
     /// Limits the max number of concurrent clients for this pool. None if the pool is unbounded.
@@ -281,6 +284,7 @@ impl ClientPool {
         timeline_id: TimelineId,
         shard_id: ShardIndex,
         auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
         max_clients: Option<NonZero<usize>>,
     ) -> Arc<Self> {
         let pool = Arc::new(Self {
@@ -288,6 +292,7 @@ impl ClientPool {
             timeline_id,
             shard_id,
             auth_token,
+            compression,
             channel_pool,
             idle: Mutex::default(),
             idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
@@ -331,7 +336,7 @@ impl ClientPool {
             self.timeline_id,
             self.shard_id,
             self.auth_token.clone(),
-            None,
+            self.compression,
         )?;
 
         Ok(ClientGuard {
diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml
index f5dfc0db25..4086213830 100644
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -27,8 +27,9 @@ tokio-util.workspace = true
 tonic.workspace = true
 url.workspace = true
 
-pageserver_client.workspace = true
 pageserver_api.workspace = true
+pageserver_client.workspace = true
+pageserver_client_grpc.workspace = true
 pageserver_page_api.workspace = true
 utils = { path = "../../libs/utils/" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index f14caf548c..42c7e40489 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -10,12 +10,14 @@ use anyhow::Context;
 use async_trait::async_trait;
 use bytes::Bytes;
 use camino::Utf8PathBuf;
+use futures::stream::FuturesUnordered;
 use futures::{Stream, StreamExt as _};
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::TenantShardId;
+use pageserver_client_grpc::{self as client_grpc, ShardSpec};
 use pageserver_page_api as page_api;
 use rand::prelude::*;
 use tokio::task::JoinSet;
@@ -37,6 +39,10 @@ pub(crate) struct Args {
     /// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
     #[clap(long, default_value = "postgres://postgres@localhost:64000")]
     page_service_connstring: String,
+    /// Use the rich gRPC Pageserver client `client_grpc::PageserverClient`, rather than the basic
+    /// no-frills `page_api::Client`. Only valid with grpc:// connstrings.
+    #[clap(long)]
+    rich_client: bool,
     #[clap(long)]
     pageserver_jwt: Option<String>,
     #[clap(long, default_value = "1")]
@@ -332,6 +338,7 @@ async fn main_impl(
             let client: Box<dyn Client> = match scheme.as_str() {
                 "postgresql" | "postgres" => {
                     assert!(!args.compression, "libpq does not support compression");
+                    assert!(!args.rich_client, "rich client requires grpc://");
                     Box::new(
                         LibpqClient::new(&args.page_service_connstring, worker_id.timeline)
                             .await
@@ -339,6 +346,16 @@ async fn main_impl(
                     )
                 }
 
+                "grpc" if args.rich_client => Box::new(
+                    RichGrpcClient::new(
+                        &args.page_service_connstring,
+                        worker_id.timeline,
+                        args.compression,
+                    )
+                    .await
+                    .unwrap(),
+                ),
+
                 "grpc" => Box::new(
                     GrpcClient::new(
                         &args.page_service_connstring,
@@ -680,3 +697,70 @@ impl Client for GrpcClient {
         Ok((resp.request_id, resp.page_images))
     }
 }
+
+/// A rich gRPC Pageserver client.
+struct RichGrpcClient {
+    inner: Arc<client_grpc::PageserverClient>,
+    requests: FuturesUnordered<
+        Pin<Box<dyn Future<Output = anyhow::Result<page_api::GetPageResponse>> + Send>>,
+    >,
+}
+
+impl RichGrpcClient {
+    async fn new(
+        connstring: &str,
+        ttid: TenantTimelineId,
+        compression: bool,
+    ) -> anyhow::Result<Self> {
+        let inner = Arc::new(client_grpc::PageserverClient::new(
+            ttid.tenant_id,
+            ttid.timeline_id,
+            ShardSpec::new(
+                [(ShardIndex::unsharded(), connstring.to_string())].into(),
+                None,
+            )?,
+            None,
+            compression.then_some(tonic::codec::CompressionEncoding::Zstd),
+        )?);
+        Ok(Self {
+            inner,
+            requests: FuturesUnordered::new(),
+        })
+    }
+}
+
+#[async_trait]
+impl Client for RichGrpcClient {
+    async fn send_get_page(
+        &mut self,
+        req_id: u64,
+        req_lsn: Lsn,
+        mod_lsn: Lsn,
+        rel: RelTag,
+        blks: Vec<u32>,
+    ) -> anyhow::Result<()> {
+        let req = page_api::GetPageRequest {
+            request_id: req_id,
+            request_class: page_api::GetPageClass::Normal,
+            read_lsn: page_api::ReadLsn {
+                request_lsn: req_lsn,
+                not_modified_since_lsn: Some(mod_lsn),
+            },
+            rel,
+            block_numbers: blks,
+        };
+        let inner = self.inner.clone();
+        self.requests.push(Box::pin(async move {
+            inner
+                .get_page(req)
+                .await
+                .map_err(|err| anyhow::anyhow!("{err}"))
+        }));
+        Ok(())
+    }
+
+    async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
+        let resp = self.requests.next().await.unwrap()?;
+        Ok((resp.request_id, resp.page_images))
+    }
+}

From 13b5e7b26fe009c711b3e57436433cf8e4d140d6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 10 Jul 2025 14:02:54 -0400
Subject: [PATCH 18/56] fix(compute_ctl): reload config before applying spec
 (#12551)

## Problem

If we have catalog update AND a pageserver migration batched in a single
spec, we will not be able to apply the spec (running the SQL) because
the compute is not attached to the right pageserver and we are not able
to read anything if we don't pick up the latest pageserver connstring.
This is not a case for now because cplane always schedules shard split /
pageserver migrations with `skip_pg_catalog_updates` (I suppose).

Context:
https://databricks.slack.com/archives/C09254R641L/p1752163559259399?thread_ts=1752160163.141149&cid=C09254R641L

With this fix, backpressure will likely not be able to affect
reconfigurations.

## Summary of changes

Do `pg_reload_conf` before we apply specs in SQL.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/compute.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0496d38e67..4a29c232ac 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1805,6 +1805,8 @@ impl ComputeNode {
             tls_config,
         )?;
 
+        self.pg_reload_conf()?;
+
         if !spec.skip_pg_catalog_updates {
             let max_concurrent_connections = spec.reconfigure_concurrency;
             // Temporarily reset max_cluster_size in config
@@ -1824,10 +1826,9 @@ impl ComputeNode {
 
                 Ok(())
             })?;
+            self.pg_reload_conf()?;
         }
 
-        self.pg_reload_conf()?;
-
         let unknown_op = "unknown".to_string();
         let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
         info!(

From c5aaf1ae21df31233a4bc81eef88d56e95b2a33e Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Thu, 10 Jul 2025 19:37:54 +0100
Subject: [PATCH 19/56] Qualify call to neon extension in compute_ctl's
 prewarming (#12554)

https://github.com/neondatabase/cloud/issues/19011
Calls without `neon.` failed on staging.
Also fix local tests to work with qualified calls
---
 compute_tools/src/compute_prewarm.rs         |  6 +++---
 test_runner/regress/test_lfc_prewarm.py      | 14 ++++++++------
 test_runner/regress/test_replica_promotes.py |  4 ++--
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs
index 3f6f9a7ecc..d014a5bb72 100644
--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -70,7 +70,7 @@ impl ComputeNode {
             }
         };
         let row = match client
-            .query_one("select * from get_prewarm_info()", &[])
+            .query_one("select * from neon.get_prewarm_info()", &[])
             .await
         {
             Ok(row) => row,
@@ -146,7 +146,7 @@ impl ComputeNode {
         ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
             .await
             .context("connecting to postgres")?
-            .query_one("select prewarm_local_cache($1)", &[&uncompressed])
+            .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
             .await
             .context("loading LFC state into postgres")
             .map(|_| ())
@@ -196,7 +196,7 @@ impl ComputeNode {
         ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
             .await
             .context("connecting to postgres")?
-            .query_one("select get_local_cache_state()", &[])
+            .query_one("select neon.get_local_cache_state()", &[])
             .await
             .context("querying LFC state")?
             .try_get::<usize, &[u8]>(0)
diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py
index 22e5bf576f..0f0cf4cc6d 100644
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -40,7 +40,7 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
 
 def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
     if method == PrewarmMethod.POSTGRES:
-        cur.execute("select get_local_cache_state()")
+        cur.execute("select neon.get_local_cache_state()")
         return cur.fetchall()[0][0]
 
     if method == PrewarmMethod.AUTOPREWARM:
@@ -72,7 +72,7 @@ def prewarm_endpoint(
     elif method == PrewarmMethod.COMPUTE_CTL:
         client.prewarm_lfc()
     elif method == PrewarmMethod.POSTGRES:
-        cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+        cur.execute("select neon.prewarm_local_cache(%s)", (lfc_state,))
 
 
 def check_prewarmed(
@@ -116,7 +116,7 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
 
     pg_conn = endpoint.connect()
     pg_cur = pg_conn.cursor()
-    pg_cur.execute("create extension neon")
+    pg_cur.execute("create schema neon; create extension neon with schema neon")
     pg_cur.execute("create database lfc")
 
     lfc_conn = endpoint.connect(dbname="lfc")
@@ -142,10 +142,12 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
     lfc_cur = lfc_conn.cursor()
     prewarm_endpoint(method, client, pg_cur, lfc_state)
 
-    pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
+    pg_cur.execute(
+        "select lfc_value from neon.neon_lfc_stats where lfc_key='file_cache_used_pages'"
+    )
     lfc_used_pages = pg_cur.fetchall()[0][0]
     log.info(f"Used LFC size: {lfc_used_pages}")
-    pg_cur.execute("select * from get_prewarm_info()")
+    pg_cur.execute("select * from neon.get_prewarm_info()")
     total, prewarmed, skipped, _ = pg_cur.fetchall()[0]
     log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}")
     progress = (prewarmed + skipped) * 100 // total
@@ -186,7 +188,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
 
     pg_conn = endpoint.connect()
     pg_cur = pg_conn.cursor()
-    pg_cur.execute("create extension neon")
+    pg_cur.execute("create schema neon; create extension neon with schema neon")
     pg_cur.execute("CREATE DATABASE lfc")
 
     lfc_conn = endpoint.connect(dbname="lfc")
diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py
index 1f26269f40..8d39ac123a 100644
--- a/test_runner/regress/test_replica_promotes.py
+++ b/test_runner/regress/test_replica_promotes.py
@@ -60,7 +60,7 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
 
     with primary.connect() as primary_conn:
         primary_cur = primary_conn.cursor()
-        primary_cur.execute("create extension neon")
+        primary_cur.execute("create schema neon;create extension neon with schema neon")
         primary_cur.execute(
             "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
         )
@@ -172,7 +172,7 @@ def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv):
     secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
 
     with primary.connect() as conn, conn.cursor() as cur:
-        cur.execute("create extension neon")
+        cur.execute("create schema neon;create extension neon with schema neon")
         cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
         cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
         cur.execute("show neon.safekeepers")

From 3593fe195a55441b76874e64bb168acf71f6b4c4 Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Thu, 10 Jul 2025 20:28:10 +0100
Subject: [PATCH 20/56] split TerminationPending into two values, keeping
 ComputeStatus stateless (#12506)

After https://github.com/neondatabase/neon/pull/12240 we observed
issues in our go code as `ComputeStatus` is not stateless, thus doesn't
deserialize as string.

```
could not check compute activity: json: cannot unmarshal object into Go struct field
ComputeState.status of type computeclient.ComputeStatus
```

- Fix this by splitting this status into two.
- Update compute OpenApi spec to reflect changes to `/terminate` in
previous PR
---
 compute_tools/README.md                    |  9 +++--
 compute_tools/src/compute.rs               | 17 +++++++---
 compute_tools/src/http/openapi_spec.yaml   | 39 ++++++++++++++++++++--
 compute_tools/src/http/routes/terminate.rs | 14 ++++----
 compute_tools/src/monitor.rs               |  3 +-
 control_plane/src/endpoint.rs              |  3 +-
 libs/compute_api/src/responses.rs          | 18 ++++++++--
 7 files changed, 81 insertions(+), 22 deletions(-)

diff --git a/compute_tools/README.md b/compute_tools/README.md
index 8d84031efc..49f1368f0e 100644
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -46,11 +46,14 @@ stateDiagram-v2
   Configuration --> Failed : Failed to configure the compute
   Configuration --> Running : Compute has been configured
   Empty --> Init : Compute spec is immediately available
-  Empty --> TerminationPending : Requested termination
+  Empty --> TerminationPendingFast : Requested termination
+  Empty --> TerminationPendingImmediate : Requested termination
   Init --> Failed : Failed to start Postgres
   Init --> Running : Started Postgres
-  Running --> TerminationPending : Requested termination
-  TerminationPending --> Terminated : Terminated compute
+  Running --> TerminationPendingFast : Requested termination
+  Running --> TerminationPendingImmediate : Requested termination
+  TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status
+  TerminationPendingImmediate --> Terminated : Terminated compute immediately
   Failed --> [*] : Compute exited
   Terminated --> [*] : Compute exited
 ```
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 4a29c232ac..c05cc229a2 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -956,14 +956,20 @@ impl ComputeNode {
             None
         };
 
-        let mut delay_exit = false;
         let mut state = self.state.lock().unwrap();
         state.terminate_flush_lsn = lsn;
-        if let ComputeStatus::TerminationPending { mode } = state.status {
+
+        let delay_exit = state.status == ComputeStatus::TerminationPendingFast;
+        if state.status == ComputeStatus::TerminationPendingFast
+            || state.status == ComputeStatus::TerminationPendingImmediate
+        {
+            info!(
+                "Changing compute status from {} to {}",
+                state.status,
+                ComputeStatus::Terminated
+            );
             state.status = ComputeStatus::Terminated;
             self.state_changed.notify_all();
-            // we were asked to terminate gracefully, don't exit to avoid restart
-            delay_exit = mode == compute_api::responses::TerminateMode::Fast
         }
         drop(state);
 
@@ -1901,7 +1907,8 @@ impl ComputeNode {
 
                             // exit loop
                             ComputeStatus::Failed
-                            | ComputeStatus::TerminationPending { .. }
+                            | ComputeStatus::TerminationPendingFast
+                            | ComputeStatus::TerminationPendingImmediate
                             | ComputeStatus::Terminated => break 'cert_update,
 
                             // wait
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 3c58b284b3..93a357e160 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -371,9 +371,28 @@ paths:
       summary: Terminate Postgres and wait for it to exit
       description: ""
       operationId: terminate
+      parameters:
+        - name: mode
+          in: query
+          description: "Terminate mode: fast (wait 30s before returning) and immediate"
+          required: false
+          schema:
+            type: string
+            enum: ["fast", "immediate"]
+            default: fast
       responses:
         200:
           description: Result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TerminateResponse"
+        201:
+          description: Result if compute is already terminated
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TerminateResponse"
         412:
           description: "wrong state"
           content:
@@ -530,11 +549,14 @@ components:
       type: string
       enum:
         - empty
-        - init
-        - failed
-        - running
         - configuration_pending
+        - init
+        - running
         - configuration
+        - failed
+        - termination_pending_fast
+        - termination_pending_immediate
+        - terminated
       example: running
 
     ExtensionInstallRequest:
@@ -660,6 +682,17 @@ components:
           description: Role name.
           example: "neon"
 
+    TerminateResponse:
+      type: object
+      required:
+        - lsn
+      properties:
+        lsn:
+          type: string
+          nullable: true
+          description: "last WAL flush LSN"
+          example: "0/028F10D8"
+
     SetRoleGrantsResponse:
       type: object
       required:
diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs
index 32d90a5990..5b30b020c8 100644
--- a/compute_tools/src/http/routes/terminate.rs
+++ b/compute_tools/src/http/routes/terminate.rs
@@ -3,7 +3,7 @@ use crate::http::JsonResponse;
 use axum::extract::State;
 use axum::response::Response;
 use axum_extra::extract::OptionalQuery;
-use compute_api::responses::{ComputeStatus, TerminateResponse};
+use compute_api::responses::{ComputeStatus, TerminateMode, TerminateResponse};
 use http::StatusCode;
 use serde::Deserialize;
 use std::sync::Arc;
@@ -12,7 +12,7 @@ use tracing::info;
 
 #[derive(Deserialize, Default)]
 pub struct TerminateQuery {
-    mode: compute_api::responses::TerminateMode,
+    mode: TerminateMode,
 }
 
 /// Terminate the compute.
@@ -24,16 +24,16 @@ pub(in crate::http) async fn terminate(
     {
         let mut state = compute.state.lock().unwrap();
         if state.status == ComputeStatus::Terminated {
-            return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn);
+            let response = TerminateResponse {
+                lsn: state.terminate_flush_lsn,
+            };
+            return JsonResponse::success(StatusCode::CREATED, response);
         }
 
         if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
             return JsonResponse::invalid_status(state.status);
         }
-        state.set_status(
-            ComputeStatus::TerminationPending { mode },
-            &compute.state_changed,
-        );
+        state.set_status(mode.into(), &compute.state_changed);
     }
 
     forward_termination_signal(false);
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 8a2f6addad..fa01545856 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -84,7 +84,8 @@ impl ComputeMonitor {
         if matches!(
             compute_status,
             ComputeStatus::Terminated
-                | ComputeStatus::TerminationPending { .. }
+                | ComputeStatus::TerminationPendingFast
+                | ComputeStatus::TerminationPendingImmediate
                 | ComputeStatus::Failed
         ) {
             info!(
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 74ab15dc97..ad2067e0f2 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -922,7 +922,8 @@ impl Endpoint {
                         ComputeStatus::Empty
                         | ComputeStatus::ConfigurationPending
                         | ComputeStatus::Configuration
-                        | ComputeStatus::TerminationPending { .. }
+                        | ComputeStatus::TerminationPendingFast
+                        | ComputeStatus::TerminationPendingImmediate
                         | ComputeStatus::Terminated => {
                             bail!("unexpected compute status: {:?}", state.status)
                         }
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index e10c381fb4..2fe233214a 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -121,6 +121,15 @@ pub enum TerminateMode {
     Immediate,
 }
 
+impl From<TerminateMode> for ComputeStatus {
+    fn from(mode: TerminateMode) -> Self {
+        match mode {
+            TerminateMode::Fast => ComputeStatus::TerminationPendingFast,
+            TerminateMode::Immediate => ComputeStatus::TerminationPendingImmediate,
+        }
+    }
+}
+
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
@@ -141,7 +150,9 @@ pub enum ComputeStatus {
     // control-plane to terminate it.
     Failed,
     // Termination requested
-    TerminationPending { mode: TerminateMode },
+    TerminationPendingFast,
+    // Termination requested, without waiting 30s before returning from /terminate
+    TerminationPendingImmediate,
     // Terminated Postgres
     Terminated,
 }
@@ -160,7 +171,10 @@ impl Display for ComputeStatus {
             ComputeStatus::Running => f.write_str("running"),
             ComputeStatus::Configuration => f.write_str("configuration"),
             ComputeStatus::Failed => f.write_str("failed"),
-            ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"),
+            ComputeStatus::TerminationPendingFast => f.write_str("termination-pending-fast"),
+            ComputeStatus::TerminationPendingImmediate => {
+                f.write_str("termination-pending-immediate")
+            }
             ComputeStatus::Terminated => f.write_str("terminated"),
         }
     }

From 1b7339b53e2483ab9d8af553007b1af038440c6e Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 10 Jul 2025 15:34:11 -0500
Subject: [PATCH 21/56] PG: add max_wal_rate (#12470)

## Problem
One PG tenant may write too fast and overwhelm the PS. The other tenants
sharing the same PSs will get very little bandwidth.

We had one experiment that two tenants sharing the same PSs. One tenant
runs a large ingestion that delivers hundreds of MB/s while the other
only get < 10 MB/s.

## Summary of changes
Rate limit how fast PG can generate WALs. The default is -1. We may
scale the default value with the CPU count. Need to run some experiments
to verify.

## How is this tested?
CI.

PGBench. No limit first. Then set to 1 MB/s and you can see the tps
drop. Then reverted the change and tps increased again.

pgbench -i -s 10 -p 55432 -h 127.0.0.1 -U cloud_admin -d postgres
pgbench postgres -c 10 -j 10 -T 6000000 -P 1 -b tpcb-like -h 127.0.0.1
-U cloud_admin -p 55432
progress: 33.0 s, 986.0 tps, lat 10.142 ms stddev 3.856 progress: 34.0
s, 973.0 tps, lat 10.299 ms stddev 3.857 progress: 35.0 s, 1004.0 tps,
lat 9.939 ms stddev 3.604 progress: 36.0 s, 984.0 tps, lat 10.183 ms
stddev 3.713 progress: 37.0 s, 998.0 tps, lat 10.004 ms stddev 3.668
progress: 38.0 s, 648.9 tps, lat 12.947 ms stddev 24.970 progress: 39.0
s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 40.0 s, 0.0 tps, lat
0.000 ms stddev 0.000 progress: 41.0 s, 0.0 tps, lat 0.000 ms stddev
0.000 progress: 42.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress:
43.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 44.0 s, 0.0 tps,
lat 0.000 ms stddev 0.000 progress: 45.0 s, 0.0 tps, lat 0.000 ms stddev
0.000 progress: 46.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress:
47.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 48.0 s, 0.0 tps,
lat 0.000 ms stddev 0.000 progress: 49.0 s, 347.3 tps, lat 321.560 ms
stddev 1805.633 progress: 50.0 s, 346.8 tps, lat 9.898 ms stddev 3.809
progress: 51.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 52.0 s,
0.0 tps, lat 0.000 ms stddev 0.000 progress: 53.0 s, 0.0 tps, lat 0.000
ms stddev 0.000 progress: 54.0 s, 0.0 tps, lat 0.000 ms stddev 0.000
progress: 55.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 56.0 s,
0.0 tps, lat 0.000 ms stddev 0.000 progress: 57.0 s, 0.0 tps, lat 0.000
ms stddev 0.000 progress: 58.0 s, 0.0 tps, lat 0.000 ms stddev 0.000
progress: 59.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 60.0 s,
0.0 tps, lat 0.000 ms stddev 0.000 progress: 61.0 s, 0.0 tps, lat 0.000
ms stddev 0.000 progress: 62.0 s, 0.0 tps, lat 0.000 ms stddev 0.000
progress: 63.0 s, 494.5 tps, lat 276.504 ms stddev 1853.689 progress:
64.0 s, 488.0 tps, lat 20.530 ms stddev 71.981 progress: 65.0 s, 407.8
tps, lat 9.502 ms stddev 3.329 progress: 66.0 s, 0.0 tps, lat 0.000 ms
stddev 0.000 progress: 67.0 s, 0.0 tps, lat 0.000 ms stddev 0.000
progress: 68.0 s, 504.5 tps, lat 71.627 ms stddev 397.733 progress: 69.0
s, 371.0 tps, lat 24.898 ms stddev 29.007 progress: 70.0 s, 541.0 tps,
lat 19.684 ms stddev 24.094 progress: 71.0 s, 342.0 tps, lat 29.542 ms
stddev 54.935

Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>
---
 libs/walproposer/src/api_bindings.rs |  7 ++++
 pgxn/neon/walproposer.h              | 17 ++++++++
 pgxn/neon/walproposer_pg.c           | 62 +++++++++++++++++++++++++++-
 3 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 7c6abf252e..5f856a44d4 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -428,6 +428,12 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
         shard_number: 0,
     };
 
+    let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
+        should_limit: crate::bindings::pg_atomic_uint32 { value: 0 },
+        sent_bytes: 0,
+        last_recorded_time_us: 0,
+    };
+
     crate::bindings::WalproposerShmemState {
         propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
         donor_name: [0; 64],
@@ -441,6 +447,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
         num_shards: 0,
         replica_promote: false,
         min_ps_feedback: empty_feedback,
+        wal_rate_limiter: empty_wal_rate_limiter,
     }
 }
 
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 4b223b6b18..e3a4022664 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -376,6 +376,18 @@ typedef struct PageserverFeedback
 	uint32		shard_number;
 } PageserverFeedback;
 
+/* BEGIN_HADRON */
+typedef struct WalRateLimiter
+{
+	/* If the value is 1, PG backends will hit backpressure. */
+	pg_atomic_uint32 should_limit;
+	/* The number of bytes sent in the current second. */
+	uint64		sent_bytes;
+	/* The last recorded time in microsecond. */
+	TimestampTz last_recorded_time_us;
+} WalRateLimiter;
+/* END_HADRON */
+
 typedef struct WalproposerShmemState
 {
 	pg_atomic_uint64 propEpochStartLsn;
@@ -395,6 +407,11 @@ typedef struct WalproposerShmemState
 
 	/* aggregated feedback with min LSNs across shards */
 	PageserverFeedback min_ps_feedback;
+
+	/* BEGIN_HADRON */
+	/* The WAL rate limiter */
+	WalRateLimiter wal_rate_limiter;
+	/* END_HADRON */
 } WalproposerShmemState;
 
 /*
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 185fc83ace..aaf8f43eeb 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -66,6 +66,9 @@ int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
 int			safekeeper_proto_version = 3;
 char	   *safekeeper_conninfo_options = "";
+/* BEGIN_HADRON */
+int         databricks_max_wal_mb_per_second = -1;
+/* END_HADRON */
 
 /* Set to true in the walproposer bgw. */
 static bool am_walproposer;
@@ -252,6 +255,18 @@ nwp_register_gucs(void)
 							PGC_POSTMASTER,
 							0,
 							NULL, NULL, NULL);
+
+    /* BEGIN_HADRON */
+    DefineCustomIntVariable(
+                            "databricks.max_wal_mb_per_second",
+                            "The maximum WAL MB per second allowed. If breached, sending WAL hit the backpressure. Setting to -1 disables the limit.",
+                            NULL,
+                            &databricks_max_wal_mb_per_second,
+                            -1, -1, INT_MAX,
+                            PGC_SUSET,
+                            GUC_UNIT_MB,
+                            NULL, NULL, NULL);
+    /* END_HADRON */
 }
 
 
@@ -393,6 +408,7 @@ assign_neon_safekeepers(const char *newval, void *extra)
 static uint64
 backpressure_lag_impl(void)
 {
+	struct WalproposerShmemState* state = NULL;
 	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0)
 	{
 		XLogRecPtr	writePtr;
@@ -426,6 +442,18 @@ backpressure_lag_impl(void)
 			return (myFlushLsn - applyPtr - max_replication_apply_lag * MB);
 		}
 	}
+
+	/* BEGIN_HADRON */
+	if (databricks_max_wal_mb_per_second == -1) {
+		return 0;
+	}
+
+	state = GetWalpropShmemState();
+	if (state != NULL && pg_atomic_read_u32(&state->wal_rate_limiter.should_limit) == 1)
+	{
+		return 1;
+	}
+	/* END_HADRON */
 	return 0;
 }
 
@@ -472,6 +500,9 @@ WalproposerShmemInit(void)
 		pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
 		pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
 		pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
+		/* BEGIN_HADRON */
+		pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
+		/* END_HADRON */
 	}
 	LWLockRelease(AddinShmemInitLock);
 
@@ -487,6 +518,9 @@ WalproposerShmemInit_SyncSafekeeper(void)
 	pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0);
 	pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
 	pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
+	/* BEGIN_HADRON */
+	pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
+	/* END_HADRON */
 }
 
 #define BACK_PRESSURE_DELAY 10000L // 0.01 sec
@@ -521,7 +555,6 @@ backpressure_throttling_impl(void)
 	if (lag == 0)
 		return retry;
 
-
 	old_status = get_ps_display(&len);
 	new_status = (char *) palloc(len + 64 + 1);
 	memcpy(new_status, old_status, len);
@@ -1458,6 +1491,8 @@ XLogBroadcastWalProposer(WalProposer *wp)
 {
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
+	struct WalproposerShmemState *state = NULL;
+	TimestampTz now = 0;
 
 	/* Start from the last sent position */
 	startptr = sentPtr;
@@ -1502,13 +1537,36 @@ XLogBroadcastWalProposer(WalProposer *wp)
 	 * that arbitrary LSN is eventually reported as written, flushed and
 	 * applied, so that it can measure the elapsed time.
 	 */
-	LagTrackerWrite(endptr, GetCurrentTimestamp());
+	now = GetCurrentTimestamp();
+	LagTrackerWrite(endptr, now);
 
 	/* Do we have any work to do? */
 	Assert(startptr <= endptr);
 	if (endptr <= startptr)
 		return;
 
+	/* BEGIN_HADRON */
+	state = GetWalpropShmemState();
+	if (databricks_max_wal_mb_per_second != -1 && state != NULL)
+	{
+		uint64 max_wal_bytes = (uint64) databricks_max_wal_mb_per_second * 1024 * 1024;
+		struct WalRateLimiter *limiter = &state->wal_rate_limiter;
+
+		if (now - limiter->last_recorded_time_us > USECS_PER_SEC)
+		{
+			/* Reset the rate limiter */
+			limiter->last_recorded_time_us = now;
+			limiter->sent_bytes = 0;
+			pg_atomic_exchange_u32(&limiter->should_limit, 0);
+		}
+		limiter->sent_bytes += (endptr - startptr);
+		if (limiter->sent_bytes > max_wal_bytes)
+		{
+			pg_atomic_exchange_u32(&limiter->should_limit, 1);
+		}
+	}
+	/* END_HADRON */
+
 	WalProposerBroadcast(wp, startptr, endptr);
 	sentPtr = endptr;
 

From 44ea17b7b24240937fc214a1bc5453da0d840ece Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 10 Jul 2025 22:39:42 +0200
Subject: [PATCH 22/56] pageserver/page_api: add attempt to GetPage request ID
 (#12536)

## Problem

`GetPageRequest::request_id` is supposed to be a unique ID for a
request. It's not, because we may retry the request using the same ID.
This causes assertion failures and confusion.

Touches #11735.
Requires #12480.

## Summary of changes

Extend the request ID with a retry attempt, and handle it in the gRPC
client and server.
---
 pageserver/client_grpc/src/client.rs          | 23 +++++---
 pageserver/client_grpc/src/pool.rs            | 13 +++++
 pageserver/client_grpc/src/retry.rs           |  8 +--
 pageserver/page_api/proto/page_service.proto  | 12 +++-
 pageserver/page_api/src/model.rs              | 58 +++++++++++++++++--
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |  8 +--
 pageserver/src/page_service.rs                | 19 +++---
 7 files changed, 110 insertions(+), 31 deletions(-)

diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index e790f4018e..393f89819a 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -143,7 +143,7 @@ impl PageserverClient {
         req: page_api::CheckRelExistsRequest,
     ) -> tonic::Result<page_api::CheckRelExistsResponse> {
         self.retry
-            .with(async || {
+            .with(async |_| {
                 // Relation metadata is only available on shard 0.
                 let mut client = self.shards.load_full().get_zero().client().await?;
                 client.check_rel_exists(req).await
@@ -158,7 +158,7 @@ impl PageserverClient {
         req: page_api::GetDbSizeRequest,
     ) -> tonic::Result<page_api::GetDbSizeResponse> {
         self.retry
-            .with(async || {
+            .with(async |_| {
                 // Relation metadata is only available on shard 0.
                 let mut client = self.shards.load_full().get_zero().client().await?;
                 client.get_db_size(req).await
@@ -166,8 +166,9 @@ impl PageserverClient {
             .await
     }
 
-    /// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically
-    /// splits requests that straddle shard boundaries, and assembles the responses.
+    /// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
+    /// `attempt` must be 0 (incremented on retry). Automatically splits requests that straddle
+    /// shard boundaries, and assembles the responses.
     ///
     /// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status`
     /// errors. All responses will have `GetPageStatusCode::Ok`.
@@ -187,6 +188,10 @@ impl PageserverClient {
         if req.block_numbers.is_empty() {
             return Err(tonic::Status::invalid_argument("no block number"));
         }
+        // The request attempt must be 0. The client will increment it internally.
+        if req.request_id.attempt != 0 {
+            return Err(tonic::Status::invalid_argument("request attempt must be 0"));
+        }
 
         // The shards may change while we're fetching pages. We execute the request using a stable
         // view of the shards (especially important for requests that span shards), but retry the
@@ -197,7 +202,11 @@ impl PageserverClient {
         // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
         // once we figure out how to handle these.
         self.retry
-            .with(async || Self::get_page_with_shards(req.clone(), &self.shards.load_full()).await)
+            .with(async |attempt| {
+                let mut req = req.clone();
+                req.request_id.attempt = attempt as u32;
+                Self::get_page_with_shards(req, &self.shards.load_full()).await
+            })
             .await
     }
 
@@ -267,7 +276,7 @@ impl PageserverClient {
         req: page_api::GetRelSizeRequest,
     ) -> tonic::Result<page_api::GetRelSizeResponse> {
         self.retry
-            .with(async || {
+            .with(async |_| {
                 // Relation metadata is only available on shard 0.
                 let mut client = self.shards.load_full().get_zero().client().await?;
                 client.get_rel_size(req).await
@@ -282,7 +291,7 @@ impl PageserverClient {
         req: page_api::GetSlruSegmentRequest,
     ) -> tonic::Result<page_api::GetSlruSegmentResponse> {
         self.retry
-            .with(async || {
+            .with(async |_| {
                 // SLRU segments are only available on shard 0.
                 let mut client = self.shards.load_full().get_zero().client().await?;
                 client.get_slru_segment(req).await
diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 2dde40b5b4..906872e091 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -591,6 +591,10 @@ impl StreamPool {
 
         // Track caller response channels by request ID. If the task returns early, these response
         // channels will be dropped and the waiting callers will receive an error.
+        //
+        // NB: this will leak entries if the server doesn't respond to a request (by request ID).
+        // It shouldn't happen, and if it does it will often hold onto queue depth quota anyway and
+        // block further use. But we could consider reaping closed channels after some time.
         let mut callers = HashMap::new();
 
         // Process requests and responses.
@@ -695,6 +699,15 @@ impl Drop for StreamGuard {
 
         // Release the queue depth reservation on drop. This can prematurely decrement it if dropped
         // before the response is received, but that's okay.
+        //
+        // TODO: actually, it's probably not okay. Queue depth release should be moved into the
+        // stream task, such that it continues to account for the queue depth slot until the server
+        // responds. Otherwise, if a slow request times out and keeps blocking the stream, the
+        // server will keep waiting on it and we can pile on subsequent requests (including the
+        // timeout retry) in the same stream and get blocked. But we may also want to avoid blocking
+        // requests on e.g. LSN waits and layer downloads, instead returning early to free up the
+        // stream. Or just scale out streams with a queue depth of 1 to sidestep all head-of-line
+        // blocking. TBD.
         let mut streams = pool.streams.lock().unwrap();
         let entry = streams.get_mut(&self.id).expect("unknown stream");
         assert!(entry.idle_since.is_none(), "active stream marked idle");
diff --git a/pageserver/client_grpc/src/retry.rs b/pageserver/client_grpc/src/retry.rs
index a4d4b19870..a1e0b8636f 100644
--- a/pageserver/client_grpc/src/retry.rs
+++ b/pageserver/client_grpc/src/retry.rs
@@ -23,14 +23,14 @@ impl Retry {
     /// If true, log successful requests. For debugging.
     const LOG_SUCCESS: bool = false;
 
-    /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
-    /// using the current tracing span for context.
+    /// Runs the given async closure with timeouts and retries (exponential backoff), passing the
+    /// attempt number starting at 0. Logs errors, using the current tracing span for context.
     ///
     /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
     /// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
     pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
     where
-        F: FnMut() -> O,
+        F: FnMut(usize) -> O, // takes attempt number, starting at 0
         O: Future<Output = tonic::Result<T>>,
     {
         let started = Instant::now();
@@ -47,7 +47,7 @@ impl Retry {
                 }
 
                 let request_started = Instant::now();
-                tokio::time::timeout(Self::REQUEST_TIMEOUT, f())
+                tokio::time::timeout(Self::REQUEST_TIMEOUT, f(retries))
                     .await
                     .map_err(|_| {
                         tonic::Status::deadline_exceeded(format!(
diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto
index 1d6c230916..b1f266d910 100644
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -153,7 +153,7 @@ message GetDbSizeResponse {
 message GetPageRequest {
   // A request ID. Will be included in the response. Should be unique for
   // in-flight requests on the stream.
-  uint64 request_id = 1;
+  RequestID request_id = 1;
   // The request class.
   GetPageClass request_class = 2;
   // The LSN to read at.
@@ -177,6 +177,14 @@ message GetPageRequest {
   repeated uint32 block_number = 5;
 }
 
+// A Request ID. Should be unique for in-flight requests on a stream. Included in the response.
+message RequestID {
+  // The base request ID.
+  uint64 id = 1;
+  // The request attempt. Starts at 0, incremented on each retry.
+  uint32 attempt = 2;
+}
+
 // A GetPageRequest class. Primarily intended for observability, but may also be
 // used for prioritization in the future.
 enum GetPageClass {
@@ -199,7 +207,7 @@ enum GetPageClass {
 // the entire batch is ready, so no one can make use of the individual pages.
 message GetPageResponse {
   // The original request's ID.
-  uint64 request_id = 1;
+  RequestID request_id = 1;
   // The response status code.
   GetPageStatusCode status_code = 2;
   // A string describing the status, if any.
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index d0d3517d41..4db8237ad8 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -356,7 +356,10 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
             return Err(ProtocolError::Missing("block_number"));
         }
         Ok(Self {
-            request_id: pb.request_id,
+            request_id: pb
+                .request_id
+                .ok_or(ProtocolError::Missing("request_id"))?
+                .into(),
             request_class: pb.request_class.into(),
             read_lsn: pb
                 .read_lsn
@@ -371,7 +374,7 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
 impl From<GetPageRequest> for proto::GetPageRequest {
     fn from(request: GetPageRequest) -> Self {
         Self {
-            request_id: request.request_id,
+            request_id: Some(request.request_id.into()),
             request_class: request.request_class.into(),
             read_lsn: Some(request.read_lsn.into()),
             rel: Some(request.rel.into()),
@@ -380,8 +383,51 @@ impl From<GetPageRequest> for proto::GetPageRequest {
     }
 }
 
-/// A GetPage request ID.
-pub type RequestID = u64;
+/// A GetPage request ID and retry attempt. Should be unique for in-flight requests on a stream.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct RequestID {
+    /// The base request ID.
+    pub id: u64,
+    // The request attempt. Starts at 0, incremented on each retry.
+    pub attempt: u32,
+}
+
+impl RequestID {
+    /// Creates a new RequestID with the given ID and an initial attempt of 0.
+    pub fn new(id: u64) -> Self {
+        Self { id, attempt: 0 }
+    }
+}
+
+impl Display for RequestID {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}.{}", self.id, self.attempt)
+    }
+}
+
+impl From<proto::RequestId> for RequestID {
+    fn from(pb: proto::RequestId) -> Self {
+        Self {
+            id: pb.id,
+            attempt: pb.attempt,
+        }
+    }
+}
+
+impl From<u64> for RequestID {
+    fn from(id: u64) -> Self {
+        Self::new(id)
+    }
+}
+
+impl From<RequestID> for proto::RequestId {
+    fn from(request_id: RequestID) -> Self {
+        Self {
+            id: request_id.id,
+            attempt: request_id.attempt,
+        }
+    }
+}
 
 /// A GetPage request class.
 #[derive(Clone, Copy, Debug, strum_macros::Display)]
@@ -467,7 +513,7 @@ pub struct GetPageResponse {
 impl From<proto::GetPageResponse> for GetPageResponse {
     fn from(pb: proto::GetPageResponse) -> Self {
         Self {
-            request_id: pb.request_id,
+            request_id: pb.request_id.unwrap_or_default().into(),
             status_code: pb.status_code.into(),
             reason: Some(pb.reason).filter(|r| !r.is_empty()),
             page_images: pb.page_image,
@@ -478,7 +524,7 @@ impl From<proto::GetPageResponse> for GetPageResponse {
 impl From<GetPageResponse> for proto::GetPageResponse {
     fn from(response: GetPageResponse) -> Self {
         Self {
-            request_id: response.request_id,
+            request_id: Some(response.request_id.into()),
             status_code: response.status_code.into(),
             reason: response.reason.unwrap_or_default(),
             page_image: response.page_images,
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 42c7e40489..b5c191e29a 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -674,7 +674,7 @@ impl Client for GrpcClient {
         blks: Vec<u32>,
     ) -> anyhow::Result<()> {
         let req = page_api::GetPageRequest {
-            request_id: req_id,
+            request_id: req_id.into(),
             request_class: page_api::GetPageClass::Normal,
             read_lsn: page_api::ReadLsn {
                 request_lsn: req_lsn,
@@ -694,7 +694,7 @@ impl Client for GrpcClient {
             "unexpected status code: {}",
             resp.status_code,
         );
-        Ok((resp.request_id, resp.page_images))
+        Ok((resp.request_id.id, resp.page_images))
     }
 }
 
@@ -740,7 +740,7 @@ impl Client for RichGrpcClient {
         blks: Vec<u32>,
     ) -> anyhow::Result<()> {
         let req = page_api::GetPageRequest {
-            request_id: req_id,
+            request_id: req_id.into(),
             request_class: page_api::GetPageClass::Normal,
             read_lsn: page_api::ReadLsn {
                 request_lsn: req_lsn,
@@ -761,6 +761,6 @@ impl Client for RichGrpcClient {
 
     async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
         let resp = self.requests.next().await.unwrap()?;
-        Ok((resp.request_id, resp.page_images))
+        Ok((resp.request_id.id, resp.page_images))
     }
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index ebb1addcdb..b2f6cd465d 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -3338,9 +3338,12 @@ impl GrpcPageServiceHandler {
     }
 
     /// Generates a PagestreamRequest header from a ReadLsn and request ID.
-    fn make_hdr(read_lsn: page_api::ReadLsn, req_id: u64) -> PagestreamRequest {
+    fn make_hdr(
+        read_lsn: page_api::ReadLsn,
+        req_id: Option<page_api::RequestID>,
+    ) -> PagestreamRequest {
         PagestreamRequest {
-            reqid: req_id,
+            reqid: req_id.map(|r| r.id).unwrap_or_default(),
             request_lsn: read_lsn.request_lsn,
             not_modified_since: read_lsn
                 .not_modified_since_lsn
@@ -3450,7 +3453,7 @@ impl GrpcPageServiceHandler {
 
             batch.push(BatchedGetPageRequest {
                 req: PagestreamGetPageRequest {
-                    hdr: Self::make_hdr(req.read_lsn, req.request_id),
+                    hdr: Self::make_hdr(req.read_lsn, Some(req.request_id)),
                     rel: req.rel,
                     blkno,
                 },
@@ -3528,7 +3531,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         span_record!(rel=%req.rel, lsn=%req.read_lsn);
 
         let req = PagestreamExistsRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
             rel: req.rel,
         };
 
@@ -3678,7 +3681,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn);
 
         let req = PagestreamDbSizeRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
             dbnode: req.db_oid,
         };
 
@@ -3728,7 +3731,7 @@ impl proto::PageService for GrpcPageServiceHandler {
                 .await?
                 .downgrade();
             while let Some(req) = reqs.message().await? {
-                let req_id = req.request_id;
+                let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
                 let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
                     .instrument(span.clone()) // propagate request span
                     .await;
@@ -3767,7 +3770,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         span_record!(rel=%req.rel, lsn=%req.read_lsn);
 
         let req = PagestreamNblocksRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
             rel: req.rel,
         };
 
@@ -3800,7 +3803,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn);
 
         let req = PagestreamGetSlruSegmentRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
             kind: req.kind as u8,
             segno: req.segno,
         };

From b91f821e8bae4ed8635f2c9380f304fc575eed91 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 10 Jul 2025 17:49:52 -0400
Subject: [PATCH 23/56] fix(libpagestore): update the default stripe size
 (#12557)

## Problem

Part of LKB-379

The pageserver connstrings are updated in the postmaster and then
there's a hook to propagate it to the shared memory of all backends.
However, the shard stripe doesn't. This would cause problems during
shard splits:

* the compute has active reads/writes
* shard split happens and the cplane applies the new config (pageserver
connstring + stripe size)
* pageserver connstring will be updated immediately once the postmaster
receives the SIGHUP, and it will be copied over the the shared memory of
all other backends.
* stripe size is a normal GUC and we don't have special handling around
that, so if any active backend has ongoing txns the value won't be
applied.
* now it's possible for backends to issue requests based on the wrong
stripe size; what's worse, if a request gets cached in the prefetch
buffer, it will get stuck forever.

## Summary of changes

To make sure it aligns with the current default in storcon.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pgxn/neon/libpagestore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 3b6c4247c3..05ba6da663 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -1410,7 +1410,7 @@ pg_init_libpagestore(void)
 							"sharding stripe size",
 							NULL,
 							&stripe_size,
-							32768, 1, INT_MAX,
+							2048, 1, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_BLOCKS,
 							NULL, NULL, NULL);

From 8aa9540a05cfab2cf870b309665d78c837310acb Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 11 Jul 2025 00:35:14 +0200
Subject: [PATCH 24/56] pageserver/page_api: include block number and rel in
 gRPC `GetPageResponse` (#12542)

## Problem

With gRPC `GetPageRequest` batches, we'll have non-trivial
fragmentation/reassembly logic in several places of the stack
(concurrent reads, shard splits, LFC hits, etc). If we included the
block numbers with the pages in `GetPageResponse` we could have better
verification and observability that the final responses are correct.

Touches #11735.
Requires #12480.

## Summary of changes

Add a `Page` struct with`block_number` for `GetPageResponse`, along with
the `RelTag` for completeness, and verify them in the rich gRPC client.
---
 pageserver/client_grpc/src/client.rs          |  34 +++-
 pageserver/client_grpc/src/split.rs           | 160 +++++++++++-------
 pageserver/page_api/proto/page_service.proto  |  19 ++-
 pageserver/page_api/src/client.rs             |   3 +-
 pageserver/page_api/src/model.rs              |  59 +++++--
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |  10 +-
 pageserver/src/page_service.rs                |   8 +-
 7 files changed, 201 insertions(+), 92 deletions(-)

diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 393f89819a..7049fbdb96 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -218,7 +218,7 @@ impl PageserverClient {
     ) -> tonic::Result<page_api::GetPageResponse> {
         // Fast path: request is for a single shard.
         if let Some(shard_id) =
-            GetPageSplitter::is_single_shard(&req, shards.count, shards.stripe_size)
+            GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)
         {
             return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
         }
@@ -238,7 +238,7 @@ impl PageserverClient {
             splitter.add_response(shard_id, shard_response)?;
         }
 
-        splitter.assemble_response()
+        splitter.get_response()
     }
 
     /// Fetches pages on the given shard. Does not retry internally.
@@ -246,9 +246,8 @@ impl PageserverClient {
         req: page_api::GetPageRequest,
         shard: &Shard,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let expected = req.block_numbers.len();
         let stream = shard.stream(req.request_class.is_bulk()).await;
-        let resp = stream.send(req).await?;
+        let resp = stream.send(req.clone()).await?;
 
         // Convert per-request errors into a tonic::Status.
         if resp.status_code != page_api::GetPageStatusCode::Ok {
@@ -258,11 +257,27 @@ impl PageserverClient {
             ));
         }
 
-        // Check that we received the expected number of pages.
-        let actual = resp.page_images.len();
-        if expected != actual {
+        // Check that we received the expected pages.
+        if req.rel != resp.rel {
             return Err(tonic::Status::internal(format!(
-                "expected {expected} pages, got {actual}",
+                "shard {} returned wrong relation, expected {} got {}",
+                shard.id, req.rel, resp.rel
+            )));
+        }
+        if !req
+            .block_numbers
+            .iter()
+            .copied()
+            .eq(resp.pages.iter().map(|p| p.block_number))
+        {
+            return Err(tonic::Status::internal(format!(
+                "shard {} returned wrong pages, expected {:?} got {:?}",
+                shard.id,
+                req.block_numbers,
+                resp.pages
+                    .iter()
+                    .map(|page| page.block_number)
+                    .collect::<Vec<_>>()
             )));
         }
 
@@ -435,6 +450,8 @@ impl Shards {
 ///   * Bulk client pool: unbounded.
 ///     * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
 struct Shard {
+    /// The shard ID.
+    id: ShardIndex,
     /// Unary gRPC client pool.
     client_pool: Arc<ClientPool>,
     /// GetPage stream pool.
@@ -500,6 +517,7 @@ impl Shard {
         );
 
         Ok(Self {
+            id: shard_id,
             client_pool,
             stream_pool,
             bulk_stream_pool,
diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/client_grpc/src/split.rs
index 57c9299b96..b7539b900c 100644
--- a/pageserver/client_grpc/src/split.rs
+++ b/pageserver/client_grpc/src/split.rs
@@ -5,27 +5,24 @@ use bytes::Bytes;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
 use pageserver_page_api as page_api;
-use utils::shard::{ShardCount, ShardIndex};
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};
 
 /// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
 /// TODO: add tests for this.
 pub struct GetPageSplitter {
-    /// The original request ID. Used for all shard requests.
-    request_id: page_api::RequestID,
     /// Split requests by shard index.
     requests: HashMap<ShardIndex, page_api::GetPageRequest>,
-    /// Maps the offset in `GetPageRequest::block_numbers` to the owning shard. Used to assemble
-    /// the response pages in the same order as the original request.
+    /// The response being assembled. Preallocated with empty pages, to be filled in.
+    response: page_api::GetPageResponse,
+    /// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used
+    /// to assemble the response pages in the same order as the original request.
     block_shards: Vec<ShardIndex>,
-    /// Page responses by shard index. Will be assembled into a single response.
-    responses: HashMap<ShardIndex, Vec<Bytes>>,
 }
 
 impl GetPageSplitter {
     /// Checks if the given request only touches a single shard, and returns the shard ID. This is
     /// the common case, so we check first in order to avoid unnecessary allocations and overhead.
-    /// The caller must ensure that the request has at least one block number, or this will panic.
-    pub fn is_single_shard(
+    pub fn for_single_shard(
         req: &page_api::GetPageRequest,
         count: ShardCount,
         stripe_size: ShardStripeSize,
@@ -35,8 +32,12 @@ impl GetPageSplitter {
             return Some(ShardIndex::unsharded());
         }
 
-        // Find the base shard index for the first page, and compare with the rest.
-        let key = rel_block_to_key(req.rel, *req.block_numbers.first().expect("no pages"));
+        // Find the first page's shard, for comparison. If there are no pages, just return the first
+        // shard (caller likely checked already, otherwise the server will reject it).
+        let Some(&first_page) = req.block_numbers.first() else {
+            return Some(ShardIndex::new(ShardNumber(0), count));
+        };
+        let key = rel_block_to_key(req.rel, first_page);
         let shard_number = key_to_shard_number(count, stripe_size, &key);
 
         req.block_numbers
@@ -57,19 +58,19 @@ impl GetPageSplitter {
     ) -> Self {
         // The caller should make sure we don't split requests unnecessarily.
         debug_assert!(
-            Self::is_single_shard(&req, count, stripe_size).is_none(),
+            Self::for_single_shard(&req, count, stripe_size).is_none(),
             "unnecessary request split"
         );
 
         // Split the requests by shard index.
         let mut requests = HashMap::with_capacity(2); // common case
         let mut block_shards = Vec::with_capacity(req.block_numbers.len());
-        for blkno in req.block_numbers {
+        for &blkno in &req.block_numbers {
             let key = rel_block_to_key(req.rel, blkno);
             let shard_number = key_to_shard_number(count, stripe_size, &key);
             let shard_id = ShardIndex::new(shard_number, count);
 
-            let shard_req = requests
+            requests
                 .entry(shard_id)
                 .or_insert_with(|| page_api::GetPageRequest {
                     request_id: req.request_id,
@@ -77,20 +78,39 @@ impl GetPageSplitter {
                     rel: req.rel,
                     read_lsn: req.read_lsn,
                     block_numbers: Vec::new(),
-                });
-            shard_req.block_numbers.push(blkno);
+                })
+                .block_numbers
+                .push(blkno);
             block_shards.push(shard_id);
         }
 
-        Self {
+        // Construct a response to be populated by shard responses. Preallocate empty page slots
+        // with the expected block numbers.
+        let response = page_api::GetPageResponse {
             request_id: req.request_id,
-            responses: HashMap::with_capacity(requests.len()),
+            status_code: page_api::GetPageStatusCode::Ok,
+            reason: None,
+            rel: req.rel,
+            pages: req
+                .block_numbers
+                .into_iter()
+                .map(|block_number| {
+                    page_api::Page {
+                        block_number,
+                        image: Bytes::new(), // empty page slot to be filled in
+                    }
+                })
+                .collect(),
+        };
+
+        Self {
             requests,
+            response,
             block_shards,
         }
     }
 
-    /// Drains the per-shard requests, moving them out of the hashmap to avoid extra allocations.
+    /// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations.
     pub fn drain_requests(
         &mut self,
     ) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
@@ -108,72 +128,82 @@ impl GetPageSplitter {
         // The caller should already have converted status codes into tonic::Status.
         if response.status_code != page_api::GetPageStatusCode::Ok {
             return Err(tonic::Status::internal(format!(
-                "unexpected non-OK response for shard {shard_id}: {:?}",
-                response.status_code
+                "unexpected non-OK response for shard {shard_id}: {} {}",
+                response.status_code,
+                response.reason.unwrap_or_default()
             )));
         }
 
-        // The stream pool ensures the response matches the request ID.
-        if response.request_id != self.request_id {
+        if response.request_id != self.response.request_id {
             return Err(tonic::Status::internal(format!(
                 "response ID mismatch for shard {shard_id}: expected {}, got {}",
-                self.request_id, response.request_id
+                self.response.request_id, response.request_id
             )));
         }
 
-        // We only dispatch one request per shard.
-        if self.responses.contains_key(&shard_id) {
+        // Place the shard response pages into the assembled response, in request order.
+        let mut pages = response.pages.into_iter();
+
+        for (i, &s) in self.block_shards.iter().enumerate() {
+            if shard_id != s {
+                continue;
+            }
+
+            let Some(slot) = self.response.pages.get_mut(i) else {
+                return Err(tonic::Status::internal(format!(
+                    "no block_shards slot {i} for shard {shard_id}"
+                )));
+            };
+            let Some(page) = pages.next() else {
+                return Err(tonic::Status::internal(format!(
+                    "missing page {} in shard {shard_id} response",
+                    slot.block_number
+                )));
+            };
+            if page.block_number != slot.block_number {
+                return Err(tonic::Status::internal(format!(
+                    "shard {shard_id} returned wrong page at index {i}, expected {} got {}",
+                    slot.block_number, page.block_number
+                )));
+            }
+            if !slot.image.is_empty() {
+                return Err(tonic::Status::internal(format!(
+                    "shard {shard_id} returned duplicate page {} at index {i}",
+                    slot.block_number
+                )));
+            }
+
+            *slot = page;
+        }
+
+        // Make sure we've consumed all pages from the shard response.
+        if let Some(extra_page) = pages.next() {
             return Err(tonic::Status::internal(format!(
-                "duplicate response for shard {shard_id}"
+                "shard {shard_id} returned extra page: {}",
+                extra_page.block_number
             )));
         }
 
-        // Add the response data to the map.
-        self.responses.insert(shard_id, response.page_images);
-
         Ok(())
     }
 
-    /// Assembles the shard responses into a single response. Responses must be present for all
-    /// relevant shards, and the total number of pages must match the original request.
+    /// Fetches the final, assembled response.
     #[allow(clippy::result_large_err)]
-    pub fn assemble_response(self) -> tonic::Result<page_api::GetPageResponse> {
-        let mut response = page_api::GetPageResponse {
-            request_id: self.request_id,
-            status_code: page_api::GetPageStatusCode::Ok,
-            reason: None,
-            page_images: Vec::with_capacity(self.block_shards.len()),
-        };
-
-        // Set up per-shard page iterators we can pull from.
-        let mut shard_responses = HashMap::with_capacity(self.responses.len());
-        for (shard_id, responses) in self.responses {
-            shard_responses.insert(shard_id, responses.into_iter());
-        }
-
-        // Reassemble the responses in the same order as the original request.
-        for shard_id in &self.block_shards {
-            let page = shard_responses
-                .get_mut(shard_id)
-                .ok_or_else(|| {
-                    tonic::Status::internal(format!("missing response for shard {shard_id}"))
-                })?
-                .next()
-                .ok_or_else(|| {
-                    tonic::Status::internal(format!("missing page from shard {shard_id}"))
-                })?;
-            response.page_images.push(page);
-        }
-
-        // Make sure there are no additional pages.
-        for (shard_id, mut pages) in shard_responses {
-            if pages.next().is_some() {
+    pub fn get_response(self) -> tonic::Result<page_api::GetPageResponse> {
+        // Check that the response is complete.
+        for (i, page) in self.response.pages.iter().enumerate() {
+            if page.image.is_empty() {
                 return Err(tonic::Status::internal(format!(
-                    "extra pages returned from shard {shard_id}"
+                    "missing page {} for shard {}",
+                    page.block_number,
+                    self.block_shards
+                        .get(i)
+                        .map(|s| s.to_string())
+                        .unwrap_or_else(|| "?".to_string())
                 )));
             }
         }
 
-        Ok(response)
+        Ok(self.response)
     }
 }
diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto
index b1f266d910..d113a04a42 100644
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -208,12 +208,25 @@ enum GetPageClass {
 message GetPageResponse {
   // The original request's ID.
   RequestID request_id = 1;
-  // The response status code.
+  // The response status code. If not OK, the rel and page fields will be empty.
   GetPageStatusCode status_code = 2;
   // A string describing the status, if any.
   string reason = 3;
-  // The 8KB page images, in the same order as the request. Empty if status_code != OK.
-  repeated bytes page_image = 4;
+  // The relation that the pages belong to.
+  RelTag rel = 4;
+  // The page(s), in the same order as the request.
+  repeated Page page = 5;
+}
+
+// A page.
+//
+// TODO: it would be slightly more efficient (but less convenient) to have separate arrays of block
+// numbers and images, but given the 8KB page size it's probably negligible. Benchmark it anyway.
+message Page {
+  // The page number.
+  uint32 block_number = 1;
+  // The materialized page image, as an 8KB byte vector.
+  bytes image = 2;
 }
 
 // A GetPageResponse status code.
diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs
index 6523d00d3d..f70d0e7b28 100644
--- a/pageserver/page_api/src/client.rs
+++ b/pageserver/page_api/src/client.rs
@@ -1,4 +1,5 @@
 use anyhow::Context as _;
+use futures::future::ready;
 use futures::{Stream, StreamExt as _, TryStreamExt as _};
 use tokio::io::AsyncRead;
 use tokio_util::io::StreamReader;
@@ -110,7 +111,7 @@ impl Client {
     ) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
         let reqs = reqs.map(proto::GetPageRequest::from);
         let resps = self.inner.get_pages(reqs).await?.into_inner();
-        Ok(resps.map_ok(GetPageResponse::from))
+        Ok(resps.and_then(|resp| ready(GetPageResponse::try_from(resp).map_err(|err| err.into()))))
     }
 
     /// Returns the size of a relation, as # of blocks.
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index 4db8237ad8..a9dd154285 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -502,22 +502,30 @@ impl From<GetPageClass> for i32 {
 pub struct GetPageResponse {
     /// The original request's ID.
     pub request_id: RequestID,
-    /// The response status code.
+    /// The response status code. If not OK, the `rel` and `pages` fields will be empty.
     pub status_code: GetPageStatusCode,
     /// A string describing the status, if any.
     pub reason: Option<String>,
-    /// The 8KB page images, in the same order as the request. Empty if status != OK.
-    pub page_images: Vec<Bytes>,
+    /// The relation that the pages belong to.
+    pub rel: RelTag,
+    // The page(s), in the same order as the request.
+    pub pages: Vec<Page>,
 }
 
-impl From<proto::GetPageResponse> for GetPageResponse {
-    fn from(pb: proto::GetPageResponse) -> Self {
-        Self {
-            request_id: pb.request_id.unwrap_or_default().into(),
+impl TryFrom<proto::GetPageResponse> for GetPageResponse {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetPageResponse) -> Result<Self, ProtocolError> {
+        Ok(Self {
+            request_id: pb
+                .request_id
+                .ok_or(ProtocolError::Missing("request_id"))?
+                .into(),
             status_code: pb.status_code.into(),
             reason: Some(pb.reason).filter(|r| !r.is_empty()),
-            page_images: pb.page_image,
-        }
+            rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
+            pages: pb.page.into_iter().map(Page::from).collect(),
+        })
     }
 }
 
@@ -527,7 +535,8 @@ impl From<GetPageResponse> for proto::GetPageResponse {
             request_id: Some(response.request_id.into()),
             status_code: response.status_code.into(),
             reason: response.reason.unwrap_or_default(),
-            page_image: response.page_images,
+            rel: Some(response.rel.into()),
+            page: response.pages.into_iter().map(proto::Page::from).collect(),
         }
     }
 }
@@ -560,11 +569,39 @@ impl GetPageResponse {
             request_id,
             status_code,
             reason: Some(status.message().to_string()),
-            page_images: Vec::new(),
+            rel: RelTag::default(),
+            pages: Vec::new(),
         })
     }
 }
 
+// A page.
+#[derive(Clone, Debug)]
+pub struct Page {
+    /// The page number.
+    pub block_number: u32,
+    /// The materialized page image, as an 8KB byte vector.
+    pub image: Bytes,
+}
+
+impl From<proto::Page> for Page {
+    fn from(pb: proto::Page) -> Self {
+        Self {
+            block_number: pb.block_number,
+            image: pb.image,
+        }
+    }
+}
+
+impl From<Page> for proto::Page {
+    fn from(page: Page) -> Self {
+        Self {
+            block_number: page.block_number,
+            image: page.image,
+        }
+    }
+}
+
 /// A GetPage response status code.
 ///
 /// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index b5c191e29a..30b30d36f6 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -694,7 +694,10 @@ impl Client for GrpcClient {
             "unexpected status code: {}",
             resp.status_code,
         );
-        Ok((resp.request_id.id, resp.page_images))
+        Ok((
+            resp.request_id.id,
+            resp.pages.into_iter().map(|p| p.image).collect(),
+        ))
     }
 }
 
@@ -761,6 +764,9 @@ impl Client for RichGrpcClient {
 
     async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
         let resp = self.requests.next().await.unwrap()?;
-        Ok((resp.request_id.id, resp.page_images))
+        Ok((
+            resp.request_id.id,
+            resp.pages.into_iter().map(|p| p.image).collect(),
+        ))
     }
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index b2f6cd465d..1fc7e4eac7 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -3483,12 +3483,16 @@ impl GrpcPageServiceHandler {
             request_id: req.request_id,
             status_code: page_api::GetPageStatusCode::Ok,
             reason: None,
-            page_images: Vec::with_capacity(results.len()),
+            rel: req.rel,
+            pages: Vec::with_capacity(results.len()),
         };
 
         for result in results {
             match result {
-                Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.page_images.push(r.page),
+                Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.pages.push(page_api::Page {
+                    block_number: r.req.blkno,
+                    image: r.page,
+                }),
                 Ok((resp, _, _)) => {
                     return Err(tonic::Status::internal(format!(
                         "unexpected response: {resp:?}"

From cec0543b5141f24d928d891a49d8832c70161311 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 10 Jul 2025 17:58:54 -0500
Subject: [PATCH 25/56] Add background to compute migration
 0002-alter_roles.sql (#11708)

On December 8th, 2023, an engineering escalation (INC-110) was opened
after it was found that BYPASSRLS was being applied to all roles.

PR that introduced the issue:
https://github.com/neondatabase/neon/pull/5657
Subsequent commit on main:
https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072

NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but
because it isn't easy to know if a Postgres cluster is affected by the
issue, we need to keep the migration around for a long time, if not
indefinitely, so any cluster can be fixed.

Branching is the gift that keeps on giving...

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
---
 compute_tools/src/migrations/0002-alter_roles.sql | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/compute_tools/src/migrations/0002-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql
index 6cb49f873f..8fc371eb8f 100644
--- a/compute_tools/src/migrations/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/0002-alter_roles.sql
@@ -1,3 +1,16 @@
+-- On December 8th, 2023, an engineering escalation (INC-110) was opened after
+-- it was found that BYPASSRLS was being applied to all roles.
+--
+-- PR that introduced the issue: https://github.com/neondatabase/neon/pull/5657
+-- Subsequent commit on main: https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072
+--
+-- NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but because it
+-- isn't easy to know if a Postgres cluster is affected by the issue, we need to
+-- keep the migration around for a long time, if not indefinitely, so any
+-- cluster can be fixed.
+--
+-- Branching is the gift that keeps on giving...
+
 DO $$
 DECLARE
     role_name text;

From c34d36d8a270b9a4910d4d26210e7c608288f079 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Fri, 11 Jul 2025 14:49:37 +0400
Subject: [PATCH 26/56] storcon_cli: timeline-safekeeper-migrate and
 timeline-locate subcommands (#12548)

## Problem
We have a `safekeeper_migrate` handler, but no subcommand in
`storcon_cli`. Same for `/:timeline_id/locate` for identifying current
set of safekeepers.

- Closes: https://github.com/neondatabase/neon/issues/12395

## Summary of changes
- Add `timeline-safekeeper-migrate` and `timeline-locate` subcommands to
`storcon_cli`
---
 Cargo.lock                                    |  1 +
 control_plane/src/broker.rs                   |  2 +-
 control_plane/src/pageserver.rs               |  2 +-
 control_plane/src/safekeeper.rs               |  2 +-
 control_plane/src/storage_controller.rs       |  2 +-
 control_plane/storcon_cli/Cargo.toml          |  1 +
 control_plane/storcon_cli/src/main.rs         | 57 ++++++++++++++++++-
 libs/safekeeper_api/src/models.rs             | 11 +++-
 .../src/service/safekeeper_service.rs         | 10 +---
 9 files changed, 73 insertions(+), 15 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c528354053..025f4e4116 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6991,6 +6991,7 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "reqwest",
+ "safekeeper_api",
  "serde_json",
  "storage_controller_client",
  "tokio",
diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
index f43f459636..988b08e875 100644
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -36,7 +36,7 @@ impl StorageBroker {
     pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
         let broker = &self.env.broker;
 
-        print!("Starting neon broker at {}", broker.client_url());
+        println!("Starting neon broker at {}", broker.client_url());
 
         let mut args = Vec::new();
 
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 3673d1f4f2..843ead807d 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -303,7 +303,7 @@ impl PageServerNode {
     async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
-        print!(
+        println!(
             "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
             self.conf.id,
             self.pg_connection_config.raw_address(),
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index da9dafd8e9..2ba2f3ebe4 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -127,7 +127,7 @@ impl SafekeeperNode {
         extra_opts: &[String],
         retry_timeout: &Duration,
     ) -> anyhow::Result<()> {
-        print!(
+        println!(
             "Starting safekeeper at '{}' in '{}', retrying for {:?}",
             self.pg_connection_config.raw_address(),
             self.datadir_path().display(),
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index bb83a6319c..dc6c82f504 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -660,7 +660,7 @@ impl StorageController {
             ));
         }
 
-        println!("Starting storage controller");
+        println!("Starting storage controller at {scheme}://{host}:{listen_port}");
 
         background_process::start_process(
             COMMAND,
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
index ce89116691..61d48b2469 100644
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -14,6 +14,7 @@ humantime.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
+safekeeper_api.workspace=true
 serde_json = { workspace = true, features = ["raw_value"] }
 storage_controller_client.workspace = true
 tokio.workspace = true
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 701c4b3b2e..24fd34a87a 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -11,7 +11,7 @@ use pageserver_api::controller_api::{
     PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
     ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
     SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    TenantShardMigrateRequest, TenantShardMigrateResponse, TimelineSafekeeperMigrateRequest,
 };
 use pageserver_api::models::{
     EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
@@ -21,6 +21,7 @@ use pageserver_api::models::{
 use pageserver_api::shard::{ShardStripeSize, TenantShardId};
 use pageserver_client::mgmt_api::{self};
 use reqwest::{Certificate, Method, StatusCode, Url};
+use safekeeper_api::models::TimelineLocateResponse;
 use storage_controller_client::control_api::Client;
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -279,6 +280,23 @@ enum Command {
         #[arg(long)]
         concurrency: Option<usize>,
     },
+    /// Locate safekeepers for a timeline from the storcon DB.
+    TimelineLocate {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+    },
+    /// Migrate a timeline to a new set of safekeepers
+    TimelineSafekeeperMigrate {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+        /// Example: --new-sk-set 1,2,3
+        #[arg(long, required = true, value_delimiter = ',')]
+        new_sk_set: Vec<NodeId>,
+    },
 }
 
 #[derive(Parser)]
@@ -1324,7 +1342,7 @@ async fn main() -> anyhow::Result<()> {
             concurrency,
         } => {
             let mut path = format!(
-                "/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
+                "v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
             );
 
             if let Some(c) = concurrency {
@@ -1335,6 +1353,41 @@ async fn main() -> anyhow::Result<()> {
                 .dispatch::<(), ()>(Method::POST, path, None)
                 .await?;
         }
+        Command::TimelineLocate {
+            tenant_id,
+            timeline_id,
+        } => {
+            let path = format!("debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate");
+
+            let resp = storcon_client
+                .dispatch::<(), TimelineLocateResponse>(Method::GET, path, None)
+                .await?;
+
+            let sk_set = resp.sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
+            let new_sk_set = resp
+                .new_sk_set
+                .as_ref()
+                .map(|ids| ids.iter().map(|id| id.0 as i64).collect::<Vec<_>>());
+
+            println!("generation = {}", resp.generation);
+            println!("sk_set = {sk_set:?}");
+            println!("new_sk_set = {new_sk_set:?}");
+        }
+        Command::TimelineSafekeeperMigrate {
+            tenant_id,
+            timeline_id,
+            new_sk_set,
+        } => {
+            let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate");
+
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::POST,
+                    path,
+                    Some(TimelineSafekeeperMigrateRequest { new_sk_set }),
+                )
+                .await?;
+        }
     }
 
     Ok(())
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index e87232474b..59e112654b 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -11,7 +11,7 @@ use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;
 
-use crate::membership::Configuration;
+use crate::membership::{Configuration, SafekeeperGeneration};
 use crate::{ServerInfo, Term};
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -311,3 +311,12 @@ pub struct PullTimelineResponse {
     pub safekeeper_host: Option<String>,
     // TODO: add more fields?
 }
+
+/// Response to a timeline locate request.
+/// Storcon-only API.
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct TimelineLocateResponse {
+    pub generation: SafekeeperGeneration,
+    pub sk_set: Vec<NodeId>,
+    pub new_sk_set: Option<Vec<NodeId>>,
+}
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index d7179372b2..42ddf81e3e 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -25,7 +25,8 @@ use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo};
 use safekeeper_api::PgVersionId;
 use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration};
 use safekeeper_api::models::{
-    PullTimelineRequest, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse,
+    PullTimelineRequest, TimelineLocateResponse, TimelineMembershipSwitchRequest,
+    TimelineMembershipSwitchResponse,
 };
 use safekeeper_api::{INITIAL_TERM, Term};
 use safekeeper_client::mgmt_api;
@@ -37,13 +38,6 @@ use utils::lsn::Lsn;
 
 use super::Service;
 
-#[derive(serde::Serialize, serde::Deserialize, Clone)]
-pub struct TimelineLocateResponse {
-    pub generation: SafekeeperGeneration,
-    pub sk_set: Vec<NodeId>,
-    pub new_sk_set: Option<Vec<NodeId>>,
-}
-
 impl Service {
     fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, ApiError> {
         let members = safekeepers

From 15f633922aaa62e333ba3b92cd97d646ce56e5ef Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 11 Jul 2025 12:39:51 +0100
Subject: [PATCH 27/56] pageserver: use image consistent LSN for force image
 layer creation (#12547)

This is a no-op for the neon deployment

* Introduce the concept image consistent lsn: of the largest LSN below
which all pages have been redone successfully
* Use the image consistent LSN for forced image layer creations
* Optionally expose the image consistent LSN via the timeline describe
HTTP endpoint
* Add a sharded timeline describe endpoint to storcon

---------

Co-authored-by: Chen Luo <chen.luo@databricks.com>
---
 libs/pageserver_api/src/controller_api.rs     |   9 +-
 libs/pageserver_api/src/models.rs             |   3 +
 pageserver/src/http/routes.rs                 |  19 ++
 pageserver/src/tenant.rs                      |  34 +++
 pageserver/src/tenant/layer_map.rs            | 232 +++++++++++++++++-
 pageserver/src/tenant/timeline.rs             |  32 ++-
 pageserver/src/tenant/timeline/compaction.rs  |  89 +++----
 .../src/tenant/timeline/layer_manager.rs      |   1 +
 storage_controller/src/http.rs                |  32 +++
 storage_controller/src/pageserver_client.rs   |  17 ++
 storage_controller/src/service.rs             |  88 ++++++-
 test_runner/fixtures/neon_fixtures.py         |  14 ++
 test_runner/regress/test_compaction.py        |  77 +++++-
 13 files changed, 567 insertions(+), 80 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a8c7083b17..b02c6a613a 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
 
-use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
+use crate::models::{PageserverUtilization, ShardParameters, TenantConfig, TimelineInfo};
 use crate::shard::{ShardStripeSize, TenantShardId};
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -126,6 +126,13 @@ pub struct TenantDescribeResponse {
     pub config: TenantConfig,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantTimelineDescribeResponse {
+    pub shards: Vec<TimelineInfo>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_consistent_lsn: Option<Lsn>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct NodeShardResponse {
     pub node_id: NodeId,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 56dd95eab3..11e02a8550 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1622,6 +1622,9 @@ pub struct TimelineInfo {
 
     /// Whether the timeline is invisible in synthetic size calculations.
     pub is_invisible: Option<bool>,
+    // HADRON: the largest LSN below which all page updates have been included in the image layers.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_consistent_lsn: Option<Lsn>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7030ac368d..d839bac557 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -397,6 +397,7 @@ async fn build_timeline_info(
     timeline: &Arc<Timeline>,
     include_non_incremental_logical_size: bool,
     force_await_initial_logical_size: bool,
+    include_image_consistent_lsn: bool,
     ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
     crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -421,6 +422,10 @@ async fn build_timeline_info(
                 .await?,
         );
     }
+    // HADRON
+    if include_image_consistent_lsn {
+        info.image_consistent_lsn = Some(timeline.compute_image_consistent_lsn().await?);
+    }
     Ok(info)
 }
 
@@ -510,6 +515,8 @@ async fn build_timeline_info_common(
         is_invisible: Some(is_invisible),
 
         walreceiver_status,
+        // HADRON
+        image_consistent_lsn: None,
     };
     Ok(info)
 }
@@ -712,6 +719,8 @@ async fn timeline_list_handler(
         parse_query_param(&request, "include-non-incremental-logical-size")?;
     let force_await_initial_logical_size: Option<bool> =
         parse_query_param(&request, "force-await-initial-logical-size")?;
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let state = get_state(&request);
@@ -732,6 +741,7 @@ async fn timeline_list_handler(
                 &timeline,
                 include_non_incremental_logical_size.unwrap_or(false),
                 force_await_initial_logical_size.unwrap_or(false),
+                include_image_consistent_lsn.unwrap_or(false),
                 &ctx,
             )
             .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -760,6 +770,9 @@ async fn timeline_and_offloaded_list_handler(
         parse_query_param(&request, "include-non-incremental-logical-size")?;
     let force_await_initial_logical_size: Option<bool> =
         parse_query_param(&request, "force-await-initial-logical-size")?;
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
+
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let state = get_state(&request);
@@ -780,6 +793,7 @@ async fn timeline_and_offloaded_list_handler(
                 &timeline,
                 include_non_incremental_logical_size.unwrap_or(false),
                 force_await_initial_logical_size.unwrap_or(false),
+                include_image_consistent_lsn.unwrap_or(false),
                 &ctx,
             )
             .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -964,6 +978,9 @@ async fn timeline_detail_handler(
         parse_query_param(&request, "include-non-incremental-logical-size")?;
     let force_await_initial_logical_size: Option<bool> =
         parse_query_param(&request, "force-await-initial-logical-size")?;
+    // HADRON
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     // Logical size calculation needs downloading.
@@ -984,6 +1001,7 @@ async fn timeline_detail_handler(
             &timeline,
             include_non_incremental_logical_size.unwrap_or(false),
             force_await_initial_logical_size.unwrap_or(false),
+            include_image_consistent_lsn.unwrap_or(false),
             ctx,
         )
         .await
@@ -3643,6 +3661,7 @@ async fn activate_post_import_handler(
         let timeline_info = build_timeline_info(
             &timeline, false, // include_non_incremental_logical_size,
             false, // force_await_initial_logical_size
+            false, // include_image_consistent_lsn
             &ctx,
         )
         .await
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f67269851a..f75a03a508 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12816,6 +12816,40 @@ mod tests {
                 },
             ]
         );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_get_force_image_creation_lsn() -> anyhow::Result<()> {
+        let tenant_conf = pageserver_api::models::TenantConfig {
+            pitr_interval: Some(Duration::from_secs(7 * 3600)),
+            image_layer_force_creation_period: Some(Duration::from_secs(3600)),
+            ..Default::default()
+        };
+
+        let tenant_id = TenantId::generate();
+
+        let harness = TenantHarness::create_custom(
+            "test_get_force_image_creation_lsn",
+            tenant_conf,
+            tenant_id,
+            ShardIdentity::unsharded(),
+            Generation::new(1),
+        )
+        .await?;
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        timeline.gc_info.write().unwrap().cutoffs.time = Some(Lsn(100));
+        {
+            let writer = timeline.writer().await;
+            writer.finish_write(Lsn(5000));
+        }
+
+        let image_creation_lsn = timeline.get_force_image_creation_lsn().unwrap();
+        assert_eq!(image_creation_lsn, Lsn(4300));
         Ok(())
     }
 }
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 23052ccee7..ba02602cfe 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -46,10 +46,11 @@
 mod historic_layer_coverage;
 mod layer_coverage;
 
-use std::collections::{HashMap, VecDeque};
+use std::collections::{BTreeMap, HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Instant;
 
 use anyhow::Result;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
@@ -904,6 +905,103 @@ impl LayerMap {
         max_stacked_deltas
     }
 
+    /* BEGIN_HADRON */
+    /**
+     * Compute the image consistent LSN, the largest LSN below which all pages have been redone successfully.
+     * It works by first finding the latest image layers and store them into a map. Then for each delta layer,
+     * find all overlapping image layers in order to potentially increase the image LSN in case there are gaps
+     * (e.g., if an image is created at LSN 100 but the delta layer spans LSN [150, 200], then we can increase
+     * image LSN to 150 because there is no WAL record in between).
+     * Finally, the image consistent LSN is computed by taking the minimum of all image layers.
+     */
+    pub fn compute_image_consistent_lsn(&self, disk_consistent_lsn: Lsn) -> Lsn {
+        struct ImageLayerInfo {
+            // creation LSN of the image layer
+            image_lsn: Lsn,
+            // the current minimum LSN of newer delta layers with overlapping key ranges
+            min_delta_lsn: Lsn,
+        }
+        let started_at = Instant::now();
+
+        let min_l0_deltas_lsn = {
+            let l0_deltas = self.level0_deltas();
+            l0_deltas
+                .iter()
+                .map(|layer| layer.get_lsn_range().start)
+                .min()
+                .unwrap_or(disk_consistent_lsn)
+        };
+        let global_key_range = Key::MIN..Key::MAX;
+
+        // step 1: collect all most recent image layers into a map
+        // map: end key to image_layer_info
+        let mut image_map: BTreeMap<Key, ImageLayerInfo> = BTreeMap::new();
+        for (img_range, img) in self.image_coverage(&global_key_range, disk_consistent_lsn) {
+            let img_lsn = img.map(|layer| layer.get_lsn_range().end).unwrap_or(Lsn(0));
+            image_map.insert(
+                img_range.end,
+                ImageLayerInfo {
+                    image_lsn: img_lsn,
+                    min_delta_lsn: min_l0_deltas_lsn,
+                },
+            );
+        }
+
+        // step 2: go through all delta layers, and update the image layer info with overlapping
+        // key ranges
+        for layer in self.historic.iter() {
+            if !layer.is_delta {
+                continue;
+            }
+            let delta_key_range = layer.get_key_range();
+            let delta_lsn_range = layer.get_lsn_range();
+            for (img_end_key, img_info) in image_map.range_mut(delta_key_range.start..Key::MAX) {
+                debug_assert!(img_end_key >= &delta_key_range.start);
+                if delta_lsn_range.end > img_info.image_lsn {
+                    // the delta layer includes WAL records after the image
+                    // it's possibel that the delta layer's start LSN < image LSN, which will be simply ignored by step 3
+                    img_info.min_delta_lsn =
+                        std::cmp::min(img_info.min_delta_lsn, delta_lsn_range.start);
+                }
+                if img_end_key >= &delta_key_range.end {
+                    // we have fully processed all overlapping image layers
+                    break;
+                }
+            }
+        }
+
+        // step 3, go through all image layers and find the image consistent LSN
+        let mut img_consistent_lsn = min_l0_deltas_lsn.checked_sub(Lsn(1)).unwrap();
+        let mut prev_key = Key::MIN;
+        for (img_key, img_info) in image_map {
+            tracing::debug!(
+                "Image layer {:?}:{} has min delta lsn {}",
+                Range {
+                    start: prev_key,
+                    end: img_key,
+                },
+                img_info.image_lsn,
+                img_info.min_delta_lsn,
+            );
+            let image_lsn = std::cmp::max(
+                img_info.image_lsn,
+                img_info.min_delta_lsn.checked_sub(Lsn(1)).unwrap_or(Lsn(0)),
+            );
+            img_consistent_lsn = std::cmp::min(img_consistent_lsn, image_lsn);
+            prev_key = img_key;
+        }
+        tracing::info!(
+            "computed image_consistent_lsn {} for disk_consistent_lsn {} in {}ms. Processed {} layrs in total.",
+            img_consistent_lsn,
+            disk_consistent_lsn,
+            started_at.elapsed().as_millis(),
+            self.historic.len()
+        );
+        img_consistent_lsn
+    }
+
+    /* END_HADRON */
+
     /// Return all L0 delta layers
     pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
         &self.l0_delta_layers
@@ -1579,6 +1677,138 @@ mod tests {
             LayerVisibilityHint::Visible
         ));
     }
+
+    /* BEGIN_HADRON */
+    #[test]
+    fn test_compute_image_consistent_lsn() {
+        let mut layer_map = LayerMap::default();
+
+        let disk_consistent_lsn = Lsn(1000);
+        // case 1: empty layer map
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(
+            disk_consistent_lsn.checked_sub(Lsn(1)).unwrap(),
+            image_consistent_lsn
+        );
+
+        // case 2: only L0 delta layer
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(100),
+                Lsn(900)..Lsn(990),
+                true,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(100),
+                Lsn(850)..Lsn(899),
+                true,
+            ));
+        }
+
+        // should use min L0 delta LSN - 1 as image consistent LSN
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(849), image_consistent_lsn);
+
+        // case 3: 3 images, no L1 delta
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(40),
+                Lsn(100)..Lsn(100),
+                false,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(40)..Key::from_i128(70),
+                Lsn(200)..Lsn(200),
+                false,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(70)..Key::from_i128(100),
+                Lsn(150)..Lsn(150),
+                false,
+            ));
+        }
+        // should use min L0 delta LSN - 1 as image consistent LSN
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(849), image_consistent_lsn);
+
+        // case 4: 3 images with 1 L1 delta
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(50),
+                Lsn(300)..Lsn(350),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(299), image_consistent_lsn);
+
+        // case 5: 3 images with 1 more L1 delta with smaller LSN
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(50)..Key::from_i128(72),
+                Lsn(200)..Lsn(300),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 6: 3 images with more newer L1 deltas (no impact on final results)
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(30),
+                Lsn(400)..Lsn(500),
+                true,
+            ));
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(35)..Key::from_i128(100),
+                Lsn(450)..Lsn(600),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 7: 3 images with more older L1 deltas (no impact on final results)
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(40),
+                Lsn(0)..Lsn(50),
+                true,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(50)..Key::from_i128(100),
+                Lsn(10)..Lsn(60),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 8: 3 images with one more L1 delta with overlapping LSN range
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(50),
+                Lsn(50)..Lsn(250),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(100), image_consistent_lsn);
+    }
+
+    /* END_HADRON */
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a9bc0a060b..718ea925b7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -351,13 +351,6 @@ pub struct Timeline {
     last_image_layer_creation_check_at: AtomicLsn,
     last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
 
-    // HADRON
-    /// If a key range has writes with LSN > force_image_creation_lsn, then we should force image layer creation
-    /// on this key range.
-    force_image_creation_lsn: AtomicLsn,
-    /// The last time instant when force_image_creation_lsn is computed.
-    force_image_creation_lsn_computed_at: std::sync::Mutex<Option<Instant>>,
-
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
 
@@ -2854,7 +2847,7 @@ impl Timeline {
     }
 
     // HADRON
-    fn get_image_creation_timeout(&self) -> Option<Duration> {
+    fn get_image_layer_force_creation_period(&self) -> Option<Duration> {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
             .tenant_conf
@@ -3134,9 +3127,6 @@ impl Timeline {
                 repartition_threshold: 0,
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
                 last_image_layer_creation_check_instant: Mutex::new(None),
-                // HADRON
-                force_image_creation_lsn: AtomicLsn::new(0),
-                force_image_creation_lsn_computed_at: std::sync::Mutex::new(None),
                 last_received_wal: Mutex::new(None),
                 rel_size_latest_cache: RwLock::new(HashMap::new()),
                 rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
@@ -5381,13 +5371,16 @@ impl Timeline {
         }
 
         // HADRON
+        // for child timelines, we consider all pages up to ancestor_LSN are redone successfully by the parent timeline
+        min_image_lsn = min_image_lsn.max(self.get_ancestor_lsn());
         if min_image_lsn < force_image_creation_lsn.unwrap_or(Lsn(0)) && max_deltas > 0 {
             info!(
-                "forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}",
+                "forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}, num deltas: {}",
                 partition.ranges[0].start,
                 partition.ranges[0].end,
                 min_image_lsn,
-                force_image_creation_lsn.unwrap()
+                force_image_creation_lsn.unwrap(),
+                max_deltas
             );
             return true;
         }
@@ -7153,6 +7146,19 @@ impl Timeline {
             .unwrap()
             .clone()
     }
+
+    /* BEGIN_HADRON */
+    pub(crate) async fn compute_image_consistent_lsn(&self) -> anyhow::Result<Lsn> {
+        let guard = self
+            .layers
+            .read(LayerManagerLockHolder::ComputeImageConsistentLsn)
+            .await;
+        let layer_map = guard.layer_map()?;
+        let disk_consistent_lsn = self.get_disk_consistent_lsn();
+
+        Ok(layer_map.compute_image_consistent_lsn(disk_consistent_lsn))
+    }
+    /* END_HADRON */
 }
 
 impl Timeline {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 171f9d1284..aa1aa937b6 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -8,7 +8,7 @@ use std::cmp::min;
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
-use std::time::{Duration, Instant, SystemTime};
+use std::time::{Duration, Instant};
 
 use super::layer_manager::LayerManagerLockHolder;
 use super::{
@@ -34,7 +34,6 @@ use pageserver_api::models::{CompactInfoResponse, CompactKeyRange};
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use pageserver_compaction::helpers::{fully_contains, overlaps_with};
 use pageserver_compaction::interface::*;
-use postgres_ffi::to_pg_timestamp;
 use serde::Serialize;
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio_util::sync::CancellationToken;
@@ -47,7 +46,6 @@ use wal_decoder::models::value::Value;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
@@ -1271,10 +1269,7 @@ impl Timeline {
         // Define partitioning schema if needed
 
         // HADRON
-        let force_image_creation_lsn = self
-            .get_or_compute_force_image_creation_lsn(cancel, ctx)
-            .await
-            .map_err(CompactionError::Other)?;
+        let force_image_creation_lsn = self.get_force_image_creation_lsn();
 
         // 1. L0 Compact
         let l0_outcome = {
@@ -1484,59 +1479,37 @@ impl Timeline {
     }
 
     /* BEGIN_HADRON */
-    // Get the force image creation LSN. Compute it if the last computed LSN is too old.
-    async fn get_or_compute_force_image_creation_lsn(
-        self: &Arc<Self>,
-        cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Option<Lsn>> {
-        const FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL: Duration = Duration::from_secs(10 * 60); // 10 minutes
-        let image_layer_force_creation_period = self.get_image_creation_timeout();
-        if image_layer_force_creation_period.is_none() {
-            return Ok(None);
+    // Get the force image creation LSN based on gc_cutoff_lsn.
+    // Note that this is an estimation and the workload rate may suddenly change. When that happens,
+    // the force image creation may be too early or too late, but eventually it should be able to catch up.
+    pub(crate) fn get_force_image_creation_lsn(self: &Arc<Self>) -> Option<Lsn> {
+        let image_creation_period = self.get_image_layer_force_creation_period()?;
+        let current_lsn = self.get_last_record_lsn();
+        let pitr_lsn = self.gc_info.read().unwrap().cutoffs.time?;
+        let pitr_interval = self.get_pitr_interval();
+        if pitr_lsn == Lsn::INVALID || pitr_interval.is_zero() {
+            tracing::warn!(
+                "pitr LSN/interval not found, skipping force image creation LSN calculation"
+            );
+            return None;
         }
 
-        let image_layer_force_creation_period = image_layer_force_creation_period.unwrap();
-        let force_image_creation_lsn_computed_at =
-            *self.force_image_creation_lsn_computed_at.lock().unwrap();
-        if force_image_creation_lsn_computed_at.is_none()
-            || force_image_creation_lsn_computed_at.unwrap().elapsed()
-                > FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL
-        {
-            let now: SystemTime = SystemTime::now();
-            let timestamp = now
-                .checked_sub(image_layer_force_creation_period)
-                .ok_or_else(|| {
-                    anyhow::anyhow!(
-                        "image creation timeout is too large: {image_layer_force_creation_period:?}"
-                    )
-                })?;
-            let timestamp = to_pg_timestamp(timestamp);
-            let force_image_creation_lsn = match self
-                .find_lsn_for_timestamp(timestamp, cancel, ctx)
-                .await?
-            {
-                LsnForTimestamp::Present(lsn) | LsnForTimestamp::Future(lsn) => lsn,
-                _ => {
-                    let gc_lsn = *self.get_applied_gc_cutoff_lsn();
-                    tracing::info!(
-                        "no LSN found for timestamp {timestamp:?}, using latest GC cutoff LSN {}",
-                        gc_lsn
-                    );
-                    gc_lsn
-                }
-            };
-            self.force_image_creation_lsn
-                .store(force_image_creation_lsn);
-            *self.force_image_creation_lsn_computed_at.lock().unwrap() = Some(Instant::now());
-            tracing::info!(
-                "computed force image creation LSN: {}",
-                force_image_creation_lsn
-            );
-            Ok(Some(force_image_creation_lsn))
-        } else {
-            Ok(Some(self.force_image_creation_lsn.load()))
-        }
+        let delta_lsn = current_lsn.checked_sub(pitr_lsn).unwrap().0
+            * image_creation_period.as_secs()
+            / pitr_interval.as_secs();
+        let force_image_creation_lsn = current_lsn.checked_sub(delta_lsn).unwrap_or(Lsn(0));
+
+        tracing::info!(
+            "Tenant shard {} computed force_image_creation_lsn: {}. Current lsn: {}, image_layer_force_creation_period: {:?}, GC cutoff: {}, PITR interval: {:?}",
+            self.tenant_shard_id,
+            force_image_creation_lsn,
+            current_lsn,
+            image_creation_period,
+            pitr_lsn,
+            pitr_interval
+        );
+
+        Some(force_image_creation_lsn)
     }
     /* END_HADRON */
 
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 2eccf48579..d8d81a6c91 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -47,6 +47,7 @@ pub(crate) enum LayerManagerLockHolder {
     ImportPgData,
     DetachAncestor,
     Eviction,
+    ComputeImageConsistentLsn,
     #[cfg(test)]
     Testing,
 }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index e5a3a969d4..62fc212e12 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -850,6 +850,31 @@ async fn handle_tenant_describe(
     json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }
 
+/* BEGIN_HADRON */
+async fn handle_tenant_timeline_describe(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Scrubber)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_timeline_describe(tenant_id, timeline_id)
+            .await?,
+    )
+}
+/* END_HADRON */
+
 async fn handle_tenant_list(
     service: Arc<Service>,
     req: Request<Body>,
@@ -2480,6 +2505,13 @@ pub fn make_router(
             )
         })
         // Timeline operations
+        .get("/control/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_describe,
+                RequestName("v1_tenant_timeline_describe"),
+            )
+        })
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             tenant_service_handler(
                 r,
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index d6fe173eb3..da0687895a 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -86,6 +86,23 @@ impl PageserverClient {
         )
     }
 
+    /* BEGIN_HADRON */
+    pub(crate) async fn tenant_timeline_describe(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Result<TimelineInfo> {
+        measured_request!(
+            "tenant_timeline_describe",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner
+                .tenant_timeline_describe(tenant_shard_id, timeline_id,)
+                .await
+        )
+    }
+    /* END_HADRON */
+
     pub(crate) async fn tenant_scan_remote_storage(
         &self,
         tenant_id: TenantId,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 9c1b81d261..31d149c5ac 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -32,7 +32,7 @@ use pageserver_api::controller_api::{
     ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
     SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
     TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    TenantShardMigrateRequest, TenantShardMigrateResponse, TenantTimelineDescribeResponse,
 };
 use pageserver_api::models::{
     self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease,
@@ -5486,6 +5486,92 @@ impl Service {
         .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
     }
 
+    /* BEGIN_HADRON */
+    pub(crate) async fn tenant_timeline_describe(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<TenantTimelineDescribeResponse, ApiError> {
+        self.tenant_remote_mutation(tenant_id, |locations| async move {
+            if locations.0.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            };
+
+            let locations: Vec<(TenantShardId, Node)> = locations
+                .0
+                .iter()
+                .map(|t| (*t.0, t.1.latest.node.clone()))
+                .collect();
+            let mut futs = FuturesUnordered::new();
+
+            for (shard_id, node) in locations {
+                futs.push({
+                    async move {
+                        let result = node
+                            .with_client_retries(
+                                |client| async move {
+                                    client
+                                        .tenant_timeline_describe(&shard_id, &timeline_id)
+                                        .await
+                                },
+                                &self.http_client,
+                                &self.config.pageserver_jwt_token,
+                                3,
+                                3,
+                                Duration::from_secs(30),
+                                &self.cancel,
+                            )
+                            .await;
+                        (result, shard_id, node.get_id())
+                    }
+                });
+            }
+
+            let mut results: Vec<TimelineInfo> = Vec::new();
+            while let Some((result, tenant_shard_id, node_id)) = futs.next().await {
+                match result {
+                    Some(Ok(timeline_info)) => results.push(timeline_info),
+                    Some(Err(e)) => {
+                        tracing::warn!(
+                            "Failed to describe tenant {} timeline {} for pageserver {}: {e}",
+                            tenant_shard_id,
+                            timeline_id,
+                            node_id,
+                        );
+                        return Err(ApiError::ResourceUnavailable(format!("{e}").into()));
+                    }
+                    None => return Err(ApiError::Cancelled),
+                }
+            }
+            let mut image_consistent_lsn: Option<Lsn> = Some(Lsn::MAX);
+            for timeline_info in &results {
+                if let Some(tline_image_consistent_lsn) = timeline_info.image_consistent_lsn {
+                    image_consistent_lsn = Some(std::cmp::min(
+                        image_consistent_lsn.unwrap(),
+                        tline_image_consistent_lsn,
+                    ));
+                } else {
+                    tracing::warn!(
+                        "Timeline {} on shard {} does not have image consistent lsn",
+                        timeline_info.timeline_id,
+                        timeline_info.tenant_id
+                    );
+                    image_consistent_lsn = None;
+                    break;
+                }
+            }
+
+            Ok(TenantTimelineDescribeResponse {
+                shards: results,
+                image_consistent_lsn,
+            })
+        })
+        .await?
+    }
+    /* END_HADRON */
+
     /// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not
     /// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory
     /// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 42924f9b83..a7b7f0e74d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2342,6 +2342,20 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response.raise_for_status()
         return response.json()
 
+    # HADRON
+    def tenant_timeline_describe(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ):
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+        return response.json()
+
     def nodes(self):
         """
         :return: list of {"id": ""}
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index e67161c6b7..ab02314288 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -960,9 +960,9 @@ def get_layer_map(env, tenant_shard_id, timeline_id, ps_id):
     return image_layer_count, delta_layer_count
 
 
-def test_image_creation_timeout(neon_env_builder: NeonEnvBuilder):
+def test_image_layer_force_creation_period(neon_env_builder: NeonEnvBuilder):
     """
-    Tests that page server can force creating new images if image creation timeout is enabled
+    Tests that page server can force creating new images if image_layer_force_creation_period is enabled
     """
     # use large knobs to disable L0 compaction/image creation except for the force image creation
     tenant_conf = {
@@ -972,10 +972,10 @@ def test_image_creation_timeout(neon_env_builder: NeonEnvBuilder):
         "checkpoint_distance": 10 * 1024,
         "checkpoint_timeout": "1s",
         "image_layer_force_creation_period": "1s",
-        # The lsn for forced image layer creations is calculated once every 10 minutes.
-        # Hence, drive compaction manually such that the test doesn't compute it at the
-        # wrong time.
-        "compaction_period": "0s",
+        "pitr_interval": "10s",
+        "gc_period": "1s",
+        "compaction_period": "1s",
+        "lsn_lease_length": "1s",
     }
 
     # consider every tenant large to run the image layer generation check more eagerly
@@ -1018,4 +1018,69 @@ def test_image_creation_timeout(neon_env_builder: NeonEnvBuilder):
     )
 
 
+def test_image_consistent_lsn(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the /v1/tenant/<tenant_id>/timeline/<timeline_id> endpoint and the computation of image_consistent_lsn
+    """
+    # use large knobs to disable L0 compaction/image creation except for the force image creation
+    tenant_conf = {
+        "compaction_threshold": "100",
+        "image_creation_threshold": "100",
+        "image_layer_creation_check_threshold": "1",
+        "checkpoint_distance": 10 * 1024,
+        "checkpoint_timeout": "1s",
+        "image_layer_force_creation_period": "1s",
+        "pitr_interval": "10s",
+        "gc_period": "1s",
+        "compaction_period": "1s",
+        "lsn_lease_length": "1s",
+    }
+
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf,
+        initial_tenant_shard_count=4,
+        initial_tenant_shard_stripe_size=1,
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER, val text)")
+    for v in range(10):
+        endpoint.safe_psql(
+            f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))", log_query=False
+        )
+
+    response = env.storage_controller.tenant_timeline_describe(tenant_id, timeline_id)
+    shards = response["shards"]
+    for shard in shards:
+        assert shard["image_consistent_lsn"] is not None
+    image_consistent_lsn = response["image_consistent_lsn"]
+    assert image_consistent_lsn is not None
+
+    # do more writes and wait for image_consistent_lsn to advance
+    for v in range(100):
+        endpoint.safe_psql(
+            f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))", log_query=False
+        )
+
+    def check_image_consistent_lsn_advanced():
+        response = env.storage_controller.tenant_timeline_describe(tenant_id, timeline_id)
+        new_image_consistent_lsn = response["image_consistent_lsn"]
+        shards = response["shards"]
+        for shard in shards:
+            print(f"shard {shard['tenant_id']} image_consistent_lsn{shard['image_consistent_lsn']}")
+        assert new_image_consistent_lsn != image_consistent_lsn
+
+    wait_until(check_image_consistent_lsn_advanced)
+
+    endpoint.stop_and_destroy()
+
+    for ps in env.pageservers:
+        ps.allowed_errors.append(".*created delta file of size.*larger than double of target.*")
+
+
 # END_HADRON

From 154f6dc59cc91ebde58d1a0b4a8b43aa68d1c3a5 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 11 Jul 2025 14:25:25 +0100
Subject: [PATCH 28/56] pageserver: log only on final shard resolution failure
 (#12565)

This log is too noisy. Instead of warning on every retry, let's log only
on the final failure.
---
 pageserver/src/tenant/timeline/handle.rs          | 10 +++++-----
 test_runner/fixtures/pageserver/allowed_errors.py |  3 +--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 33c97287c0..7bca66190f 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -359,14 +359,14 @@ impl<T: Types> Cache<T> {
                 Err(e) => {
                     // Retry on tenant manager error to handle tenant split more gracefully
                     if attempt < GET_MAX_RETRIES {
-                        tracing::warn!(
-                            "Fail to resolve tenant shard in attempt {}: {:?}. Retrying...",
-                            attempt,
-                            e
-                        );
                         tokio::time::sleep(RETRY_BACKOFF).await;
                         continue;
                     } else {
+                        tracing::warn!(
+                            "Failed to resolve tenant shard after {} attempts: {:?}",
+                            GET_MAX_RETRIES,
+                            e
+                        );
                         return Err(e);
                     }
                 }
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 0e4dd571c0..59249f31ad 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -115,8 +115,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*Local data loss suspected.*",
     # Too many frozen layers error is normal during intensive benchmarks
     ".*too many frozen layers.*",
-    # Transient errors when resolving tenant shards by page service
-    ".*Fail to resolve tenant shard in attempt.*",
+    ".*Failed to resolve tenant shard after.*",
     # Expected warnings when pageserver has not refreshed GC info yet
     ".*pitr LSN/interval not found, skipping force image creation LSN calculation.*",
     ".*No broker updates received for a while.*",

From a8db7ebffb7e9b2a1bc9cc950a03a244d26d34d4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 11 Jul 2025 17:17:44 +0300
Subject: [PATCH 29/56] Minor refactor of the SQL functions to get working set
 size estimate (#12550)

Split the functions into two: one internal function to calculate the
estimate, and another (two functions) to expose it as SQL functions.

This is in preparation of adding new communicator implementation. With
that, the SQL functions will dispatch the call to the old or new
implementation depending on which is being used.
---
 pgxn/neon/file_cache.c | 47 +++++++++++++++---------------------------
 pgxn/neon/file_cache.h |  3 ++-
 pgxn/neon/neon.c       | 30 +++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 8cfa09bc87..0e316abd1d 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -205,6 +205,8 @@ bool AmPrewarmWorker;
 
 #define LFC_ENABLED() (lfc_ctl->limit != 0)
 
+PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
+
 /*
  * Close LFC file if opened.
  * All backends should close their LFC files once LFC is disabled.
@@ -2135,40 +2137,25 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		SRF_RETURN_DONE(funcctx);
 }
 
-PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
 
-Datum
-approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+/*
+ * Internal implementation of the approximate_working_set_size_seconds()
+ * function.
+ */
+int32
+lfc_approximate_working_set_size_seconds(time_t duration, bool reset)
 {
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
-		LWLockAcquire(lfc_lock, LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
-}
+	int32		dc;
 
-PG_FUNCTION_INFO_V1(approximate_working_set_size);
+	if (lfc_size_limit == 0)
+		return -1;
 
-Datum
-approximate_working_set_size(PG_FUNCTION_ARGS)
-{
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		bool reset = PG_GETARG_BOOL(0);
-		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
-		if (reset)
-			memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
+	LWLockAcquire(lfc_lock, LW_SHARED);
+	dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
+	if (reset)
+		memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
+	LWLockRelease(lfc_lock);
+	return dc;
 }
 
 PG_FUNCTION_INFO_V1(get_local_cache_state);
diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h
index d5ac55d5ba..14e5d4f753 100644
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -47,7 +47,8 @@ extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blk
 extern FileCacheState* lfc_get_state(size_t max_entries);
 extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);
 
-PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
+extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset);
+
 
 static inline bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 9e0ca16fed..7b749f1080 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -561,6 +561,8 @@ _PG_init(void)
 PG_FUNCTION_INFO_V1(pg_cluster_size);
 PG_FUNCTION_INFO_V1(backpressure_lsns);
 PG_FUNCTION_INFO_V1(backpressure_throttling_time);
+PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
+PG_FUNCTION_INFO_V1(approximate_working_set_size);
 
 Datum
 pg_cluster_size(PG_FUNCTION_ARGS)
@@ -607,6 +609,34 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }
 
+Datum
+approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+{
+	time_t		duration;
+	int32		dc;
+
+	duration = PG_ARGISNULL(0) ? (time_t) -1 : PG_GETARG_INT32(0);
+
+	dc = lfc_approximate_working_set_size_seconds(duration, false);
+	if (dc < 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_INT32(dc);
+}
+
+Datum
+approximate_working_set_size(PG_FUNCTION_ARGS)
+{
+	bool		reset = PG_GETARG_BOOL(0);
+	int32		dc;
+
+	dc = lfc_approximate_working_set_size_seconds(-1, reset);
+	if (dc < 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_INT32(dc);
+}
+
 #if PG_MAJORVERSION_NUM >= 16
 static void
 neon_shmem_startup_hook(void)

From f4245403b36925c3ad0ef39c344ca30b1701b74f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 11 Jul 2025 16:13:36 +0100
Subject: [PATCH 30/56] [proxy] allow testing query cancellation locally
 (#12568)

## Problem

Canceelation requires redis, redis required control-plane.

## Summary of changes

Make redis for cancellation not require control plane.
Add instructions for setting up redis locally.
---
 proxy/README.md           | 10 +++++++++-
 proxy/src/binary/proxy.rs | 20 +++++++++++---------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/proxy/README.md b/proxy/README.md
index e10ff3d710..ff48f9f323 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -123,6 +123,11 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_pl
 docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';"
 ```
 
+If you want to test query cancellation, redis is also required:
+```sh
+docker run --detach --name proxy-redis --publish 6379:6379 redis:7.0
+```
+
 Let's create self-signed certificate by running:
 ```sh
 openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build"
@@ -130,7 +135,10 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key
 
 Then we need to build proxy with 'testing' feature and run, e.g.:
 ```sh
-RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
+RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- \
+  --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' \
+  --redis-auth-type="plain" --redis-plain="redis://127.0.0.1:6379" \
+  -c server.crt -k server.key
 ```
 
 Now from client you can start a new session:
diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index 691709ce2a..16a7dc7b67 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -522,15 +522,7 @@ pub async fn run() -> anyhow::Result<()> {
         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
     }
 
-    if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend
-        && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api
-        && let Some(client) = redis_client
-    {
-        // project info cache and invalidation of that cache.
-        let cache = api.caches.project_info.clone();
-        maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone()));
-        maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-
+    if let Some(client) = redis_client {
         // Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
         // This prevents immediate exit and pod restart,
         // which can cause hammering of the redis in case of connection issues.
@@ -560,6 +552,16 @@ pub async fn run() -> anyhow::Result<()> {
                 }
             }
         }
+
+        #[allow(irrefutable_let_patterns)]
+        if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend
+            && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api
+        {
+            // project info cache and invalidation of that cache.
+            let cache = api.caches.project_info.clone();
+            maintenance_tasks.spawn(notifications::task_main(client, cache.clone()));
+            maintenance_tasks.spawn(async move { cache.gc_worker().await });
+        }
     }
 
     let maintenance = loop {

From a0a7733b5aa657553a5b91bb0a3d4f6e3847e38b Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 11 Jul 2025 10:57:50 -0500
Subject: [PATCH 31/56] Use relative paths in submodule URL references (#12559)

This is a nifty trick from the hadron repo that seems to help with SSH
key dance.

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
---
 .gitmodules | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index d1330bf28c..e381fb079e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,16 +1,16 @@
 [submodule "vendor/postgres-v14"]
 	path = vendor/postgres-v14
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_14_STABLE_neon
 [submodule "vendor/postgres-v15"]
 	path = vendor/postgres-v15
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_15_STABLE_neon
 [submodule "vendor/postgres-v16"]
 	path = vendor/postgres-v16
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_16_STABLE_neon
 [submodule "vendor/postgres-v17"]
 	path = vendor/postgres-v17
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_17_STABLE_neon

From 3300207523008ab3dd922780c4d164bd4376a007 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 11 Jul 2025 19:05:22 +0300
Subject: [PATCH 32/56] Update working set size estimate without lock (#12570)

Update the WSS estimate before acquring the lock, so that we don't need
to hold the lock for so long. That seems safe to me, see added comment.

I was planning to do this with the new rust-based communicator
implementation anyway, but it might help a little with the current C
implementation too. And more importantly, having this as a separate PR
gives us a chance to review this aspect independently.
---
 pgxn/neon/file_cache.c | 77 +++++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 28 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 0e316abd1d..2c87f139af 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -162,8 +162,34 @@ typedef struct FileCacheControl
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
 	dlist_head  holes;          /* double linked list of punched holes */
-	HyperLogLogState wss_estimation; /* estimation of working set size */
+
 	ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */
+
+	/*
+	 * Estimation of working set size.
+	 *
+	 * This is not guarded by the lock. No locking is needed because all the
+	 * writes to the "registers" are simple 64-bit stores, to update a
+	 * timestamp. We assume that:
+	 *
+	 * - 64-bit stores are atomic. We could enforce that by using
+	 *   pg_atomic_uint64 instead of TimestampTz as the datatype in hll.h, but
+	 *   for now we just rely on it implicitly.
+	 *
+	 * - Even if they're not, and there is a race between two stores, it
+	 *   doesn't matter much which one wins because they're both updating the
+	 *   register with the current timestamp. Or you have a race between
+	 *   resetting the register and updating it, in which case it also doesn't
+	 *   matter much which one wins.
+	 *
+	 * - If they're not atomic, you might get an occasional "torn write" if
+	 *   you're really unlucky, but we tolerate that too. It just means that
+	 *   the estimate will be a little off, until the register is updated
+	 *   again.
+	 */
+	HyperLogLogState wss_estimation;
+
+	/* Prewarmer state */
 	PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
 	size_t n_prewarm_workers;
 	size_t n_prewarm_entries;
@@ -1144,6 +1170,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 
+	/* Update working set size estimate for the blocks */
+	for (int i = 0; i < nblocks; i++)
+	{
+		tag.blockNum = blkno + i;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}
+
 	/*
 	 * For every chunk that has blocks we're interested in, we
 	 * 1. get the chunk header
@@ -1222,14 +1255,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}
 
 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
-
-		/* Approximate working set for the blocks assumed in this entry */
-		for (int i = 0; i < blocks_in_chunk; i++)
-		{
-			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-		}
-
 		if (entry == NULL)
 		{
 			/* Pages are not cached */
@@ -1506,9 +1531,15 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		return false;
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	tag.forkNum = forknum;
 
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+	/* Update working set size estimate for the blocks */
+	if (lfc_prewarm_update_ws_estimation)
+	{
+		tag.blockNum = blkno;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}
 
 	tag.blockNum = blkno - chunk_offs;
 	hash = get_hash_value(lfc_hash, &tag);
@@ -1526,19 +1557,13 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 
 	if (lwlsn > lsn)
 	{
-		elog(DEBUG1, "Skip LFC write for %d because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X",
+		elog(DEBUG1, "Skip LFC write for %u because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X",
 			 blkno, LSN_FORMAT_ARGS(lwlsn), LSN_FORMAT_ARGS(lsn));
 		LWLockRelease(lfc_lock);
 		return false;
 	}
 
 	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
-
-	if (lfc_prewarm_update_ws_estimation)
-	{
-		tag.blockNum = blkno;
-		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-	}
 	if (found)
 	{
 		state = GET_STATE(entry, chunk_offs);
@@ -1651,9 +1676,15 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		return;
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	tag.forkNum = forkNum;
 
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+	/* Update working set size estimate for the blocks */
+	for (int i = 0; i < nblocks; i++)
+	{
+		tag.blockNum = blkno + i;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
 
@@ -1694,14 +1725,6 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		cv = &lfc_ctl->cv[hash % N_COND_VARS];
 
 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
-
-		/* Approximate working set for the blocks assumed in this entry */
-		for (int i = 0; i < blocks_in_chunk; i++)
-		{
-			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-		}
-
 		if (found)
 		{
 			/*
@@ -2150,11 +2173,9 @@ lfc_approximate_working_set_size_seconds(time_t duration, bool reset)
 	if (lfc_size_limit == 0)
 		return -1;
 
-	LWLockAcquire(lfc_lock, LW_SHARED);
 	dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
 	if (reset)
 		memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
-	LWLockRelease(lfc_lock);
 	return dc;
 }
 

From 379259bdd75edae91fad0d180fa513bff3e1f92b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 11 Jul 2025 19:07:14 +0200
Subject: [PATCH 33/56] storcon: don't error log on timeline delete if tenant
 migration is in progress (#12523)

Fixes [LKB-61](https://databricks.atlassian.net/browse/LKB-61):
`test_timeline_archival_chaos` being flaky with storcon error `Requested
tenant is missing`.

When a tenant migration is ongoing, and the attach request has been sent
to the new location, but the attach hasn't finished yet, it is possible
for the pageserver to return a 412 precondition failed HTTP error on
timeline deletion, because it is being sent to the new location already.
That one we would previously log via sth like:

```
ERROR request{method=DELETE path=/v1/tenant/1f544a11c90d1afd7af9b26e48985a4e/timeline/32818fb3ebf07cb7f06805429d7dee38 request_id=c493c04b-7f33-46d2-8a65-aac8a5516055}: Error processing HTTP request: InternalServerError(Error deleting timeline 32
818fb3ebf07cb7f06805429d7dee38 on 1f544a11c90d1afd7af9b26e48985a4e on node 2 (localhost): pageserver API: Precondition failed: Requested tenant is missing
```

This patch changes that and makes us return a more reasonable resource
unavailable error. Not sure how scalable this is with tenants with a
large number of shards, but that's a different discussion (we'd probably
need a limited amount of per-storcon retries).

example
[link](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-12398/15981821532/index.html#/testresult/e7785dfb1238d92f).
---
 storage_controller/src/service.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 31d149c5ac..0907907edc 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5206,6 +5206,9 @@ impl Service {
                 match res {
                     Ok(ok) => Ok(ok),
                     Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT),
+                    Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) if msg.contains("Requested tenant is missing") => {
+                        Err(ApiError::ResourceUnavailable("Tenant migration in progress".into()))
+                    },
                     Err(mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg)) => Err(ApiError::ResourceUnavailable(msg.into())),
                     Err(e) => {
                         Err(

From 63ca084696f4dd226bfea1abae66dcb3234d1051 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 11 Jul 2025 14:37:55 -0400
Subject: [PATCH 34/56] fix(pageserver): downgrade wal apply error during
 gc-compaction (#12518)

## Problem

close LKB-162

close https://github.com/neondatabase/cloud/issues/30665, related to
https://github.com/neondatabase/cloud/issues/29434

We see a lot of errors like:

```
2025-05-22T23:06:14.928959Z ERROR compaction_loop{tenant_id=? shard_id=0304}:run:gc_compact_timeline{timeline_id=?}: error applying 4 WAL records 35/DC0DF0B8..3B/E43188C0 (8119 bytes) to key 000000067F0000400500006027000000B9D0, from base image with LSN 0/0 to reconstruct page image at LSN 61/150B9B20 n_attempts=0: apply_wal_records

Caused by:
    0: read walredo stdout
    1: early eof
```

which is an acceptable form of error and we should downgrade it to
warning.

## Summary of changes

walredo error during gc-compaction is expected when the data below the
gc horizon does not contain a full key history. This is possible in some
rare cases of gc that is only able to remove data in the middle of the
history but not all earlier history when a full keyspace gets deleted.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/walredo.rs | 46 +++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index c6d3cafe9a..f053c9ed37 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -147,6 +147,16 @@ pub enum RedoAttemptType {
     GcCompaction,
 }
 
+impl std::fmt::Display for RedoAttemptType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            RedoAttemptType::ReadPage => write!(f, "read page"),
+            RedoAttemptType::LegacyCompaction => write!(f, "legacy compaction"),
+            RedoAttemptType::GcCompaction => write!(f, "gc compaction"),
+        }
+    }
+}
+
 ///
 /// Public interface of WAL redo manager
 ///
@@ -199,6 +209,7 @@ impl PostgresRedoManager {
                         self.conf.wal_redo_timeout,
                         pg_version,
                         max_retry_attempts,
+                        redo_attempt_type,
                     )
                     .await
                 };
@@ -221,6 +232,7 @@ impl PostgresRedoManager {
                 self.conf.wal_redo_timeout,
                 pg_version,
                 max_retry_attempts,
+                redo_attempt_type,
             )
             .await
         }
@@ -445,6 +457,7 @@ impl PostgresRedoManager {
         wal_redo_timeout: Duration,
         pg_version: PgMajorVersion,
         max_retry_attempts: u32,
+        redo_attempt_type: RedoAttemptType,
     ) -> Result<Bytes, Error> {
         *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
 
@@ -485,17 +498,28 @@ impl PostgresRedoManager {
                 );
 
                 if let Err(e) = result.as_ref() {
-                    error!(
-                        "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
-                        records.len(),
-                        records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-                        records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                        nbytes,
-                        base_img_lsn,
-                        lsn,
-                        n_attempts,
-                        e,
-                    );
+                    macro_rules! message {
+                        ($level:tt) => {
+                            $level!(
+                                "error applying {} WAL records {}..{} ({} bytes) to key {} during {}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                                records.len(),
+                                records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+                                records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                                nbytes,
+                                key,
+                                redo_attempt_type,
+                                base_img_lsn,
+                                lsn,
+                                n_attempts,
+                                e,
+                            )
+                        }
+                    }
+                    match redo_attempt_type {
+                        RedoAttemptType::ReadPage => message!(error),
+                        RedoAttemptType::LegacyCompaction => message!(error),
+                        RedoAttemptType::GcCompaction => message!(warn),
+                    }
                 }
 
                 result.map_err(Error::Other)

From 4566b12a22876f1110b77da9e7b75615c9963b38 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Fri, 11 Jul 2025 20:56:39 +0200
Subject: [PATCH 35/56] NEON: Finish Zenith->Neon rename (#12566)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Even though we're now part of Databricks, let's at least make this part
consistent.

## Summary of changes

- PG14: https://github.com/neondatabase/postgres/pull/669
- PG15: https://github.com/neondatabase/postgres/pull/670
- PG16: https://github.com/neondatabase/postgres/pull/671
- PG17: https://github.com/neondatabase/postgres/pull/672

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 compute_tools/src/compute.rs                  | 23 +++++++++++++
 control_plane/src/endpoint.rs                 |  3 +-
 docs/core_changes.md                          |  7 ++--
 pageserver/src/basebackup.rs                  | 33 +++++++++++--------
 pageserver/src/import_datadir.rs              | 14 ++++----
 pgxn/neon_test_utils/neontest.c               | 10 +++---
 pgxn/typedefs.list                            | 22 ++++++-------
 test_runner/fixtures/neon_fixtures.py         |  1 +
 .../regress/test_timeline_detach_ancestor.py  |  8 ++---
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/postgres-v17                           |  2 +-
 vendor/revisions.json                         |  8 ++---
 14 files changed, 84 insertions(+), 53 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index c05cc229a2..2e0b7d7b2e 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1040,6 +1040,8 @@ impl ComputeNode {
             PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
         };
 
+        self.fix_zenith_signal_neon_signal()?;
+
         let mut state = self.state.lock().unwrap();
         state.metrics.pageserver_connect_micros =
             connected.duration_since(started).as_micros() as u64;
@@ -1049,6 +1051,27 @@ impl ComputeNode {
         Ok(())
     }
 
+    /// Move the Zenith signal file to Neon signal file location.
+    /// This makes Compute compatible with older PageServers that don't yet
+    /// know about the Zenith->Neon rename.
+    fn fix_zenith_signal_neon_signal(&self) -> Result<()> {
+        let datadir = Path::new(&self.params.pgdata);
+
+        let neonsig = datadir.join("neon.signal");
+
+        if neonsig.is_file() {
+            return Ok(());
+        }
+
+        let zenithsig = datadir.join("zenith.signal");
+
+        if zenithsig.is_file() {
+            fs::copy(zenithsig, neonsig)?;
+        }
+
+        Ok(())
+    }
+
     /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
     /// the connection was established, and the (compressed) size of the basebackup.
     fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index ad2067e0f2..91a62b0ca4 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -32,7 +32,8 @@
 //!     config.json                 - passed to `compute_ctl`
 //!     pgdata/
 //!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
-//!         zenith.signal
+//!         neon.signal
+//!         zenith.signal         - copy of neon.signal, for backward compatibility
 //!         <other PostgreSQL files>
 //! ```
 //!
diff --git a/docs/core_changes.md b/docs/core_changes.md
index 1388317728..abfd20af26 100644
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -129,9 +129,10 @@ segment to bootstrap the WAL writing, but it doesn't contain the checkpoint reco
 changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
 from WAL.
 
-This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start
-at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last
-checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo.
+This includes code to read the `neon.signal` (also `zenith.signal`) file, which tells the startup 
+code the LSN to start at. When the `neon.signal` file is present, the startup uses that LSN
+instead of the last checkpoint's LSN. The system is known to be consistent at that LSN, without 
+any WAL redo.
 
 
 ### How to get rid of the patch
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 36dada1e89..1a44c80e2d 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -114,7 +114,7 @@ where
     // Compute postgres doesn't have any previous WAL files, but the first
     // record that it's going to write needs to include the LSN of the
     // previous record (xl_prev). We include prev_record_lsn in the
-    // "zenith.signal" file, so that postgres can read it during startup.
+    // "neon.signal" file, so that postgres can read it during startup.
     //
     // We don't keep full history of record boundaries in the page server,
     // however, only the predecessor of the latest record on each
@@ -751,34 +751,39 @@ where
 
     //
     // Add generated pg_control file and bootstrap WAL segment.
-    // Also send zenith.signal file with extra bootstrap data.
+    // Also send neon.signal and zenith.signal file with extra bootstrap data.
     //
     async fn add_pgcontrol_file(
         &mut self,
         pg_control_bytes: Bytes,
         system_identifier: u64,
     ) -> Result<(), BasebackupError> {
-        // add zenith.signal file
-        let mut zenith_signal = String::new();
+        // add neon.signal file
+        let mut neon_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
             if self.timeline.is_ancestor_lsn(self.lsn) {
-                write!(zenith_signal, "PREV LSN: none")
+                write!(neon_signal, "PREV LSN: none")
                     .map_err(|e| BasebackupError::Server(e.into()))?;
             } else {
-                write!(zenith_signal, "PREV LSN: invalid")
+                write!(neon_signal, "PREV LSN: invalid")
                     .map_err(|e| BasebackupError::Server(e.into()))?;
             }
         } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
+            write!(neon_signal, "PREV LSN: {}", self.prev_record_lsn)
                 .map_err(|e| BasebackupError::Server(e.into()))?;
         }
-        self.ar
-            .append(
-                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
-                zenith_signal.as_bytes(),
-            )
-            .await
-            .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
+
+        // TODO: Remove zenith.signal once all historical computes have been replaced
+        // ... and thus support the neon.signal file.
+        for signalfilename in ["neon.signal", "zenith.signal"] {
+            self.ar
+                .append(
+                    &new_tar_header(signalfilename, neon_signal.len() as u64)?,
+                    neon_signal.as_bytes(),
+                )
+                .await
+                .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,neon.signal"))?;
+        }
 
         //send pg_control
         let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 96fe0c1078..409cc2e3c5 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -610,13 +610,13 @@ async fn import_file(
         debug!("imported twophase file");
     } else if file_path.starts_with("pg_wal") {
         debug!("found wal file in base section. ignore it");
-    } else if file_path.starts_with("zenith.signal") {
+    } else if file_path.starts_with("zenith.signal") || file_path.starts_with("neon.signal") {
         // Parse zenith signal file to set correct previous LSN
         let bytes = read_all_bytes(reader).await?;
-        // zenith.signal format is "PREV LSN: prev_lsn"
+        // neon.signal format is "PREV LSN: prev_lsn"
         // TODO write serialization and deserialization in the same place.
-        let zenith_signal = std::str::from_utf8(&bytes)?.trim();
-        let prev_lsn = match zenith_signal {
+        let neon_signal = std::str::from_utf8(&bytes)?.trim();
+        let prev_lsn = match neon_signal {
             "PREV LSN: none" => Lsn(0),
             "PREV LSN: invalid" => Lsn(0),
             other => {
@@ -624,17 +624,17 @@ async fn import_file(
                 split[1]
                     .trim()
                     .parse::<Lsn>()
-                    .context("can't parse zenith.signal")?
+                    .context("can't parse neon.signal")?
             }
         };
 
-        // zenith.signal is not necessarily the last file, that we handle
+        // neon.signal is not necessarily the last file, that we handle
         // but it is ok to call `finish_write()`, because final `modification.commit()`
         // will update lsn once more to the final one.
         let writer = modification.tline.writer().await;
         writer.finish_write(prev_lsn);
 
-        debug!("imported zenith signal {}", prev_lsn);
+        debug!("imported neon signal {}", prev_lsn);
     } else if file_path.starts_with("pg_tblspc") {
         // TODO Backups exported from neon won't have pg_tblspc, but we will need
         // this to import arbitrary postgres databases.
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index d37412f674..5f880dfd23 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -236,13 +236,13 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 	bool		save_neon_test_evict;
 
 	/*
-	 * Temporarily set the zenith_test_evict GUC, so that when we pin and
+	 * Temporarily set the neon_test_evict GUC, so that when we pin and
 	 * unpin a buffer, the buffer is evicted. We use that hack to evict all
 	 * buffers, as there is no explicit "evict this buffer" function in the
 	 * buffer manager.
 	 */
-	save_neon_test_evict = zenith_test_evict;
-	zenith_test_evict = true;
+	save_neon_test_evict = neon_test_evict;
+	neon_test_evict = true;
 	PG_TRY();
 	{
 		/* Scan through all the buffers */
@@ -273,7 +273,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 
 			/*
 			 * Pin the buffer, and release it again. Because we have
-			 * zenith_test_evict==true, this will evict the page from the
+			 * neon_test_evict==true, this will evict the page from the
 			 * buffer cache if no one else is holding a pin on it.
 			 */
 			if (isvalid)
@@ -286,7 +286,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 	PG_FINALLY();
 	{
 		/* restore the GUC */
-		zenith_test_evict = save_neon_test_evict;
+		neon_test_evict = save_neon_test_evict;
 	}
 	PG_END_TRY();
 
diff --git a/pgxn/typedefs.list b/pgxn/typedefs.list
index 760f384212..3ea8b3b091 100644
--- a/pgxn/typedefs.list
+++ b/pgxn/typedefs.list
@@ -2953,17 +2953,17 @@ XmlTableBuilderData
 YYLTYPE
 YYSTYPE
 YY_BUFFER_STATE
-ZenithErrorResponse
-ZenithExistsRequest
-ZenithExistsResponse
-ZenithGetPageRequest
-ZenithGetPageResponse
-ZenithMessage
-ZenithMessageTag
-ZenithNblocksRequest
-ZenithNblocksResponse
-ZenithRequest
-ZenithResponse
+NeonErrorResponse
+NeonExistsRequest
+NeonExistsResponse
+NeonGetPageRequest
+NeonGetPageResponse
+NeonMessage
+NeonMessageTag
+NeonNblocksRequest
+NeonNblocksResponse
+NeonRequest
+NeonResponse
 _SPI_connection
 _SPI_plan
 __AssignProcessToJobObject
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a7b7f0e74d..b9fff05c6c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -5409,6 +5409,7 @@ SKIP_FILES = frozenset(
     (
         "pg_internal.init",
         "pg.log",
+        "neon.signal",
         "zenith.signal",
         "pg_hba.conf",
         "postgresql.conf",
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index c0f163db32..45b7af719e 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -209,9 +209,9 @@ def test_ancestor_detach_branched_from(
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
 
-    # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different
-    # as there is always "PREV_LSN: invalid" for "before"
-    skip_files = {"zenith.signal"}
+    # because we do the fullbackup from ancestor at the branch_lsn, the neon.signal and/or zenith.signal is always
+    # different as there is always "PREV_LSN: invalid" for "before"
+    skip_files = {"zenith.signal", "neon.signal"}
 
     assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files)
 
@@ -767,7 +767,7 @@ def test_compaction_induced_by_detaches_in_history(
         env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after
     )
 
-    # we don't need to skip any files, because zenith.signal will be identical
+    # we don't need to skip any files, because neon.signal will be identical
     assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
 
 
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 9085654ee8..8ce1f52303 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 9085654ee8022d5cc4ca719380a1dc53e5e3246f
+Subproject commit 8ce1f52303aec29e098309347b57c01a1962e221
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 8c3249f36c..afd46987f3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9
+Subproject commit afd46987f3da50c9146a8aa59380052df0862c06
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 7a4c0eacae..e08c8d5f15 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 7a4c0eacaeb9b97416542fa19103061c166460b1
+Subproject commit e08c8d5f1576ca0487d14d154510499c5f12adfb
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index db424d42d7..353c725b0c 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit db424d42d748f8ad91ac00e28db2c7f2efa42f7f
+Subproject commit 353c725b0c76cc82b15af21d8360d03391dc6814
diff --git a/vendor/revisions.json b/vendor/revisions.json
index b260698c86..992aa405b1 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.5",
-    "db424d42d748f8ad91ac00e28db2c7f2efa42f7f"
+    "353c725b0c76cc82b15af21d8360d03391dc6814"
   ],
   "v16": [
     "16.9",
-    "7a4c0eacaeb9b97416542fa19103061c166460b1"
+    "e08c8d5f1576ca0487d14d154510499c5f12adfb"
   ],
   "v15": [
     "15.13",
-    "8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9"
+    "afd46987f3da50c9146a8aa59380052df0862c06"
   ],
   "v14": [
     "14.18",
-    "9085654ee8022d5cc4ca719380a1dc53e5e3246f"
+    "8ce1f52303aec29e098309347b57c01a1962e221"
   ]
 }

From cb991fba421999e390c9debfc39fb39a636fe1e9 Mon Sep 17 00:00:00 2001
From: HaoyuHuang <haoyu.huang.68@gmail.com>
Date: Fri, 11 Jul 2025 12:27:55 -0700
Subject: [PATCH 36/56] A few more PS changes (#12552)

# TLDR
Problem-I is a bug fix. The rest are no-ops.

## Problem I
Page server checks image layer creation based on the elapsed time but
this check depends on the current logical size, which is only computed
on shard 0. Thus, for non-0 shards, the check will be ineffective and
image creation will never be done for idle tenants.

## Summary of changes I
This PR fixes the problem by simply removing the dependency on current
logical size.

## Summary of changes II
This PR adds a timeout when calling page server to split shard to make
sure SC does not wait for the API call forever. Currently the PR doesn't
adds any retry logic because it's not clear whether page server shard
split can be safely retried if the existing operation is still ongoing
or left the storage in a bad state. Thus it's better to abort the whole
operation and restart.

## Problem III
`test_remote_failures` requires PS to be compiled in the testing mode.
For PS in dev/staging, they are compiled without this mode.

## Summary of changes III
Remove the restriction and also increase the number of total failures
allowed.

## Summary of changes IV
remove test on PS getpage http route.

---------

Co-authored-by: Chen Luo <chen.luo@databricks.com>
Co-authored-by: Yecheng Yang <carlton.yang@databricks.com>
Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 control_plane/src/local_env.rs               |  4 +
 control_plane/src/storage_controller.rs      |  7 ++
 libs/remote_storage/src/simulate_failures.rs |  1 +
 libs/utils/src/env.rs                        |  3 +-
 pageserver/src/bin/pageserver.rs             |  5 --
 pageserver/src/http/routes.rs                |  2 +-
 pageserver/src/tenant/mgr.rs                 |  2 +
 pageserver/src/tenant/timeline.rs            | 48 ++++++-----
 storage_controller/src/main.rs               |  7 ++
 storage_controller/src/service.rs            | 27 ++++++-
 test_runner/regress/test_compaction.py       | 62 ++++++++++++++
 test_runner/regress/test_sharding.py         | 85 ++++++++++++++++++++
 12 files changed, 226 insertions(+), 27 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index d0611113e8..d34dd39f61 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -217,6 +217,9 @@ pub struct NeonStorageControllerConf {
     pub posthog_config: Option<PostHogConfig>,
 
     pub kick_secondary_downloads: Option<bool>,
+
+    #[serde(with = "humantime_serde")]
+    pub shard_split_request_timeout: Option<Duration>,
 }
 
 impl NeonStorageControllerConf {
@@ -250,6 +253,7 @@ impl Default for NeonStorageControllerConf {
             timeline_safekeeper_count: None,
             posthog_config: None,
             kick_secondary_downloads: None,
+            shard_split_request_timeout: None,
         }
     }
 }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index dc6c82f504..f996f39967 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -648,6 +648,13 @@ impl StorageController {
             args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
         }
 
+        if let Some(duration) = self.config.shard_split_request_timeout {
+            args.push(format!(
+                "--shard-split-request-timeout={}",
+                humantime::Duration::from(duration)
+            ));
+        }
+
         let mut envs = vec![
             ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
             ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 30d116f57c..e895380192 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -31,6 +31,7 @@ pub struct UnreliableWrapper {
     /* BEGIN_HADRON */
     // This the probability of failure for each operation, ranged from [0, 100].
     // The probability is default to 100, which means that all operations will fail.
+    // Storage will fail by probability up to attempts_to_fail times.
     attempt_failure_probability: u64,
     /* END_HADRON */
 }
diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs
index cc1cbf8009..0b3b5e6c4f 100644
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -47,6 +47,7 @@ where
 
 /* BEGIN_HADRON */
 pub enum DeploymentMode {
+    Local,
     Dev,
     Staging,
     Prod,
@@ -64,7 +65,7 @@ pub fn get_deployment_mode() -> Option<DeploymentMode> {
             }
         },
         Err(_) => {
-            tracing::error!("DEPLOYMENT_MODE not set");
+            // tracing::error!("DEPLOYMENT_MODE not set");
             None
         }
     }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 299fe7e159..dfb8b437c3 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -917,11 +917,6 @@ async fn create_remote_storage_client(
     // If `test_remote_failures` is non-zero, wrap the client with a
     // wrapper that simulates failures.
     if conf.test_remote_failures > 0 {
-        if !cfg!(feature = "testing") {
-            anyhow::bail!(
-                "test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"
-            );
-        }
         info!(
             "Simulating remote failures for first {} attempts of each op",
             conf.test_remote_failures
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d839bac557..0d40c5ecf7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -4183,7 +4183,7 @@ pub fn make_router(
         })
         .get(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
-            |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
+            |r|  testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
         )
         .get(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage",
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 15853d3614..52f67abde5 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1678,6 +1678,8 @@ impl TenantManager {
         // Phase 6: Release the InProgress on the parent shard
         drop(parent_slot_guard);
 
+        utils::pausable_failpoint!("shard-split-post-finish-pause");
+
         Ok(child_shards)
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 718ea925b7..fe622713e9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5604,10 +5604,11 @@ impl Timeline {
     /// Predicate function which indicates whether we should check if new image layers
     /// are required. Since checking if new image layers are required is expensive in
     /// terms of CPU, we only do it in the following cases:
-    /// 1. If the timeline has ingested sufficient WAL to justify the cost
+    /// 1. If the timeline has ingested sufficient WAL to justify the cost or ...
     /// 2. If enough time has passed since the last check:
     ///     1. For large tenants, we wish to perform the check more often since they
-    ///        suffer from the lack of image layers
+    ///        suffer from the lack of image layers. Note that we assume sharded tenants
+    ///        to be large since non-zero shards do not track the logical size.
     ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
     fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
         let large_timeline_threshold = self.conf.image_layer_generation_large_timeline_threshold;
@@ -5621,30 +5622,39 @@ impl Timeline {
 
         let distance_based_decision = distance.0 >= min_distance;
 
-        let mut time_based_decision = false;
         let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
-        if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
-            let check_required_after =
-                if Some(Into::<u64>::into(&logical_size)) >= large_timeline_threshold {
-                    self.get_checkpoint_timeout()
-                } else {
-                    Duration::from_secs(3600 * 48)
-                };
-
-            time_based_decision = match *last_check_instant {
-                Some(last_check) => {
-                    let elapsed = last_check.elapsed();
-                    elapsed >= check_required_after
+        let check_required_after = (|| {
+            if self.shard_identity.is_unsharded() {
+                if let CurrentLogicalSize::Exact(logical_size) =
+                    self.current_logical_size.current_size()
+                {
+                    if Some(Into::<u64>::into(&logical_size)) < large_timeline_threshold {
+                        return Duration::from_secs(3600 * 48);
+                    }
                 }
-                None => true,
-            };
-        }
+            }
+
+            self.get_checkpoint_timeout()
+        })();
+
+        let time_based_decision = match *last_check_instant {
+            Some(last_check) => {
+                let elapsed = last_check.elapsed();
+                elapsed >= check_required_after
+            }
+            None => true,
+        };
 
         // Do the expensive delta layer counting only if this timeline has ingested sufficient
         // WAL since the last check or a checkpoint timeout interval has elapsed since the last
         // check.
         let decision = distance_based_decision || time_based_decision;
-
+        tracing::info!(
+            "Decided to check image layers: {}. Distance-based decision: {}, time-based decision: {}",
+            decision,
+            distance_based_decision,
+            time_based_decision
+        );
         if decision {
             self.last_image_layer_creation_check_at.store(lsn);
             *last_check_instant = Some(Instant::now());
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 2a851dc25b..5d21feeb10 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -222,6 +222,9 @@ struct Cli {
     /// Primarily useful for testing to reduce test execution time.
     #[arg(long, default_value = "false", action=ArgAction::Set)]
     kick_secondary_downloads: bool,
+
+    #[arg(long)]
+    shard_split_request_timeout: Option<humantime::Duration>,
 }
 
 enum StrictMode {
@@ -470,6 +473,10 @@ async fn async_main() -> anyhow::Result<()> {
         timeline_safekeeper_count: args.timeline_safekeeper_count,
         posthog_config: posthog_config.clone(),
         kick_secondary_downloads: args.kick_secondary_downloads,
+        shard_split_request_timeout: args
+            .shard_split_request_timeout
+            .map(humantime::Duration::into)
+            .unwrap_or(Duration::MAX),
     };
 
     // Validate that we can connect to the database
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 0907907edc..638cb410fa 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -60,6 +60,7 @@ use tokio::sync::mpsc::error::TrySendError;
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
 use utils::completion::Barrier;
+use utils::env;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -483,6 +484,9 @@ pub struct Config {
 
     /// When set, actively checks and initiates heatmap downloads/uploads.
     pub kick_secondary_downloads: bool,
+
+    /// Timeout used for HTTP client of split requests. [`Duration::MAX`] if None.
+    pub shard_split_request_timeout: Duration,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -6406,18 +6410,39 @@ impl Service {
         // TODO: issue split calls concurrently (this only matters once we're splitting
         // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
 
+        // HADRON: set a timeout for splitting individual shards on page servers.
+        // Currently we do not perform any retry because it's not clear if page server can handle
+        // partially split shards correctly.
+        let shard_split_timeout =
+            if let Some(env::DeploymentMode::Local) = env::get_deployment_mode() {
+                Duration::from_secs(30)
+            } else {
+                self.config.shard_split_request_timeout
+            };
+        let mut http_client_builder = reqwest::ClientBuilder::new()
+            .pool_max_idle_per_host(0)
+            .timeout(shard_split_timeout);
+
+        for ssl_ca_cert in &self.config.ssl_ca_certs {
+            http_client_builder = http_client_builder.add_root_certificate(ssl_ca_cert.clone());
+        }
+        let http_client = http_client_builder
+            .build()
+            .expect("Failed to construct HTTP client");
         for target in &targets {
             let ShardSplitTarget {
                 parent_id,
                 node,
                 child_ids,
             } = target;
+
             let client = PageserverClient::new(
                 node.get_id(),
-                self.http_client.clone(),
+                http_client.clone(),
                 node.base_url(),
                 self.config.pageserver_jwt_token.as_deref(),
             );
+
             let response = client
                 .tenant_shard_split(
                     *parent_id,
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index ab02314288..963a19d640 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -7,6 +7,7 @@ import time
 from enum import StrEnum
 
 import pytest
+from fixtures.common_types import TenantShardId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -960,6 +961,67 @@ def get_layer_map(env, tenant_shard_id, timeline_id, ps_id):
     return image_layer_count, delta_layer_count
 
 
+def test_image_layer_creation_time_threshold(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests that image layers can be created when the time threshold is reached on non-0 shards.
+    """
+    tenant_conf = {
+        "compaction_threshold": "100",
+        "image_creation_threshold": "100",
+        "image_layer_creation_check_threshold": "1",
+        # disable distance based image layer creation check
+        "checkpoint_distance": 10 * 1024 * 1024 * 1024,
+        "checkpoint_timeout": "100ms",
+        "image_layer_force_creation_period": "1s",
+        "pitr_interval": "10s",
+        "gc_period": "1s",
+        "compaction_period": "1s",
+        "lsn_lease_length": "1s",
+    }
+
+    # consider every tenant large to run the image layer generation check more eagerly
+    neon_env_builder.pageserver_config_override = (
+        "image_layer_generation_large_timeline_threshold=0"
+    )
+
+    neon_env_builder.num_pageservers = 1
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf,
+        initial_tenant_shard_count=2,
+        initial_tenant_shard_stripe_size=1,
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER, val text)")
+
+    for v in range(10):
+        endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))")
+
+    tenant_shard_id = TenantShardId(tenant_id, 1, 2)
+
+    # Generate some rows.
+    for v in range(20):
+        endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))")
+
+    # restart page server so that logical size on non-0 shards is missing
+    env.pageserver.restart()
+
+    (old_images, old_deltas) = get_layer_map(env, tenant_shard_id, timeline_id, 0)
+    log.info(f"old images: {old_images}, old deltas: {old_deltas}")
+
+    def check_image_creation():
+        (new_images, old_deltas) = get_layer_map(env, tenant_shard_id, timeline_id, 0)
+        log.info(f"images: {new_images}, deltas: {old_deltas}")
+        assert new_images > old_images
+
+    wait_until(check_image_creation)
+
+    endpoint.stop_and_destroy()
+
+
 def test_image_layer_force_creation_period(neon_env_builder: NeonEnvBuilder):
     """
     Tests that page server can force creating new images if image_layer_force_creation_period is enabled
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 8ff767eca4..5549105188 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1673,6 +1673,91 @@ def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder):
 # END_HADRON
 
 
+# HADRON
+@pytest.mark.skip(reason="The backpressure change has not been merged yet.")
+def test_back_pressure_per_shard(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests back pressure knobs are enforced on the per shard basis instead of at the tenant level.
+    """
+    init_shard_count = 4
+    neon_env_builder.num_pageservers = init_shard_count
+    stripe_size = 1
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=init_shard_count,
+        initial_tenant_shard_stripe_size=stripe_size,
+        initial_tenant_conf={
+            # disable auto-flush of shards and set max_replication_flush_lag as 15MB.
+            # The backpressure parameters must be enforced at the shard level to avoid stalling PG.
+            "checkpoint_distance": 1 * 1024 * 1024 * 1024,
+            "checkpoint_timeout": "1h",
+        },
+    )
+
+    endpoint = env.endpoints.create(
+        "main",
+        config_lines=[
+            "max_replication_write_lag = 0",
+            "max_replication_apply_lag = 0",
+            "max_replication_flush_lag = 15MB",
+            "neon.max_cluster_size = 10GB",
+        ],
+    )
+    endpoint.respec(skip_pg_catalog_updates=False)  # Needed for databricks_system to get created.
+    endpoint.start()
+
+    # generate 20MB of data
+    endpoint.safe_psql(
+        "CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, 20000) s;"
+    )
+    res = endpoint.safe_psql(
+        "SELECT neon.backpressure_throttling_time() as throttling_time", dbname="databricks_system"
+    )[0]
+    assert res[0] == 0, f"throttling_time should be 0, but got {res[0]}"
+
+    endpoint.stop()
+
+
+# HADRON
+def test_shard_split_page_server_timeout(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests that shard split can correctly handle page server timeouts and abort the split
+    """
+    init_shard_count = 2
+    neon_env_builder.num_pageservers = 1
+    stripe_size = 1
+
+    if neon_env_builder.storage_controller_config is None:
+        neon_env_builder.storage_controller_config = {"shard_split_request_timeout": "5s"}
+    else:
+        neon_env_builder.storage_controller_config["shard_split_request_timeout"] = "5s"
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=init_shard_count,
+        initial_tenant_shard_stripe_size=stripe_size,
+    )
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Enqueuing background abort.*",
+            ".*failpoint.*",
+            ".*Failed to abort.*",
+            ".*Exclusive lock by ShardSplit was held.*",
+        ]
+    )
+    env.pageserver.allowed_errors.extend([".*request was dropped before completing.*"])
+
+    endpoint1 = env.endpoints.create_start(branch_name="main")
+
+    env.pageserver.http_client().configure_failpoints(("shard-split-post-finish-pause", "pause"))
+
+    with pytest.raises(StorageControllerApiException):
+        env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=4)
+
+    env.pageserver.http_client().configure_failpoints(("shard-split-post-finish-pause", "off"))
+    endpoint1.stop_and_destroy()
+
+
 def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
     """
     Check a scenario when one of the shards is much slower than others.

From 380d167b7ca2c8312fafffef30ae8cbdea7fd8a0 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 11 Jul 2025 21:35:42 +0200
Subject: [PATCH 37/56] proxy: For cancellation data replace HSET+EXPIRE/HGET
 with SET..EX/GET (#12553)

## Problem

To store cancellation data we send two commands to redis because the
redis server version doesn't support HSET with EX. Also, HSET is not
really needed.

## Summary of changes

* Replace the HSET + EXPIRE command pair with one SET .. EX command.
* Replace HGET with GET.
* Leave a workaround for old keys set with HSET.
* Replace some anyhow errors with specific errors to surface the
WRONGTYPE error from redis.
---
 Cargo.lock                                    |   1 +
 proxy/Cargo.toml                              |   3 +-
 proxy/src/batch.rs                            |  68 +++++++----
 proxy/src/cancellation.rs                     | 111 ++++++++++++------
 proxy/src/metrics.rs                          |   6 +-
 .../connection_with_credentials_provider.rs   |  24 +++-
 proxy/src/redis/elasticache.rs                |  20 +++-
 proxy/src/redis/kv_ops.rs                     |  16 ++-
 8 files changed, 175 insertions(+), 74 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 025f4e4116..4323254f0a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5289,6 +5289,7 @@ dependencies = [
  "async-trait",
  "atomic-take",
  "aws-config",
+ "aws-credential-types",
  "aws-sdk-iam",
  "aws-sigv4",
  "base64 0.22.1",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index ce8610be24..0a406d1ca8 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -16,6 +16,7 @@ async-compression.workspace = true
 async-trait.workspace = true
 atomic-take.workspace = true
 aws-config.workspace = true
+aws-credential-types.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
 base64.workspace = true
@@ -127,4 +128,4 @@ rstest.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
 tokio-postgres.workspace = true
-tracing-test = "0.2"
\ No newline at end of file
+tracing-test = "0.2"
diff --git a/proxy/src/batch.rs b/proxy/src/batch.rs
index 33e08797f2..cf866ab9a3 100644
--- a/proxy/src/batch.rs
+++ b/proxy/src/batch.rs
@@ -7,13 +7,17 @@ use std::pin::pin;
 use std::sync::Mutex;
 
 use scopeguard::ScopeGuard;
+use tokio::sync::oneshot;
 use tokio::sync::oneshot::error::TryRecvError;
 
 use crate::ext::LockExt;
 
+type ProcResult<P> = Result<<P as QueueProcessing>::Res, <P as QueueProcessing>::Err>;
+
 pub trait QueueProcessing: Send + 'static {
     type Req: Send + 'static;
     type Res: Send;
+    type Err: Send + Clone;
 
     /// Get the desired batch size.
     fn batch_size(&self, queue_size: usize) -> usize;
@@ -24,7 +28,18 @@ pub trait QueueProcessing: Send + 'static {
     /// If this apply can error, it's expected that errors be forwarded to each Self::Res.
     ///
     /// Batching does not need to happen atomically.
-    fn apply(&mut self, req: Vec<Self::Req>) -> impl Future<Output = Vec<Self::Res>> + Send;
+    fn apply(
+        &mut self,
+        req: Vec<Self::Req>,
+    ) -> impl Future<Output = Result<Vec<Self::Res>, Self::Err>> + Send;
+}
+
+#[derive(thiserror::Error)]
+pub enum BatchQueueError<E: Clone, C> {
+    #[error(transparent)]
+    Result(E),
+    #[error(transparent)]
+    Cancelled(C),
 }
 
 pub struct BatchQueue<P: QueueProcessing> {
@@ -34,7 +49,7 @@ pub struct BatchQueue<P: QueueProcessing> {
 
 struct BatchJob<P: QueueProcessing> {
     req: P::Req,
-    res: tokio::sync::oneshot::Sender<P::Res>,
+    res: tokio::sync::oneshot::Sender<Result<P::Res, P::Err>>,
 }
 
 impl<P: QueueProcessing> BatchQueue<P> {
@@ -55,11 +70,11 @@ impl<P: QueueProcessing> BatchQueue<P> {
         &self,
         req: P::Req,
         cancelled: impl Future<Output = R>,
-    ) -> Result<P::Res, R> {
+    ) -> Result<P::Res, BatchQueueError<P::Err, R>> {
         let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req);
 
         let mut cancelled = pin!(cancelled);
-        let resp = loop {
+        let resp: Option<Result<P::Res, P::Err>> = loop {
             // try become the leader, or try wait for success.
             let mut processor = tokio::select! {
                 // try become leader.
@@ -72,7 +87,7 @@ impl<P: QueueProcessing> BatchQueue<P> {
                     if inner.queue.remove(&id).is_some() {
                         tracing::warn!("batched task cancelled before completion");
                     }
-                    return Err(cancel);
+                    return Err(BatchQueueError::Cancelled(cancel));
                 },
             };
 
@@ -96,18 +111,30 @@ impl<P: QueueProcessing> BatchQueue<P> {
             // good: we didn't get cancelled.
             ScopeGuard::into_inner(cancel_safety);
 
-            if values.len() != resps.len() {
-                tracing::error!(
-                    "batch: invalid response size, expected={}, got={}",
-                    resps.len(),
-                    values.len()
-                );
-            }
+            match values {
+                Ok(values) => {
+                    if values.len() != resps.len() {
+                        tracing::error!(
+                            "batch: invalid response size, expected={}, got={}",
+                            resps.len(),
+                            values.len()
+                        );
+                    }
 
-            // send response values.
-            for (tx, value) in std::iter::zip(resps, values) {
-                if tx.send(value).is_err() {
-                    // receiver hung up but that's fine.
+                    // send response values.
+                    for (tx, value) in std::iter::zip(resps, values) {
+                        if tx.send(Ok(value)).is_err() {
+                            // receiver hung up but that's fine.
+                        }
+                    }
+                }
+
+                Err(err) => {
+                    for tx in resps {
+                        if tx.send(Err(err.clone())).is_err() {
+                            // receiver hung up but that's fine.
+                        }
+                    }
                 }
             }
 
@@ -129,7 +156,8 @@ impl<P: QueueProcessing> BatchQueue<P> {
 
         tracing::debug!(id, "batch: job completed");
 
-        Ok(resp.expect("no response found. batch processer should not panic"))
+        resp.expect("no response found. batch processer should not panic")
+            .map_err(BatchQueueError::Result)
     }
 }
 
@@ -139,8 +167,8 @@ struct BatchQueueInner<P: QueueProcessing> {
 }
 
 impl<P: QueueProcessing> BatchQueueInner<P> {
-    fn register_job(&mut self, req: P::Req) -> (u64, tokio::sync::oneshot::Receiver<P::Res>) {
-        let (tx, rx) = tokio::sync::oneshot::channel();
+    fn register_job(&mut self, req: P::Req) -> (u64, oneshot::Receiver<ProcResult<P>>) {
+        let (tx, rx) = oneshot::channel();
 
         let id = self.version;
 
@@ -158,7 +186,7 @@ impl<P: QueueProcessing> BatchQueueInner<P> {
         (id, rx)
     }
 
-    fn get_batch(&mut self, p: &P) -> (Vec<P::Req>, Vec<tokio::sync::oneshot::Sender<P::Res>>) {
+    fn get_batch(&mut self, p: &P) -> (Vec<P::Req>, Vec<oneshot::Sender<ProcResult<P>>>) {
         let batch_size = p.batch_size(self.queue.len());
         let mut reqs = Vec::with_capacity(batch_size);
         let mut resps = Vec::with_capacity(batch_size);
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 74413f1a7d..4ea4c4ea54 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -4,12 +4,11 @@ use std::pin::pin;
 use std::sync::{Arc, OnceLock};
 use std::time::Duration;
 
-use anyhow::anyhow;
 use futures::FutureExt;
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use postgres_client::RawCancelToken;
 use postgres_client::tls::MakeTlsConnect;
-use redis::{Cmd, FromRedisValue, Value};
+use redis::{Cmd, FromRedisValue, SetExpiry, SetOptions, Value};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
@@ -18,7 +17,7 @@ use tracing::{debug, error, info};
 
 use crate::auth::AuthError;
 use crate::auth::backend::ComputeUserInfo;
-use crate::batch::{BatchQueue, QueueProcessing};
+use crate::batch::{BatchQueue, BatchQueueError, QueueProcessing};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::ControlPlaneApi;
@@ -28,7 +27,7 @@ use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, Redis
 use crate::pqproto::CancelKeyData;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
-use crate::redis::kv_ops::RedisKVClient;
+use crate::redis::kv_ops::{RedisKVClient, RedisKVClientError};
 
 type IpSubnetKey = IpNet;
 
@@ -45,6 +44,17 @@ pub enum CancelKeyOp {
     GetCancelData {
         key: CancelKeyData,
     },
+    GetCancelDataOld {
+        key: CancelKeyData,
+    },
+}
+
+#[derive(thiserror::Error, Debug, Clone)]
+pub enum PipelineError {
+    #[error("could not send cmd to redis: {0}")]
+    RedisKVClient(Arc<RedisKVClientError>),
+    #[error("incorrect number of responses from redis")]
+    IncorrectNumberOfResponses,
 }
 
 pub struct Pipeline {
@@ -60,7 +70,7 @@ impl Pipeline {
         }
     }
 
-    async fn execute(self, client: &mut RedisKVClient) -> Vec<anyhow::Result<Value>> {
+    async fn execute(self, client: &mut RedisKVClient) -> Result<Vec<Value>, PipelineError> {
         let responses = self.replies;
         let batch_size = self.inner.len();
 
@@ -78,30 +88,20 @@ impl Pipeline {
                     batch_size,
                     responses, "successfully completed cancellation jobs",
                 );
-                values.into_iter().map(Ok).collect()
+                Ok(values.into_iter().collect())
             }
             Ok(value) => {
                 error!(batch_size, ?value, "unexpected redis return value");
-                std::iter::repeat_with(|| Err(anyhow!("incorrect response type from redis")))
-                    .take(responses)
-                    .collect()
-            }
-            Err(err) => {
-                std::iter::repeat_with(|| Err(anyhow!("could not send cmd to redis: {err}")))
-                    .take(responses)
-                    .collect()
+                Err(PipelineError::IncorrectNumberOfResponses)
             }
+            Err(err) => Err(PipelineError::RedisKVClient(Arc::new(err))),
         }
     }
 
-    fn add_command_with_reply(&mut self, cmd: Cmd) {
+    fn add_command(&mut self, cmd: Cmd) {
         self.inner.add_command(cmd);
         self.replies += 1;
     }
-
-    fn add_command_no_reply(&mut self, cmd: Cmd) {
-        self.inner.add_command(cmd).ignore();
-    }
 }
 
 impl CancelKeyOp {
@@ -109,12 +109,19 @@ impl CancelKeyOp {
         match self {
             CancelKeyOp::StoreCancelKey { key, value, expire } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
-                pipe.add_command_with_reply(Cmd::hset(&key, "data", &**value));
-                pipe.add_command_no_reply(Cmd::expire(&key, expire.as_secs() as i64));
+                pipe.add_command(Cmd::set_options(
+                    &key,
+                    &**value,
+                    SetOptions::default().with_expiration(SetExpiry::EX(expire.as_secs())),
+                ));
+            }
+            CancelKeyOp::GetCancelDataOld { key } => {
+                let key = KeyPrefix::Cancel(*key).build_redis_key();
+                pipe.add_command(Cmd::hget(key, "data"));
             }
             CancelKeyOp::GetCancelData { key } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
-                pipe.add_command_with_reply(Cmd::hget(key, "data"));
+                pipe.add_command(Cmd::get(key));
             }
         }
     }
@@ -127,13 +134,14 @@ pub struct CancellationProcessor {
 
 impl QueueProcessing for CancellationProcessor {
     type Req = (CancelChannelSizeGuard<'static>, CancelKeyOp);
-    type Res = anyhow::Result<redis::Value>;
+    type Res = redis::Value;
+    type Err = PipelineError;
 
     fn batch_size(&self, _queue_size: usize) -> usize {
         self.batch_size
     }
 
-    async fn apply(&mut self, batch: Vec<Self::Req>) -> Vec<Self::Res> {
+    async fn apply(&mut self, batch: Vec<Self::Req>) -> Result<Vec<Self::Res>, Self::Err> {
         if !self.client.credentials_refreshed() {
             // this will cause a timeout for cancellation operations
             tracing::debug!(
@@ -244,18 +252,18 @@ impl CancellationHandler {
         &self,
         key: CancelKeyData,
     ) -> Result<Option<CancelClosure>, CancelError> {
-        let guard = Metrics::get()
-            .proxy
-            .cancel_channel_size
-            .guard(RedisMsgKind::HGet);
-        let op = CancelKeyOp::GetCancelData { key };
+        const TIMEOUT: Duration = Duration::from_secs(5);
 
         let Some(tx) = self.tx.get() else {
             tracing::warn!("cancellation handler is not available");
             return Err(CancelError::InternalError);
         };
 
-        const TIMEOUT: Duration = Duration::from_secs(5);
+        let guard = Metrics::get()
+            .proxy
+            .cancel_channel_size
+            .guard(RedisMsgKind::Get);
+        let op = CancelKeyOp::GetCancelData { key };
         let result = timeout(
             TIMEOUT,
             tx.call((guard, op), std::future::pending::<Infallible>()),
@@ -264,10 +272,37 @@ impl CancellationHandler {
         .map_err(|_| {
             tracing::warn!("timed out waiting to receive GetCancelData response");
             CancelError::RateLimit
-        })?
-        // cannot be cancelled
-        .unwrap_or_else(|x| match x {})
-        .map_err(|e| {
+        })?;
+
+        // We may still have cancel keys set with HSET <key> "data".
+        // Check error type and retry with HGET.
+        // TODO: remove code after HSET is not used anymore.
+        let result = if let Err(err) = result.as_ref()
+            && let BatchQueueError::Result(err) = err
+            && let PipelineError::RedisKVClient(err) = err
+            && let RedisKVClientError::Redis(err) = &**err
+            && let Some(errcode) = err.code()
+            && errcode == "WRONGTYPE"
+        {
+            let guard = Metrics::get()
+                .proxy
+                .cancel_channel_size
+                .guard(RedisMsgKind::HGet);
+            let op = CancelKeyOp::GetCancelDataOld { key };
+            timeout(
+                TIMEOUT,
+                tx.call((guard, op), std::future::pending::<Infallible>()),
+            )
+            .await
+            .map_err(|_| {
+                tracing::warn!("timed out waiting to receive GetCancelData response");
+                CancelError::RateLimit
+            })?
+        } else {
+            result
+        };
+
+        let result = result.map_err(|e| {
             tracing::warn!("failed to receive GetCancelData response: {e}");
             CancelError::InternalError
         })?;
@@ -442,7 +477,7 @@ impl Session {
             let guard = Metrics::get()
                 .proxy
                 .cancel_channel_size
-                .guard(RedisMsgKind::HSet);
+                .guard(RedisMsgKind::Set);
             let op = CancelKeyOp::StoreCancelKey {
                 key: self.key,
                 value: closure_json.clone(),
@@ -456,7 +491,7 @@ impl Session {
             );
 
             match tx.call((guard, op), cancel.as_mut()).await {
-                Ok(Ok(_)) => {
+                Ok(_) => {
                     tracing::debug!(
                         src=%self.key,
                         dest=?cancel_closure.cancel_token,
@@ -467,10 +502,10 @@ impl Session {
                     tokio::time::sleep(CANCEL_KEY_REFRESH).await;
                 }
                 // retry immediately.
-                Ok(Err(error)) => {
+                Err(BatchQueueError::Result(error)) => {
                     tracing::warn!(?error, "error registering cancellation key");
                 }
-                Err(Err(_cancelled)) => break,
+                Err(BatchQueueError::Cancelled(Err(_cancelled))) => break,
             }
         }
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 9d1a3d4358..8439082498 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -374,11 +374,9 @@ pub enum Waiting {
 #[label(singleton = "kind")]
 #[allow(clippy::enum_variant_names)]
 pub enum RedisMsgKind {
-    HSet,
-    HSetMultiple,
+    Set,
+    Get,
     HGet,
-    HGetAll,
-    HDel,
 }
 
 #[derive(Default, Clone)]
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index 35a3fe4334..b0bf332e44 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -4,11 +4,12 @@ use std::time::Duration;
 
 use futures::FutureExt;
 use redis::aio::{ConnectionLike, MultiplexedConnection};
-use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult};
+use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisError, RedisResult};
 use tokio::task::AbortHandle;
 use tracing::{error, info, warn};
 
 use super::elasticache::CredentialsProvider;
+use crate::redis::elasticache::CredentialsProviderError;
 
 enum Credentials {
     Static(ConnectionInfo),
@@ -26,6 +27,14 @@ impl Clone for Credentials {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub enum ConnectionProviderError {
+    #[error(transparent)]
+    Redis(#[from] RedisError),
+    #[error(transparent)]
+    CredentialsProvider(#[from] CredentialsProviderError),
+}
+
 /// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token.
 /// Provides PubSub connection without credentials refresh.
 pub struct ConnectionWithCredentialsProvider {
@@ -86,15 +95,18 @@ impl ConnectionWithCredentialsProvider {
         }
     }
 
-    async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> {
-        redis::cmd("PING").query_async(con).await
+    async fn ping(con: &mut MultiplexedConnection) -> Result<(), ConnectionProviderError> {
+        redis::cmd("PING")
+            .query_async(con)
+            .await
+            .map_err(Into::into)
     }
 
     pub(crate) fn credentials_refreshed(&self) -> bool {
         self.credentials_refreshed.load(Ordering::Relaxed)
     }
 
-    pub(crate) async fn connect(&mut self) -> anyhow::Result<()> {
+    pub(crate) async fn connect(&mut self) -> Result<(), ConnectionProviderError> {
         let _guard = self.mutex.lock().await;
         if let Some(con) = self.con.as_mut() {
             match Self::ping(con).await {
@@ -141,7 +153,7 @@ impl ConnectionWithCredentialsProvider {
         Ok(())
     }
 
-    async fn get_connection_info(&self) -> anyhow::Result<ConnectionInfo> {
+    async fn get_connection_info(&self) -> Result<ConnectionInfo, ConnectionProviderError> {
         match &self.credentials {
             Credentials::Static(info) => Ok(info.clone()),
             Credentials::Dynamic(provider, addr) => {
@@ -160,7 +172,7 @@ impl ConnectionWithCredentialsProvider {
         }
     }
 
-    async fn get_client(&self) -> anyhow::Result<redis::Client> {
+    async fn get_client(&self) -> Result<redis::Client, ConnectionProviderError> {
         let client = redis::Client::open(self.get_connection_info().await?)?;
         self.credentials_refreshed.store(true, Ordering::Relaxed);
         Ok(client)
diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs
index 58e3c889a7..6f3b34d381 100644
--- a/proxy/src/redis/elasticache.rs
+++ b/proxy/src/redis/elasticache.rs
@@ -9,10 +9,12 @@ use aws_config::meta::region::RegionProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
+use aws_credential_types::provider::error::CredentialsError;
 use aws_sdk_iam::config::ProvideCredentials;
 use aws_sigv4::http_request::{
-    self, SignableBody, SignableRequest, SignatureLocation, SigningSettings,
+    self, SignableBody, SignableRequest, SignatureLocation, SigningError, SigningSettings,
 };
+use aws_sigv4::sign::v4::signing_params::BuildError;
 use tracing::info;
 
 #[derive(Debug)]
@@ -40,6 +42,18 @@ impl AWSIRSAConfig {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub enum CredentialsProviderError {
+    #[error(transparent)]
+    AwsCredentials(#[from] CredentialsError),
+    #[error(transparent)]
+    AwsSigv4Build(#[from] BuildError),
+    #[error(transparent)]
+    AwsSigv4Singing(#[from] SigningError),
+    #[error(transparent)]
+    Http(#[from] http::Error),
+}
+
 /// Credentials provider for AWS elasticache authentication.
 ///
 /// Official documentation:
@@ -92,7 +106,9 @@ impl CredentialsProvider {
         })
     }
 
-    pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
+    pub(crate) async fn provide_credentials(
+        &self,
+    ) -> Result<(String, String), CredentialsProviderError> {
         let aws_credentials = self
             .credentials_provider
             .provide_credentials()
diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs
index cfdbc21839..d1e97b6b09 100644
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -2,9 +2,18 @@ use std::time::Duration;
 
 use futures::FutureExt;
 use redis::aio::ConnectionLike;
-use redis::{Cmd, FromRedisValue, Pipeline, RedisResult};
+use redis::{Cmd, FromRedisValue, Pipeline, RedisError, RedisResult};
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use crate::redis::connection_with_credentials_provider::ConnectionProviderError;
+
+#[derive(thiserror::Error, Debug)]
+pub enum RedisKVClientError {
+    #[error(transparent)]
+    Redis(#[from] RedisError),
+    #[error(transparent)]
+    ConnectionProvider(#[from] ConnectionProviderError),
+}
 
 pub struct RedisKVClient {
     client: ConnectionWithCredentialsProvider,
@@ -32,12 +41,13 @@ impl RedisKVClient {
         Self { client }
     }
 
-    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+    pub async fn try_connect(&mut self) -> Result<(), RedisKVClientError> {
         self.client
             .connect()
             .boxed()
             .await
             .inspect_err(|e| tracing::error!("failed to connect to redis: {e}"))
+            .map_err(Into::into)
     }
 
     pub(crate) fn credentials_refreshed(&self) -> bool {
@@ -47,7 +57,7 @@ impl RedisKVClient {
     pub(crate) async fn query<T: FromRedisValue>(
         &mut self,
         q: &impl Queryable,
-    ) -> anyhow::Result<T> {
+    ) -> Result<T, RedisKVClientError> {
         let e = match q.query(&mut self.client).await {
             Ok(t) => return Ok(t),
             Err(e) => e,

From 9bba31bf6805e1c179b75fbb5bcab96c96980c75 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 11 Jul 2025 20:39:08 +0100
Subject: [PATCH 38/56] proxy: encode json as we parse rows (#11992)

Serialize query row responses directly into JSON. Some of this code
should be using the `json::value_as_object/list` macros, but I've
avoided it for now to minimize the size of the diff.
---
 Cargo.lock                            |   1 +
 proxy/Cargo.toml                      |   1 +
 proxy/src/serverless/json.rs          |  95 +++++++---------
 proxy/src/serverless/sql_over_http.rs | 154 +++++++++++++-------------
 4 files changed, 122 insertions(+), 129 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4323254f0a..14b460005a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5329,6 +5329,7 @@ dependencies = [
  "itoa",
  "jose-jwa",
  "jose-jwk",
+ "json",
  "lasso",
  "measured",
  "metrics",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0a406d1ca8..82fe6818e3 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -49,6 +49,7 @@ indexmap = { workspace = true, features = ["serde"] }
 ipnet.workspace = true
 itertools.workspace = true
 itoa.workspace = true
+json = { path = "../libs/proxy/json" }
 lasso = { workspace = true, features = ["multi-threaded"] }
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 2e67d07079..ef7c8a4d82 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -1,6 +1,7 @@
+use json::{ListSer, ObjectSer, ValueSer};
 use postgres_client::Row;
 use postgres_client::types::{Kind, Type};
-use serde_json::{Map, Value};
+use serde_json::Value;
 
 //
 // Convert json non-string types to strings, so that they can be passed to Postgres
@@ -74,44 +75,40 @@ pub(crate) enum JsonConversionError {
     UnbalancedString,
 }
 
-enum OutputMode {
-    Array(Vec<Value>),
-    Object(Map<String, Value>),
+enum OutputMode<'a> {
+    Array(ListSer<'a>),
+    Object(ObjectSer<'a>),
 }
 
-impl OutputMode {
-    fn key(&mut self, key: &str) -> &mut Value {
+impl OutputMode<'_> {
+    fn key(&mut self, key: &str) -> ValueSer<'_> {
         match self {
-            OutputMode::Array(values) => push_entry(values, Value::Null),
-            OutputMode::Object(map) => map.entry(key.to_string()).or_insert(Value::Null),
+            OutputMode::Array(values) => values.entry(),
+            OutputMode::Object(map) => map.key(key),
         }
     }
 
-    fn finish(self) -> Value {
+    fn finish(self) {
         match self {
-            OutputMode::Array(values) => Value::Array(values),
-            OutputMode::Object(map) => Value::Object(map),
+            OutputMode::Array(values) => values.finish(),
+            OutputMode::Object(map) => map.finish(),
         }
     }
 }
 
-fn push_entry<T>(arr: &mut Vec<T>, t: T) -> &mut T {
-    arr.push(t);
-    arr.last_mut().expect("a value was just inserted")
-}
-
 //
 // Convert postgres row with text-encoded values to JSON object
 //
 pub(crate) fn pg_text_row_to_json(
+    output: ValueSer,
     row: &Row,
     raw_output: bool,
     array_mode: bool,
-) -> Result<Value, JsonConversionError> {
+) -> Result<(), JsonConversionError> {
     let mut entries = if array_mode {
-        OutputMode::Array(Vec::with_capacity(row.columns().len()))
+        OutputMode::Array(output.list())
     } else {
-        OutputMode::Object(Map::with_capacity(row.columns().len()))
+        OutputMode::Object(output.object())
     };
 
     for (i, column) in row.columns().iter().enumerate() {
@@ -120,53 +117,48 @@ pub(crate) fn pg_text_row_to_json(
         let value = entries.key(column.name());
 
         match pg_value {
-            Some(v) if raw_output => *value = Value::String(v.to_string()),
+            Some(v) if raw_output => value.value(v),
             Some(v) => pg_text_to_json(value, v, column.type_())?,
-            None => *value = Value::Null,
+            None => value.value(json::Null),
         }
     }
 
-    Ok(entries.finish())
+    entries.finish();
+    Ok(())
 }
 
 //
 // Convert postgres text-encoded value to JSON value
 //
-fn pg_text_to_json(
-    output: &mut Value,
-    val: &str,
-    pg_type: &Type,
-) -> Result<(), JsonConversionError> {
+fn pg_text_to_json(output: ValueSer, val: &str, pg_type: &Type) -> Result<(), JsonConversionError> {
     if let Kind::Array(elem_type) = pg_type.kind() {
         // todo: we should fetch this from postgres.
         let delimiter = ',';
 
-        let mut array = vec![];
-        pg_array_parse(&mut array, val, elem_type, delimiter)?;
-        *output = Value::Array(array);
+        json::value_as_list!(|output| pg_array_parse(output, val, elem_type, delimiter)?);
         return Ok(());
     }
 
     match *pg_type {
-        Type::BOOL => *output = Value::Bool(val == "t"),
+        Type::BOOL => output.value(val == "t"),
         Type::INT2 | Type::INT4 => {
             let val = val.parse::<i32>()?;
-            *output = Value::Number(serde_json::Number::from(val));
+            output.value(val);
         }
         Type::FLOAT4 | Type::FLOAT8 => {
             let fval = val.parse::<f64>()?;
-            let num = serde_json::Number::from_f64(fval);
-            if let Some(num) = num {
-                *output = Value::Number(num);
+            if fval.is_finite() {
+                output.value(fval);
             } else {
                 // Pass Nan, Inf, -Inf as strings
                 // JS JSON.stringify() does converts them to null, but we
                 // want to preserve them, so we pass them as strings
-                *output = Value::String(val.to_string());
+                output.value(val);
             }
         }
-        Type::JSON | Type::JSONB => *output = serde_json::from_str(val)?,
-        _ => *output = Value::String(val.to_string()),
+        // we assume that the string value is valid json.
+        Type::JSON | Type::JSONB => output.write_raw_json(val.as_bytes()),
+        _ => output.value(val),
     }
 
     Ok(())
@@ -192,7 +184,7 @@ fn pg_text_to_json(
 /// gets its own level of curly braces, and delimiters must be written between adjacent
 /// curly-braced entities of the same level.
 fn pg_array_parse(
-    elements: &mut Vec<Value>,
+    elements: &mut ListSer,
     mut pg_array: &str,
     elem: &Type,
     delim: char,
@@ -221,7 +213,7 @@ fn pg_array_parse(
 /// reads a single array from the `pg_array` string and pushes each values to `elements`.
 /// returns the rest of the `pg_array` string that was not read.
 fn pg_array_parse_inner<'a>(
-    elements: &mut Vec<Value>,
+    elements: &mut ListSer,
     mut pg_array: &'a str,
     elem: &Type,
     delim: char,
@@ -234,7 +226,7 @@ fn pg_array_parse_inner<'a>(
     let mut q = String::new();
 
     loop {
-        let value = push_entry(elements, Value::Null);
+        let value = elements.entry();
         pg_array = pg_array_parse_item(value, &mut q, pg_array, elem, delim)?;
 
         // check for separator.
@@ -260,7 +252,7 @@ fn pg_array_parse_inner<'a>(
 ///
 /// `quoted` is a scratch allocation that has no defined output.
 fn pg_array_parse_item<'a>(
-    output: &mut Value,
+    output: ValueSer,
     quoted: &mut String,
     mut pg_array: &'a str,
     elem: &Type,
@@ -276,9 +268,8 @@ fn pg_array_parse_item<'a>(
 
     if pg_array.starts_with('{') {
         // nested array.
-        let mut nested = vec![];
-        pg_array = pg_array_parse_inner(&mut nested, pg_array, elem, delim)?;
-        *output = Value::Array(nested);
+        pg_array =
+            json::value_as_list!(|output| pg_array_parse_inner(output, pg_array, elem, delim))?;
         return Ok(pg_array);
     }
 
@@ -306,7 +297,7 @@ fn pg_array_parse_item<'a>(
     // we might have an item string:
     // check for null
     if item == "NULL" {
-        *output = Value::Null;
+        output.value(json::Null);
     } else {
         pg_text_to_json(output, item, elem)?;
     }
@@ -440,15 +431,15 @@ mod tests {
     }
 
     fn pg_text_to_json(val: &str, pg_type: &Type) -> Value {
-        let mut v = Value::Null;
-        super::pg_text_to_json(&mut v, val, pg_type).unwrap();
-        v
+        let output = json::value_to_string!(|v| super::pg_text_to_json(v, val, pg_type).unwrap());
+        serde_json::from_str(&output).unwrap()
     }
 
     fn pg_array_parse(pg_array: &str, pg_type: &Type) -> Value {
-        let mut array = vec![];
-        super::pg_array_parse(&mut array, pg_array, pg_type, ',').unwrap();
-        Value::Array(array)
+        let output = json::value_to_string!(|v| json::value_as_list!(|v| {
+            super::pg_array_parse(v, pg_array, pg_type, ',').unwrap();
+        }));
+        serde_json::from_str(&output).unwrap()
     }
 
     #[test]
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7a718d0280..8a14f804b6 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -14,10 +14,7 @@ use hyper::http::{HeaderName, HeaderValue};
 use hyper::{Request, Response, StatusCode, header};
 use indexmap::IndexMap;
 use postgres_client::error::{DbError, ErrorPosition, SqlState};
-use postgres_client::{
-    GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, RowStream, Transaction,
-};
-use serde::Serialize;
+use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use serde_json::Value;
 use serde_json::value::RawValue;
 use tokio::time::{self, Instant};
@@ -687,32 +684,21 @@ impl QueryData {
         let (inner, mut discard) = client.inner();
         let cancel_token = inner.cancel_token();
 
-        match select(
+        let mut json_buf = vec![];
+
+        let batch_result = match select(
             pin!(query_to_json(
                 config,
                 &mut *inner,
                 self,
-                &mut 0,
+                json::ValueSer::new(&mut json_buf),
                 parsed_headers
             )),
             pin!(cancel.cancelled()),
         )
         .await
         {
-            // The query successfully completed.
-            Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
-                discard.check_idle(status);
-
-                let json_output =
-                    serde_json::to_string(&results).expect("json serialization should not fail");
-                Ok(json_output)
-            }
-            // The query failed with an error
-            Either::Left((Err(e), __not_yet_cancelled)) => {
-                discard.discard();
-                Err(e)
-            }
-            // The query was cancelled.
+            Either::Left((res, __not_yet_cancelled)) => res,
             Either::Right((_cancelled, query)) => {
                 tracing::info!("cancelling query");
                 if let Err(err) = cancel_token.cancel_query(NoTls).await {
@@ -721,13 +707,7 @@ impl QueryData {
                 // wait for the query cancellation
                 match time::timeout(time::Duration::from_millis(100), query).await {
                     // query successed before it was cancelled.
-                    Ok(Ok((status, results))) => {
-                        discard.check_idle(status);
-
-                        let json_output = serde_json::to_string(&results)
-                            .expect("json serialization should not fail");
-                        Ok(json_output)
-                    }
+                    Ok(Ok(status)) => Ok(status),
                     // query failed or was cancelled.
                     Ok(Err(error)) => {
                         let db_error = match &error {
@@ -743,14 +723,29 @@ impl QueryData {
                             discard.discard();
                         }
 
-                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                        return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                     }
                     Err(_timeout) => {
                         discard.discard();
-                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                        return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                     }
                 }
             }
+        };
+
+        match batch_result {
+            // The query successfully completed.
+            Ok(status) => {
+                discard.check_idle(status);
+
+                let json_output = String::from_utf8(json_buf).expect("json should be valid utf8");
+                Ok(json_output)
+            }
+            // The query failed with an error
+            Err(e) => {
+                discard.discard();
+                Err(e)
+            }
         }
     }
 }
@@ -787,7 +782,7 @@ impl BatchQueryData {
             })
             .map_err(SqlOverHttpError::Postgres)?;
 
-        let json_output = match query_batch(
+        let json_output = match query_batch_to_json(
             config,
             cancel.child_token(),
             &mut transaction,
@@ -845,24 +840,21 @@ async fn query_batch(
     transaction: &mut Transaction<'_>,
     queries: BatchQueryData,
     parsed_headers: HttpHeaders,
-) -> Result<String, SqlOverHttpError> {
-    let mut results = Vec::with_capacity(queries.queries.len());
-    let mut current_size = 0;
+    results: &mut json::ListSer<'_>,
+) -> Result<(), SqlOverHttpError> {
     for stmt in queries.queries {
         let query = pin!(query_to_json(
             config,
             transaction,
             stmt,
-            &mut current_size,
+            results.entry(),
             parsed_headers,
         ));
         let cancelled = pin!(cancel.cancelled());
         let res = select(query, cancelled).await;
         match res {
             // TODO: maybe we should check that the transaction bit is set here
-            Either::Left((Ok((_, values)), _cancelled)) => {
-                results.push(values);
-            }
+            Either::Left((Ok(_), _cancelled)) => {}
             Either::Left((Err(e), _cancelled)) => {
                 return Err(e);
             }
@@ -872,8 +864,22 @@ async fn query_batch(
         }
     }
 
-    let results = json!({ "results": results });
-    let json_output = serde_json::to_string(&results).expect("json serialization should not fail");
+    Ok(())
+}
+
+async fn query_batch_to_json(
+    config: &'static HttpConfig,
+    cancel: CancellationToken,
+    tx: &mut Transaction<'_>,
+    queries: BatchQueryData,
+    headers: HttpHeaders,
+) -> Result<String, SqlOverHttpError> {
+    let json_output = json::value_to_string!(|obj| json::value_as_object!(|obj| {
+        let results = obj.key("results");
+        json::value_as_list!(|results| {
+            query_batch(config, cancel, tx, queries, headers, results).await?;
+        });
+    }));
 
     Ok(json_output)
 }
@@ -882,54 +888,54 @@ async fn query_to_json<T: GenericClient>(
     config: &'static HttpConfig,
     client: &mut T,
     data: QueryData,
-    current_size: &mut usize,
+    output: json::ValueSer<'_>,
     parsed_headers: HttpHeaders,
-) -> Result<(ReadyForQueryStatus, impl Serialize + use<T>), SqlOverHttpError> {
+) -> Result<ReadyForQueryStatus, SqlOverHttpError> {
     let query_start = Instant::now();
 
-    let query_params = data.params;
+    let mut output = json::ObjectSer::new(output);
     let mut row_stream = client
-        .query_raw_txt(&data.query, query_params)
+        .query_raw_txt(&data.query, data.params)
         .await
         .map_err(SqlOverHttpError::Postgres)?;
     let query_acknowledged = Instant::now();
 
-    let columns_len = row_stream.statement.columns().len();
-    let mut fields = Vec::with_capacity(columns_len);
-
+    let mut json_fields = output.key("fields").list();
     for c in row_stream.statement.columns() {
-        fields.push(json!({
-            "name": c.name().to_owned(),
-            "dataTypeID": c.type_().oid(),
-            "tableID": c.table_oid(),
-            "columnID": c.column_id(),
-            "dataTypeSize": c.type_size(),
-            "dataTypeModifier": c.type_modifier(),
-            "format": "text",
-        }));
+        let json_field = json_fields.entry();
+        json::value_as_object!(|json_field| {
+            json_field.entry("name", c.name());
+            json_field.entry("dataTypeID", c.type_().oid());
+            json_field.entry("tableID", c.table_oid());
+            json_field.entry("columnID", c.column_id());
+            json_field.entry("dataTypeSize", c.type_size());
+            json_field.entry("dataTypeModifier", c.type_modifier());
+            json_field.entry("format", "text");
+        });
     }
+    json_fields.finish();
 
-    let raw_output = parsed_headers.raw_output;
     let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
+    let raw_output = parsed_headers.raw_output;
 
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
     // big.
-    let mut rows = Vec::new();
+    let mut rows = 0;
+    let mut json_rows = output.key("rows").list();
     while let Some(row) = row_stream.next().await {
         let row = row.map_err(SqlOverHttpError::Postgres)?;
-        *current_size += row.body_len();
 
         // we don't have a streaming response support yet so this is to prevent OOM
         // from a malicious query (eg a cross join)
-        if *current_size > config.max_response_size_bytes {
+        if json_rows.as_buffer().len() > config.max_response_size_bytes {
             return Err(SqlOverHttpError::ResponseTooLarge(
                 config.max_response_size_bytes,
             ));
         }
 
-        let row = pg_text_row_to_json(&row, raw_output, array_mode)?;
-        rows.push(row);
+        pg_text_row_to_json(json_rows.entry(), &row, raw_output, array_mode)?;
+        rows += 1;
 
         // assumption: parsing pg text and converting to json takes CPU time.
         // let's assume it is slightly expensive, so we should consume some cooperative budget.
@@ -937,16 +943,14 @@ async fn query_to_json<T: GenericClient>(
         // of rows and never hit the tokio mpsc for a long time (although unlikely).
         tokio::task::consume_budget().await;
     }
+    json_rows.finish();
 
     let query_resp_end = Instant::now();
-    let RowStream {
-        command_tag,
-        status: ready,
-        ..
-    } = row_stream;
+
+    let ready = row_stream.status;
 
     // grab the command tag and number of rows affected
-    let command_tag = command_tag.unwrap_or_default();
+    let command_tag = row_stream.command_tag.unwrap_or_default();
     let mut command_tag_split = command_tag.split(' ');
     let command_tag_name = command_tag_split.next().unwrap_or_default();
     let command_tag_count = if command_tag_name == "INSERT" {
@@ -959,7 +963,7 @@ async fn query_to_json<T: GenericClient>(
     .and_then(|s| s.parse::<i64>().ok());
 
     info!(
-        rows = rows.len(),
+        rows,
         ?ready,
         command_tag,
         acknowledgement = ?(query_acknowledged - query_start),
@@ -967,16 +971,12 @@ async fn query_to_json<T: GenericClient>(
         "finished executing query"
     );
 
-    // Resulting JSON format is based on the format of node-postgres result.
-    let results = json!({
-        "command": command_tag_name.to_string(),
-        "rowCount": command_tag_count,
-        "rows": rows,
-        "fields": fields,
-        "rowAsArray": array_mode,
-    });
+    output.entry("command", command_tag_name);
+    output.entry("rowCount", command_tag_count);
+    output.entry("rowAsArray", array_mode);
 
-    Ok((ready, results))
+    output.finish();
+    Ok(ready)
 }
 
 enum Client {

From ee7bb1a66746e4bbbf1213792b8169e00ce08334 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Sat, 12 Jul 2025 08:57:04 +0400
Subject: [PATCH 39/56] storcon: validate new_sk_set before starting safekeeper
 migration (#12546)

## Problem
We don't validate the validity of the `new_sk_set` before starting the
migration. It is validated later, so the migration to an invalid
safekeeper set will fail anyway. But at this point we might already
commited an invalid `new_sk_set` to the database and there is no `abort`
command yet (I ran into this issue in neon_local and ruined the timeline
:)

- Part of https://github.com/neondatabase/neon/issues/11669

## Summary of changes
- Add safekeeper count and safekeeper duplication checks before starting
the migration
- Test that we validate the `new_sk_set` before starting the migration
- Add `force` option to the `TimelineSafekeeperMigrateRequest` to
disable not-mandatory checks
---
 .../src/service/safekeeper_service.rs         | 45 +++++++++++++++----
 .../regress/test_safekeeper_migration.py      | 38 ++++++++++++++++
 2 files changed, 75 insertions(+), 8 deletions(-)

diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 42ddf81e3e..7521d7bd86 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -39,13 +39,13 @@ use utils::lsn::Lsn;
 use super::Service;
 
 impl Service {
-    fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, ApiError> {
+    fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, anyhow::Error> {
         let members = safekeepers
             .iter()
             .map(|sk| sk.get_safekeeper_id())
             .collect::<Vec<_>>();
 
-        MemberSet::new(members).map_err(ApiError::InternalServerError)
+        MemberSet::new(members)
     }
 
     fn get_safekeepers(&self, ids: &[i64]) -> Result<Vec<Safekeeper>, ApiError> {
@@ -80,7 +80,7 @@ impl Service {
     ) -> Result<Vec<NodeId>, ApiError> {
         let safekeepers = self.get_safekeepers(&timeline_persistence.sk_set)?;
 
-        let mset = Self::make_member_set(&safekeepers)?;
+        let mset = Self::make_member_set(&safekeepers).map_err(ApiError::InternalServerError)?;
         let mconf = safekeeper_api::membership::Configuration::new(mset);
 
         let req = safekeeper_api::models::TimelineCreateRequest {
@@ -1105,6 +1105,26 @@ impl Service {
             }
         }
 
+        if new_sk_set.is_empty() {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "new safekeeper set is empty"
+            )));
+        }
+
+        if new_sk_set.len() < self.config.timeline_safekeeper_count {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "new safekeeper set must have at least {} safekeepers",
+                self.config.timeline_safekeeper_count
+            )));
+        }
+
+        let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
+        let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?;
+        // Construct new member set in advance to validate it.
+        // E.g. validates that there is no duplicate safekeepers.
+        let new_sk_member_set =
+            Self::make_member_set(&new_safekeepers).map_err(ApiError::BadRequest)?;
+
         // TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks.
         let _tenant_lock = trace_shared_lock(
             &self.tenant_op_locks,
@@ -1135,6 +1155,18 @@ impl Service {
             .map(|&id| NodeId(id as u64))
             .collect::<Vec<_>>();
 
+        // Validate that we are not migrating to a decomissioned safekeeper.
+        for sk in new_safekeepers.iter() {
+            if !cur_sk_set.contains(&sk.get_id())
+                && sk.scheduling_policy() == SkSchedulingPolicy::Decomissioned
+            {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "safekeeper {} is decomissioned",
+                    sk.get_id()
+                )));
+            }
+        }
+
         tracing::info!(
             ?cur_sk_set,
             ?new_sk_set,
@@ -1177,11 +1209,8 @@ impl Service {
         }
 
         let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?;
-        let cur_sk_member_set = Self::make_member_set(&cur_safekeepers)?;
-
-        let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
-        let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?;
-        let new_sk_member_set = Self::make_member_set(&new_safekeepers)?;
+        let cur_sk_member_set =
+            Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?;
 
         let joint_config = membership::Configuration {
             generation,
diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py
index b82d7b9bb0..170c1a3650 100644
--- a/test_runner/regress/test_safekeeper_migration.py
+++ b/test_runner/regress/test_safekeeper_migration.py
@@ -2,6 +2,9 @@ from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
+import pytest
+from fixtures.neon_fixtures import StorageControllerApiException
+
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnvBuilder
 
@@ -75,3 +78,38 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
     ep.start(safekeeper_generation=1, safekeepers=[3])
 
     assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
+
+
+def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that safekeeper_migrate validates the new_sk_set before starting the migration.
+    """
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.storage_controller_config = {
+        "timelines_onto_safekeepers": True,
+        "timeline_safekeeper_count": 2,
+    }
+    env = neon_env_builder.init_start()
+
+    def expect_fail(sk_set: list[int], match: str):
+        with pytest.raises(StorageControllerApiException, match=match):
+            env.storage_controller.migrate_safekeepers(
+                env.initial_tenant, env.initial_timeline, sk_set
+            )
+        # Check that we failed before commiting to the database.
+        mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+        assert mconf["generation"] == 1
+
+    expect_fail([], "safekeeper set is empty")
+    expect_fail([1], "must have at least 2 safekeepers")
+    expect_fail([1, 1], "duplicate safekeeper")
+    expect_fail([1, 100500], "does not exist")
+
+    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+    sk_set = mconf["sk_set"]
+    assert len(sk_set) == 2
+
+    decom_sk = [sk.id for sk in env.safekeepers if sk.id not in sk_set][0]
+    env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned")
+
+    expect_fail([sk_set[0], decom_sk], "decomissioned")

From a5fe67f3616b55135fa3a58c2db89bf30a9eb955 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Sun, 13 Jul 2025 19:27:39 +0200
Subject: [PATCH 40/56] proxy: cancel maintain_cancel_key task immediately
 (#12586)

## Problem

When a connection terminates its maintain_cancel_key task keeps running
until the CANCEL_KEY_REFRESH sleep finishes and then it triggers another
cancel key TTL refresh before exiting.

## Summary of changes

* Check for cancellation while sleeping and interrupt sleep.
* If cancelled, break the loop, don't send a refresh cmd.
---
 proxy/src/cancellation.rs | 10 ++++++++--
 proxy/src/util.rs         | 14 +++++++++++---
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 4ea4c4ea54..03be9dd4cf 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -28,6 +28,7 @@ use crate::pqproto::CancelKeyData;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
 use crate::redis::kv_ops::{RedisKVClient, RedisKVClientError};
+use crate::util::run_until;
 
 type IpSubnetKey = IpNet;
 
@@ -498,8 +499,13 @@ impl Session {
                         "registered cancellation key"
                     );
 
-                    // wait before continuing.
-                    tokio::time::sleep(CANCEL_KEY_REFRESH).await;
+                    // wait before continuing. break immediately if cancelled.
+                    if run_until(tokio::time::sleep(CANCEL_KEY_REFRESH), cancel.as_mut())
+                        .await
+                        .is_err()
+                    {
+                        break;
+                    }
                 }
                 // retry immediately.
                 Err(BatchQueueError::Result(error)) => {
diff --git a/proxy/src/util.rs b/proxy/src/util.rs
index 7fc2d9fbdb..0291216d94 100644
--- a/proxy/src/util.rs
+++ b/proxy/src/util.rs
@@ -7,8 +7,16 @@ pub async fn run_until_cancelled<F: Future>(
     f: F,
     cancellation_token: &CancellationToken,
 ) -> Option<F::Output> {
-    match select(pin!(f), pin!(cancellation_token.cancelled())).await {
-        Either::Left((f, _)) => Some(f),
-        Either::Right(((), _)) => None,
+    run_until(f, cancellation_token.cancelled()).await.ok()
+}
+
+/// Runs the future `f` unless interrupted by future `condition`.
+pub async fn run_until<F1: Future, F2: Future>(
+    f: F1,
+    condition: F2,
+) -> Result<F1::Output, F2::Output> {
+    match select(pin!(f), pin!(condition)).await {
+        Either::Left((f1, _)) => Ok(f1),
+        Either::Right((f2, _)) => Err(f2),
     }
 }

From 296c9190b2f6e12c571a2b71f070b1c5597738e8 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 14 Jul 2025 00:49:23 +0200
Subject: [PATCH 41/56] proxy: Use EXPIRE command to refresh cancel entries
 (#12580)

## Problem

When refreshing cancellation data we resend the entire value again just
to reset the TTL, which causes unnecessary load in proxy, on network and
possibly on redis side.

## Summary of changes

* Switch from using SET with full value to using EXPIRE to reset TTL.
* Add a tiny delay between retries to prevent busy loop.
* Shorten CancelKeyOp variants: drop redundant suffix.
* Retry SET when EXPIRE failed.
---
 proxy/src/cancellation.rs | 130 +++++++++++++++++++++++++++-----------
 proxy/src/metrics.rs      |   1 +
 2 files changed, 95 insertions(+), 36 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 03be9dd4cf..77062d3bb4 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -32,20 +32,24 @@ use crate::util::run_until;
 
 type IpSubnetKey = IpNet;
 
-const CANCEL_KEY_TTL: std::time::Duration = std::time::Duration::from_secs(600);
-const CANCEL_KEY_REFRESH: std::time::Duration = std::time::Duration::from_secs(570);
+const CANCEL_KEY_TTL: Duration = Duration::from_secs(600);
+const CANCEL_KEY_REFRESH: Duration = Duration::from_secs(570);
 
 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
-    StoreCancelKey {
+    Store {
         key: CancelKeyData,
         value: Box<str>,
-        expire: std::time::Duration,
+        expire: Duration,
     },
-    GetCancelData {
+    Refresh {
+        key: CancelKeyData,
+        expire: Duration,
+    },
+    Get {
         key: CancelKeyData,
     },
-    GetCancelDataOld {
+    GetOld {
         key: CancelKeyData,
     },
 }
@@ -108,7 +112,7 @@ impl Pipeline {
 impl CancelKeyOp {
     fn register(&self, pipe: &mut Pipeline) {
         match self {
-            CancelKeyOp::StoreCancelKey { key, value, expire } => {
+            CancelKeyOp::Store { key, value, expire } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
                 pipe.add_command(Cmd::set_options(
                     &key,
@@ -116,11 +120,15 @@ impl CancelKeyOp {
                     SetOptions::default().with_expiration(SetExpiry::EX(expire.as_secs())),
                 ));
             }
-            CancelKeyOp::GetCancelDataOld { key } => {
+            CancelKeyOp::Refresh { key, expire } => {
+                let key = KeyPrefix::Cancel(*key).build_redis_key();
+                pipe.add_command(Cmd::expire(&key, expire.as_secs() as i64));
+            }
+            CancelKeyOp::GetOld { key } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
                 pipe.add_command(Cmd::hget(key, "data"));
             }
-            CancelKeyOp::GetCancelData { key } => {
+            CancelKeyOp::Get { key } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
                 pipe.add_command(Cmd::get(key));
             }
@@ -264,7 +272,7 @@ impl CancellationHandler {
             .proxy
             .cancel_channel_size
             .guard(RedisMsgKind::Get);
-        let op = CancelKeyOp::GetCancelData { key };
+        let op = CancelKeyOp::Get { key };
         let result = timeout(
             TIMEOUT,
             tx.call((guard, op), std::future::pending::<Infallible>()),
@@ -289,7 +297,7 @@ impl CancellationHandler {
                 .proxy
                 .cancel_channel_size
                 .guard(RedisMsgKind::HGet);
-            let op = CancelKeyOp::GetCancelDataOld { key };
+            let op = CancelKeyOp::GetOld { key };
             timeout(
                 TIMEOUT,
                 tx.call((guard, op), std::future::pending::<Infallible>()),
@@ -474,45 +482,95 @@ impl Session {
 
         let mut cancel = pin!(cancel);
 
+        enum State {
+            Set,
+            Refresh,
+        }
+        let mut state = State::Set;
+
         loop {
-            let guard = Metrics::get()
-                .proxy
-                .cancel_channel_size
-                .guard(RedisMsgKind::Set);
-            let op = CancelKeyOp::StoreCancelKey {
-                key: self.key,
-                value: closure_json.clone(),
-                expire: CANCEL_KEY_TTL,
+            let guard_op = match state {
+                State::Set => {
+                    let guard = Metrics::get()
+                        .proxy
+                        .cancel_channel_size
+                        .guard(RedisMsgKind::Set);
+                    let op = CancelKeyOp::Store {
+                        key: self.key,
+                        value: closure_json.clone(),
+                        expire: CANCEL_KEY_TTL,
+                    };
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "registering cancellation key"
+                    );
+                    (guard, op)
+                }
+
+                State::Refresh => {
+                    let guard = Metrics::get()
+                        .proxy
+                        .cancel_channel_size
+                        .guard(RedisMsgKind::Expire);
+                    let op = CancelKeyOp::Refresh {
+                        key: self.key,
+                        expire: CANCEL_KEY_TTL,
+                    };
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "refreshing cancellation key"
+                    );
+                    (guard, op)
+                }
             };
 
-            tracing::debug!(
-                src=%self.key,
-                dest=?cancel_closure.cancel_token,
-                "registering cancellation key"
-            );
-
-            match tx.call((guard, op), cancel.as_mut()).await {
-                Ok(_) => {
+            match tx.call(guard_op, cancel.as_mut()).await {
+                // SET returns OK
+                Ok(Value::Okay) => {
                     tracing::debug!(
                         src=%self.key,
                         dest=?cancel_closure.cancel_token,
                         "registered cancellation key"
                     );
-
-                    // wait before continuing. break immediately if cancelled.
-                    if run_until(tokio::time::sleep(CANCEL_KEY_REFRESH), cancel.as_mut())
-                        .await
-                        .is_err()
-                    {
-                        break;
-                    }
+                    state = State::Refresh;
                 }
+
+                // EXPIRE returns 1
+                Ok(Value::Int(1)) => {
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "refreshed cancellation key"
+                    );
+                }
+
+                Ok(_) => {
+                    // Any other response likely means the key expired.
+                    tracing::warn!(src=%self.key, "refreshing cancellation key failed");
+                    // Re-enter the SET loop to repush full data.
+                    state = State::Set;
+                }
+
                 // retry immediately.
                 Err(BatchQueueError::Result(error)) => {
-                    tracing::warn!(?error, "error registering cancellation key");
+                    tracing::warn!(?error, "error refreshing cancellation key");
+                    // Small delay to prevent busy loop with high cpu and logging.
+                    tokio::time::sleep(Duration::from_millis(10)).await;
+                    continue;
                 }
+
                 Err(BatchQueueError::Cancelled(Err(_cancelled))) => break,
             }
+
+            // wait before continuing. break immediately if cancelled.
+            if run_until(tokio::time::sleep(CANCEL_KEY_REFRESH), cancel.as_mut())
+                .await
+                .is_err()
+            {
+                break;
+            }
         }
 
         if let Err(err) = cancel_closure
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 8439082498..bf4d5a11eb 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -376,6 +376,7 @@ pub enum Waiting {
 pub enum RedisMsgKind {
     Set,
     Get,
+    Expire,
     HGet,
 }
 

From fecb707b19f6f14942e9cbc624890a0e371bb931 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 11:41:58 +0200
Subject: [PATCH 42/56] pagebench: add `idle-streams` (#12583)

## Problem

For the communicator scheduling policy, we need to understand the
server-side cost of idle gRPC streams.

Touches #11735.

## Summary of changes

Add an `idle-streams` benchmark to `pagebench` which opens a large
number of idle gRPC GetPage streams.
---
 pageserver/pagebench/src/cmd/idle_streams.rs | 127 +++++++++++++++++++
 pageserver/pagebench/src/main.rs             |   3 +
 2 files changed, 130 insertions(+)
 create mode 100644 pageserver/pagebench/src/cmd/idle_streams.rs

diff --git a/pageserver/pagebench/src/cmd/idle_streams.rs b/pageserver/pagebench/src/cmd/idle_streams.rs
new file mode 100644
index 0000000000..73bc9f3f46
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/idle_streams.rs
@@ -0,0 +1,127 @@
+use std::sync::Arc;
+
+use anyhow::anyhow;
+use futures::StreamExt;
+use tonic::transport::Endpoint;
+use tracing::info;
+
+use pageserver_page_api::{GetPageClass, GetPageRequest, GetPageStatusCode, ReadLsn, RelTag};
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+use utils::shard::ShardIndex;
+
+/// Starts a large number of idle gRPC GetPage streams.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    /// The Pageserver to connect to. Must use grpc://.
+    #[clap(long, default_value = "grpc://localhost:51051")]
+    server: String,
+    /// The Pageserver HTTP API.
+    #[clap(long, default_value = "http://localhost:9898")]
+    http_server: String,
+    /// The number of streams to open.
+    #[clap(long, default_value = "100000")]
+    count: usize,
+    /// Number of streams per connection.
+    #[clap(long, default_value = "100")]
+    per_connection: usize,
+    /// Send a single GetPage request on each stream.
+    #[clap(long, default_value_t = false)]
+    send_request: bool,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()?;
+
+    rt.block_on(main_impl(args))
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    // Discover a tenant and timeline to use.
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        reqwest::Client::new(),
+        args.http_server.clone(),
+        None,
+    ));
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: Some(1),
+            targets: None,
+        },
+    )
+    .await?;
+    let ttid = timelines
+        .first()
+        .ok_or_else(|| anyhow!("no timelines found"))?;
+
+    // Set up the initial client.
+    let endpoint = Endpoint::from_shared(args.server.clone())?;
+
+    let connect = async || {
+        pageserver_page_api::Client::new(
+            endpoint.connect().await?,
+            ttid.tenant_id,
+            ttid.timeline_id,
+            ShardIndex::unsharded(),
+            None,
+            None,
+        )
+    };
+
+    let mut client = connect().await?;
+    let mut streams = Vec::with_capacity(args.count);
+
+    // Create streams.
+    for i in 0..args.count {
+        if i % 100 == 0 {
+            info!("opened {}/{} streams", i, args.count);
+        }
+        if i % args.per_connection == 0 && i > 0 {
+            client = connect().await?;
+        }
+
+        let (req_tx, req_rx) = tokio::sync::mpsc::unbounded_channel();
+        let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
+        let mut resp_stream = client.get_pages(req_stream).await?;
+
+        // Send request if specified.
+        if args.send_request {
+            req_tx.send(GetPageRequest {
+                request_id: 1.into(),
+                request_class: GetPageClass::Normal,
+                read_lsn: ReadLsn {
+                    request_lsn: Lsn::MAX,
+                    not_modified_since_lsn: Some(Lsn(1)),
+                },
+                rel: RelTag {
+                    spcnode: 1664, // pg_global
+                    dbnode: 0,     // shared database
+                    relnode: 1262, // pg_authid
+                    forknum: 0,    // init
+                },
+                block_numbers: vec![0],
+            })?;
+            let resp = resp_stream
+                .next()
+                .await
+                .transpose()?
+                .ok_or_else(|| anyhow!("no response"))?;
+            if resp.status_code != GetPageStatusCode::Ok {
+                return Err(anyhow!("{} response", resp.status_code));
+            }
+        }
+
+        // Hold onto streams to avoid closing them.
+        streams.push((req_tx, resp_stream));
+    }
+
+    info!("opened {} streams, sleeping", args.count);
+
+    // Block forever, to hold the idle streams open for inspection.
+    futures::future::pending::<()>().await;
+
+    Ok(())
+}
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 5527557450..6498203de3 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -17,6 +17,7 @@ mod cmd {
     pub(super) mod aux_files;
     pub(super) mod basebackup;
     pub(super) mod getpage_latest_lsn;
+    pub(super) mod idle_streams;
     pub(super) mod ondemand_download_churn;
     pub(super) mod trigger_initial_size_calculation;
 }
@@ -29,6 +30,7 @@ enum Args {
     TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
     OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
     AuxFiles(cmd::aux_files::Args),
+    IdleStreams(cmd::idle_streams::Args),
 }
 
 fn main() {
@@ -49,6 +51,7 @@ fn main() {
         }
         Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
         Args::AuxFiles(args) => cmd::aux_files::main(args),
+        Args::IdleStreams(args) => cmd::idle_streams::main(args),
     }
     .unwrap()
 }

From d14d8271b815b57adeab6707b84ee26909f647f7 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 12:43:10 +0200
Subject: [PATCH 43/56] pageserver/client_grpc: improve retry logic (#12579)

## Problem

gRPC client retries currently include pool acquisition under the
per-attempt timeout. If pool acquisition is slow (e.g. full pool), this
will cause spurious timeout warnings, and the caller will lose its place
in the pool queue.

Touches #11735.

## Summary of changes

Makes several improvements to retries and related logic:

* Don't include pool acquisition time under request timeouts.
* Move attempt timeouts out of `Retry` and into the closure.
* Make `Retry` configurable, move constants into main module.
* Don't backoff on the first retry, and reduce initial/max backoffs to
5ms and 5s respectively.
* Add `with_retries` and `with_timeout` helpers.
* Add slow logging for pool acquisition, and a `warn_slow` counterpart
to `log_slow`.
* Add debug logging for requests and responses at the client boundary.
---
 libs/utils/src/logging.rs            |  56 +++++++---
 pageserver/client_grpc/src/client.rs | 160 +++++++++++++++++++--------
 pageserver/client_grpc/src/retry.rs  |  72 ++++++------
 3 files changed, 189 insertions(+), 99 deletions(-)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 5828a400a0..d67c0f123b 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,4 +1,5 @@
 use std::future::Future;
+use std::pin::Pin;
 use std::str::FromStr;
 use std::time::Duration;
 
@@ -7,7 +8,7 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
 use tokio::time::Instant;
-use tracing::info;
+use tracing::{info, warn};
 
 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
@@ -377,10 +378,11 @@ impl std::fmt::Debug for SecretString {
 ///
 /// TODO: consider upgrading this to a warning, but currently it fires too often.
 #[inline]
-pub async fn log_slow<F, O>(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O
-where
-    F: Future<Output = O>,
-{
+pub async fn log_slow<O>(
+    name: &str,
+    threshold: Duration,
+    f: Pin<&mut impl Future<Output = O>>,
+) -> O {
     monitor_slow_future(
         threshold,
         threshold, // period = threshold
@@ -394,16 +396,42 @@ where
             if !is_slow {
                 return;
             }
+            let elapsed = elapsed_total.as_secs_f64();
             if ready {
-                info!(
-                    "slow {name} completed after {:.3}s",
-                    elapsed_total.as_secs_f64()
-                );
+                info!("slow {name} completed after {elapsed:.3}s");
             } else {
-                info!(
-                    "slow {name} still running after {:.3}s",
-                    elapsed_total.as_secs_f64()
-                );
+                info!("slow {name} still running after {elapsed:.3}s");
+            }
+        },
+    )
+    .await
+}
+
+/// Logs a periodic warning if a future is slow to complete.
+#[inline]
+pub async fn warn_slow<O>(
+    name: &str,
+    threshold: Duration,
+    f: Pin<&mut impl Future<Output = O>>,
+) -> O {
+    monitor_slow_future(
+        threshold,
+        threshold, // period = threshold
+        f,
+        |MonitorSlowFutureCallback {
+             ready,
+             is_slow,
+             elapsed_total,
+             elapsed_since_last_callback: _,
+         }| {
+            if !is_slow {
+                return;
+            }
+            let elapsed = elapsed_total.as_secs_f64();
+            if ready {
+                warn!("slow {name} completed after {elapsed:.3}s");
+            } else {
+                warn!("slow {name} still running after {elapsed:.3}s");
             }
         },
     )
@@ -416,7 +444,7 @@ where
 pub async fn monitor_slow_future<F, O>(
     threshold: Duration,
     period: Duration,
-    mut fut: std::pin::Pin<&mut F>,
+    mut fut: Pin<&mut F>,
     mut cb: impl FnMut(MonitorSlowFutureCallback),
 ) -> O
 where
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 7049fbdb96..7732585f7c 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -1,13 +1,16 @@
 use std::collections::HashMap;
 use std::num::NonZero;
+use std::pin::pin;
 use std::sync::Arc;
+use std::time::{Duration, Instant};
 
 use anyhow::anyhow;
 use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
 use tonic::codec::CompressionEncoding;
-use tracing::instrument;
+use tracing::{debug, instrument};
+use utils::logging::warn_slow;
 
 use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
 use crate::retry::Retry;
@@ -44,6 +47,23 @@ const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
 /// get a larger queue depth.
 const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
 
+/// The overall request call timeout, including retries and pool acquisition.
+/// TODO: should we retry forever? Should the caller decide?
+const CALL_TIMEOUT: Duration = Duration::from_secs(60);
+
+/// The per-request (retry attempt) timeout, including any lazy connection establishment.
+const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// The initial request retry backoff duration. The first retry does not back off.
+/// TODO: use a different backoff for ResourceExhausted (rate limiting)? Needs server support.
+const BASE_BACKOFF: Duration = Duration::from_millis(5);
+
+/// The maximum request retry backoff duration.
+const MAX_BACKOFF: Duration = Duration::from_secs(5);
+
+/// Threshold and interval for warning about slow operation.
+const SLOW_THRESHOLD: Duration = Duration::from_secs(3);
+
 /// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
 /// basic `page_api::Client` gRPC client, and supports:
 ///
@@ -67,8 +87,6 @@ pub struct PageserverClient {
     compression: Option<CompressionEncoding>,
     /// The shards for this tenant.
     shards: ArcSwap<Shards>,
-    /// The retry configuration.
-    retry: Retry,
 }
 
 impl PageserverClient {
@@ -94,7 +112,6 @@ impl PageserverClient {
             auth_token,
             compression,
             shards: ArcSwap::new(Arc::new(shards)),
-            retry: Retry,
         })
     }
 
@@ -142,13 +159,15 @@ impl PageserverClient {
         &self,
         req: page_api::CheckRelExistsRequest,
     ) -> tonic::Result<page_api::CheckRelExistsResponse> {
-        self.retry
-            .with(async |_| {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.check_rel_exists(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.check_rel_exists(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Returns the total size of a database, as # of bytes.
@@ -157,13 +176,15 @@ impl PageserverClient {
         &self,
         req: page_api::GetDbSizeRequest,
     ) -> tonic::Result<page_api::GetDbSizeResponse> {
-        self.retry
-            .with(async |_| {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.get_db_size(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_db_size(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
@@ -193,6 +214,8 @@ impl PageserverClient {
             return Err(tonic::Status::invalid_argument("request attempt must be 0"));
         }
 
+        debug!("sending request: {req:?}");
+
         // The shards may change while we're fetching pages. We execute the request using a stable
         // view of the shards (especially important for requests that span shards), but retry the
         // top-level (pre-split) request to pick up shard changes. This can lead to unnecessary
@@ -201,13 +224,16 @@ impl PageserverClient {
         //
         // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
         // once we figure out how to handle these.
-        self.retry
-            .with(async |attempt| {
-                let mut req = req.clone();
-                req.request_id.attempt = attempt as u32;
-                Self::get_page_with_shards(req, &self.shards.load_full()).await
-            })
-            .await
+        let resp = Self::with_retries(CALL_TIMEOUT, async |attempt| {
+            let mut req = req.clone();
+            req.request_id.attempt = attempt as u32;
+            let shards = self.shards.load_full();
+            Self::with_timeout(REQUEST_TIMEOUT, Self::get_page_with_shards(req, &shards)).await
+        })
+        .await?;
+
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Fetches pages using the given shards. This uses a stable view of the shards, regardless of
@@ -290,13 +316,15 @@ impl PageserverClient {
         &self,
         req: page_api::GetRelSizeRequest,
     ) -> tonic::Result<page_api::GetRelSizeResponse> {
-        self.retry
-            .with(async |_| {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.get_rel_size(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_rel_size(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Fetches an SLRU segment.
@@ -305,13 +333,45 @@ impl PageserverClient {
         &self,
         req: page_api::GetSlruSegmentRequest,
     ) -> tonic::Result<page_api::GetSlruSegmentResponse> {
-        self.retry
-            .with(async |_| {
-                // SLRU segments are only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.get_slru_segment(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // SLRU segments are only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_slru_segment(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
+    }
+
+    /// Runs the given async closure with retries up to the given timeout. Only certain gRPC status
+    /// codes are retried, see [`Retry::should_retry`]. Returns `DeadlineExceeded` on timeout.
+    async fn with_retries<T, F, O>(timeout: Duration, f: F) -> tonic::Result<T>
+    where
+        F: FnMut(usize) -> O, // pass attempt number, starting at 0
+        O: Future<Output = tonic::Result<T>>,
+    {
+        Retry {
+            timeout: Some(timeout),
+            base_backoff: BASE_BACKOFF,
+            max_backoff: MAX_BACKOFF,
+        }
+        .with(f)
+        .await
+    }
+
+    /// Runs the given future with a timeout. Returns `DeadlineExceeded` on timeout.
+    async fn with_timeout<T>(
+        timeout: Duration,
+        f: impl Future<Output = tonic::Result<T>>,
+    ) -> tonic::Result<T> {
+        let started = Instant::now();
+        tokio::time::timeout(timeout, f).await.map_err(|_| {
+            tonic::Status::deadline_exceeded(format!(
+                "request timed out after {:.3}s",
+                started.elapsed().as_secs_f64()
+            ))
+        })?
     }
 }
 
@@ -525,19 +585,25 @@ impl Shard {
     }
 
     /// Returns a pooled client for this shard.
+    #[instrument(skip_all)]
     async fn client(&self) -> tonic::Result<ClientGuard> {
-        self.client_pool
-            .get()
-            .await
-            .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
+        warn_slow(
+            "client pool acquisition",
+            SLOW_THRESHOLD,
+            pin!(self.client_pool.get()),
+        )
+        .await
+        .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
     }
 
     /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
     /// pool (e.g. for prefetches).
+    #[instrument(skip_all, fields(bulk))]
     async fn stream(&self, bulk: bool) -> StreamGuard {
-        match bulk {
-            false => self.stream_pool.get().await,
-            true => self.bulk_stream_pool.get().await,
-        }
+        let pool = match bulk {
+            false => &self.stream_pool,
+            true => &self.bulk_stream_pool,
+        };
+        warn_slow("stream pool acquisition", SLOW_THRESHOLD, pin!(pool.get())).await
     }
 }
diff --git a/pageserver/client_grpc/src/retry.rs b/pageserver/client_grpc/src/retry.rs
index a1e0b8636f..8a138711e8 100644
--- a/pageserver/client_grpc/src/retry.rs
+++ b/pageserver/client_grpc/src/retry.rs
@@ -1,5 +1,6 @@
 use std::time::Duration;
 
+use futures::future::pending;
 use tokio::time::Instant;
 use tracing::{error, info, warn};
 
@@ -8,60 +9,54 @@ use utils::backoff::exponential_backoff_duration;
 /// A retry handler for Pageserver gRPC requests.
 ///
 /// This is used instead of backoff::retry for better control and observability.
-pub struct Retry;
+pub struct Retry {
+    /// Timeout across all retry attempts. If None, retries forever.
+    pub timeout: Option<Duration>,
+    /// The initial backoff duration. The first retry does not use a backoff.
+    pub base_backoff: Duration,
+    /// The maximum backoff duration.
+    pub max_backoff: Duration,
+}
 
 impl Retry {
-    /// The per-request timeout.
-    // TODO: tune these, and/or make them configurable. Should we retry forever?
-    const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
-    /// The total timeout across all attempts
-    const TOTAL_TIMEOUT: Duration = Duration::from_secs(60);
-    /// The initial backoff duration.
-    const BASE_BACKOFF: Duration = Duration::from_millis(10);
-    /// The maximum backoff duration.
-    const MAX_BACKOFF: Duration = Duration::from_secs(10);
-    /// If true, log successful requests. For debugging.
-    const LOG_SUCCESS: bool = false;
-
-    /// Runs the given async closure with timeouts and retries (exponential backoff), passing the
-    /// attempt number starting at 0. Logs errors, using the current tracing span for context.
+    /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
+    /// using the current tracing span for context.
     ///
-    /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
-    /// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
+    /// Only certain gRPC status codes are retried, see [`Self::should_retry`].
     pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
     where
-        F: FnMut(usize) -> O, // takes attempt number, starting at 0
+        F: FnMut(usize) -> O, // pass attempt number, starting at 0
         O: Future<Output = tonic::Result<T>>,
     {
         let started = Instant::now();
-        let deadline = started + Self::TOTAL_TIMEOUT;
+        let deadline = self.timeout.map(|timeout| started + timeout);
         let mut last_error = None;
         let mut retries = 0;
         loop {
-            // Set up a future to wait for the backoff (if any) and run the request with a timeout.
+            // Set up a future to wait for the backoff, if any, and run the closure.
             let backoff_and_try = async {
                 // NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
                 // https://github.com/tokio-rs/tokio/issues/6866
-                if let Some(backoff) = Self::backoff_duration(retries) {
+                if let Some(backoff) = self.backoff_duration(retries) {
                     tokio::time::sleep(backoff).await;
                 }
 
-                let request_started = Instant::now();
-                tokio::time::timeout(Self::REQUEST_TIMEOUT, f(retries))
-                    .await
-                    .map_err(|_| {
-                        tonic::Status::deadline_exceeded(format!(
-                            "request timed out after {:.3}s",
-                            request_started.elapsed().as_secs_f64()
-                        ))
-                    })?
+                f(retries).await
             };
 
-            // Wait for the backoff and request, or bail out if the total timeout is exceeded.
+            // Set up a future for the timeout, if any.
+            let timeout = async {
+                match deadline {
+                    Some(deadline) => tokio::time::sleep_until(deadline).await,
+                    None => pending().await,
+                }
+            };
+
+            // Wait for the backoff and request, or bail out if the timeout is exceeded.
             let result = tokio::select! {
                 result = backoff_and_try => result,
 
-                _ = tokio::time::sleep_until(deadline) => {
+                _ = timeout => {
                     let last_error = last_error.unwrap_or_else(|| {
                         tonic::Status::deadline_exceeded(format!(
                             "request timed out after {:.3}s",
@@ -79,7 +74,7 @@ impl Retry {
             match result {
                 // Success, return the result.
                 Ok(result) => {
-                    if retries > 0 || Self::LOG_SUCCESS {
+                    if retries > 0 {
                         info!(
                             "request succeeded after {retries} retries in {:.3}s",
                             started.elapsed().as_secs_f64(),
@@ -112,12 +107,13 @@ impl Retry {
         }
     }
 
-    /// Returns the backoff duration for the given retry attempt, or None for no backoff.
-    fn backoff_duration(retry: usize) -> Option<Duration> {
+    /// Returns the backoff duration for the given retry attempt, or None for no backoff. The first
+    /// attempt and first retry never backs off, so this returns None for 0 and 1 retries.
+    fn backoff_duration(&self, retries: usize) -> Option<Duration> {
         let backoff = exponential_backoff_duration(
-            retry as u32,
-            Self::BASE_BACKOFF.as_secs_f64(),
-            Self::MAX_BACKOFF.as_secs_f64(),
+            (retries as u32).saturating_sub(1), // first retry does not back off
+            self.base_backoff.as_secs_f64(),
+            self.max_backoff.as_secs_f64(),
         );
         (!backoff.is_zero()).then_some(backoff)
     }

From f18cc808f09adcc5fd570cdb2a5bddd2c77a0da9 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 12:47:26 +0200
Subject: [PATCH 44/56] pageserver/client_grpc: reap idle channels immediately
 (#12587)

## Problem

It can take 3x the idle timeout to reap a channel. We have to wait for
the idle timeout to trigger first for the stream, then the client, then
the channel.

Touches #11735.

## Summary of changes

Reap empty channels immediately, and rely indirectly on the
channel/stream timeouts.

This can still lead to 2x the idle timeout for streams (first stream
then client), but that's okay -- if the stream closes abruptly (e.g. due
to timeout or error) we want to keep the client around in the pool for a
while.
---
 pageserver/client_grpc/src/pool.rs | 66 +++++++++---------------------
 1 file changed, 19 insertions(+), 47 deletions(-)

diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 906872e091..4a29252cd9 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -9,19 +9,20 @@
 //!
 //! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
 //!   can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
-//!   per-channel client limit. Channels may be closed when they are no longer used by any clients.
+//!   per-channel client limit. Channels are closed immediately when empty, and indirectly rely on
+//!   client/stream idle timeouts.
 //!
 //! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
 //!   channel from the ChannelPool for the client's lifetime. A client can only be acquired by a
-//!   single caller at a time, and is returned to the pool when dropped. Idle clients may be removed
-//!   from the pool after some time, to free up the channel.
+//!   single caller at a time, and is returned to the pool when dropped. Idle clients are removed
+//!   from the pool after a while to free up resources.
 //!
 //! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
 //!   ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
 //!   returns a guard that can be used to send a single request, to properly enforce queue depth and
 //!   route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
 //!   possibly pipelining multiple requests from multiple callers on the same stream (up to some
-//!   queue depth). Idle streams may be removed from the pool after a while to free up the client.
+//!   queue depth). Idle streams are removed from the pool after a while to free up resources.
 //!
 //! Each channel corresponds to one TCP connection. Each client unary request and each stream
 //! corresponds to one HTTP/2 stream and server task.
@@ -48,14 +49,12 @@ use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;
 
-/// Reap channels/clients/streams that have been idle for this long.
+/// Reap clients/streams that have been idle for this long. Channels are reaped immediately when
+/// empty, and indirectly rely on the client/stream idle timeouts.
 ///
-/// TODO: this is per-pool. For nested pools, it can take up to 3x as long for a TCP connection to
-/// be reaped. First, we must wait for an idle stream to be reaped, which marks its client as idle.
-/// Then, we must wait for the idle client to be reaped, which marks its channel as idle. Then, we
-/// must wait for the idle channel to be reaped. Is that a problem? Maybe not, we just have to
-/// account for it when setting the reap threshold. Alternatively, we can immediately reap empty
-/// channels, and/or stream pool clients.
+/// A stream's client will be reaped after 2x the idle threshold (first stream the client), but
+/// that's okay -- if the stream closes abruptly (e.g. due to timeout or cancellation), we want to
+/// keep its client around in the pool for a while.
 const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
     false => Duration::from_secs(180),
     true => Duration::from_secs(1), // exercise reaping in tests
@@ -83,8 +82,6 @@ pub struct ChannelPool {
     max_clients_per_channel: NonZero<usize>,
     /// Open channels.
     channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
-    /// Reaps idle channels.
-    idle_reaper: Reaper,
     /// Channel ID generator.
     next_channel_id: AtomicUsize,
 }
@@ -96,9 +93,6 @@ struct ChannelEntry {
     channel: Channel,
     /// Number of clients using this channel.
     clients: usize,
-    /// The channel has been idle (no clients) since this time. None if channel is in use.
-    /// INVARIANT: Some if clients == 0, otherwise None.
-    idle_since: Option<Instant>,
 }
 
 impl ChannelPool {
@@ -108,15 +102,12 @@ impl ChannelPool {
         E: TryInto<Endpoint> + Send + Sync + 'static,
         <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
     {
-        let pool = Arc::new(Self {
+        Ok(Arc::new(Self {
             endpoint: endpoint.try_into()?,
             max_clients_per_channel,
             channels: Mutex::default(),
-            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
             next_channel_id: AtomicUsize::default(),
-        });
-        pool.idle_reaper.spawn(&pool);
-        Ok(pool)
+        }))
     }
 
     /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
@@ -137,22 +128,17 @@ impl ChannelPool {
         let mut channels = self.channels.lock().unwrap();
 
         // Try to find an existing channel with available capacity. We check entries in BTreeMap
-        // order, to fill up the lower-ordered channels first. The ClientPool also prefers clients
-        // with lower-ordered channel IDs first. This will cluster clients in lower-ordered
+        // order, to fill up the lower-ordered channels first. The client/stream pools also prefer
+        // clients with lower-ordered channel IDs first. This will cluster clients in lower-ordered
         // channels, and free up higher-ordered channels such that they can be reaped.
         for (&id, entry) in channels.iter_mut() {
             assert!(
                 entry.clients <= self.max_clients_per_channel.get(),
                 "channel overflow"
             );
-            assert_eq!(
-                entry.idle_since.is_some(),
-                entry.clients == 0,
-                "incorrect channel idle state"
-            );
+            assert_ne!(entry.clients, 0, "empty channel not reaped");
             if entry.clients < self.max_clients_per_channel.get() {
                 entry.clients += 1;
-                entry.idle_since = None;
                 return ChannelGuard {
                     pool: Arc::downgrade(self),
                     id,
@@ -169,7 +155,6 @@ impl ChannelPool {
         let entry = ChannelEntry {
             channel: channel.clone(),
             clients: 1, // account for the guard below
-            idle_since: None,
         };
         channels.insert(id, entry);
 
@@ -181,20 +166,6 @@ impl ChannelPool {
     }
 }
 
-impl Reapable for ChannelPool {
-    /// Reaps channels that have been idle since before the cutoff.
-    fn reap_idle(&self, cutoff: Instant) {
-        self.channels.lock().unwrap().retain(|_, entry| {
-            let Some(idle_since) = entry.idle_since else {
-                assert_ne!(entry.clients, 0, "empty channel not marked idle");
-                return true;
-            };
-            assert_eq!(entry.clients, 0, "idle channel has clients");
-            idle_since >= cutoff
-        })
-    }
-}
-
 /// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`,
 /// since the gRPC client requires an owned `Channel`.
 pub struct ChannelGuard {
@@ -211,7 +182,7 @@ impl ChannelGuard {
     }
 }
 
-/// Returns the channel to the pool.
+/// Returns the channel to the pool. The channel is closed when empty.
 impl Drop for ChannelGuard {
     fn drop(&mut self) {
         let Some(pool) = self.pool.upgrade() else {
@@ -220,11 +191,12 @@ impl Drop for ChannelGuard {
 
         let mut channels = pool.channels.lock().unwrap();
         let entry = channels.get_mut(&self.id).expect("unknown channel");
-        assert!(entry.idle_since.is_none(), "active channel marked idle");
         assert!(entry.clients > 0, "channel underflow");
         entry.clients -= 1;
+
+        // Reap empty channels immediately.
         if entry.clients == 0 {
-            entry.idle_since = Some(Instant::now()); // mark channel as idle
+            channels.remove(&self.id);
         }
     }
 }

From 30b877074cda2580c677ec9527b83ab975dee181 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 13:44:53 +0200
Subject: [PATCH 45/56] pagebench: add CPU profiling support (#12478)

## Problem

The new communicator gRPC client has significantly worse Pagebench
performance than a basic gRPC client. We need to find out why.

## Summary of changes

Add a `pagebench --profile` flag which takes a client CPU profile of the
benchmark and writes a flamegraph to `profile.svg`.
---
 Cargo.lock                       |  1 +
 pageserver/pagebench/Cargo.toml  |  1 +
 pageserver/pagebench/src/main.rs | 59 +++++++++++++++++++++++++-------
 3 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 14b460005a..bea8d3a7fd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4296,6 +4296,7 @@ dependencies = [
  "pageserver_client",
  "pageserver_client_grpc",
  "pageserver_page_api",
+ "pprof",
  "rand 0.8.5",
  "reqwest",
  "serde",
diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml
index 4086213830..609fef2b4f 100644
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -16,6 +16,7 @@ futures.workspace = true
 hdrhistogram.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
+pprof.workspace = true
 rand.workspace = true
 reqwest.workspace = true
 serde.workspace = true
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 6498203de3..ceca58e032 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -1,4 +1,7 @@
+use std::fs::File;
+
 use clap::Parser;
+use tracing::info;
 use utils::logging;
 
 /// Re-usable pieces of code that aren't CLI-specific.
@@ -24,7 +27,18 @@ mod cmd {
 
 /// Component-level performance test for pageserver.
 #[derive(clap::Parser)]
-enum Args {
+struct Args {
+    /// Takes a client CPU profile into profile.svg. The benchmark must exit cleanly before it's
+    /// written, e.g. via --runtime.
+    #[arg(long)]
+    profile: bool,
+
+    #[command(subcommand)]
+    subcommand: Subcommand,
+}
+
+#[derive(clap::Subcommand)]
+enum Subcommand {
     Basebackup(cmd::basebackup::Args),
     GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
     TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
@@ -33,25 +47,46 @@ enum Args {
     IdleStreams(cmd::idle_streams::Args),
 }
 
-fn main() {
+fn main() -> anyhow::Result<()> {
     logging::init(
         logging::LogFormat::Plain,
         logging::TracingErrorLayerEnablement::Disabled,
         logging::Output::Stderr,
-    )
-    .unwrap();
+    )?;
     logging::replace_panic_hook_with_tracing_panic_hook().forget();
 
     let args = Args::parse();
-    match args {
-        Args::Basebackup(args) => cmd::basebackup::main(args),
-        Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
-        Args::TriggerInitialSizeCalculation(args) => {
+
+    // Start a CPU profile if requested.
+    let mut profiler = None;
+    if args.profile {
+        profiler = Some(
+            pprof::ProfilerGuardBuilder::default()
+                .frequency(1000)
+                .blocklist(&["libc", "libgcc", "pthread", "vdso"])
+                .build()?,
+        );
+    }
+
+    match args.subcommand {
+        Subcommand::Basebackup(args) => cmd::basebackup::main(args),
+        Subcommand::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
+        Subcommand::TriggerInitialSizeCalculation(args) => {
             cmd::trigger_initial_size_calculation::main(args)
         }
-        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
-        Args::AuxFiles(args) => cmd::aux_files::main(args),
-        Args::IdleStreams(args) => cmd::idle_streams::main(args),
+        Subcommand::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
+        Subcommand::AuxFiles(args) => cmd::aux_files::main(args),
+        Subcommand::IdleStreams(args) => cmd::idle_streams::main(args),
+    }?;
+
+    // Generate a CPU flamegraph if requested.
+    if let Some(profiler) = profiler {
+        let report = profiler.report().build()?;
+        drop(profiler); // stop profiling
+        let file = File::create("profile.svg")?;
+        report.flamegraph(file)?;
+        info!("wrote CPU profile flamegraph to profile.svg")
     }
-    .unwrap()
+
+    Ok(())
 }

From 42ab34dc362b1b54dc96c43202b43d5ece558aa7 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 14:11:33 +0200
Subject: [PATCH 46/56] pageserver/client_grpc: don't pipeline GetPage requests
 (#12584)

## Problem

The communicator gRPC client currently attempts to pipeline GetPage
requests from multiple callers onto the same gRPC stream. This has a
number of issues:

* Head-of-line blocking: the request may block on e.g. layer download or
LSN wait, delaying the next request.
* Cancellation: we can't easily cancel in-progress requests (e.g. due to
timeout or backend termination), so it may keep blocking the next
request (even its own retry).
* Complex stream scheduling: picking a stream becomes harder/slower, and
additional Tokio tasks and synchronization is needed for stream
management.

Touches #11735.
Requires #12579.

## Summary of changes

This patch removes pipelining of gRPC stream requests, and instead
prefers to scale out the number of streams to achieve the same
throughput. Stream scheduling has been rewritten, and mostly follows the
same pattern as the client pool with exclusive acquisition by a single
caller.

[Benchmarks](https://github.com/neondatabase/neon/pull/12583) show that
the cost of an idle server-side GetPage worker task is about 26 KB (2.5
GB for 100,000), so we can afford to scale out.

This has a number of advantages:

* It (mostly) eliminates head-of-line blocking (except at the TCP
level).
* Cancellation becomes trivial, by closing the stream.
* Stream scheduling becomes significantly simpler and cheaper.
* Individual callers can still use client-side batching for pipelining.
---
 Cargo.lock                           |   1 +
 Cargo.toml                           |   2 +-
 pageserver/client_grpc/src/client.rs |  19 +-
 pageserver/client_grpc/src/pool.rs   | 397 +++++++++++----------------
 pageserver/page_api/src/model.rs     |   7 +-
 workspace_hack/Cargo.toml            |   2 +-
 6 files changed, 165 insertions(+), 263 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bea8d3a7fd..2f36790d30 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7564,6 +7564,7 @@ dependencies = [
  "futures-core",
  "pin-project-lite",
  "tokio",
+ "tokio-util",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 0d521ee4d9..df2064a4a7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -201,7 +201,7 @@ tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.g
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
-tokio-stream = "0.1"
+tokio-stream = { version = "0.1", features = ["sync"] }
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
 toml = "0.8"
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 7732585f7c..4b606d6939 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -32,21 +32,13 @@ const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
 /// Max number of concurrent unary request clients per shard.
 const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
 
-/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage
-/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`.
+/// Max number of concurrent GetPage streams per shard.
 const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
 
-/// Max number of pipelined requests per stream.
-const MAX_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(2).unwrap();
-
 /// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
-/// are more throughput-oriented, we have a smaller limit but higher queue depth.
+/// are more throughput-oriented, we have a smaller limit.
 const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
 
-/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus
-/// get a larger queue depth.
-const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
-
 /// The overall request call timeout, including retries and pool acquisition.
 /// TODO: should we retry forever? Should the caller decide?
 const CALL_TIMEOUT: Duration = Duration::from_secs(60);
@@ -272,7 +264,7 @@ impl PageserverClient {
         req: page_api::GetPageRequest,
         shard: &Shard,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let stream = shard.stream(req.request_class.is_bulk()).await;
+        let mut stream = shard.stream(req.request_class.is_bulk()).await?;
         let resp = stream.send(req.clone()).await?;
 
         // Convert per-request errors into a tonic::Status.
@@ -557,7 +549,6 @@ impl Shard {
                 None, // unbounded, limited by stream pool
             ),
             Some(MAX_STREAMS),
-            MAX_STREAM_QUEUE_DEPTH,
         );
 
         // Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
@@ -573,7 +564,6 @@ impl Shard {
                 None, // unbounded, limited by stream pool
             ),
             Some(MAX_BULK_STREAMS),
-            MAX_BULK_STREAM_QUEUE_DEPTH,
         );
 
         Ok(Self {
@@ -593,13 +583,12 @@ impl Shard {
             pin!(self.client_pool.get()),
         )
         .await
-        .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
     }
 
     /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
     /// pool (e.g. for prefetches).
     #[instrument(skip_all, fields(bulk))]
-    async fn stream(&self, bulk: bool) -> StreamGuard {
+    async fn stream(&self, bulk: bool) -> tonic::Result<StreamGuard> {
         let pool = match bulk {
             false => &self.stream_pool,
             true => &self.bulk_stream_pool,
diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 4a29252cd9..98a649b4c8 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -18,11 +18,27 @@
 //!   from the pool after a while to free up resources.
 //!
 //! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
-//!   ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
-//!   returns a guard that can be used to send a single request, to properly enforce queue depth and
-//!   route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
-//!   possibly pipelining multiple requests from multiple callers on the same stream (up to some
-//!   queue depth). Idle streams are removed from the pool after a while to free up resources.
+//!   ClientPool for the stream's lifetime. A stream can only be acquired by a single caller at a
+//!   time, and is returned to the pool when dropped. Idle streams are removed from the pool after
+//!   a while to free up resources.
+//!
+//!   The stream only supports sending a single, synchronous request at a time, and does not support
+//!   pipelining multiple requests from different callers onto the same stream -- instead, we scale
+//!   out concurrent streams to improve throughput. There are many reasons for this design choice:
+//!
+//!     * It (mostly) eliminates head-of-line blocking. A single stream is processed sequentially by
+//!       a single server task, which may block e.g. on layer downloads, LSN waits, etc.
+//!
+//!     * Cancellation becomes trivial, by closing the stream. Otherwise, if a caller goes away
+//!       (e.g. because of a timeout), the request would still be processed by the server and block
+//!       requests behind it in the stream. It might even block its own timeout retry.
+//!
+//!     * Stream scheduling becomes significantly simpler and cheaper.
+//!
+//!     * Individual callers can still use client-side batching for pipelining.
+//!
+//!     * Idle streams are cheap. Benchmarks show that an idle GetPage stream takes up about 26 KB
+//!       per stream (2.5 GB for 100,000 streams), so we can afford to scale out.
 //!
 //! Each channel corresponds to one TCP connection. Each client unary request and each stream
 //! corresponds to one HTTP/2 stream and server task.
@@ -30,20 +46,20 @@
 //! TODO: error handling (including custom error types).
 //! TODO: observability.
 
-use std::collections::{BTreeMap, HashMap};
+use std::collections::BTreeMap;
 use std::num::NonZero;
 use std::ops::{Deref, DerefMut};
+use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, Weak};
 use std::time::{Duration, Instant};
 
-use futures::StreamExt as _;
-use tokio::sync::mpsc::{Receiver, Sender};
-use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use futures::{Stream, StreamExt as _};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, watch};
+use tokio_stream::wrappers::WatchStream;
 use tokio_util::sync::CancellationToken;
 use tonic::codec::CompressionEncoding;
 use tonic::transport::{Channel, Endpoint};
-use tracing::{error, warn};
 
 use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
@@ -225,8 +241,7 @@ pub struct ClientPool {
     ///
     /// The first client in the map will be acquired next. The map is sorted by client ID, which in
     /// turn is sorted by its channel ID, such that we prefer acquiring idle clients from
-    /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
-    /// clients are reaped.
+    /// lower-ordered channels. This allows us to free up and reap higher-ordered channels.
     idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
     /// Reaps idle clients.
     idle_reaper: Reaper,
@@ -282,7 +297,7 @@ impl ClientPool {
     /// This is moderately performance-sensitive. It is called for every unary request, but these
     /// establish a new gRPC stream per request so they're already expensive. GetPage requests use
     /// the `StreamPool` instead.
-    pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
+    pub async fn get(self: &Arc<Self>) -> tonic::Result<ClientGuard> {
         // Acquire a permit if the pool is bounded.
         let mut permit = None;
         if let Some(limiter) = self.limiter.clone() {
@@ -300,7 +315,7 @@ impl ClientPool {
             });
         }
 
-        // Slow path: construct a new client.
+        // Construct a new client.
         let mut channel_guard = self.channel_pool.get();
         let client = page_api::Client::new(
             channel_guard.take(),
@@ -309,7 +324,8 @@ impl ClientPool {
             self.shard_id,
             self.auth_token.clone(),
             self.compression,
-        )?;
+        )
+        .map_err(|err| tonic::Status::internal(format!("failed to create client: {err}")))?;
 
         Ok(ClientGuard {
             pool: Arc::downgrade(self),
@@ -379,287 +395,187 @@ impl Drop for ClientGuard {
 /// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
 /// acquires a client from the inner `ClientPool` for the stream's lifetime.
 ///
-/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send
-/// a single request and await the response. Internally, requests are multiplexed across streams and
-/// channels. This allows proper queue depth enforcement and response routing.
+/// Individual streams only send a single request at a time, and do not pipeline multiple callers
+/// onto the same stream. Instead, we scale out the number of concurrent streams. This is primarily
+/// to eliminate head-of-line blocking. See the module documentation for more details.
 ///
 /// TODO: consider making this generic over request and response types; not currently needed.
 pub struct StreamPool {
     /// The client pool to acquire clients from. Must be unbounded.
     client_pool: Arc<ClientPool>,
-    /// All pooled streams.
+    /// Idle pooled streams. Acquired streams are removed from here and returned on drop.
     ///
-    /// Incoming requests will be sent over an existing stream with available capacity. If all
-    /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
-    /// stream has an associated Tokio task that processes requests and responses.
-    streams: Mutex<HashMap<StreamID, StreamEntry>>,
-    /// The max number of concurrent streams, or None if unbounded.
-    max_streams: Option<NonZero<usize>>,
-    /// The max number of concurrent requests per stream.
-    max_queue_depth: NonZero<usize>,
-    /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
-    /// None if the pool is unbounded.
+    /// The first stream in the map will be acquired next. The map is sorted by stream ID, which is
+    /// equivalent to the client ID and in turn sorted by its channel ID. This way we prefer
+    /// acquiring idle streams from lower-ordered channels, which allows us to free up and reap
+    /// higher-ordered channels.
+    idle: Mutex<BTreeMap<StreamID, StreamEntry>>,
+    /// Limits the max number of concurrent streams. None if the pool is unbounded.
     limiter: Option<Arc<Semaphore>>,
     /// Reaps idle streams.
     idle_reaper: Reaper,
-    /// Stream ID generator.
-    next_stream_id: AtomicUsize,
 }
 
-type StreamID = usize;
-type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>;
-type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>;
-type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
+/// The stream ID. Reuses the inner client ID.
+type StreamID = ClientID;
 
+/// A pooled stream.
 struct StreamEntry {
-    /// Sends caller requests to the stream task. The stream task exits when this is dropped.
-    sender: RequestSender,
-    /// Number of in-flight requests on this stream.
-    queue_depth: usize,
-    /// The time when this stream went idle (queue_depth == 0).
-    /// INVARIANT: Some if queue_depth == 0, otherwise None.
-    idle_since: Option<Instant>,
+    /// The bidirectional stream.
+    stream: BiStream,
+    /// The time when this stream was last used, i.e. when it was put back into `StreamPool::idle`.
+    idle_since: Instant,
+}
+
+/// A bidirectional GetPage stream and its client. Can send requests and receive responses.
+struct BiStream {
+    /// The owning client. Holds onto the channel slot while the stream is alive.
+    client: ClientGuard,
+    /// Stream for sending requests. Uses a watch channel, so it can only send a single request at a
+    /// time, and the caller must await the response before sending another request. This is
+    /// enforced by `StreamGuard::send`.
+    sender: watch::Sender<page_api::GetPageRequest>,
+    /// Stream for receiving responses.
+    receiver: Pin<Box<dyn Stream<Item = tonic::Result<page_api::GetPageResponse>> + Send>>,
 }
 
 impl StreamPool {
-    /// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth`
-    /// concurrent requests on each stream, and use up to `max_streams` concurrent streams.
+    /// Creates a new stream pool, using the given client pool. It will use up to `max_streams`
+    /// concurrent streams.
     ///
     /// The client pool must be unbounded. The stream pool will enforce its own limits, and because
     /// streams are long-lived they can cause persistent starvation if they exhaust the client pool.
     /// The stream pool should generally have its own dedicated client pool (but it can share a
     /// channel pool with others since these are always unbounded).
-    pub fn new(
-        client_pool: Arc<ClientPool>,
-        max_streams: Option<NonZero<usize>>,
-        max_queue_depth: NonZero<usize>,
-    ) -> Arc<Self> {
+    pub fn new(client_pool: Arc<ClientPool>, max_streams: Option<NonZero<usize>>) -> Arc<Self> {
         assert!(client_pool.limiter.is_none(), "bounded client pool");
         let pool = Arc::new(Self {
             client_pool,
-            streams: Mutex::default(),
-            limiter: max_streams.map(|max_streams| {
-                Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
-            }),
-            max_streams,
-            max_queue_depth,
+            idle: Mutex::default(),
+            limiter: max_streams.map(|max_streams| Arc::new(Semaphore::new(max_streams.get()))),
             idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
-            next_stream_id: AtomicUsize::default(),
         });
         pool.idle_reaper.spawn(&pool);
         pool
     }
 
-    /// Acquires an available stream from the pool, or spins up a new stream async if all streams
-    /// are full. Returns a guard that can be used to send a single request on the stream and await
-    /// the response, with queue depth quota already acquired. Blocks if the pool is at capacity
-    /// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight).
+    /// Acquires an available stream from the pool, or spins up a new stream if all streams are
+    /// full. Returns a guard that can be used to send requests and await the responses. Blocks if
+    /// the pool is full.
     ///
     /// This is very performance-sensitive, as it is on the GetPage hot path.
     ///
-    /// TODO: this must do something more sophisticated for performance. We want:
-    ///
-    /// * Cheap, concurrent access in the common case where we can use a pooled stream.
-    /// * Quick acquisition of pooled streams with available capacity.
-    /// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
-    /// * Prefer filling up existing streams' queue depth before spinning up new streams.
-    /// * Don't hold a lock while spinning up new streams.
-    /// * Allow concurrent clients to join onto streams while they're spun up.
-    /// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
-    ///
-    /// For now, we just do something simple but inefficient (linear scan under mutex).
-    pub async fn get(self: &Arc<Self>) -> StreamGuard {
+    /// TODO: is a `Mutex<BTreeMap>` performant enough? Will it become too contended? We can't
+    /// trivially use e.g. DashMap or sharding, because we want to pop lower-ordered streams first
+    /// to free up higher-ordered channels.
+    pub async fn get(self: &Arc<Self>) -> tonic::Result<StreamGuard> {
         // Acquire a permit if the pool is bounded.
         let mut permit = None;
         if let Some(limiter) = self.limiter.clone() {
             permit = Some(limiter.acquire_owned().await.expect("never closed"));
         }
-        let mut streams = self.streams.lock().unwrap();
 
-        // Look for a pooled stream with available capacity.
-        for (&id, entry) in streams.iter_mut() {
-            assert!(
-                entry.queue_depth <= self.max_queue_depth.get(),
-                "stream queue overflow"
-            );
-            assert_eq!(
-                entry.idle_since.is_some(),
-                entry.queue_depth == 0,
-                "incorrect stream idle state"
-            );
-            if entry.queue_depth < self.max_queue_depth.get() {
-                entry.queue_depth += 1;
-                entry.idle_since = None;
-                return StreamGuard {
-                    pool: Arc::downgrade(self),
-                    id,
-                    sender: entry.sender.clone(),
-                    permit,
-                };
-            }
+        // Fast path: acquire an idle stream from the pool.
+        if let Some((_, entry)) = self.idle.lock().unwrap().pop_first() {
+            return Ok(StreamGuard {
+                pool: Arc::downgrade(self),
+                stream: Some(entry.stream),
+                can_reuse: true,
+                permit,
+            });
         }
 
-        // No available stream, spin up a new one. We install the stream entry in the pool first and
-        // return the guard, while spinning up the stream task async. This allows other callers to
-        // join onto this stream and also create additional streams concurrently if this fills up.
-        let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
-        let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
-        let entry = StreamEntry {
-            sender: req_tx.clone(),
-            queue_depth: 1, // reserve quota for this caller
-            idle_since: None,
-        };
-        streams.insert(id, entry);
+        // Spin up a new stream. Uses a watch channel to send a single request at a time, since
+        // `StreamGuard::send` enforces this anyway and it avoids unnecessary channel overhead.
+        let mut client = self.client_pool.get().await?;
 
-        if let Some(max_streams) = self.max_streams {
-            assert!(streams.len() <= max_streams.get(), "stream overflow");
-        };
+        let (req_tx, req_rx) = watch::channel(page_api::GetPageRequest::default());
+        let req_stream = WatchStream::from_changes(req_rx);
+        let resp_stream = client.get_pages(req_stream).await?;
 
-        let client_pool = self.client_pool.clone();
-        let pool = Arc::downgrade(self);
-
-        tokio::spawn(async move {
-            if let Err(err) = Self::run_stream(client_pool, req_rx).await {
-                error!("stream failed: {err}");
-            }
-            // Remove stream from pool on exit. Weak reference to avoid holding the pool alive.
-            if let Some(pool) = pool.upgrade() {
-                let entry = pool.streams.lock().unwrap().remove(&id);
-                assert!(entry.is_some(), "unknown stream ID: {id}");
-            }
-        });
-
-        StreamGuard {
+        Ok(StreamGuard {
             pool: Arc::downgrade(self),
-            id,
-            sender: req_tx,
+            stream: Some(BiStream {
+                client,
+                sender: req_tx,
+                receiver: Box::pin(resp_stream),
+            }),
+            can_reuse: true,
             permit,
-        }
-    }
-
-    /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
-    /// bidirectional GetPage stream, then forwards requests and responses between callers and the
-    /// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be
-    /// atomic with pool stream acquisition.
-    ///
-    /// The task exits when the request channel is closed, or on a stream error. The caller is
-    /// responsible for removing the stream from the pool on exit.
-    async fn run_stream(
-        client_pool: Arc<ClientPool>,
-        mut caller_rx: RequestReceiver,
-    ) -> anyhow::Result<()> {
-        // Acquire a client from the pool and create a stream.
-        let mut client = client_pool.get().await?;
-
-        // NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could
-        // theoretically deadlock if both the client and server block on sends (since we're not
-        // reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and
-        // low queue depths, but it was seen to happen with the libpq protocol so better safe than
-        // sorry. It should never buffer more than the queue depth anyway, but using an unbounded
-        // channel guarantees that it will never block.
-        let (req_tx, req_rx) = mpsc::unbounded_channel();
-        let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
-        let mut resp_stream = client.get_pages(req_stream).await?;
-
-        // Track caller response channels by request ID. If the task returns early, these response
-        // channels will be dropped and the waiting callers will receive an error.
-        //
-        // NB: this will leak entries if the server doesn't respond to a request (by request ID).
-        // It shouldn't happen, and if it does it will often hold onto queue depth quota anyway and
-        // block further use. But we could consider reaping closed channels after some time.
-        let mut callers = HashMap::new();
-
-        // Process requests and responses.
-        loop {
-            tokio::select! {
-                // Receive requests from callers and send them to the stream.
-                req = caller_rx.recv() => {
-                    // Shut down if request channel is closed.
-                    let Some((req, resp_tx)) = req else {
-                        return Ok(());
-                    };
-
-                    // Store the response channel by request ID.
-                    if callers.contains_key(&req.request_id) {
-                        // Error on request ID duplicates. Ignore callers that went away.
-                        _ = resp_tx.send(Err(tonic::Status::invalid_argument(
-                            format!("duplicate request ID: {}", req.request_id),
-                        )));
-                        continue;
-                    }
-                    callers.insert(req.request_id, resp_tx);
-
-                    // Send the request on the stream. Bail out if the stream is closed.
-                    req_tx.send(req).map_err(|_| {
-                        tonic::Status::unavailable("stream closed")
-                    })?;
-                }
-
-                // Receive responses from the stream and send them to callers.
-                resp = resp_stream.next() => {
-                    // Shut down if the stream is closed, and bail out on stream errors.
-                    let Some(resp) = resp.transpose()? else {
-                        return Ok(())
-                    };
-
-                    // Send the response to the caller. Ignore errors if the caller went away.
-                    let Some(resp_tx) = callers.remove(&resp.request_id) else {
-                        warn!("received response for unknown request ID: {}", resp.request_id);
-                        continue;
-                    };
-                    _ = resp_tx.send(Ok(resp));
-                }
-            }
-        }
+        })
     }
 }
 
 impl Reapable for StreamPool {
     /// Reaps streams that have been idle since before the cutoff.
     fn reap_idle(&self, cutoff: Instant) {
-        self.streams.lock().unwrap().retain(|_, entry| {
-            let Some(idle_since) = entry.idle_since else {
-                assert_ne!(entry.queue_depth, 0, "empty stream not marked idle");
-                return true;
-            };
-            assert_eq!(entry.queue_depth, 0, "idle stream has requests");
-            idle_since >= cutoff
-        });
+        self.idle
+            .lock()
+            .unwrap()
+            .retain(|_, entry| entry.idle_since >= cutoff);
     }
 }
 
-/// A pooled stream reference. Can be used to send a single request, to properly enforce queue
-/// depth. Queue depth is already reserved and will be returned on drop.
+/// A stream acquired from the pool. Returned to the pool when dropped, unless there are still
+/// in-flight requests on the stream, or the stream failed.
 pub struct StreamGuard {
     pool: Weak<StreamPool>,
-    id: StreamID,
-    sender: RequestSender,
+    stream: Option<BiStream>,             // Some until dropped
+    can_reuse: bool,                      // returned to pool if true
     permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
 }
 
 impl StreamGuard {
-    /// Sends a request on the stream and awaits the response. Consumes the guard, since it's only
-    /// valid for a single request (to enforce queue depth). This also drops the guard on return and
-    /// returns the queue depth quota to the pool.
+    /// Sends a request on the stream and awaits the response. If the future is dropped before it
+    /// resolves (e.g. due to a timeout or cancellation), the stream will be closed to cancel the
+    /// request and is not returned to the pool. The same is true if the stream errors, in which
+    /// case the caller can't send further requests on the stream.
     ///
-    /// The `GetPageRequest::request_id` must be unique across in-flight requests.
+    /// We only support sending a single request at a time, to eliminate head-of-line blocking. See
+    /// module documentation for details.
     ///
     /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
     /// to avoid tearing down the stream for per-request errors. Callers must check this.
     pub async fn send(
-        self,
+        &mut self,
         req: page_api::GetPageRequest,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let (resp_tx, resp_rx) = oneshot::channel();
+        let req_id = req.request_id;
+        let stream = self.stream.as_mut().expect("not dropped");
 
-        self.sender
-            .send((req, resp_tx))
-            .await
+        // Mark the stream as not reusable while the request is in flight. We can't return the
+        // stream to the pool until we receive the response, to avoid head-of-line blocking and
+        // stale responses. Failed streams can't be reused either.
+        if !self.can_reuse {
+            return Err(tonic::Status::internal("stream can't be reused"));
+        }
+        self.can_reuse = false;
+
+        // Send the request and receive the response.
+        //
+        // NB: this uses a watch channel, so it's unsafe to change this code to pipeline requests.
+        stream
+            .sender
+            .send(req)
             .map_err(|_| tonic::Status::unavailable("stream closed"))?;
 
-        resp_rx
+        let resp = stream
+            .receiver
+            .next()
             .await
-            .map_err(|_| tonic::Status::unavailable("stream closed"))?
+            .ok_or_else(|| tonic::Status::unavailable("stream closed"))??;
+
+        if resp.request_id != req_id {
+            return Err(tonic::Status::internal(format!(
+                "response ID {} does not match request ID {}",
+                resp.request_id, req_id
+            )));
+        }
+
+        // Success, mark the stream as reusable.
+        self.can_reuse = true;
+
+        Ok(resp)
     }
 }
 
@@ -669,26 +585,21 @@ impl Drop for StreamGuard {
             return; // pool was dropped
         };
 
-        // Release the queue depth reservation on drop. This can prematurely decrement it if dropped
-        // before the response is received, but that's okay.
-        //
-        // TODO: actually, it's probably not okay. Queue depth release should be moved into the
-        // stream task, such that it continues to account for the queue depth slot until the server
-        // responds. Otherwise, if a slow request times out and keeps blocking the stream, the
-        // server will keep waiting on it and we can pile on subsequent requests (including the
-        // timeout retry) in the same stream and get blocked. But we may also want to avoid blocking
-        // requests on e.g. LSN waits and layer downloads, instead returning early to free up the
-        // stream. Or just scale out streams with a queue depth of 1 to sidestep all head-of-line
-        // blocking. TBD.
-        let mut streams = pool.streams.lock().unwrap();
-        let entry = streams.get_mut(&self.id).expect("unknown stream");
-        assert!(entry.idle_since.is_none(), "active stream marked idle");
-        assert!(entry.queue_depth > 0, "stream queue underflow");
-        entry.queue_depth -= 1;
-        if entry.queue_depth == 0 {
-            entry.idle_since = Some(Instant::now()); // mark stream as idle
+        // If the stream isn't reusable, it can't be returned to the pool.
+        if !self.can_reuse {
+            return;
         }
 
+        // Place the idle stream back into the pool.
+        let entry = StreamEntry {
+            stream: self.stream.take().expect("dropped once"),
+            idle_since: Instant::now(),
+        };
+        pool.idle
+            .lock()
+            .unwrap()
+            .insert(entry.stream.client.id, entry);
+
         _ = self.permit; // returned on drop, referenced for visibility
     }
 }
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index a9dd154285..76355ae546 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -49,7 +49,7 @@ impl From<ProtocolError> for tonic::Status {
 }
 
 /// The LSN a request should read at.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, Default)]
 pub struct ReadLsn {
     /// The request's read LSN.
     pub request_lsn: Lsn,
@@ -329,7 +329,7 @@ impl From<GetDbSizeResponse> for proto::GetDbSizeResponse {
 }
 
 /// Requests one or more pages.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Default)]
 pub struct GetPageRequest {
     /// A request ID. Will be included in the response. Should be unique for in-flight requests on
     /// the stream.
@@ -430,12 +430,13 @@ impl From<RequestID> for proto::RequestId {
 }
 
 /// A GetPage request class.
-#[derive(Clone, Copy, Debug, strum_macros::Display)]
+#[derive(Clone, Copy, Debug, Default, strum_macros::Display)]
 pub enum GetPageClass {
     /// Unknown class. For backwards compatibility: used when an older client version sends a class
     /// that a newer server version has removed.
     Unknown,
     /// A normal request. This is the default.
+    #[default]
     Normal,
     /// A prefetch request. NB: can only be classified on pg < 18.
     Prefetch,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index fc01deb92d..c61598cdf6 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -98,7 +98,7 @@ tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unpref
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["full", "test-util"] }
 tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
-tokio-stream = { version = "0.1", features = ["net"] }
+tokio-stream = { version = "0.1", features = ["net", "sync"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io-util", "rt"] }
 toml_edit = { version = "0.22", features = ["serde"] }
 tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "log"] }

From a203f9829a87fc47deece609b4a35b6239bd7322 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 14:30:28 +0200
Subject: [PATCH 47/56] pageserver: add timeline_id span when freezing layers
 (#12572)

## Problem

We don't log the timeline ID when rolling ephemeral layers during
housekeeping.

Resolves [LKB-179](https://databricks.atlassian.net/browse/LKB-179)

## Summary of changes

Add a span with timeline ID when calling `maybe_freeze_ephemeral_layer`
from the housekeeping loop.

We don't instrument the function itself, since future callers may not
have a span including the tenant_id already, but we don't want to
duplicate the tenant_id for these spans.
---
 pageserver/src/tenant.rs          | 8 +++++++-
 pageserver/src/tenant/timeline.rs | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f75a03a508..1a3016e7f1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3393,7 +3393,13 @@ impl TenantShard {
                 .collect_vec();
 
             for timeline in timelines {
-                timeline.maybe_freeze_ephemeral_layer().await;
+                // Include a span with the timeline ID. The parent span already has the tenant ID.
+                let span =
+                    info_span!("maybe_freeze_ephemeral_layer", timeline_id = %timeline.timeline_id);
+                timeline
+                    .maybe_freeze_ephemeral_layer()
+                    .instrument(span)
+                    .await;
             }
         }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fe622713e9..f2833674a9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1893,6 +1893,8 @@ impl Timeline {
     // an ephemeral layer open forever when idle.  It also freezes layers if the global limit on
     // ephemeral layer bytes has been breached.
     pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
         let Ok(mut write_guard) = self.write_lock.try_lock() else {
             // If the write lock is held, there is an active wal receiver: rolling open layers
             // is their responsibility while they hold this lock.

From eb830fa547f61aaaa582d765b440b156b6a780f2 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 15:22:38 +0200
Subject: [PATCH 48/56] pageserver/client_grpc: use unbounded pools (#12585)

## Problem

The communicator gRPC client currently uses bounded client/stream pools.
This can artificially constrain clients, especially after we remove
pipelining in #12584.

[Benchmarks](https://github.com/neondatabase/neon/pull/12583) show that
the cost of an idle server-side GetPage worker task is about 26 KB (2.5
GB for 100,000), so we can afford to scale out.

In the worst case, we'll degenerate to the current libpq state with one
stream per backend, but without the TCP connection overhead. In the
common case we expect significantly lower stream counts due to stream
sharing, driven e.g. by idle backends, LFC hits, read coalescing,
sharding (backends typically only talk to one shard at a time), etc.

Currently, Pageservers rarely serve more than 4000 backend connections,
so we have at least 2 orders of magnitude of headroom.

Touches #11735.
Requires #12584.

## Summary of changes

Remove the pool limits, and restructure the pools.

We still keep a separate bulk pool for Getpage batches of >4 pages (>32
KB), with fewer streams per connection. This reduces TCP-level
congestion and head-of-line blocking for non-bulk requests, and
concentrates larger window sizes on a smaller set of
streams/connections, presumably reducing memory usage. Apart from this,
bulk requests don't have any latency penalty compared to other requests.
---
 pageserver/client_grpc/src/client.rs | 104 ++++++++++++++-------------
 pageserver/page_api/src/model.rs     |  13 ----
 2 files changed, 55 insertions(+), 62 deletions(-)

diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 4b606d6939..3a9edc7092 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -24,20 +24,23 @@ use utils::shard::{ShardCount, ShardIndex, ShardNumber};
 /// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
 /// when full.
 ///
+/// Normal requests are small, and we don't pipeline them, so we can afford a large number of
+/// streams per connection.
+///
 /// TODO: tune all of these constants, and consider making them configurable.
-/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
-/// with only streams.
-const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
+const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(64).unwrap();
 
-/// Max number of concurrent unary request clients per shard.
-const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
+/// Max number of concurrent bulk GetPage streams per channel (i.e. TCP connection). These use a
+/// dedicated channel pool with a lower client limit, to avoid TCP-level head-of-line blocking and
+/// transmission delays. This also concentrates large window sizes on a smaller set of
+/// streams/connections, presumably reducing memory use.
+const MAX_BULK_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
 
-/// Max number of concurrent GetPage streams per shard.
-const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
-
-/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
-/// are more throughput-oriented, we have a smaller limit.
-const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
+/// The batch size threshold at which a GetPage request will use the bulk stream pool.
+///
+/// The gRPC initial window size is 64 KB. Each page is 8 KB, so let's avoid increasing the window
+/// size for the normal stream pool, and route requests for >= 5 pages (>32 KB) to the bulk pool.
+const BULK_THRESHOLD_BATCH_SIZE: usize = 5;
 
 /// The overall request call timeout, including retries and pool acquisition.
 /// TODO: should we retry forever? Should the caller decide?
@@ -62,10 +65,19 @@ const SLOW_THRESHOLD: Duration = Duration::from_secs(3);
 /// * Sharded tenants across multiple Pageservers.
 /// * Pooling of connections, clients, and streams for efficient resource use.
 /// * Concurrent use by many callers.
-/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling.
+/// * Internal handling of GetPage bidirectional streams.
 /// * Automatic retries.
 /// * Observability.
 ///
+/// The client has dedicated connection/client/stream pools per shard, for resource reuse. These
+/// pools are unbounded: we allow scaling out as many concurrent streams as needed to serve all
+/// concurrent callers, which mostly eliminates head-of-line blocking. Idle streams are fairly
+/// cheap: the server task currently uses 26 KB of memory, so we can comfortably fit 100,000
+/// concurrent idle streams (2.5 GB memory). The worst case degenerates to the old libpq case with
+/// one stream per backend, but without the TCP connection overhead. In the common case we expect
+/// significantly lower stream counts due to stream sharing, driven e.g. by idle backends, LFC hits,
+/// read coalescing, sharding (backends typically only talk to one shard at a time), etc.
+///
 /// TODO: this client does not support base backups or LSN leases, as these are only used by
 /// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
 pub struct PageserverClient {
@@ -264,7 +276,7 @@ impl PageserverClient {
         req: page_api::GetPageRequest,
         shard: &Shard,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let mut stream = shard.stream(req.request_class.is_bulk()).await?;
+        let mut stream = shard.stream(Self::is_bulk(&req)).await?;
         let resp = stream.send(req.clone()).await?;
 
         // Convert per-request errors into a tonic::Status.
@@ -365,6 +377,11 @@ impl PageserverClient {
             ))
         })?
     }
+
+    /// Returns true if the request is considered a bulk request and should use the bulk pool.
+    fn is_bulk(req: &page_api::GetPageRequest) -> bool {
+        req.block_numbers.len() >= BULK_THRESHOLD_BATCH_SIZE
+    }
 }
 
 /// Shard specification for a PageserverClient.
@@ -492,15 +509,23 @@ impl Shards {
     }
 }
 
-/// A single shard. Uses dedicated resource pools with the following structure:
+/// A single shard. Has dedicated resource pools with the following structure:
 ///
-/// * Channel pool: unbounded.
-///   * Unary client pool: MAX_UNARY_CLIENTS.
-///   * Stream client pool: unbounded.
-///     * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH.
-/// * Bulk channel pool: unbounded.
+/// * Channel pool: MAX_CLIENTS_PER_CHANNEL.
+///   * Client pool: unbounded.
+///     * Stream pool: unbounded.
+/// * Bulk channel pool: MAX_BULK_CLIENTS_PER_CHANNEL.
 ///   * Bulk client pool: unbounded.
-///     * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
+///     * Bulk stream pool: unbounded.
+///
+/// We use a separate bulk channel pool with a lower concurrency limit for large batch requests.
+/// This avoids TCP-level head-of-line blocking, and also concentrates large window sizes on a
+/// smaller set of streams/connections, which presumably reduces memory use. Neither of these pools
+/// are bounded, nor do they pipeline requests, so the latency characteristics should be mostly
+/// similar (except for TCP transmission time).
+///
+/// TODO: since we never use bounded pools, we could consider removing the pool limiters. However,
+/// the code is fairly trivial, so we may as well keep them around for now in case we need them.
 struct Shard {
     /// The shard ID.
     id: ShardIndex,
@@ -508,7 +533,7 @@ struct Shard {
     client_pool: Arc<ClientPool>,
     /// GetPage stream pool.
     stream_pool: Arc<StreamPool>,
-    /// GetPage stream pool for bulk requests, e.g. prefetches.
+    /// GetPage stream pool for bulk requests.
     bulk_stream_pool: Arc<StreamPool>,
 }
 
@@ -522,48 +547,30 @@ impl Shard {
         auth_token: Option<String>,
         compression: Option<CompressionEncoding>,
     ) -> anyhow::Result<Self> {
-        // Common channel pool for unary and stream requests. Bounded by client/stream pools.
-        let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
-
-        // Client pool for unary requests.
+        // Shard pools for unary requests and non-bulk GetPage requests.
         let client_pool = ClientPool::new(
-            channel_pool.clone(),
+            ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?,
             tenant_id,
             timeline_id,
             shard_id,
             auth_token.clone(),
             compression,
-            Some(MAX_UNARY_CLIENTS),
+            None, // unbounded
         );
+        let stream_pool = StreamPool::new(client_pool.clone(), None); // unbounded
 
-        // GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients,
-        // but shares a channel pool with it (as it's unbounded).
-        let stream_pool = StreamPool::new(
-            ClientPool::new(
-                channel_pool.clone(),
-                tenant_id,
-                timeline_id,
-                shard_id,
-                auth_token.clone(),
-                compression,
-                None, // unbounded, limited by stream pool
-            ),
-            Some(MAX_STREAMS),
-        );
-
-        // Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
-        // to avoid head-of-line blocking of latency-sensitive requests.
+        // Bulk GetPage stream pool for large batches (prefetches, sequential scans, vacuum, etc.).
         let bulk_stream_pool = StreamPool::new(
             ClientPool::new(
-                ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?,
+                ChannelPool::new(url, MAX_BULK_CLIENTS_PER_CHANNEL)?,
                 tenant_id,
                 timeline_id,
                 shard_id,
                 auth_token,
                 compression,
-                None, // unbounded, limited by stream pool
+                None, // unbounded,
             ),
-            Some(MAX_BULK_STREAMS),
+            None, // unbounded
         );
 
         Ok(Self {
@@ -585,8 +592,7 @@ impl Shard {
         .await
     }
 
-    /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
-    /// pool (e.g. for prefetches).
+    /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk pool.
     #[instrument(skip_all, fields(bulk))]
     async fn stream(&self, bulk: bool) -> tonic::Result<StreamGuard> {
         let pool = match bulk {
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index 76355ae546..a3286ecf15 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -444,19 +444,6 @@ pub enum GetPageClass {
     Background,
 }
 
-impl GetPageClass {
-    /// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than
-    /// latency-sensitive).
-    pub fn is_bulk(&self) -> bool {
-        match self {
-            Self::Unknown => false,
-            Self::Normal => false,
-            Self::Prefetch => true,
-            Self::Background => true,
-        }
-    }
-}
-
 impl From<proto::GetPageClass> for GetPageClass {
     fn from(pb: proto::GetPageClass) -> Self {
         match pb {

From 4fedcbc0ac94d399808384911b92f8417b74c286 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Mon, 14 Jul 2025 15:25:25 +0200
Subject: [PATCH 49/56] Leverage the existing mechanism to retry 404 errors
 instead of implementing new code. (#12567)

## Problem
In https://github.com/neondatabase/neon/pull/12513, the new code was
implemented to retry 404 errors caused by the replication lag. However,
this implemented the new logic, making the script more complicated,
while we have an existing one in `neon_api.py`.
## Summary of changes
The existing mechanism is used to retry 404 errors.

---------

Co-authored-by: Alexey Masterov <alexey.masterov@databricks.com>
---
 test_runner/fixtures/neon_api.py          | 19 +++++++++++++------
 test_runner/random_ops/test_random_ops.py | 22 +++-------------------
 2 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index 9d85b9a332..e0f16abe77 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -34,7 +34,9 @@ class NeonAPI:
         self.retries524 = 0
         self.retries4xx = 0
 
-    def __request(self, method: str | bytes, endpoint: str, **kwargs: Any) -> requests.Response:
+    def __request(
+        self, method: str | bytes, endpoint: str, retry404: bool = False, **kwargs: Any
+    ) -> requests.Response:
         kwargs["headers"] = kwargs.get("headers", {})
         kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}"
 
@@ -55,10 +57,12 @@ class NeonAPI:
                 resp.raise_for_status()
                 break
             elif resp.status_code >= 400:
-                if resp.status_code == 422:
-                    if resp.json()["message"] == "branch not ready yet":
-                        retry = True
-                        self.retries4xx += 1
+                if resp.status_code == 404 and retry404:
+                    retry = True
+                    self.retries4xx += 1
+                elif resp.status_code == 422 and resp.json()["message"] == "branch not ready yet":
+                    retry = True
+                    self.retries4xx += 1
                 elif resp.status_code == 423 and resp.json()["message"] in {
                     "endpoint is in some transitive state, could not suspend",
                     "project already has running conflicting operations, scheduling of new ones is prohibited",
@@ -66,7 +70,7 @@ class NeonAPI:
                     retry = True
                     self.retries4xx += 1
                 elif resp.status_code == 524:
-                    log.info("The request was timed out, trying to get operations")
+                    log.info("The request was timed out")
                     retry = True
                     self.retries524 += 1
             if retry:
@@ -203,6 +207,9 @@ class NeonAPI:
         resp = self.__request(
             "GET",
             f"/projects/{project_id}/branches/{branch_id}",
+            # XXX Retry get parent details to work around the issue
+            # https://databricks.atlassian.net/browse/LKB-279
+            retry404=True,
             headers={
                 "Accept": "application/json",
             },
diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py
index 5c43b06bc5..b106e9b729 100644
--- a/test_runner/random_ops/test_random_ops.py
+++ b/test_runner/random_ops/test_random_ops.py
@@ -13,7 +13,6 @@ from typing import TYPE_CHECKING, Any
 
 import pytest
 from fixtures.log_helper import log
-from requests import HTTPError
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -153,26 +152,11 @@ class NeonBranch:
             return
         self.updated_at = datetime.fromisoformat(res["branch"]["updated_at"])
         parent_id: str = res["branch"]["parent_id"]
-        # XXX Retry get parent details to work around the issue
-        # https://databricks.atlassian.net/browse/LKB-279
-        target_time = datetime.now() + timedelta(seconds=30)
-        while datetime.now() < target_time:
-            try:
-                parent_def = self.neon_api.get_branch_details(self.project_id, parent_id)
-            except HTTPError as he:
-                if he.response.status_code == 404:
-                    log.info("Branch not found, waiting...")
-                    time.sleep(1)
-                else:
-                    raise HTTPError(he) from he
-            else:
-                break
-        else:
-            raise RuntimeError(f"Branch {parent_id} not found")
-
         # Creates an object for the parent branch
         # After the reset operation a new parent branch is created
-        parent = NeonBranch(self.project, parent_def, True)
+        parent = NeonBranch(
+            self.project, self.neon_api.get_branch_details(self.project_id, parent_id), True
+        )
         self.project.branches[parent_id] = parent
         self.parent = parent
         parent.children[self.id] = self

From 2288efae662e41fcd2cf7369e3b4b9dc95d25e95 Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Mon, 14 Jul 2025 14:41:31 +0100
Subject: [PATCH 50/56] Performance test for LFC prewarm (#12524)

https://github.com/neondatabase/cloud/issues/19011

Measure relative performance for prewarmed and non-prewarmed endpoints.
Add test that runs on every commit, and one performance test with a
remote cluster.
---
 .github/actionlint.yml                      |   1 +
 .github/workflows/benchmarking.yml          |  72 +++++++++
 test_runner/fixtures/neon_api.py            |   4 +
 test_runner/performance/test_lfc_prewarm.py | 167 ++++++++++++++++++++
 4 files changed, 244 insertions(+)
 create mode 100644 test_runner/performance/test_lfc_prewarm.py

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 3142a36fa0..25b2fc702a 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -31,6 +31,7 @@ config-variables:
   - NEON_PROD_AWS_ACCOUNT_ID
   - PGREGRESS_PG16_PROJECT_ID
   - PGREGRESS_PG17_PROJECT_ID
+  - PREWARM_PGBENCH_SIZE
   - REMOTE_STORAGE_AZURE_CONTAINER
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_CICD_CHANNEL_ID
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 79371ec704..df80bad579 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -219,6 +219,7 @@ jobs:
           --ignore test_runner/performance/test_cumulative_statistics_persistence.py
           --ignore test_runner/performance/test_perf_many_relations.py
           --ignore test_runner/performance/test_perf_oltp_large_tenant.py
+          --ignore test_runner/performance/test_lfc_prewarm.py
       env:
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -410,6 +411,77 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
+  prewarm-test:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }}
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 17
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+
+    steps:
+    - name: Harden the runner (Audit all outbound calls)
+      uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+      with:
+        egress-policy: audit
+
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Run prewarm benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_lfc_prewarm.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      id: create-allure-report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+      with:
+        store-test-results-into-db: true
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
   generate-matrices:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index e0f16abe77..bb618325e0 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -314,6 +314,10 @@ class NeonAPI:
         if endpoint_type:
             data["endpoint"]["type"] = endpoint_type
         if settings:
+            # otherwise we get 400 "settings must not be nil"
+            # TODO(myrrc): fix on cplane side
+            if "pg_settings" not in settings:
+                settings["pg_settings"] = {}
             data["endpoint"]["settings"] = settings
 
         resp = self.__request(
diff --git a/test_runner/performance/test_lfc_prewarm.py b/test_runner/performance/test_lfc_prewarm.py
new file mode 100644
index 0000000000..ad2c759a63
--- /dev/null
+++ b/test_runner/performance/test_lfc_prewarm.py
@@ -0,0 +1,167 @@
+from __future__ import annotations
+
+import os
+import timeit
+import traceback
+from concurrent.futures import ThreadPoolExecutor as Exec
+from pathlib import Path
+from time import sleep
+from typing import TYPE_CHECKING, Any, cast
+
+import pytest
+from fixtures.benchmark_fixture import NeonBenchmarker, PgBenchRunResult
+from fixtures.log_helper import log
+from fixtures.neon_api import NeonAPI, connection_parameters_to_env
+
+if TYPE_CHECKING:
+    from fixtures.compare_fixtures import NeonCompare
+    from fixtures.neon_fixtures import Endpoint, PgBin
+    from fixtures.pg_version import PgVersion
+
+from performance.test_perf_pgbench import utc_now_timestamp
+
+# These tests compare performance for a write-heavy and read-heavy workloads of an ordinary endpoint
+# compared to the endpoint which saves its LFC and prewarms using it on startup.
+
+
+def test_compare_prewarmed_pgbench_perf(neon_compare: NeonCompare):
+    env = neon_compare.env
+    env.create_branch("normal")
+    env.create_branch("prewarmed")
+    pg_bin = neon_compare.pg_bin
+    ep_normal: Endpoint = env.endpoints.create_start("normal")
+    ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed", autoprewarm=True)
+
+    for ep in [ep_normal, ep_prewarmed]:
+        connstr: str = ep.connstr()
+        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", connstr, "-s100"])
+        ep.safe_psql("CREATE EXTENSION neon")
+        client = ep.http_client()
+        client.offload_lfc()
+        ep.stop()
+        ep.start()
+        client.prewarm_lfc_wait()
+
+        run_start_timestamp = utc_now_timestamp()
+        t0 = timeit.default_timer()
+        out = pg_bin.run_capture(["pgbench", "-c10", "-T10", connstr])
+        run_duration = timeit.default_timer() - t0
+        run_end_timestamp = utc_now_timestamp()
+
+        stdout = Path(f"{out}.stdout").read_text()
+        res = PgBenchRunResult.parse_from_stdout(
+            stdout=stdout,
+            run_duration=run_duration,
+            run_start_timestamp=run_start_timestamp,
+            run_end_timestamp=run_end_timestamp,
+        )
+        name: str = cast("str", ep.branch_name)
+        neon_compare.zenbenchmark.record_pg_bench_result(name, res)
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(30 * 60)
+def test_compare_prewarmed_pgbench_perf_benchmark(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    name = f"Test prewarmed pgbench performance, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
+    project = neon_api.create_project(pg_version, name)
+    project_id = project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+    err = False
+    try:
+        benchmark_impl(pg_bin, neon_api, project, zenbenchmark)
+    except Exception as e:
+        err = True
+        log.error(f"Caught exception: {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not err
+        neon_api.delete_project(project_id)
+
+
+def benchmark_impl(
+    pg_bin: PgBin, neon_api: NeonAPI, project: dict[str, Any], zenbenchmark: NeonBenchmarker
+):
+    pgbench_size = int(os.getenv("PGBENCH_SIZE") or "3424")  # 50GB
+    offload_secs = 20
+    test_duration_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60}"
+    # prewarm API is not publicly exposed. In order to test performance of a
+    # fully prewarmed endpoint, wait after it restarts
+    prewarmed_sleep_secs = 30
+
+    branch_id = project["branch"]["id"]
+    project_id = project["project"]["id"]
+    normal_env = connection_parameters_to_env(
+        project["connection_uris"][0]["connection_parameters"]
+    )
+    normal_id = project["endpoints"][0]["id"]
+
+    prewarmed_branch_id = neon_api.create_branch(
+        project_id, "prewarmed", parent_id=branch_id, add_endpoint=False
+    )["branch"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+
+    ep_prewarmed = neon_api.create_endpoint(
+        project_id,
+        prewarmed_branch_id,
+        endpoint_type="read_write",
+        settings={"autoprewarm": True, "offload_lfc_interval_seconds": offload_secs},
+    )
+    neon_api.wait_for_operation_to_finish(project_id)
+
+    prewarmed_env = normal_env.copy()
+    prewarmed_env["PGHOST"] = ep_prewarmed["endpoint"]["host"]
+    prewarmed_id = ep_prewarmed["endpoint"]["id"]
+
+    def bench(endpoint_name, endpoint_id, env):
+        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", f"-s{pgbench_size}"], env)
+        sleep(offload_secs * 2)  # ensure LFC is offloaded after pgbench finishes
+        neon_api.restart_endpoint(project_id, endpoint_id)
+        sleep(prewarmed_sleep_secs)
+
+        run_start_timestamp = utc_now_timestamp()
+        t0 = timeit.default_timer()
+        out = pg_bin.run_capture(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env)
+        run_duration = timeit.default_timer() - t0
+        run_end_timestamp = utc_now_timestamp()
+
+        stdout = Path(f"{out}.stdout").read_text()
+        res = PgBenchRunResult.parse_from_stdout(
+            stdout=stdout,
+            run_duration=run_duration,
+            run_start_timestamp=run_start_timestamp,
+            run_end_timestamp=run_end_timestamp,
+        )
+        zenbenchmark.record_pg_bench_result(endpoint_name, res)
+
+    with Exec(max_workers=2) as exe:
+        exe.submit(bench, "normal", normal_id, normal_env)
+        exe.submit(bench, "prewarmed", prewarmed_id, prewarmed_env)
+
+
+def test_compare_prewarmed_read_perf(neon_compare: NeonCompare):
+    env = neon_compare.env
+    env.create_branch("normal")
+    env.create_branch("prewarmed")
+    ep_normal: Endpoint = env.endpoints.create_start("normal")
+    ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed", autoprewarm=True)
+
+    sql = [
+        "CREATE EXTENSION neon",
+        "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')",
+        "INSERT INTO foo SELECT FROM generate_series(1,1000000)",
+    ]
+    for ep in [ep_normal, ep_prewarmed]:
+        ep.safe_psql_many(sql)
+        client = ep.http_client()
+        client.offload_lfc()
+        ep.stop()
+        ep.start()
+        client.prewarm_lfc_wait()
+        with neon_compare.record_duration(f"{ep.branch_name}_run_duration"):
+            ep.safe_psql("SELECT count(*) from foo")

From f67a8a173ec889a163f0b89b43dd6957da45b82c Mon Sep 17 00:00:00 2001
From: HaoyuHuang <haoyu.huang.68@gmail.com>
Date: Mon, 14 Jul 2025 09:37:04 -0700
Subject: [PATCH 51/56] A few SK changes (#12577)

# TLDR
This PR is a no-op.

## Problem
When a SK loses a disk, it must recover all WALs from the very
beginning. This may take days/weeks to catch up to the latest WALs for
all timelines it owns.

## Summary of changes
When SK starts up,
if it finds that it has 0 timelines,
- it will ask SC for the timeline it owns.
- Then, pulls the timeline from its peer safekeepers to restore the WAL
redundancy right away.

After pulling timeline is complete, it will become active and accepts
new WALs.

The current impl is a prototype. We can optimize the impl further, e.g.,
parallel pull timelines.

---------

Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>
---
 control_plane/storcon_cli/src/main.rs         |   1 +
 libs/pageserver_api/src/controller_api.rs     |  39 ++
 libs/utils/src/ip_address.rs                  |  73 ++++
 libs/utils/src/lib.rs                         |   3 +
 pageserver/src/controller_upcall_client.rs    |   1 +
 safekeeper/client/src/mgmt_api.rs             |   9 +-
 safekeeper/src/bin/safekeeper.rs              |  24 ++
 safekeeper/src/hadron.rs                      | 388 ++++++++++++++++++
 safekeeper/src/http/routes.rs                 |  11 +-
 safekeeper/src/lib.rs                         |  12 +
 safekeeper/src/metrics.rs                     |  37 ++
 safekeeper/src/pull_timeline.rs               | 128 ++++--
 .../tests/walproposer_sim/safekeeper.rs       |   5 +
 test_runner/regress/test_wal_restore.py       | 113 +++++
 14 files changed, 808 insertions(+), 36 deletions(-)
 create mode 100644 libs/utils/src/ip_address.rs
 create mode 100644 safekeeper/src/hadron.rs

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 24fd34a87a..fcc5549beb 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -476,6 +476,7 @@ async fn main() -> anyhow::Result<()> {
                         listen_http_port,
                         listen_https_port,
                         availability_zone_id: AvailabilityZone(availability_zone_id),
+                        node_ip_addr: None,
                     }),
                 )
                 .await?;
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index b02c6a613a..8f86b03f72 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,5 +1,6 @@
 use std::collections::{HashMap, HashSet};
 use std::fmt::Display;
+use std::net::IpAddr;
 use std::str::FromStr;
 use std::time::{Duration, Instant};
 
@@ -60,6 +61,11 @@ pub struct NodeRegisterRequest {
     pub listen_https_port: Option<u16>,
 
     pub availability_zone_id: AvailabilityZone,
+
+    // Reachable IP address of the PS/SK registering, if known.
+    // Hadron Cluster Coordiantor will update the DNS record of the registering node
+    // with this IP address.
+    pub node_ip_addr: Option<IpAddr>,
 }
 
 #[derive(Serialize, Deserialize)]
@@ -545,6 +551,39 @@ pub struct SafekeeperDescribeResponse {
     pub scheduling_policy: SkSchedulingPolicy,
 }
 
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct TimelineSafekeeperPeer {
+    pub node_id: NodeId,
+    pub listen_http_addr: String,
+    pub http_port: i32,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SCSafekeeperTimeline {
+    // SC does not know the tenant id.
+    pub timeline_id: TimelineId,
+    pub peers: Vec<NodeId>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SCSafekeeperTimelinesResponse {
+    pub timelines: Vec<SCSafekeeperTimeline>,
+    pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SafekeeperTimeline {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub peers: Vec<NodeId>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SafekeeperTimelinesResponse {
+    pub timelines: Vec<SafekeeperTimeline>,
+    pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
+}
+
 #[derive(Serialize, Deserialize, Clone)]
 pub struct SafekeeperSchedulingPolicyRequest {
     pub scheduling_policy: SkSchedulingPolicy,
diff --git a/libs/utils/src/ip_address.rs b/libs/utils/src/ip_address.rs
new file mode 100644
index 0000000000..d0834d0ba5
--- /dev/null
+++ b/libs/utils/src/ip_address.rs
@@ -0,0 +1,73 @@
+use std::env::{VarError, var};
+use std::error::Error;
+use std::net::IpAddr;
+use std::str::FromStr;
+
+/// Name of the environment variable containing the reachable IP address of the node. If set, the IP address contained in this
+/// environment variable is used as the reachable IP address of the pageserver or safekeeper node during node registration.
+/// In a Kubernetes environment, this environment variable should be set by Kubernetes to the Pod IP (specified in the Pod
+/// template).
+pub const HADRON_NODE_IP_ADDRESS: &str = "HADRON_NODE_IP_ADDRESS";
+
+/// Read the reachable IP address of this page server from env var HADRON_NODE_IP_ADDRESS.
+/// In Kubernetes this environment variable is set to the Pod IP (specified in the Pod template).
+pub fn read_node_ip_addr_from_env() -> Result<Option<IpAddr>, Box<dyn Error>> {
+    match var(HADRON_NODE_IP_ADDRESS) {
+        Ok(v) => {
+            if let Ok(addr) = IpAddr::from_str(&v) {
+                Ok(Some(addr))
+            } else {
+                Err(format!("Invalid IP address string: {v}. Cannot be parsed as either an IPv4 or an IPv6 address.").into())
+            }
+        }
+        Err(VarError::NotPresent) => Ok(None),
+        Err(e) => Err(e.into()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::env;
+    use std::net::{Ipv4Addr, Ipv6Addr};
+
+    #[test]
+    fn test_read_node_ip_addr_from_env() {
+        // SAFETY: test code
+        unsafe {
+            // Test with a valid IPv4 address
+            env::set_var(HADRON_NODE_IP_ADDRESS, "192.168.1.1");
+            let result = read_node_ip_addr_from_env().unwrap();
+            assert_eq!(result, Some(IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1))));
+
+            // Test with a valid IPv6 address
+            env::set_var(
+                HADRON_NODE_IP_ADDRESS,
+                "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
+            );
+        }
+        let result = read_node_ip_addr_from_env().unwrap();
+        assert_eq!(
+            result,
+            Some(IpAddr::V6(
+                Ipv6Addr::from_str("2001:0db8:85a3:0000:0000:8a2e:0370:7334").unwrap()
+            ))
+        );
+
+        // Test with an invalid IP address
+        // SAFETY: test code
+        unsafe {
+            env::set_var(HADRON_NODE_IP_ADDRESS, "invalid_ip");
+        }
+        let result = read_node_ip_addr_from_env();
+        assert!(result.is_err());
+
+        // Test with no environment variable set
+        // SAFETY: test code
+        unsafe {
+            env::remove_var(HADRON_NODE_IP_ADDRESS);
+        }
+        let result = read_node_ip_addr_from_env().unwrap();
+        assert_eq!(result, None);
+    }
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 2b81da017d..69771be5dc 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,6 +26,9 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
 
+// utility functions to obtain reachable IP addresses in PS/SK nodes.
+pub mod ip_address;
+
 pub mod shard;
 
 mod hex;
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index f1f9aaf43c..be1de43d18 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -194,6 +194,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
                         listen_http_port: m.http_port,
                         listen_https_port: m.https_port,
                         availability_zone_id: az_id.expect("Checked above"),
+                        node_ip_addr: None,
                     })
                 }
                 Err(e) => {
diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index b4bb193a4b..3c8db3029e 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -6,10 +6,10 @@
 use std::error::Error as _;
 
 use http_utils::error::HttpErrorBody;
-use reqwest::{IntoUrl, Method, StatusCode};
+use reqwest::{IntoUrl, Method, Response, StatusCode};
 use safekeeper_api::models::{
     self, PullTimelineRequest, PullTimelineResponse, SafekeeperStatus, SafekeeperUtilization,
-    TimelineCreateRequest, TimelineStatus,
+    TimelineCreateRequest,
 };
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;
@@ -161,13 +161,12 @@ impl Client {
         &self,
         tenant_id: TenantId,
         timeline_id: TimelineId,
-    ) -> Result<TimelineStatus> {
+    ) -> Result<Response> {
         let uri = format!(
             "{}/v1/tenant/{}/timeline/{}",
             self.mgmt_api_endpoint, tenant_id, timeline_id
         );
-        let resp = self.get(&uri).await?;
-        resp.json().await.map_err(Error::ReceiveBody)
+        self.get(&uri).await
     }
 
     pub async fn snapshot(
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index b2d5976ef4..79cf2f9149 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -23,6 +23,7 @@ use safekeeper::defaults::{
     DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
     DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
 };
+use safekeeper::hadron;
 use safekeeper::wal_backup::WalBackup;
 use safekeeper::{
     BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf,
@@ -252,6 +253,10 @@ struct Args {
     /// Run in development mode (disables security checks)
     #[arg(long, help = "Run in development mode (disables security checks)")]
     dev: bool,
+    /* BEGIN_HADRON */
+    #[arg(long)]
+    enable_pull_timeline_on_startup: bool,
+    /* END_HADRON */
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -435,6 +440,11 @@ async fn main() -> anyhow::Result<()> {
         use_https_safekeeper_api: args.use_https_safekeeper_api,
         enable_tls_wal_service_api: args.enable_tls_wal_service_api,
         force_metric_collection_on_scrape: args.force_metric_collection_on_scrape,
+        /* BEGIN_HADRON */
+        advertise_pg_addr_tenant_only: None,
+        enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup,
+        hcc_base_url: None,
+        /* END_HADRON */
     });
 
     // initialize sentry if SENTRY_DSN is provided
@@ -529,6 +539,20 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
     // Load all timelines from disk to memory.
     global_timelines.init().await?;
 
+    /* BEGIN_HADRON */
+    if conf.enable_pull_timeline_on_startup && global_timelines.timelines_count() == 0 {
+        match hadron::hcc_pull_timelines(&conf, global_timelines.clone()).await {
+            Ok(_) => {
+                info!("Successfully pulled all timelines from peer safekeepers");
+            }
+            Err(e) => {
+                error!("Failed to pull timelines from peer safekeepers: {:?}", e);
+                return Err(e);
+            }
+        }
+    }
+    /* END_HADRON */
+
     // Run everything in current thread rt, if asked.
     if conf.current_thread_runtime {
         info!("running in current thread runtime");
diff --git a/safekeeper/src/hadron.rs b/safekeeper/src/hadron.rs
new file mode 100644
index 0000000000..b41bf2c3da
--- /dev/null
+++ b/safekeeper/src/hadron.rs
@@ -0,0 +1,388 @@
+use pem::Pem;
+use safekeeper_api::models::PullTimelineRequest;
+use std::{collections::HashMap, env::VarError, net::IpAddr, sync::Arc, time::Duration};
+use tokio::time::sleep;
+use tokio_util::sync::CancellationToken;
+use url::Url;
+use utils::{backoff, id::TenantTimelineId, ip_address};
+
+use anyhow::Result;
+use pageserver_api::controller_api::{
+    AvailabilityZone, NodeRegisterRequest, SafekeeperTimeline, SafekeeperTimelinesResponse,
+};
+
+use crate::{
+    GlobalTimelines, SafeKeeperConf,
+    metrics::{
+        SK_RECOVERY_PULL_TIMELINE_ERRORS, SK_RECOVERY_PULL_TIMELINE_OKS,
+        SK_RECOVERY_PULL_TIMELINE_SECONDS, SK_RECOVERY_PULL_TIMELINES_SECONDS,
+    },
+    pull_timeline,
+    timelines_global_map::DeleteOrExclude,
+};
+
+// Extract information in the SafeKeeperConf to build a NodeRegisterRequest used to register the safekeeper with the HCC.
+fn build_node_registeration_request(
+    conf: &SafeKeeperConf,
+    node_ip_addr: Option<IpAddr>,
+) -> Result<NodeRegisterRequest> {
+    let advertise_pg_addr_with_port = conf
+        .advertise_pg_addr_tenant_only
+        .as_deref()
+        .expect("advertise_pg_addr_tenant_only is required to register with HCC");
+
+    // Extract host/port from the string.
+    let (advertise_host_addr, pg_port_str) = advertise_pg_addr_with_port.split_at(
+        advertise_pg_addr_with_port
+            .rfind(':')
+            .ok_or(anyhow::anyhow!("Invalid advertise_pg_addr"))?,
+    );
+    // Need the `[1..]` to remove the leading ':'.
+    let pg_port = pg_port_str[1..]
+        .parse::<u16>()
+        .map_err(|e| anyhow::anyhow!("Cannot parse PG port: {}", e))?;
+
+    let (_, http_port_str) = conf.listen_http_addr.split_at(
+        conf.listen_http_addr
+            .rfind(':')
+            .ok_or(anyhow::anyhow!("Invalid listen_http_addr"))?,
+    );
+    let http_port = http_port_str[1..]
+        .parse::<u16>()
+        .map_err(|e| anyhow::anyhow!("Cannot parse HTTP port: {}", e))?;
+
+    Ok(NodeRegisterRequest {
+        node_id: conf.my_id,
+        listen_pg_addr: advertise_host_addr.to_string(),
+        listen_pg_port: pg_port,
+        listen_http_addr: advertise_host_addr.to_string(),
+        listen_http_port: http_port,
+        node_ip_addr,
+        availability_zone_id: AvailabilityZone("todo".to_string()),
+        listen_grpc_addr: None,
+        listen_grpc_port: None,
+        listen_https_port: None,
+    })
+}
+
+// Retrieve the JWT token used for authenticating with HCC from the environment variable.
+// Returns None if the token cannot be retrieved.
+fn get_hcc_auth_token() -> Option<String> {
+    match std::env::var("HCC_AUTH_TOKEN") {
+        Ok(v) => {
+            tracing::info!("Loaded JWT token for authentication with HCC");
+            Some(v)
+        }
+        Err(VarError::NotPresent) => {
+            tracing::info!("No JWT token for authentication with HCC detected");
+            None
+        }
+        Err(_) => {
+            tracing::info!(
+                "Failed to either load to detect non-present HCC_AUTH_TOKEN environment variable"
+            );
+            None
+        }
+    }
+}
+
+async fn send_safekeeper_register_request(
+    request_url: &Url,
+    auth_token: &Option<String>,
+    request: &NodeRegisterRequest,
+) -> Result<()> {
+    let client = reqwest::Client::new();
+    let mut req_builder = client
+        .post(request_url.clone())
+        .header("Content-Type", "application/json");
+    if let Some(token) = auth_token {
+        req_builder = req_builder.bearer_auth(token);
+    }
+    req_builder
+        .json(&request)
+        .send()
+        .await?
+        .error_for_status()?;
+    Ok(())
+}
+
+/// Registers this safe keeper with the HCC.
+pub async fn register(conf: &SafeKeeperConf) -> Result<()> {
+    match conf.hcc_base_url.as_ref() {
+        None => {
+            tracing::info!("HCC base URL is not set, skipping registration");
+            Ok(())
+        }
+        Some(hcc_base_url) => {
+            // The following operations acquiring the auth token and the node IP address both read environment
+            // variables. It's fine for now as this `register()` function is only called once during startup.
+            // If we start to talk to HCC more regularly in the safekeeper we should probably consider
+            // refactoring things into a "HadronClusterCoordinatorClient" struct.
+            let auth_token = get_hcc_auth_token();
+            let node_ip_addr =
+                ip_address::read_node_ip_addr_from_env().expect("Error reading node IP address.");
+
+            let request = build_node_registeration_request(conf, node_ip_addr)?;
+            let cancel = CancellationToken::new();
+            let request_url = hcc_base_url.clone().join("/hadron-internal/v1/sk")?;
+
+            backoff::retry(
+                || async {
+                    send_safekeeper_register_request(&request_url, &auth_token, &request).await
+                },
+                |_| false,
+                3,
+                u32::MAX,
+                "Calling the HCC safekeeper register API",
+                &cancel,
+            )
+            .await
+            .ok_or(anyhow::anyhow!(
+                "Error in forever retry loop. This error should never be surfaced."
+            ))?
+        }
+    }
+}
+
+async fn safekeeper_list_timelines_request(
+    conf: &SafeKeeperConf,
+) -> Result<pageserver_api::controller_api::SafekeeperTimelinesResponse> {
+    if conf.hcc_base_url.is_none() {
+        tracing::info!("HCC base URL is not set, skipping registration");
+        return Err(anyhow::anyhow!("HCC base URL is not set"));
+    }
+
+    // The following operations acquiring the auth token and the node IP address both read environment
+    // variables. It's fine for now as this `register()` function is only called once during startup.
+    // If we start to talk to HCC more regularly in the safekeeper we should probably consider
+    // refactoring things into a "HadronClusterCoordinatorClient" struct.
+    let auth_token = get_hcc_auth_token();
+    let method = format!("/control/v1/safekeeper/{}/timelines", conf.my_id.0);
+    let request_url = conf.hcc_base_url.as_ref().unwrap().clone().join(&method)?;
+
+    let client = reqwest::Client::new();
+    let mut req_builder = client
+        .get(request_url.clone())
+        .header("Content-Type", "application/json")
+        .query(&[("id", conf.my_id.0)]);
+    if let Some(token) = auth_token {
+        req_builder = req_builder.bearer_auth(token);
+    }
+    let response = req_builder
+        .send()
+        .await?
+        .error_for_status()?
+        .json::<pageserver_api::controller_api::SafekeeperTimelinesResponse>()
+        .await?;
+    Ok(response)
+}
+
+// Returns true on success, false otherwise.
+pub async fn hcc_pull_timeline(
+    timeline: SafekeeperTimeline,
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+    nodeid_http: &HashMap<u64, String>,
+) -> bool {
+    let mut request = PullTimelineRequest {
+        tenant_id: timeline.tenant_id,
+        timeline_id: timeline.timeline_id,
+        http_hosts: Vec::new(),
+        ignore_tombstone: None,
+    };
+    for host in timeline.peers {
+        if host.0 == conf.my_id.0 {
+            continue;
+        }
+        if let Some(http_host) = nodeid_http.get(&host.0) {
+            request.http_hosts.push(http_host.clone());
+        }
+    }
+
+    let ca_certs = match conf
+        .ssl_ca_certs
+        .iter()
+        .map(Pem::contents)
+        .map(reqwest::Certificate::from_der)
+        .collect::<Result<Vec<_>, _>>()
+    {
+        Ok(result) => result,
+        Err(_) => {
+            return false;
+        }
+    };
+    match pull_timeline::handle_request(
+        request,
+        conf.sk_auth_token.clone(),
+        ca_certs,
+        global_timelines.clone(),
+        true,
+    )
+    .await
+    {
+        Ok(resp) => {
+            tracing::info!(
+                "Completed pulling tenant {} timeline {} from SK {:?}",
+                timeline.tenant_id,
+                timeline.timeline_id,
+                resp.safekeeper_host
+            );
+            return true;
+        }
+        Err(e) => {
+            tracing::error!(
+                "Failed to pull tenant {} timeline {} from SK {}",
+                timeline.tenant_id,
+                timeline.timeline_id,
+                e
+            );
+
+            let ttid = TenantTimelineId {
+                tenant_id: timeline.tenant_id,
+                timeline_id: timeline.timeline_id,
+            };
+            // Revert the failed timeline pull.
+            // Notice that not found timeline returns OK also.
+            match global_timelines
+                .delete_or_exclude(&ttid, DeleteOrExclude::DeleteLocal)
+                .await
+            {
+                Ok(dr) => {
+                    tracing::info!(
+                        "Deleted tenant {} timeline {} DirExists: {}",
+                        timeline.tenant_id,
+                        timeline.timeline_id,
+                        dr.dir_existed,
+                    );
+                }
+                Err(e) => {
+                    tracing::error!(
+                        "Failed to delete tenant {} timeline {} from global_timelines: {}",
+                        timeline.tenant_id,
+                        timeline.timeline_id,
+                        e
+                    );
+                }
+            }
+        }
+    }
+    false
+}
+
+pub async fn hcc_pull_timeline_till_success(
+    timeline: SafekeeperTimeline,
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+    nodeid_http: &HashMap<u64, String>,
+) {
+    const MAX_PULL_TIMELINE_RETRIES: u64 = 100;
+    for i in 0..MAX_PULL_TIMELINE_RETRIES {
+        if hcc_pull_timeline(
+            timeline.clone(),
+            conf,
+            global_timelines.clone(),
+            nodeid_http,
+        )
+        .await
+        {
+            SK_RECOVERY_PULL_TIMELINE_OKS.inc();
+            return;
+        }
+        tracing::error!(
+            "Failed to pull timeline {} from SK peers, retrying {}/{}",
+            timeline.timeline_id,
+            i + 1,
+            MAX_PULL_TIMELINE_RETRIES
+        );
+        tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+    }
+    SK_RECOVERY_PULL_TIMELINE_ERRORS.inc();
+}
+
+pub async fn hcc_pull_timelines(
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+) -> Result<()> {
+    let _timer = SK_RECOVERY_PULL_TIMELINES_SECONDS.start_timer();
+    tracing::info!("Start pulling timelines from SK peers");
+
+    let mut response = SafekeeperTimelinesResponse {
+        timelines: Vec::new(),
+        safekeeper_peers: Vec::new(),
+    };
+    for i in 0..100 {
+        match safekeeper_list_timelines_request(conf).await {
+            Ok(timelines) => {
+                response = timelines;
+            }
+            Err(e) => {
+                tracing::error!("Failed to list timelines from HCC: {}", e);
+                if i == 99 {
+                    return Err(e);
+                }
+            }
+        }
+        sleep(Duration::from_millis(100)).await;
+    }
+
+    let mut nodeid_http = HashMap::new();
+    for sk in response.safekeeper_peers {
+        nodeid_http.insert(
+            sk.node_id.0,
+            format!("http://{}:{}", sk.listen_http_addr, sk.http_port),
+        );
+    }
+    tracing::info!("Received {} timelines from HCC", response.timelines.len());
+    for timeline in response.timelines {
+        let _timer = SK_RECOVERY_PULL_TIMELINE_SECONDS
+            .with_label_values(&[
+                &timeline.tenant_id.to_string(),
+                &timeline.timeline_id.to_string(),
+            ])
+            .start_timer();
+        hcc_pull_timeline_till_success(timeline, conf, global_timelines.clone(), &nodeid_http)
+            .await;
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use utils::id::NodeId;
+
+    #[test]
+    fn test_build_node_registeration_request() {
+        // Test that:
+        // 1. We always extract the host name and port used to register with the HCC from the
+        //    `advertise_pg_addr` if it is set.
+        // 2. The correct ports are extracted from `advertise_pg_addr` and `listen_http_addr`.
+        let mut conf = SafeKeeperConf::dummy();
+        conf.my_id = NodeId(1);
+        conf.advertise_pg_addr_tenant_only =
+            Some("safe-keeper-1.safe-keeper.hadron.svc.cluster.local:5454".to_string());
+        // `listen_pg_addr` and `listen_pg_addr_tenant_only` are not used for node registration. Set them to a different
+        // host and port values and make sure that they don't show up in the node registration request.
+        conf.listen_pg_addr = "0.0.0.0:5456".to_string();
+        conf.listen_pg_addr_tenant_only = Some("0.0.0.0:5456".to_string());
+        conf.listen_http_addr = "0.0.0.0:7676".to_string();
+        let node_ip_addr: Option<IpAddr> = Some("127.0.0.1".parse().unwrap());
+
+        let request = build_node_registeration_request(&conf, node_ip_addr).unwrap();
+        assert_eq!(request.node_id, NodeId(1));
+        assert_eq!(
+            request.listen_pg_addr,
+            "safe-keeper-1.safe-keeper.hadron.svc.cluster.local"
+        );
+        assert_eq!(request.listen_pg_port, 5454);
+        assert_eq!(
+            request.listen_http_addr,
+            "safe-keeper-1.safe-keeper.hadron.svc.cluster.local"
+        );
+        assert_eq!(request.listen_http_port, 7676);
+        assert_eq!(
+            request.node_ip_addr,
+            Some(IpAddr::V4("127.0.0.1".parse().unwrap()))
+        );
+    }
+}
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 4b061c65d9..a0ee2facb5 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -241,9 +241,14 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
             ApiError::InternalServerError(anyhow::anyhow!("failed to parse CA certs: {e}"))
         })?;
 
-    let resp =
-        pull_timeline::handle_request(data, conf.sk_auth_token.clone(), ca_certs, global_timelines)
-            .await?;
+    let resp = pull_timeline::handle_request(
+        data,
+        conf.sk_auth_token.clone(),
+        ca_certs,
+        global_timelines,
+        false,
+    )
+    .await?;
     json_response(StatusCode::OK, resp)
 }
 
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index c0b5403ebf..02533b804d 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -10,6 +10,7 @@ use pem::Pem;
 use remote_storage::RemoteStorageConfig;
 use storage_broker::Uri;
 use tokio::runtime::Runtime;
+use url::Url;
 use utils::auth::SwappableJwtAuth;
 use utils::id::NodeId;
 use utils::logging::SecretString;
@@ -20,6 +21,7 @@ pub mod control_file;
 pub mod control_file_upgrade;
 pub mod copy_timeline;
 pub mod debug_dump;
+pub mod hadron;
 pub mod handler;
 pub mod http;
 pub mod metrics;
@@ -100,6 +102,11 @@ pub struct SafeKeeperConf {
     pub advertise_pg_addr: Option<String>,
     pub availability_zone: Option<String>,
     pub no_sync: bool,
+    /* BEGIN_HADRON */
+    pub advertise_pg_addr_tenant_only: Option<String>,
+    pub enable_pull_timeline_on_startup: bool,
+    pub hcc_base_url: Option<Url>,
+    /* END_HADRON */
     pub broker_endpoint: Uri,
     pub broker_keepalive_interval: Duration,
     pub heartbeat_timeout: Duration,
@@ -185,6 +192,11 @@ impl SafeKeeperConf {
             use_https_safekeeper_api: false,
             enable_tls_wal_service_api: false,
             force_metric_collection_on_scrape: true,
+            /* BEGIN_HADRON */
+            advertise_pg_addr_tenant_only: None,
+            enable_pull_timeline_on_startup: false,
+            hcc_base_url: None,
+            /* END_HADRON */
         }
     }
 }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 1f98651e71..e1af51c115 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -85,6 +85,43 @@ pub static WAL_STORAGE_LIMIT_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_wal_storage_limit_errors counter")
 });
+pub static SK_RECOVERY_PULL_TIMELINE_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_recovery_pull_timeline_errors",
+        concat!(
+            "Number of errors due to pull_timeline errors during SK lost disk recovery.",
+            "An increase in this metric indicates pull timelines runs into error."
+        )
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_errors counter")
+});
+pub static SK_RECOVERY_PULL_TIMELINE_OKS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_recovery_pull_timeline_oks",
+        concat!(
+            "Number of successful pull_timeline during SK lost disk recovery.",
+            "An increase in this metric indicates pull timelines is successful."
+        )
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_oks counter")
+});
+pub static SK_RECOVERY_PULL_TIMELINES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_recovery_pull_timelines_seconds",
+        "Seconds to pull timelines",
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timelines_seconds histogram")
+});
+pub static SK_RECOVERY_PULL_TIMELINE_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "safekeeper_recovery_pull_timeline_seconds",
+        "Seconds to pull timeline",
+        &["tenant_id", "timeline_id"],
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_seconds histogram vec")
+});
 /* END_HADRON */
 pub static PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 1c9e5bade5..b4c4877b2c 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -8,6 +8,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
+use http::StatusCode;
 use http_utils::error::ApiError;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
 use remote_storage::GenericRemoteStorage;
@@ -21,10 +22,11 @@ use tokio::fs::OpenOptions;
 use tokio::io::AsyncWrite;
 use tokio::sync::mpsc;
 use tokio::task;
+use tokio::time::sleep;
 use tokio_tar::{Archive, Builder, Header};
 use tokio_util::io::{CopyToBytes, SinkWriter};
 use tokio_util::sync::PollSender;
-use tracing::{error, info, instrument};
+use tracing::{error, info, instrument, warn};
 use utils::crashsafe::fsync_async_opt;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::logging::SecretString;
@@ -449,6 +451,7 @@ pub async fn handle_request(
     sk_auth_token: Option<SecretString>,
     ssl_ca_certs: Vec<Certificate>,
     global_timelines: Arc<GlobalTimelines>,
+    wait_for_peer_timeline_status: bool,
 ) -> Result<PullTimelineResponse, ApiError> {
     let existing_tli = global_timelines.get(TenantTimelineId::new(
         request.tenant_id,
@@ -472,37 +475,100 @@ pub async fn handle_request(
     let http_hosts = request.http_hosts.clone();
 
     // Figure out statuses of potential donors.
-    let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
-        futures::future::join_all(http_hosts.iter().map(|url| async {
-            let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
-            let info = cclient
-                .timeline_status(request.tenant_id, request.timeline_id)
-                .await?;
-            Ok(info)
-        }))
-        .await;
-
     let mut statuses = Vec::new();
-    for (i, response) in responses.into_iter().enumerate() {
-        match response {
-            Ok(status) => {
-                statuses.push((status, i));
-            }
-            Err(e) => {
-                info!("error fetching status from {}: {e}", http_hosts[i]);
+    if !wait_for_peer_timeline_status {
+        let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
+            futures::future::join_all(http_hosts.iter().map(|url| async {
+                let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
+                let resp = cclient
+                    .timeline_status(request.tenant_id, request.timeline_id)
+                    .await?;
+                let info: TimelineStatus = resp
+                    .json()
+                    .await
+                    .context("Failed to deserialize timeline status")
+                    .map_err(|e| mgmt_api::Error::ReceiveErrorBody(e.to_string()))?;
+                Ok(info)
+            }))
+            .await;
+
+        for (i, response) in responses.into_iter().enumerate() {
+            match response {
+                Ok(status) => {
+                    statuses.push((status, i));
+                }
+                Err(e) => {
+                    info!("error fetching status from {}: {e}", http_hosts[i]);
+                }
             }
         }
-    }
 
-    // Allow missing responses from up to one safekeeper (say due to downtime)
-    // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
-    // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
-    let min_required_successful = (http_hosts.len() - 1).max(1);
-    if statuses.len() < min_required_successful {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "only got {} successful status responses. required: {min_required_successful}",
-            statuses.len()
-        )));
+        // Allow missing responses from up to one safekeeper (say due to downtime)
+        // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
+        // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
+        let min_required_successful = (http_hosts.len() - 1).max(1);
+        if statuses.len() < min_required_successful {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "only got {} successful status responses. required: {min_required_successful}",
+                statuses.len()
+            )));
+        }
+    } else {
+        let mut retry = true;
+        // We must get status from all other peers.
+        // Otherwise, we may run into split-brain scenario.
+        while retry {
+            statuses.clear();
+            retry = false;
+            for (i, url) in http_hosts.iter().enumerate() {
+                let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
+                match cclient
+                    .timeline_status(request.tenant_id, request.timeline_id)
+                    .await
+                {
+                    Ok(resp) => {
+                        if resp.status() == StatusCode::NOT_FOUND {
+                            warn!(
+                                "Timeline {} not found on peer SK {}, no need to pull it",
+                                TenantTimelineId::new(request.tenant_id, request.timeline_id),
+                                url
+                            );
+                            return Ok(PullTimelineResponse {
+                                safekeeper_host: None,
+                            });
+                        }
+                        let info: TimelineStatus = resp
+                            .json()
+                            .await
+                            .context("Failed to deserialize timeline status")
+                            .map_err(ApiError::InternalServerError)?;
+                        statuses.push((info, i));
+                    }
+                    Err(e) => {
+                        match e {
+                            // If we get a 404, it means the timeline doesn't exist on this safekeeper.
+                            // We can ignore this error.
+                            mgmt_api::Error::ApiError(status, _)
+                                if status == StatusCode::NOT_FOUND =>
+                            {
+                                warn!(
+                                    "Timeline {} not found on peer SK {}, no need to pull it",
+                                    TenantTimelineId::new(request.tenant_id, request.timeline_id),
+                                    url
+                                );
+                                return Ok(PullTimelineResponse {
+                                    safekeeper_host: None,
+                                });
+                            }
+                            _ => {}
+                        }
+                        retry = true;
+                        error!("Failed to get timeline status from {}: {:#}", url, e);
+                    }
+                }
+            }
+            sleep(std::time::Duration::from_millis(100)).await;
+        }
     }
 
     // Find the most advanced safekeeper
@@ -511,6 +577,12 @@ pub async fn handle_request(
         .max_by_key(|(status, _)| {
             (
                 status.acceptor_state.epoch,
+                /* BEGIN_HADRON */
+                // We need to pull from the SK with the highest term.
+                // This is because another compute may come online and vote the same highest term again on the other two SKs.
+                // Then, there will be 2 computes running on the same term.
+                status.acceptor_state.term,
+                /* END_HADRON */
                 status.flush_lsn,
                 status.commit_lsn,
             )
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 280cd790a4..393df6228e 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -191,6 +191,11 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         use_https_safekeeper_api: false,
         enable_tls_wal_service_api: false,
         force_metric_collection_on_scrape: true,
+        /* BEGIN_HADRON */
+        enable_pull_timeline_on_startup: false,
+        advertise_pg_addr_tenant_only: None,
+        hcc_base_url: None,
+        /* END_HADRON */
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 0bb63308bb..573016f772 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import sys
 import tarfile
 import tempfile
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 import pytest
@@ -198,3 +199,115 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool)
     # the table is back now!
     restored = env.endpoints.create_start("main")
     assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
+
+
+# BEGIN_HADRON
+# TODO: re-enable once CM python is integreated.
+# def clear_directory(directory):
+#     for item in os.listdir(directory):
+#         item_path = os.path.join(directory, item)
+#         if os.path.isdir(item_path):
+#             log.info(f"removing SK directory: {item_path}")
+#             shutil.rmtree(item_path)
+#         else:
+#             log.info(f"removing SK file: {item_path}")
+#             os.remove(item_path)
+
+
+# def test_sk_pull_timelines(
+#     neon_env_builder: NeonEnvBuilder,
+# ):
+#     DBNAME = "regression"
+#     superuser_name = "databricks_superuser"
+#     neon_env_builder.num_safekeepers = 3
+#     neon_env_builder.num_pageservers = 4
+#     neon_env_builder.safekeeper_extra_opts = ["--enable-pull-timeline-on-startup"]
+#     neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
+
+#     env = neon_env_builder.init_start(initial_tenant_shard_count=4)
+
+#     env.compute_manager.start(base_port=env.compute_manager_port)
+
+#     test_creator = "test_creator"
+#     test_metastore_id = uuid4()
+#     test_account_id = uuid4()
+#     test_workspace_id = 1
+#     test_workspace_url = "http://test_workspace_url"
+#     test_metadata_version = 1
+#     test_metadata = {
+#         "state": "INSTANCE_PROVISIONING",
+#         "admin_rolename": "admin",
+#         "admin_password_scram": "abc123456",
+#     }
+
+#     test_instance_name_1 = "test_instance_1"
+#     test_instance_read_write_compute_pool_1 = {
+#         "instance_name": test_instance_name_1,
+#         "compute_pool_name": "compute_pool_1",
+#         "creator": test_creator,
+#         "capacity": 2.0,
+#         "node_count": 1,
+#         "metadata_version": 0,
+#         "metadata": {
+#             "state": "INSTANCE_PROVISIONING",
+#         },
+#     }
+
+#     test_instance_1_readable_secondaries_enabled = False
+
+#     # Test creation
+#     create_instance_with_retries(
+#         env,
+#         test_instance_name_1,
+#         test_creator,
+#         test_metastore_id,
+#         test_account_id,
+#         test_workspace_id,
+#         test_workspace_url,
+#         test_instance_read_write_compute_pool_1,
+#         test_metadata_version,
+#         test_metadata,
+#         test_instance_1_readable_secondaries_enabled,
+#     )
+#     instance = env.compute_manager.get_instance_by_name(test_instance_name_1, test_workspace_id)
+#     log.info(f"haoyu Instance created: {instance}")
+#     assert instance["instance_name"] == test_instance_name_1
+#     test_instance_id = instance["instance_id"]
+#     instance_detail = env.compute_manager.describe_instance(test_instance_id)
+#     log.info(f"haoyu Instance detail: {instance_detail}")
+
+#     env.initial_tenant = instance_detail[0]["tenant_id"]
+#     env.initial_timeline = instance_detail[0]["timeline_id"]
+
+#     # Connect to postgres and create a database called "regression".
+#     endpoint = env.endpoints.create_start("main")
+#     endpoint.safe_psql(f"CREATE ROLE {superuser_name}")
+#     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
+
+#     endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);")
+#     # Write some data. ~20 MB.
+#     num_rows = 0
+#     for _i in range(0, 20000):
+#         endpoint.safe_psql(
+#             "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
+#         )
+#         num_rows += 1
+
+#     log.info(f"SKs {env.storage_controller.hcc_sk_node_list()}")
+
+#     env.safekeepers[0].stop(immediate=True)
+#     clear_directory(env.safekeepers[0].data_dir)
+#     env.safekeepers[0].start()
+
+#     # PG can still write data. ~20 MB.
+#     for _i in range(0, 20000):
+#         endpoint.safe_psql(
+#             "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
+#         )
+#         num_rows += 1
+
+#     tuples = endpoint.safe_psql("SELECT COUNT(*) FROM usertable;")
+#     assert tuples[0][0] == num_rows
+#     endpoint.stop_and_destroy()
+
+# END_HADRON

From f8d3f86f586c6615e75251f9919c6c66feefa5d6 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 14 Jul 2025 17:37:28 +0100
Subject: [PATCH 52/56] pageserver: include records in get page debug handler
 (#12578)

Include records and image in the debug get page handler.
This endpoint does not update the metrics and does not support tracing.

Note that this now returns individual bytes which need to be encoded
properly for debugging.

Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>
---
 pageserver/src/http/routes.rs          |  35 ++++--
 pageserver/src/tenant/storage_layer.rs |  28 ++++-
 pageserver/src/tenant/timeline.rs      | 143 +++++++++++++++++++++++++
 3 files changed, 196 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0d40c5ecf7..3e844a375d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,6 +10,7 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use anyhow::{Context, Result, anyhow};
+use bytes::Bytes;
 use enumset::EnumSet;
 use futures::future::join_all;
 use futures::{StreamExt, TryFutureExt};
@@ -46,6 +47,7 @@ use pageserver_api::shard::{ShardCount, TenantShardId};
 use postgres_ffi::PgMajorVersion;
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
 use scopeguard::defer;
+use serde::{Deserialize, Serialize};
 use serde_json::json;
 use tenant_size_model::svg::SvgBranchKind;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -57,6 +59,7 @@ use utils::auth::SwappableJwtAuth;
 use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
+use wal_decoder::models::record::NeonWalRecord;
 
 use crate::config::PageServerConf;
 use crate::context;
@@ -77,6 +80,7 @@ use crate::tenant::remote_timeline_client::{
 };
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
+use crate::tenant::storage_layer::ValuesReconstructState;
 use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
 use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
 use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
@@ -2708,6 +2712,16 @@ async fn deletion_queue_flush(
     }
 }
 
+/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+struct GetPageResponse {
+    pub page: Bytes,
+    pub layers_visited: u32,
+    pub delta_layers_visited: u32,
+    pub records: Vec<(Lsn, NeonWalRecord)>,
+    pub img: Option<(Lsn, Bytes)>,
+}
+
 async fn getpage_at_lsn_handler(
     request: Request<Body>,
     cancel: CancellationToken,
@@ -2758,21 +2772,24 @@ async fn getpage_at_lsn_handler_inner(
 
         // Use last_record_lsn if no lsn is provided
         let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let page = timeline.get(key.0, lsn, &ctx).await?;
 
         if touch {
             json_response(StatusCode::OK, ())
         } else {
-            Result::<_, ApiError>::Ok(
-                Response::builder()
-                    .status(StatusCode::OK)
-                    .header(header::CONTENT_TYPE, "application/octet-stream")
-                    .body(hyper::Body::from(page))
-                    .unwrap(),
-            )
+            let mut reconstruct_state = ValuesReconstructState::new_with_debug(IoConcurrency::sequential());
+            let page = timeline.debug_get(key.0, lsn, &ctx, &mut reconstruct_state).await?;
+            let response = GetPageResponse {
+                page,
+                layers_visited: reconstruct_state.get_layers_visited(),
+                delta_layers_visited: reconstruct_state.get_delta_layers_visited(),
+                records: reconstruct_state.debug_state.records.clone(),
+                img: reconstruct_state.debug_state.img.clone(),
+            };
+
+            json_response(StatusCode::OK, response)
         }
     }
-    .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .instrument(info_span!("timeline_debug_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
     .await
 }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9fbb9d2438..43ea8fffa3 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -75,7 +75,7 @@ where
 /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
 /// call, to collect more records.
 ///
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub(crate) struct ValueReconstructState {
     pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
     pub(crate) img: Option<(Lsn, Bytes)>,
@@ -308,6 +308,9 @@ pub struct ValuesReconstructState {
     layers_visited: u32,
     delta_layers_visited: u32,
 
+    pub(crate) enable_debug: bool,
+    pub(crate) debug_state: ValueReconstructState,
+
     pub(crate) io_concurrency: IoConcurrency,
     num_active_ios: Arc<AtomicUsize>,
 
@@ -657,6 +660,23 @@ impl ValuesReconstructState {
             layers_visited: 0,
             delta_layers_visited: 0,
             io_concurrency,
+            enable_debug: false,
+            debug_state: ValueReconstructState::default(),
+            num_active_ios: Arc::new(AtomicUsize::new(0)),
+            read_path: None,
+        }
+    }
+
+    pub(crate) fn new_with_debug(io_concurrency: IoConcurrency) -> Self {
+        Self {
+            keys: HashMap::new(),
+            keys_done: KeySpaceRandomAccum::new(),
+            keys_with_image_coverage: None,
+            layers_visited: 0,
+            delta_layers_visited: 0,
+            io_concurrency,
+            enable_debug: true,
+            debug_state: ValueReconstructState::default(),
             num_active_ios: Arc::new(AtomicUsize::new(0)),
             read_path: None,
         }
@@ -670,6 +690,12 @@ impl ValuesReconstructState {
         self.io_concurrency.spawn_io(fut).await;
     }
 
+    pub(crate) fn set_debug_state(&mut self, debug_state: &ValueReconstructState) {
+        if self.enable_debug {
+            self.debug_state = debug_state.clone();
+        }
+    }
+
     pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
         self.layers_visited += 1;
         if let ReadableLayer::PersistentLayer(layer) = layer {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f2833674a9..73d2d72b59 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1253,6 +1253,57 @@ impl Timeline {
         }
     }
 
+    #[inline(always)]
+    pub(crate) async fn debug_get(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+        reconstruct_state: &mut ValuesReconstructState,
+    ) -> Result<Bytes, PageReconstructError> {
+        if !lsn.is_valid() {
+            return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
+        }
+
+        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
+        // already checked the key against the shard_identity when looking up the Timeline from
+        // page_service.
+        debug_assert!(!self.shard_identity.is_key_disposable(&key));
+
+        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
+        let vectored_res = self
+            .debug_get_vectored_impl(query, reconstruct_state, ctx)
+            .await;
+
+        let key_value = vectored_res?.pop_first();
+        match key_value {
+            Some((got_key, value)) => {
+                if got_key != key {
+                    error!(
+                        "Expected {}, but singular vectored get returned {}",
+                        key, got_key
+                    );
+                    Err(PageReconstructError::Other(anyhow!(
+                        "Singular vectored get returned wrong key"
+                    )))
+                } else {
+                    value
+                }
+            }
+            None => Err(PageReconstructError::MissingKey(Box::new(
+                MissingKeyError {
+                    keyspace: KeySpace::single(key..key.next()),
+                    shard: self.shard_identity.get_shard_number(&key),
+                    original_hwm_lsn: lsn,
+                    ancestor_lsn: None,
+                    backtrace: None,
+                    read_path: None,
+                    query: None,
+                },
+            ))),
+        }
+    }
+
     pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;
 
     /// Look up multiple page versions at a given LSN
@@ -1547,6 +1598,98 @@ impl Timeline {
         Ok(results)
     }
 
+    // A copy of the get_vectored_impl method except that we store the image and wal records into `reconstruct_state`.
+    // This is only used in the http getpage call for debugging purpose.
+    pub(super) async fn debug_get_vectored_impl(
+        &self,
+        query: VersionedKeySpaceQuery,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if query.is_empty() {
+            return Ok(BTreeMap::default());
+        }
+
+        let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
+            Some(ReadPath::new(
+                query.total_keyspace(),
+                query.high_watermark_lsn()?,
+            ))
+        } else {
+            None
+        };
+
+        reconstruct_state.read_path = read_path;
+
+        let traversal_res: Result<(), _> = self
+            .get_vectored_reconstruct_data(query.clone(), reconstruct_state, ctx)
+            .await;
+
+        if let Err(err) = traversal_res {
+            // Wait for all the spawned IOs to complete.
+            // See comments on `spawn_io` inside `storage_layer` for more details.
+            let mut collect_futs = std::mem::take(&mut reconstruct_state.keys)
+                .into_values()
+                .map(|state| state.collect_pending_ios())
+                .collect::<FuturesUnordered<_>>();
+            while collect_futs.next().await.is_some() {}
+            return Err(err);
+        };
+
+        let reconstruct_state = Arc::new(Mutex::new(reconstruct_state));
+        let futs = FuturesUnordered::new();
+
+        for (key, state) in std::mem::take(&mut reconstruct_state.lock().unwrap().keys) {
+            let req_lsn_for_key = query.map_key_to_lsn(&key);
+            futs.push({
+                let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
+                let rc_clone = Arc::clone(&reconstruct_state);
+
+                async move {
+                    assert_eq!(state.situation, ValueReconstructSituation::Complete);
+
+                    let converted = match state.collect_pending_ios().await {
+                        Ok(ok) => ok,
+                        Err(err) => {
+                            return (key, Err(err));
+                        }
+                    };
+                    DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);
+
+                    // The walredo module expects the records to be descending in terms of Lsn.
+                    // And we submit the IOs in that order, so, there shuold be no need to sort here.
+                    debug_assert!(
+                        converted
+                            .records
+                            .is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)),
+                        "{converted:?}"
+                    );
+                    {
+                        let mut guard = rc_clone.lock().unwrap();
+                        guard.set_debug_state(&converted);
+                    }
+                    (
+                        key,
+                        walredo_self
+                            .reconstruct_value(
+                                key,
+                                req_lsn_for_key,
+                                converted,
+                                RedoAttemptType::ReadPage,
+                            )
+                            .await,
+                    )
+                }
+            });
+        }
+
+        let results = futs
+            .collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
+            .await;
+
+        Ok(results)
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last

From 3e6fdb0aa671e876dddddaf167de4a036409019a Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 14 Jul 2025 18:47:07 +0200
Subject: [PATCH 53/56] Add and use [U]INT64_[HEX_]FORMAT for various [u]int64
 needs (#12592)

We didn't consistently apply these, and it wasn't consistently solved.
With this patch we should have a more consistent approach to this, and
have less issues porting changes to newer versions.

This also removes some potentially buggy casts to `long` from `uint64` -
they could've truncated the value in systems where `long` only has 32
bits.
---
 pgxn/neon/communicator.c         | 50 ++++++++++++++++----------------
 pgxn/neon/neon_pgversioncompat.h |  4 +++
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c
index bd53855eab..158b8940a3 100644
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -421,7 +421,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
 {
 	if (resp->tag != T_NeonGetPageResponse && resp->tag != T_NeonErrorResponse)
 	{
-		neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=%ld, ring_flush=%ld, ring_unused=%ld",
+		neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=" UINT64_FORMAT ", ring_flush=" UINT64_FORMAT ", ring_unused=" UINT64_FORMAT "",
 					   resp->tag, MyPState->ring_receive, MyPState->ring_flush, MyPState->ring_unused);
 	}
 	if (neon_protocol_version >= 3)
@@ -438,7 +438,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
 				getpage_resp->req.blkno != slot->buftag.blockNum)
 			{
 				NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-											"Receive unexpected getpage response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
+											"Receive unexpected getpage response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
 											resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
 											slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), slot->buftag.forkNum, slot->buftag.blockNum);
 			}
@@ -447,7 +447,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
 				 resp->lsn != slot->request_lsns.request_lsn ||
 				 resp->not_modified_since != slot->request_lsns.not_modified_since)
 		{
-			elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+			elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 				 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 				 slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
 		}
@@ -496,9 +496,9 @@ communicator_prefetch_pump_state(void)
 			slot->my_ring_index != MyPState->ring_receive)
 		{
 			neon_shard_log(slot->shard_no, PANIC,
-						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
 						   slot->status, slot->response,
-						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+						   slot->my_ring_index, MyPState->ring_receive);
 		}
 		/* update prefetch state */
 		MyPState->n_responses_buffered += 1;
@@ -789,9 +789,9 @@ prefetch_read(PrefetchRequest *slot)
 		slot->my_ring_index != MyPState->ring_receive)
 	{
 		neon_shard_log(slot->shard_no, PANIC,
-					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
+					   "Incorrect prefetch read: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
 					   slot->status, slot->response,
-					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
+					   slot->my_ring_index, MyPState->ring_receive);
 	}
 
 	/*
@@ -816,9 +816,9 @@ prefetch_read(PrefetchRequest *slot)
 			slot->my_ring_index != MyPState->ring_receive)
 		{
 			neon_shard_log(shard_no, PANIC,
-						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
 						   slot->status, slot->response,
-						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+						   slot->my_ring_index, MyPState->ring_receive);
 		}
 
 		/* update prefetch state */
@@ -852,8 +852,8 @@ prefetch_read(PrefetchRequest *slot)
 		 * and the prefetch queue was flushed during the receive call
 		 */
 		neon_shard_log(shard_no, LOG,
-					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   (long) my_ring_index,
+					   "No response from reading prefetch entry " UINT64_FORMAT ": %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   my_ring_index,
 					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
 					   buftag.forkNum, buftag.blockNum);
 		return false;
@@ -1844,7 +1844,7 @@ nm_to_string(NeonMessage *msg)
 				NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg;
 
 				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\"");
-				appendStringInfo(&s, ", \"db_size\": %ld}",
+				appendStringInfo(&s, ", \"db_size\": " INT64_FORMAT "}",
 								 msg_resp->db_size);
 				appendStringInfoChar(&s, '}');
 
@@ -2045,7 +2045,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
 						exists_resp->req.forknum != request.forknum)
 					{
 						NEON_PANIC_CONNECTION_STATE(0, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
 					}
@@ -2058,14 +2058,14 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
 				{
 					if (!equal_requests(resp, &request.hdr))
 					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 					}
 				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 								resp->reqid,
 								RelFileInfoFmt(rinfo),
 								forkNum,
@@ -2241,7 +2241,7 @@ Retry:
 			case T_NeonErrorResponse:
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[shard %d, reqid " UINT64_HEX_FORMAT "] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
 								slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
 								forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
 						 errdetail("page server returned error: %s",
@@ -2294,7 +2294,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
 						relsize_resp->req.forknum != forknum)
 					{
 						NEON_PANIC_CONNECTION_STATE(0, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
 					}
@@ -2307,14 +2307,14 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
 				{
 					if (!equal_requests(resp, &request.hdr))
 					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 					}
 				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 								resp->reqid,
 								RelFileInfoFmt(rinfo),
 								forknum,
@@ -2364,7 +2364,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
 						dbsize_resp->req.dbNode != dbNode)
 					{
 						NEON_PANIC_CONNECTION_STATE(0, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
+													"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
 					}
@@ -2377,14 +2377,14 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
 				{
 					if (!equal_requests(resp, &request.hdr))
 					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 					}
 				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read db size of db %u from page server at lsn %X/%08X",
 								resp->reqid,
 								dbNode, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)),
 						 errdetail("page server returned error: %s",
@@ -2455,7 +2455,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 					slru_resp->req.segno != segno)
 				{
 					NEON_PANIC_CONNECTION_STATE(0, PANIC,
-												"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
+												"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
 												resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
 												request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno);
 				}
@@ -2469,14 +2469,14 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 			{
 				if (!equal_requests(resp, &request.hdr))
 				{
-					elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+					elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 						 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 						 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 				}
 			}
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %llu at lsn %X/%08X",
+					 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read SLRU %d segment %llu at lsn %X/%08X",
 							resp->reqid,
 							kind,
 							(unsigned long long) segno,
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index 787bd552f8..c7574ef0f9 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -165,4 +165,8 @@ extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
 extern TimeLineID GetWALInsertionTimeLine(void);
 #endif
 
+/* format codes not present in PG17-; but available in PG18+ */
+#define INT64_HEX_FORMAT "%" INT64_MODIFIER "x"
+#define UINT64_HEX_FORMAT "%" INT64_MODIFIER "x"
+
 #endif							/* NEON_PGVERSIONCOMPAT_H */

From a456e818afbf7a82be0bf72761d6025c1e17b99a Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Mon, 14 Jul 2025 18:37:47 +0100
Subject: [PATCH 54/56] LFC prewarm perftest: increase timeout for
 initialization job (#12594)

Tests on
https://github.com/neondatabase/neon/actions/runs/16268609007/job/45930162686
time out due to pgbench init job taking more than 30 minutes to run.
Increase test timeout duration to 2 hours.
---
 test_runner/performance/test_lfc_prewarm.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test_runner/performance/test_lfc_prewarm.py b/test_runner/performance/test_lfc_prewarm.py
index ad2c759a63..6c0083de95 100644
--- a/test_runner/performance/test_lfc_prewarm.py
+++ b/test_runner/performance/test_lfc_prewarm.py
@@ -60,7 +60,7 @@ def test_compare_prewarmed_pgbench_perf(neon_compare: NeonCompare):
 
 
 @pytest.mark.remote_cluster
-@pytest.mark.timeout(30 * 60)
+@pytest.mark.timeout(2 * 60 * 60)
 def test_compare_prewarmed_pgbench_perf_benchmark(
     pg_bin: PgBin,
     neon_api: NeonAPI,
@@ -91,8 +91,9 @@ def benchmark_impl(
     test_duration_min = 5
     pgbench_duration = f"-T{test_duration_min * 60}"
     # prewarm API is not publicly exposed. In order to test performance of a
-    # fully prewarmed endpoint, wait after it restarts
-    prewarmed_sleep_secs = 30
+    # fully prewarmed endpoint, wait after it restarts.
+    # The number here is empirical, based on manual runs on staging
+    prewarmed_sleep_secs = 180
 
     branch_id = project["branch"]["id"]
     project_id = project["project"]["id"]

From 9a2456bea557b3f140fff9d3b40809b9b853af84 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 14 Jul 2025 21:42:36 +0300
Subject: [PATCH 55/56] Reduce noise from get_installed_extensions during e.g
 shut down (#12479)

All Errors that can occur during get_installed_extensions() come from
tokio-postgres functions, e.g. if the database is being shut down
("FATAL: terminating connection due to administrator command"). I'm
seeing a lot of such errors in the logs with the regression tests, with
very verbose stack traces. The compute_ctl stack trace is pretty useless
for errors originating from the Postgres connection, the error message
has all the information, so stop printing the stack trace.

I changed the result type of the functions to return the originating
tokio_postgres Error rather than anyhow::Error, so that if we introduce
other error sources to the functions where the stack trace might be
useful, we'll be forced to revisit this, probably by introducing a new
Error type that separates postgres errors from other errors. But this
will do for now.
---
 compute_tools/src/compute.rs              | 2 +-
 compute_tools/src/installed_extensions.rs | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 2e0b7d7b2e..8f42cf699b 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -2487,7 +2487,7 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
                 serde_json::to_string(&extensions).expect("failed to serialize extensions list")
             );
         }
-        Err(err) => error!("could not get installed extensions: {err:?}"),
+        Err(err) => error!("could not get installed extensions: {err}"),
     }
     Ok(())
 }
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 411e03b7ec..90e1a17be4 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -2,6 +2,7 @@ use std::collections::HashMap;
 
 use anyhow::Result;
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
+use tokio_postgres::error::Error as PostgresError;
 use tokio_postgres::{Client, Config, NoTls};
 
 use crate::metrics::INSTALLED_EXTENSIONS;
@@ -10,7 +11,7 @@ use crate::metrics::INSTALLED_EXTENSIONS;
 /// and to make database listing query here more explicit.
 ///
 /// Limit the number of databases to 500 to avoid excessive load.
-async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
+async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
     // `pg_database.datconnlimit = -2` means that the database is in the
     // invalid state
     let databases = client
@@ -37,7 +38,9 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 /// Same extension can be installed in multiple databases with different versions,
 /// so we report a separate metric (number of databases where it is installed)
 /// for each extension version.
-pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExtensions> {
+pub async fn get_installed_extensions(
+    mut conf: Config,
+) -> Result<InstalledExtensions, PostgresError> {
     conf.application_name("compute_ctl:get_installed_extensions");
     let databases: Vec<String> = {
         let (mut client, connection) = conf.connect(NoTls).await?;

From ff526a1051b42443ad0cb6e81aff27a314b3482a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Szafra=C5=84ski?= <k.p.szafranski@gmail.com>
Date: Tue, 15 Jul 2025 09:42:48 +0200
Subject: [PATCH 56/56] [proxy] Recognize more cplane errors, use
 retry_delay_ms as TTL (#12543)

## Problem

Not all cplane errors are properly recognized and cached/retried.

## Summary of changes

Add more cplane error reasons. Also, use retry_delay_ms as cache TTL if
present.

Related to https://github.com/neondatabase/cloud/issues/19353
---
 proxy/src/cache/timed_lru.rs                  | 13 ++--
 .../control_plane/client/cplane_proxy_v1.rs   | 75 ++++++++++---------
 proxy/src/control_plane/errors.rs             | 49 ++++++------
 proxy/src/control_plane/messages.rs           | 33 ++++++--
 proxy/src/proxy/mod.rs                        | 10 ++-
 5 files changed, 107 insertions(+), 73 deletions(-)

diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 183e1ea449..e87cf53ab9 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -14,8 +14,8 @@ use std::time::{Duration, Instant};
 use hashlink::{LruCache, linked_hash_map::RawEntryMut};
 use tracing::debug;
 
+use super::Cache;
 use super::common::Cached;
-use super::{Cache, timed_lru};
 
 /// An implementation of timed LRU cache with fixed capacity.
 /// Key properties:
@@ -30,7 +30,7 @@ use super::{Cache, timed_lru};
 ///
 /// * There's an API for immediate invalidation (removal) of a cache entry;
 ///   It's useful in case we know for sure that the entry is no longer correct.
-///   See [`timed_lru::Cached`] for more information.
+///   See [`Cached`] for more information.
 ///
 /// * Expired entries are kept in the cache, until they are evicted by the LRU policy,
 ///   or by a successful lookup (i.e. the entry hasn't expired yet).
@@ -217,15 +217,18 @@ impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
 }
 
 impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
-    /// Retrieve a cached entry in convenient wrapper.
-    pub(crate) fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
+    /// Retrieve a cached entry in convenient wrapper, alongside timing information.
+    pub(crate) fn get_with_created_at<Q>(
+        &self,
+        key: &Q,
+    ) -> Option<Cached<&Self, (<Self as Cache>::Value, Instant)>>
     where
         K: Borrow<Q> + Clone,
         Q: Hash + Eq + ?Sized,
     {
         self.get_raw(key, |key, entry| Cached {
             token: Some((self, key.clone())),
-            value: entry.value.clone(),
+            value: (entry.value.clone(), entry.created_at),
         })
     }
 }
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index fc263b73b1..bb785b8b0c 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -23,12 +23,13 @@ use crate::control_plane::errors::{
     ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
 };
 use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse};
 use crate::control_plane::{
     AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, EndpointAccessControl, NodeInfo,
     RoleAccessControl,
 };
 use crate::metrics::Metrics;
+use crate::proxy::retry::CouldRetry;
 use crate::rate_limiter::WakeComputeRateLimiter;
 use crate::types::{EndpointCacheKey, EndpointId, RoleName};
 use crate::{compute, http, scram};
@@ -382,16 +383,31 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
 
         macro_rules! check_cache {
             () => {
-                if let Some(cached) = self.caches.node_info.get(&key) {
-                    let (cached, info) = cached.take_value();
-                    let info = info.map_err(|c| {
-                        info!(key = &*key, "found cached wake_compute error");
-                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
-                    })?;
+                if let Some(cached) = self.caches.node_info.get_with_created_at(&key) {
+                    let (cached, (info, created_at)) = cached.take_value();
+                    return match info {
+                        Err(mut msg) => {
+                            info!(key = &*key, "found cached wake_compute error");
 
-                    debug!(key = &*key, "found cached compute node info");
-                    ctx.set_project(info.aux.clone());
-                    return Ok(cached.map(|()| info));
+                            // if retry_delay_ms is set, reduce it by the amount of time it spent in cache
+                            if let Some(status) = &mut msg.status {
+                                if let Some(retry_info) = &mut status.details.retry_info {
+                                    retry_info.retry_delay_ms = retry_info
+                                        .retry_delay_ms
+                                        .saturating_sub(created_at.elapsed().as_millis() as u64)
+                                }
+                            }
+
+                            Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                                msg,
+                            )))
+                        }
+                        Ok(info) => {
+                            debug!(key = &*key, "found cached compute node info");
+                            ctx.set_project(info.aux.clone());
+                            Ok(cached.map(|()| info))
+                        }
+                    };
                 }
             };
         }
@@ -434,42 +450,29 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                 Ok(cached.map(|()| node))
             }
             Err(err) => match err {
-                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
-                    let Some(status) = &err.status else {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
-                    };
+                WakeComputeError::ControlPlane(ControlPlaneError::Message(ref msg)) => {
+                    let retry_info = msg.status.as_ref().and_then(|s| s.details.retry_info);
 
-                    let reason = status
-                        .details
-                        .error_info
-                        .map_or(Reason::Unknown, |x| x.reason);
-
-                    // if we can retry this error, do not cache it.
-                    if reason.can_retry() {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
+                    // If we can retry this error, do not cache it,
+                    // unless we were given a retry delay.
+                    if msg.could_retry() && retry_info.is_none() {
+                        return Err(err);
                     }
 
-                    // at this point, we should only have quota errors.
                     debug!(
                         key = &*key,
                         "created a cache entry for the wake compute error"
                     );
 
-                    self.caches.node_info.insert_ttl(
-                        key,
-                        Err(err.clone()),
-                        Duration::from_secs(30),
-                    );
+                    let ttl = retry_info.map_or(Duration::from_secs(30), |r| {
+                        Duration::from_millis(r.retry_delay_ms)
+                    });
 
-                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                        err,
-                    )))
+                    self.caches.node_info.insert_ttl(key, Err(msg.clone()), ttl);
+
+                    Err(err)
                 }
-                err => return Err(err),
+                err => Err(err),
             },
         }
     }
diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs
index f640657d90..12843e48c7 100644
--- a/proxy/src/control_plane/errors.rs
+++ b/proxy/src/control_plane/errors.rs
@@ -43,28 +43,35 @@ impl UserFacingError for ControlPlaneError {
 }
 
 impl ReportableError for ControlPlaneError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
+    fn get_error_kind(&self) -> ErrorKind {
         match self {
             ControlPlaneError::Message(e) => match e.get_reason() {
-                Reason::RoleProtected => ErrorKind::User,
-                Reason::ResourceNotFound => ErrorKind::User,
-                Reason::ProjectNotFound => ErrorKind::User,
-                Reason::EndpointNotFound => ErrorKind::User,
-                Reason::BranchNotFound => ErrorKind::User,
+                Reason::RoleProtected
+                | Reason::ResourceNotFound
+                | Reason::ProjectNotFound
+                | Reason::EndpointNotFound
+                | Reason::EndpointDisabled
+                | Reason::BranchNotFound
+                | Reason::InvalidEphemeralEndpointOptions => ErrorKind::User,
+
                 Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
-                Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota,
-                Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota,
-                Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota,
-                Reason::WrittenDataQuotaExceeded => ErrorKind::Quota,
-                Reason::DataTransferQuotaExceeded => ErrorKind::Quota,
-                Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota,
-                Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
-                Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
-                Reason::RunningOperations => ErrorKind::ControlPlane,
-                Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
-                Reason::Unknown => ErrorKind::ControlPlane,
+
+                Reason::NonDefaultBranchComputeTimeExceeded
+                | Reason::ActiveTimeQuotaExceeded
+                | Reason::ComputeTimeQuotaExceeded
+                | Reason::WrittenDataQuotaExceeded
+                | Reason::DataTransferQuotaExceeded
+                | Reason::LogicalSizeQuotaExceeded
+                | Reason::ActiveEndpointsLimitExceeded => ErrorKind::Quota,
+
+                Reason::ConcurrencyLimitReached
+                | Reason::LockAlreadyTaken
+                | Reason::RunningOperations
+                | Reason::EndpointIdle
+                | Reason::ProjectUnderMaintenance
+                | Reason::Unknown => ErrorKind::ControlPlane,
             },
-            ControlPlaneError::Transport(_) => crate::error::ErrorKind::ControlPlane,
+            ControlPlaneError::Transport(_) => ErrorKind::ControlPlane,
         }
     }
 }
@@ -120,10 +127,10 @@ impl UserFacingError for GetAuthInfoError {
 }
 
 impl ReportableError for GetAuthInfoError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
+    fn get_error_kind(&self) -> ErrorKind {
         match self {
-            Self::BadSecret => crate::error::ErrorKind::ControlPlane,
-            Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
+            Self::BadSecret => ErrorKind::ControlPlane,
+            Self::ApiError(_) => ErrorKind::ControlPlane,
         }
     }
 }
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index f0314f91f0..cf193ed268 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -126,10 +126,16 @@ pub(crate) enum Reason {
     /// or that the subject doesn't have enough permissions to access the requested endpoint.
     #[serde(rename = "ENDPOINT_NOT_FOUND")]
     EndpointNotFound,
+    /// EndpointDisabled indicates that the endpoint has been disabled and does not accept connections.
+    #[serde(rename = "ENDPOINT_DISABLED")]
+    EndpointDisabled,
     /// BranchNotFound indicates that the branch wasn't found, usually due to the provided ID not being correct,
     /// or that the subject doesn't have enough permissions to access the requested branch.
     #[serde(rename = "BRANCH_NOT_FOUND")]
     BranchNotFound,
+    /// InvalidEphemeralEndpointOptions indicates that the specified LSN or timestamp are wrong.
+    #[serde(rename = "INVALID_EPHEMERAL_OPTIONS")]
+    InvalidEphemeralEndpointOptions,
     /// RateLimitExceeded indicates that the rate limit for the operation has been exceeded.
     #[serde(rename = "RATE_LIMIT_EXCEEDED")]
     RateLimitExceeded,
@@ -152,6 +158,9 @@ pub(crate) enum Reason {
     /// LogicalSizeQuotaExceeded indicates that the logical size quota was exceeded.
     #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")]
     LogicalSizeQuotaExceeded,
+    /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded.
+    #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")]
+    ActiveEndpointsLimitExceeded,
     /// RunningOperations indicates that the project already has some running operations
     /// and scheduling of new ones is prohibited.
     #[serde(rename = "RUNNING_OPERATIONS")]
@@ -162,9 +171,13 @@ pub(crate) enum Reason {
     /// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken.
     #[serde(rename = "LOCK_ALREADY_TAKEN")]
     LockAlreadyTaken,
-    /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded.
-    #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")]
-    ActiveEndpointsLimitExceeded,
+    /// EndpointIdle indicates that the endpoint cannot become active, because it's idle.
+    #[serde(rename = "ENDPOINT_IDLE")]
+    EndpointIdle,
+    /// ProjectUnderMaintenance indicates that the project is currently ongoing maintenance,
+    /// and thus cannot accept connections.
+    #[serde(rename = "PROJECT_UNDER_MAINTENANCE")]
+    ProjectUnderMaintenance,
     #[default]
     #[serde(other)]
     Unknown,
@@ -184,13 +197,15 @@ impl Reason {
     pub(crate) fn can_retry(self) -> bool {
         match self {
             // do not retry role protected errors
-            // not a transitive error
+            // not a transient error
             Reason::RoleProtected => false,
-            // on retry, it will still not be found
+            // on retry, it will still not be found or valid
             Reason::ResourceNotFound
             | Reason::ProjectNotFound
             | Reason::EndpointNotFound
-            | Reason::BranchNotFound => false,
+            | Reason::EndpointDisabled
+            | Reason::BranchNotFound
+            | Reason::InvalidEphemeralEndpointOptions => false,
             // we were asked to go away
             Reason::RateLimitExceeded
             | Reason::NonDefaultBranchComputeTimeExceeded
@@ -200,11 +215,13 @@ impl Reason {
             | Reason::DataTransferQuotaExceeded
             | Reason::LogicalSizeQuotaExceeded
             | Reason::ActiveEndpointsLimitExceeded => false,
-            // transitive error. control plane is currently busy
+            // transient error. control plane is currently busy
             // but might be ready soon
             Reason::RunningOperations
             | Reason::ConcurrencyLimitReached
-            | Reason::LockAlreadyTaken => true,
+            | Reason::LockAlreadyTaken
+            | Reason::EndpointIdle
+            | Reason::ProjectUnderMaintenance => true,
             // unknown error. better not retry it.
             Reason::Unknown => false,
         }
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 08c81afa04..02651109e0 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -195,15 +195,18 @@ impl NeonOptions {
     // proxy options:
 
     /// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute.
-    pub const PARAMS_COMPAT: &str = "proxy_params_compat";
+    pub const PARAMS_COMPAT: &'static str = "proxy_params_compat";
 
     // cplane options:
 
     /// `LSN` allows provisioning an ephemeral compute with time-travel to the provided LSN.
-    const LSN: &str = "lsn";
+    const LSN: &'static str = "lsn";
+
+    /// `TIMESTAMP` allows provisioning an ephemeral compute with time-travel to the provided timestamp.
+    const TIMESTAMP: &'static str = "timestamp";
 
     /// `ENDPOINT_TYPE` allows configuring an ephemeral compute to be read_only or read_write.
-    const ENDPOINT_TYPE: &str = "endpoint_type";
+    const ENDPOINT_TYPE: &'static str = "endpoint_type";
 
     pub(crate) fn parse_params(params: &StartupMessageParams) -> Self {
         params
@@ -228,6 +231,7 @@ impl NeonOptions {
             // This is not a cplane option, we know it does not create ephemeral computes.
             Self::PARAMS_COMPAT => false,
             Self::LSN => true,
+            Self::TIMESTAMP => true,
             Self::ENDPOINT_TYPE => true,
             // err on the side of caution. any cplane options we don't know about
             // might lead to ephemeral computes.