fix deps

fix lints
update measured properly
2026-05-18 13:40:37 +00:00 · 2025-07-11 14:46:56 +01:00 · 2025-07-11 14:18:30 +01:00 · 2025-07-11 13:59:21 +01:00 · 2025-07-11 13:59:21 +01:00 · 2025-07-11 13:59:21 +01:00
100 changed files with 2988 additions and 832 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1009,6 +1009,12 @@ dependencies = [
 "generic-array",
 ]

+[[package]]
+name = "boxcar"
+version = "0.2.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26c4925bc979b677330a8c7fe7a8c94af2dbb4a2d37b4a20a80d884400f46baa"
+
 [[package]]
 name = "bstr"
 version = "1.5.0"
@@ -1242,14 +1248,14 @@ checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"

 [[package]]
 name = "clashmap"
-version = "1.0.0"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93bd59c81e2bd87a775ae2de75f070f7e2bfe97363a6ad652f46824564c23e4d"
+checksum = "e8a055b1f1bf558eae4959f6dd77cf2d7d50ae1483928e60ef21ca5a24fd4321"
 dependencies = [
 "crossbeam-utils",
 "hashbrown 0.15.2",
 "lock_api",
- "parking_lot_core 0.9.8",
+ "parking_lot_core 0.9.10",
 "polonius-the-crab",
 "replace_with",
 ]
@@ -1753,19 +1759,6 @@ dependencies = [
 "syn 2.0.100",
 ]

-[[package]]
-name = "dashmap"
-version = "5.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
-dependencies = [
- "cfg-if",
- "hashbrown 0.14.5",
- "lock_api",
- "once_cell",
- "parking_lot_core 0.9.8",
-]
-
 [[package]]
 name = "dashmap"
 version = "6.1.0"
@@ -1777,7 +1770,7 @@ dependencies = [
 "hashbrown 0.14.5",
 "lock_api",
 "once_cell",
- "parking_lot_core 0.9.8",
+ "parking_lot_core 0.9.10",
 ]

 [[package]]
@@ -2343,6 +2336,12 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"

+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.2.1"
@@ -2582,7 +2581,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "842dc78579ce01e6a1576ad896edc92fca002dd60c9c3746b7fc2bec6fb429d0"
 dependencies = [
 "cfg-if",
- "dashmap 6.1.0",
+ "dashmap",
 "futures-sink",
 "futures-timer",
 "futures-util",
@@ -3288,7 +3287,7 @@ dependencies = [
 "clap",
 "crossbeam-channel",
 "crossbeam-utils",
- "dashmap 6.1.0",
+ "dashmap",
 "env_logger",
 "indexmap 2.9.0",
 "itoa",
@@ -3545,16 +3544,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "lasso"
-version = "0.7.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
-dependencies = [
- "dashmap 5.5.0",
- "hashbrown 0.13.2",
-]
-
 [[package]]
 name = "lazy_static"
 version = "1.5.0"
@@ -3674,17 +3663,17 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "measured"
-version = "0.0.22"
+version = "0.0.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
+checksum = "d22ae866c28b9c59afaeb488ad6e3bd148570cf5a2bacf6c4845def5b9a03470"
 dependencies = [
 "bytes",
 "crossbeam-utils",
 "hashbrown 0.14.5",
 "itoa",
- "lasso",
 "measured-derive",
 "memchr",
+ "paracord",
 "parking_lot 0.12.1",
 "rustc-hash 1.1.0",
 "ryu",
@@ -3692,9 +3681,9 @@ dependencies = [

 [[package]]
 name = "measured-derive"
-version = "0.0.22"
+version = "0.0.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
+checksum = "16d734ed9dbca22d87d56b54d220f254ce921cb5cce97d4a960075af0131d076"
 dependencies = [
 "heck",
 "proc-macro2",
@@ -3704,15 +3693,26 @@ dependencies = [

 [[package]]
 name = "measured-process"
-version = "0.0.22"
+version = "0.0.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
+checksum = "f71a318d2b9edcded1e5e0ccf3c9e1e1614217d7f07933631c771daa717743aa"
 dependencies = [
 "libc",
 "measured",
 "procfs",
 ]

+[[package]]
+name = "measured-tokio"
+version = "0.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e79a936051c484268e2d71a5dd01219096c62c8ff09afad95f07c14a3b0c580"
+dependencies = [
+ "itoa",
+ "measured",
+ "tokio",
+]
+
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -4294,6 +4294,7 @@ dependencies = [
 "humantime-serde",
 "pageserver_api",
 "pageserver_client",
+ "pageserver_client_grpc",
 "pageserver_page_api",
 "rand 0.8.5",
 "reqwest",
@@ -4323,6 +4324,7 @@ dependencies = [
 "pageserver_api",
 "postgres_ffi",
 "remote_storage",
+ "serde",
 "serde_json",
 "svg_fmt",
 "thiserror 1.0.69",
@@ -4499,6 +4501,7 @@ name = "pageserver_client_grpc"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "arc-swap",
 "bytes",
 "compute_api",
 "futures",
@@ -4506,6 +4509,7 @@ dependencies = [
 "pageserver_page_api",
 "tokio",
 "tokio-stream",
+ "tokio-util",
 "tonic 0.13.1",
 "tracing",
 "utils",
@@ -4567,6 +4571,21 @@ dependencies = [
 "seize",
 ]

+[[package]]
+name = "paracord"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e41e924113f9a05eecd561c6e695648f243a03b3ad8d9b3eff689342b95023b"
+dependencies = [
+ "boxcar",
+ "clashmap",
+ "foldhash",
+ "hashbrown 0.15.2",
+ "serde",
+ "sync_wrapper 1.0.1",
+ "typed-arena",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
@@ -4591,7 +4610,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
 "lock_api",
- "parking_lot_core 0.9.8",
+ "parking_lot_core 0.9.10",
 ]

 [[package]]
@@ -4610,15 +4629,15 @@ dependencies = [

 [[package]]
 name = "parking_lot_core"
-version = "0.9.8"
+version = "0.9.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
 dependencies = [
 "cfg-if",
 "libc",
- "redox_syscall 0.3.5",
+ "redox_syscall 0.5.10",
 "smallvec",
- "windows-targets 0.48.0",
+ "windows-targets 0.52.6",
 ]

 [[package]]
@@ -5324,13 +5343,14 @@ dependencies = [
 "itoa",
 "jose-jwa",
 "jose-jwk",
- "lasso",
 "measured",
+ "measured-tokio",
 "metrics",
 "once_cell",
 "opentelemetry",
 "p256 0.13.2",
 "papaya",
+ "paracord",
 "parking_lot 0.12.1",
 "parquet",
 "parquet_derive",
@@ -5633,6 +5653,15 @@ dependencies = [
 "bitflags 1.3.2",
 ]

+[[package]]
+name = "redox_syscall"
+version = "0.5.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b8c0c260b63a8219631167be35e6a988e9554dbd323f8bd08439c8ed1302bd1"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "regex"
 version = "1.10.2"
@@ -6893,12 +6922,12 @@ dependencies = [
 "hyper 0.14.30",
 "itertools 0.10.5",
 "json-structural-diff",
- "lasso",
 "measured",
 "metrics",
 "once_cell",
 "pageserver_api",
 "pageserver_client",
+ "paracord",
 "postgres_connection",
 "posthog_client_lite",
 "rand 0.8.5",
@@ -6987,6 +7016,7 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "reqwest",
+ "safekeeper_api",
 "serde_json",
 "storage_controller_client",
 "tokio",
@@ -8012,6 +8042,12 @@ dependencies = [
 "static_assertions",
 ]

+[[package]]
+name = "typed-arena"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a"
+
 [[package]]
 name = "typed-json"
 version = "0.1.1"
@@ -8756,6 +8792,7 @@ dependencies = [
 "num-traits",
 "once_cell",
 "p256 0.13.2",
+ "paracord",
 "parquet",
 "prettyplease",
 "proc-macro2",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -128,11 +128,11 @@ itertools = "0.10"
 itoa = "1.0.11"
 jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
 jsonwebtoken = "9"
-lasso = "0.7"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.22", features=["lasso"] }
-measured-process = { version = "0.0.22" }
+measured = { version = "0.0.23", features = ["paracord"] }
+measured-process = { version = "0.0.23" }
+measured-tokio = { version = "0.0.23" }
 memoffset = "0.9"
 nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] }
 # Do not update to >= 7.0.0, at least. The update will have a significant impact
@@ -145,6 +145,7 @@ opentelemetry = "0.27"
 opentelemetry_sdk = "0.27"
 opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.27"
+paracord = { version = "0.1.0", features = ["serde"] }
 parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
@@ -262,6 +263,7 @@ neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_client_grpc = { path = "./pageserver/client_grpc" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -46,11 +46,14 @@ stateDiagram-v2
  Configuration --> Failed : Failed to configure the compute
  Configuration --> Running : Compute has been configured
  Empty --> Init : Compute spec is immediately available
-  Empty --> TerminationPending : Requested termination
+  Empty --> TerminationPendingFast : Requested termination
+  Empty --> TerminationPendingImmediate : Requested termination
  Init --> Failed : Failed to start Postgres
  Init --> Running : Started Postgres
-  Running --> TerminationPending : Requested termination
-  TerminationPending --> Terminated : Terminated compute
+  Running --> TerminationPendingFast : Requested termination
+  Running --> TerminationPendingImmediate : Requested termination
+  TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status
+  TerminationPendingImmediate --> Terminated : Terminated compute immediately
  Failed --> [*] : Compute exited
  Terminated --> [*] : Compute exited
 ```
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -956,14 +956,20 @@ impl ComputeNode {
            None
        };

-        let mut delay_exit = false;
        let mut state = self.state.lock().unwrap();
        state.terminate_flush_lsn = lsn;
-        if let ComputeStatus::TerminationPending { mode } = state.status {
+
+        let delay_exit = state.status == ComputeStatus::TerminationPendingFast;
+        if state.status == ComputeStatus::TerminationPendingFast
+            || state.status == ComputeStatus::TerminationPendingImmediate
+        {
+            info!(
+                "Changing compute status from {} to {}",
+                state.status,
+                ComputeStatus::Terminated
+            );
            state.status = ComputeStatus::Terminated;
            self.state_changed.notify_all();
-            // we were asked to terminate gracefully, don't exit to avoid restart
-            delay_exit = mode == compute_api::responses::TerminateMode::Fast
        }
        drop(state);

@@ -1805,6 +1811,8 @@ impl ComputeNode {
            tls_config,
        )?;

+        self.pg_reload_conf()?;
+
        if !spec.skip_pg_catalog_updates {
            let max_concurrent_connections = spec.reconfigure_concurrency;
            // Temporarily reset max_cluster_size in config
@@ -1824,10 +1832,9 @@ impl ComputeNode {

                Ok(())
            })?;
+            self.pg_reload_conf()?;
        }

-        self.pg_reload_conf()?;
-
        let unknown_op = "unknown".to_string();
        let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
        info!(
@@ -1900,7 +1907,8 @@ impl ComputeNode {

                            // exit loop
                            ComputeStatus::Failed
-                            | ComputeStatus::TerminationPending { .. }
+                            | ComputeStatus::TerminationPendingFast
+                            | ComputeStatus::TerminationPendingImmediate
                            | ComputeStatus::Terminated => break 'cert_update,

                            // wait
--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -70,7 +70,7 @@ impl ComputeNode {
            }
        };
        let row = match client
-            .query_one("select * from get_prewarm_info()", &[])
+            .query_one("select * from neon.get_prewarm_info()", &[])
            .await
        {
            Ok(row) => row,
@@ -146,7 +146,7 @@ impl ComputeNode {
        ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
            .await
            .context("connecting to postgres")?
-            .query_one("select prewarm_local_cache($1)", &[&uncompressed])
+            .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
            .await
            .context("loading LFC state into postgres")
            .map(|_| ())
@@ -196,7 +196,7 @@ impl ComputeNode {
        ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
            .await
            .context("connecting to postgres")?
-            .query_one("select get_local_cache_state()", &[])
+            .query_one("select neon.get_local_cache_state()", &[])
            .await
            .context("querying LFC state")?
            .try_get::<usize, &[u8]>(0)
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -371,9 +371,28 @@ paths:
      summary: Terminate Postgres and wait for it to exit
      description: ""
      operationId: terminate
+      parameters:
+        - name: mode
+          in: query
+          description: "Terminate mode: fast (wait 30s before returning) and immediate"
+          required: false
+          schema:
+            type: string
+            enum: ["fast", "immediate"]
+            default: fast
      responses:
        200:
          description: Result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TerminateResponse"
+        201:
+          description: Result if compute is already terminated
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TerminateResponse"
        412:
          description: "wrong state"
          content:
@@ -530,11 +549,14 @@ components:
      type: string
      enum:
        - empty
-        - init
-        - failed
-        - running
        - configuration_pending
+        - init
+        - running
        - configuration
+        - failed
+        - termination_pending_fast
+        - termination_pending_immediate
+        - terminated
      example: running

    ExtensionInstallRequest:
@@ -660,6 +682,17 @@ components:
          description: Role name.
          example: "neon"

+    TerminateResponse:
+      type: object
+      required:
+        - lsn
+      properties:
+        lsn:
+          type: string
+          nullable: true
+          description: "last WAL flush LSN"
+          example: "0/028F10D8"
+
    SetRoleGrantsResponse:
      type: object
      required:
--- a/compute_tools/src/http/routes/terminate.rs
+++ b/compute_tools/src/http/routes/terminate.rs
@@ -3,7 +3,7 @@ use crate::http::JsonResponse;
 use axum::extract::State;
 use axum::response::Response;
 use axum_extra::extract::OptionalQuery;
-use compute_api::responses::{ComputeStatus, TerminateResponse};
+use compute_api::responses::{ComputeStatus, TerminateMode, TerminateResponse};
 use http::StatusCode;
 use serde::Deserialize;
 use std::sync::Arc;
@@ -12,7 +12,7 @@ use tracing::info;

 #[derive(Deserialize, Default)]
 pub struct TerminateQuery {
-    mode: compute_api::responses::TerminateMode,
+    mode: TerminateMode,
 }

 /// Terminate the compute.
@@ -24,16 +24,16 @@ pub(in crate::http) async fn terminate(
    {
        let mut state = compute.state.lock().unwrap();
        if state.status == ComputeStatus::Terminated {
-            return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn);
+            let response = TerminateResponse {
+                lsn: state.terminate_flush_lsn,
+            };
+            return JsonResponse::success(StatusCode::CREATED, response);
        }

        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
            return JsonResponse::invalid_status(state.status);
        }
-        state.set_status(
-            ComputeStatus::TerminationPending { mode },
-            &compute.state_changed,
-        );
+        state.set_status(mode.into(), &compute.state_changed);
    }

    forward_termination_signal(false);
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -108,7 +108,7 @@ pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "compute_ctl_lfc_prewarm_errors_total",
-        "Total number of LFC prewarms errors requested by compute_ctl or autoprewarm option",
+        "Total number of LFC prewarm errors",
    )
    .expect("failed to define a metric")
 });
@@ -124,7 +124,7 @@ pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "compute_ctl_lfc_offload_errors_total",
-        "Total number of LFC offload errors requested by compute_ctl or lfc_offload_period_seconds option",
+        "Total number of LFC offload errors",
    )
    .expect("failed to define a metric")
 });
--- a/compute_tools/src/migrations/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/0002-alter_roles.sql
@@ -1,3 +1,16 @@
+-- On December 8th, 2023, an engineering escalation (INC-110) was opened after
+-- it was found that BYPASSRLS was being applied to all roles.
+--
+-- PR that introduced the issue: https://github.com/neondatabase/neon/pull/5657
+-- Subsequent commit on main: https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072
+--
+-- NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but because it
+-- isn't easy to know if a Postgres cluster is affected by the issue, we need to
+-- keep the migration around for a long time, if not indefinitely, so any
+-- cluster can be fixed.
+--
+-- Branching is the gift that keeps on giving...
+
 DO $$
 DECLARE
    role_name text;
--- a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
@@ -0,0 +1 @@
+GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;
--- a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
@@ -0,0 +1,23 @@
+DO $$
+DECLARE
+    signal_backend record;
+BEGIN
+    SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
+            admin_option AS admin
+        INTO signal_backend
+        FROM pg_auth_members
+        WHERE roleid = 'pg_signal_backend'::regrole
+            AND member = 'neon_superuser'::regrole;
+
+    IF signal_backend IS NULL THEN
+        RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_signal_backend';
+    END IF;
+
+    IF signal_backend.member IS NULL OR NOT signal_backend.member THEN
+        RAISE EXCEPTION 'neon_superuser is not a member of pg_signal_backend';
+    END IF;
+
+    IF signal_backend.admin IS NULL OR NOT signal_backend.admin THEN
+        RAISE EXCEPTION 'neon_superuser cannot grant pg_signal_backend';
+    END IF;
+END $$;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -84,7 +84,8 @@ impl ComputeMonitor {
        if matches!(
            compute_status,
            ComputeStatus::Terminated
-                | ComputeStatus::TerminationPending { .. }
+                | ComputeStatus::TerminationPendingFast
+                | ComputeStatus::TerminationPendingImmediate
                | ComputeStatus::Failed
        ) {
            info!(
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -197,6 +197,7 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
        include_str!(
            "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
        ),
+        include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"),
    ];

    MigrationRunner::new(client, &migrations)
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -36,7 +36,7 @@ impl StorageBroker {
    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
        let broker = &self.env.broker;

-        print!("Starting neon broker at {}", broker.client_url());
+        println!("Starting neon broker at {}", broker.client_url());

        let mut args = Vec::new();

--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -922,7 +922,8 @@ impl Endpoint {
                        ComputeStatus::Empty
                        | ComputeStatus::ConfigurationPending
                        | ComputeStatus::Configuration
-                        | ComputeStatus::TerminationPending { .. }
+                        | ComputeStatus::TerminationPendingFast
+                        | ComputeStatus::TerminationPendingImmediate
                        | ComputeStatus::Terminated => {
                            bail!("unexpected compute status: {:?}", state.status)
                        }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -303,7 +303,7 @@ impl PageServerNode {
    async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
-        print!(
+        println!(
            "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
            self.conf.id,
            self.pg_connection_config.raw_address(),
@@ -452,6 +452,12 @@ impl PageServerNode {
                .map(|x| x.parse::<usize>())
                .transpose()
                .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
+            // HADRON
+            image_layer_force_creation_period: settings
+                .remove("image_layer_force_creation_period")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'image_layer_force_creation_period' as duration")?,
            image_layer_creation_check_threshold: settings
                .remove("image_layer_creation_check_threshold")
                .map(|x| x.parse::<u8>())
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -127,7 +127,7 @@ impl SafekeeperNode {
        extra_opts: &[String],
        retry_timeout: &Duration,
    ) -> anyhow::Result<()> {
-        print!(
+        println!(
            "Starting safekeeper at '{}' in '{}', retrying for {:?}",
            self.pg_connection_config.raw_address(),
            self.datadir_path().display(),
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -660,7 +660,7 @@ impl StorageController {
            ));
        }

-        println!("Starting storage controller");
+        println!("Starting storage controller at {scheme}://{host}:{listen_port}");

        background_process::start_process(
            COMMAND,
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -14,6 +14,7 @@ humantime.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
+safekeeper_api.workspace=true
 serde_json = { workspace = true, features = ["raw_value"] }
 storage_controller_client.workspace = true
 tokio.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -11,7 +11,7 @@ use pageserver_api::controller_api::{
    PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
    ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
    SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    TenantShardMigrateRequest, TenantShardMigrateResponse, TimelineSafekeeperMigrateRequest,
 };
 use pageserver_api::models::{
    EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
@@ -21,6 +21,7 @@ use pageserver_api::models::{
 use pageserver_api::shard::{ShardStripeSize, TenantShardId};
 use pageserver_client::mgmt_api::{self};
 use reqwest::{Certificate, Method, StatusCode, Url};
+use safekeeper_api::models::TimelineLocateResponse;
 use storage_controller_client::control_api::Client;
 use utils::id::{NodeId, TenantId, TimelineId};

@@ -279,6 +280,23 @@ enum Command {
        #[arg(long)]
        concurrency: Option<usize>,
    },
+    /// Locate safekeepers for a timeline from the storcon DB.
+    TimelineLocate {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+    },
+    /// Migrate a timeline to a new set of safekeepers
+    TimelineSafekeeperMigrate {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+        /// Example: --new-sk-set 1,2,3
+        #[arg(long, required = true, value_delimiter = ',')]
+        new_sk_set: Vec<NodeId>,
+    },
 }

 #[derive(Parser)]
@@ -1324,7 +1342,7 @@ async fn main() -> anyhow::Result<()> {
            concurrency,
        } => {
            let mut path = format!(
-                "/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
+                "v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
            );

            if let Some(c) = concurrency {
@@ -1335,6 +1353,41 @@ async fn main() -> anyhow::Result<()> {
                .dispatch::<(), ()>(Method::POST, path, None)
                .await?;
        }
+        Command::TimelineLocate {
+            tenant_id,
+            timeline_id,
+        } => {
+            let path = format!("debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate");
+
+            let resp = storcon_client
+                .dispatch::<(), TimelineLocateResponse>(Method::GET, path, None)
+                .await?;
+
+            let sk_set = resp.sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
+            let new_sk_set = resp
+                .new_sk_set
+                .as_ref()
+                .map(|ids| ids.iter().map(|id| id.0 as i64).collect::<Vec<_>>());
+
+            println!("generation = {}", resp.generation);
+            println!("sk_set = {sk_set:?}");
+            println!("new_sk_set = {new_sk_set:?}");
+        }
+        Command::TimelineSafekeeperMigrate {
+            tenant_id,
+            timeline_id,
+            new_sk_set,
+        } => {
+            let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate");
+
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::POST,
+                    path,
+                    Some(TimelineSafekeeperMigrateRequest { new_sk_set }),
+                )
+                .await?;
+        }
    }

    Ok(())
--- a/deny.toml
+++ b/deny.toml
@@ -52,6 +52,7 @@ exceptions = [
    # Zlib license has some restrictions if we decide to change sth
    { allow = ["Zlib"], name = "const_format_proc_macros", version = "*" },
    { allow = ["Zlib"], name = "const_format", version = "*" },
+    { allow = ["Zlib"], name = "foldhash", version = "*" },
 ]

 [licenses.private]
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -121,6 +121,15 @@ pub enum TerminateMode {
    Immediate,
 }

+impl From<TerminateMode> for ComputeStatus {
+    fn from(mode: TerminateMode) -> Self {
+        match mode {
+            TerminateMode::Fast => ComputeStatus::TerminationPendingFast,
+            TerminateMode::Immediate => ComputeStatus::TerminationPendingImmediate,
+        }
+    }
+}
+
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
@@ -141,7 +150,9 @@ pub enum ComputeStatus {
    // control-plane to terminate it.
    Failed,
    // Termination requested
-    TerminationPending { mode: TerminateMode },
+    TerminationPendingFast,
+    // Termination requested, without waiting 30s before returning from /terminate
+    TerminationPendingImmediate,
    // Terminated Postgres
    Terminated,
 }
@@ -160,7 +171,10 @@ impl Display for ComputeStatus {
            ComputeStatus::Running => f.write_str("running"),
            ComputeStatus::Configuration => f.write_str("configuration"),
            ComputeStatus::Failed => f.write_str("failed"),
-            ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"),
+            ComputeStatus::TerminationPendingFast => f.write_str("termination-pending-fast"),
+            ComputeStatus::TerminationPendingImmediate => {
+                f.write_str("termination-pending-immediate")
+            }
            ComputeStatus::Terminated => f.write_str("terminated"),
        }
    }
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -20,6 +20,7 @@ use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{Instrument, debug, info, info_span, warn};
 use utils::auth::{AuthError, Claims, SwappableJwtAuth};
+use utils::metrics_collector::{METRICS_COLLECTOR, METRICS_STALE_MILLIS};

 use crate::error::{ApiError, api_error_handler, route_error_handler};
 use crate::request::{get_query_param, parse_query_param};
@@ -250,9 +251,28 @@ impl std::io::Write for ChannelWriter {
    }
 }

-pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn prometheus_metrics_handler(
+    req: Request<Body>,
+    force_metric_collection_on_scrape: bool,
+) -> Result<Response<Body>, ApiError> {
    SERVE_METRICS_COUNT.inc();

+    // HADRON
+    let requested_use_latest = parse_query_param(&req, "use_latest")?;
+
+    let use_latest = match requested_use_latest {
+        None => force_metric_collection_on_scrape,
+        Some(true) => true,
+        Some(false) => {
+            if force_metric_collection_on_scrape {
+                // We don't cache in this case
+                true
+            } else {
+                false
+            }
+        }
+    };
+
    let started_at = std::time::Instant::now();

    let (tx, rx) = mpsc::channel(1);
@@ -277,12 +297,18 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<

        let _span = span.entered();

-        let metrics = metrics::gather();
+        // HADRON
+        let collected = if use_latest {
+            // Skip caching the results if we always force metric collection on scrape.
+            METRICS_COLLECTOR.run_once(!force_metric_collection_on_scrape)
+        } else {
+            METRICS_COLLECTOR.last_collected()
+        };

        let gathered_at = std::time::Instant::now();

        let res = encoder
-            .encode(&metrics, &mut writer)
+            .encode(&collected.metrics, &mut writer)
            .and_then(|_| writer.flush().map_err(|e| e.into()));

        // this instant is not when we finally got the full response sent, sending is done by hyper
@@ -295,6 +321,10 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
        let encoded_in = encoded_at - gathered_at - writer.wait_time();
        let total = encoded_at - started_at;

+        // HADRON
+        let staleness_ms = (encoded_at - collected.collected_at).as_millis();
+        METRICS_STALE_MILLIS.set(staleness_ms as i64);
+
        match res {
            Ok(()) => {
                tracing::info!(
@@ -303,6 +333,7 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
                    spawning_ms = spawned_in.as_millis(),
                    collection_ms = collected_in.as_millis(),
                    encoding_ms = encoded_in.as_millis(),
+                    stalenss_ms = staleness_ms,
                    "responded /metrics"
                );
            }
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -272,6 +272,9 @@ pub struct ConfigToml {
    pub timeline_import_config: TimelineImportConfig,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub basebackup_cache_config: Option<BasebackupCacheConfig>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_layer_generation_large_timeline_threshold: Option<u64>,
+    pub force_metric_collection_on_scrape: bool,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -561,6 +564,11 @@ pub struct TenantConfigToml {
    pub gc_period: Duration,
    // Delta layer churn threshold to create L1 image layers.
    pub image_creation_threshold: usize,
+    // HADRON
+    // When the timeout is reached, PageServer will (1) force compact any remaining L0 deltas and
+    // (2) create image layers if there are any L1 deltas.
+    #[serde(with = "humantime_serde")]
+    pub image_layer_force_creation_period: Option<Duration>,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is time.
@@ -823,6 +831,8 @@ impl Default for ConfigToml {
            },
            basebackup_cache_config: None,
            posthog_config: None,
+            image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024),
+            force_metric_collection_on_scrape: true,
        }
    }
 }
@@ -916,6 +926,7 @@ impl Default for TenantConfigToml {
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
+            image_layer_force_creation_period: None,
            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
                .expect("cannot parse default PITR interval"),
            walreceiver_connect_timeout: humantime::parse_duration(
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;

-use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
+use crate::models::{PageserverUtilization, ShardParameters, TenantConfig, TimelineInfo};
 use crate::shard::{ShardStripeSize, TenantShardId};

 #[derive(Serialize, Deserialize, Debug)]
@@ -126,6 +126,13 @@ pub struct TenantDescribeResponse {
    pub config: TenantConfig,
 }

+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantTimelineDescribeResponse {
+    pub shards: Vec<TimelineInfo>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_consistent_lsn: Option<Lsn>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct NodeShardResponse {
    pub node_id: NodeId,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -597,6 +597,9 @@ pub struct TenantConfigPatch {
    pub gc_period: FieldPatch<String>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub image_creation_threshold: FieldPatch<usize>,
+    // HADRON
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_layer_force_creation_period: FieldPatch<String>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub pitr_interval: FieldPatch<String>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
@@ -700,6 +703,11 @@ pub struct TenantConfig {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub image_creation_threshold: Option<usize>,

+    // HADRON
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    pub image_layer_force_creation_period: Option<Duration>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(with = "humantime_serde")]
    pub pitr_interval: Option<Duration>,
@@ -798,6 +806,7 @@ impl TenantConfig {
            mut gc_horizon,
            mut gc_period,
            mut image_creation_threshold,
+            mut image_layer_force_creation_period,
            mut pitr_interval,
            mut walreceiver_connect_timeout,
            mut lagging_wal_timeout,
@@ -861,6 +870,11 @@ impl TenantConfig {
        patch
            .image_creation_threshold
            .apply(&mut image_creation_threshold);
+        // HADRON
+        patch
+            .image_layer_force_creation_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut image_layer_force_creation_period);
        patch
            .pitr_interval
            .map(|v| humantime::parse_duration(&v))?
@@ -942,6 +956,7 @@ impl TenantConfig {
            gc_horizon,
            gc_period,
            image_creation_threshold,
+            image_layer_force_creation_period,
            pitr_interval,
            walreceiver_connect_timeout,
            lagging_wal_timeout,
@@ -1016,6 +1031,9 @@ impl TenantConfig {
            image_creation_threshold: self
                .image_creation_threshold
                .unwrap_or(global_conf.image_creation_threshold),
+            image_layer_force_creation_period: self
+                .image_layer_force_creation_period
+                .or(global_conf.image_layer_force_creation_period),
            pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval),
            walreceiver_connect_timeout: self
                .walreceiver_connect_timeout
@@ -1604,6 +1622,9 @@ pub struct TimelineInfo {

    /// Whether the timeline is invisible in synthetic size calculations.
    pub is_invisible: Option<bool>,
+    // HADRON: the largest LSN below which all page updates have been included in the image layers.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_consistent_lsn: Option<Lsn>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -11,7 +11,7 @@ use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;

-use crate::membership::Configuration;
+use crate::membership::{Configuration, SafekeeperGeneration};
 use crate::{ServerInfo, Term};

 #[derive(Debug, Serialize, Deserialize)]
@@ -311,3 +311,12 @@ pub struct PullTimelineResponse {
    pub safekeeper_host: Option<String>,
    // TODO: add more fields?
 }
+
+/// Response to a timeline locate request.
+/// Storcon-only API.
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct TimelineLocateResponse {
+    pub generation: SafekeeperGeneration,
+    pub sk_set: Vec<NodeId>,
+    pub new_sk_set: Option<Vec<NodeId>>,
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -99,6 +99,8 @@ pub mod elapsed_accum;
 #[cfg(target_os = "linux")]
 pub mod linux_socket_ioctl;

+pub mod metrics_collector;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
--- a/libs/utils/src/metrics_collector.rs
+++ b/libs/utils/src/metrics_collector.rs
@@ -0,0 +1,75 @@
+use std::{
+    sync::{Arc, RwLock},
+    time::{Duration, Instant},
+};
+
+use metrics::{IntGauge, proto::MetricFamily, register_int_gauge};
+use once_cell::sync::Lazy;
+
+pub static METRICS_STALE_MILLIS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "metrics_metrics_stale_milliseconds",
+        "The current metrics stale time in milliseconds"
+    )
+    .expect("failed to define a metric")
+});
+
+#[derive(Debug)]
+pub struct CollectedMetrics {
+    pub metrics: Vec<MetricFamily>,
+    pub collected_at: Instant,
+}
+
+impl CollectedMetrics {
+    fn new(metrics: Vec<MetricFamily>) -> Self {
+        Self {
+            metrics,
+            collected_at: Instant::now(),
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct MetricsCollector {
+    last_collected: RwLock<Arc<CollectedMetrics>>,
+}
+
+impl MetricsCollector {
+    pub fn new() -> Self {
+        Self {
+            last_collected: RwLock::new(Arc::new(CollectedMetrics::new(vec![]))),
+        }
+    }
+
+    #[tracing::instrument(name = "metrics_collector", skip_all)]
+    pub fn run_once(&self, cache_metrics: bool) -> Arc<CollectedMetrics> {
+        let started = Instant::now();
+        let metrics = metrics::gather();
+        let collected = Arc::new(CollectedMetrics::new(metrics));
+        if cache_metrics {
+            let mut guard = self.last_collected.write().unwrap();
+            *guard = collected.clone();
+        }
+        tracing::info!(
+            "Collected {} metric families in {} ms",
+            collected.metrics.len(),
+            started.elapsed().as_millis()
+        );
+        collected
+    }
+
+    pub fn last_collected(&self) -> Arc<CollectedMetrics> {
+        self.last_collected.read().unwrap().clone()
+    }
+}
+
+impl Default for MetricsCollector {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+// Interval for metrics collection. Currently hard-coded to be the same as the metrics scape interval from the obs agent
+pub static METRICS_COLLECTION_INTERVAL: Duration = Duration::from_secs(30);
+
+pub static METRICS_COLLECTOR: Lazy<MetricsCollector> = Lazy::new(MetricsCollector::default);
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -428,6 +428,12 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
        shard_number: 0,
    };

+    let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
+        should_limit: crate::bindings::pg_atomic_uint32 { value: 0 },
+        sent_bytes: 0,
+        last_recorded_time_us: 0,
+    };
+
    crate::bindings::WalproposerShmemState {
        propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
        donor_name: [0; 64],
@@ -441,6 +447,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
        num_shards: 0,
        replica_promote: false,
        min_ps_feedback: empty_feedback,
+        wal_rate_limiter: empty_wal_rate_limiter,
    }
 }

--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
 use std::error::Error as _;
 use std::time::Duration;

@@ -251,6 +251,70 @@ impl Client {
        Ok(())
    }

+    pub async fn tenant_timeline_compact(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        force_image_layer_creation: bool,
+        must_force_image_layer_creation: bool,
+        scheduled: bool,
+        wait_until_done: bool,
+    ) -> Result<()> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/compact",
+            self.mgmt_api_endpoint
+        ))
+        .expect("Cannot build URL");
+
+        if force_image_layer_creation {
+            path.query_pairs_mut()
+                .append_pair("force_image_layer_creation", "true");
+        }
+
+        if must_force_image_layer_creation {
+            path.query_pairs_mut()
+                .append_pair("must_force_image_layer_creation", "true");
+        }
+
+        if scheduled {
+            path.query_pairs_mut().append_pair("scheduled", "true");
+        }
+        if wait_until_done {
+            path.query_pairs_mut()
+                .append_pair("wait_until_scheduled_compaction_done", "true");
+            path.query_pairs_mut()
+                .append_pair("wait_until_uploaded", "true");
+        }
+        self.request(Method::PUT, path, ()).await?;
+        Ok(())
+    }
+
+    /* BEGIN_HADRON */
+    pub async fn tenant_timeline_describe(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Result<TimelineInfo> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        ))
+        .expect("Cannot build URL");
+        path.query_pairs_mut()
+            .append_pair("include-image-consistent-lsn", "true");
+
+        let response: reqwest::Response = self.request(Method::GET, path, ()).await?;
+        let body = response.json().await.map_err(Error::ReceiveBody)?;
+        Ok(body)
+    }
+
+    pub async fn list_tenant_visible_size(&self) -> Result<BTreeMap<TenantShardId, u64>> {
+        let uri = format!("{}/v1/list_tenant_visible_size", self.mgmt_api_endpoint);
+        let resp = self.get(&uri).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+    /* END_HADRON */
+
    pub async fn tenant_scan_remote_storage(
        &self,
        tenant_id: TenantId,
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -4,8 +4,12 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

+[features]
+testing = ["pageserver_api/testing"]
+
 [dependencies]
 anyhow.workspace = true
+arc-swap.workspace = true
 bytes.workspace = true
 compute_api.workspace = true
 futures.workspace = true
@@ -13,6 +17,7 @@ pageserver_api.workspace = true
 pageserver_page_api.workspace = true
 tokio.workspace = true
 tokio-stream.workspace = true
+tokio-util.workspace = true
 tonic.workspace = true
 tracing.workspace = true
 utils.workspace = true
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -3,8 +3,10 @@ use std::num::NonZero;
 use std::sync::Arc;

 use anyhow::anyhow;
+use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
+use tonic::codec::CompressionEncoding;
 use tracing::instrument;

 use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
@@ -55,28 +57,85 @@ const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
 /// TODO: this client does not support base backups or LSN leases, as these are only used by
 /// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
 pub struct PageserverClient {
-    // TODO: support swapping out the shard map, e.g. via an ArcSwap.
-    shards: Shards,
+    /// The tenant ID.
+    tenant_id: TenantId,
+    /// The timeline ID.
+    timeline_id: TimelineId,
+    /// The JWT auth token for this tenant, if any.
+    auth_token: Option<String>,
+    /// The compression to use, if any.
+    compression: Option<CompressionEncoding>,
+    /// The shards for this tenant.
+    shards: ArcSwap<Shards>,
+    /// The retry configuration.
    retry: Retry,
 }

 impl PageserverClient {
    /// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given
-    /// in the shard map, which must be complete and must use gRPC URLs.
+    /// in the shard spec, which must be complete and must use gRPC URLs.
    pub fn new(
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        shard_map: HashMap<ShardIndex, String>,
-        stripe_size: ShardStripeSize,
+        shard_spec: ShardSpec,
        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
    ) -> anyhow::Result<Self> {
-        let shards = Shards::new(tenant_id, timeline_id, shard_map, stripe_size, auth_token)?;
+        let shards = Shards::new(
+            tenant_id,
+            timeline_id,
+            shard_spec,
+            auth_token.clone(),
+            compression,
+        )?;
        Ok(Self {
-            shards,
+            tenant_id,
+            timeline_id,
+            auth_token,
+            compression,
+            shards: ArcSwap::new(Arc::new(shards)),
            retry: Retry,
        })
    }

+    /// Updates the shards from the given shard spec. In-flight requests will complete using the
+    /// existing shards, but may retry with the new shards if they fail.
+    ///
+    /// TODO: verify that in-flight requests are allowed to complete, and that the old pools are
+    /// properly spun down and dropped afterwards.
+    pub fn update_shards(&self, shard_spec: ShardSpec) -> anyhow::Result<()> {
+        // Validate the shard spec. We should really use `ArcSwap::rcu` for this, to avoid races
+        // with concurrent updates, but that involves creating a new `Shards` on every attempt,
+        // which spins up a bunch of Tokio tasks and such. These should already be checked elsewhere
+        // in the stack, and if they're violated then we already have problems elsewhere, so a
+        // best-effort but possibly-racy check is okay here.
+        let old = self.shards.load_full();
+        if shard_spec.count < old.count {
+            return Err(anyhow!(
+                "can't reduce shard count from {} to {}",
+                old.count,
+                shard_spec.count
+            ));
+        }
+        if !old.count.is_unsharded() && shard_spec.stripe_size != old.stripe_size {
+            return Err(anyhow!(
+                "can't change stripe size from {} to {}",
+                old.stripe_size,
+                shard_spec.stripe_size
+            ));
+        }
+
+        let shards = Shards::new(
+            self.tenant_id,
+            self.timeline_id,
+            shard_spec,
+            self.auth_token.clone(),
+            self.compression,
+        )?;
+        self.shards.store(Arc::new(shards));
+        Ok(())
+    }
+
    /// Returns whether a relation exists.
    #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
    pub async fn check_rel_exists(
@@ -84,9 +143,9 @@ impl PageserverClient {
        req: page_api::CheckRelExistsRequest,
    ) -> tonic::Result<page_api::CheckRelExistsResponse> {
        self.retry
-            .with(async || {
+            .with(async |_| {
                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
+                let mut client = self.shards.load_full().get_zero().client().await?;
                client.check_rel_exists(req).await
            })
            .await
@@ -99,16 +158,17 @@ impl PageserverClient {
        req: page_api::GetDbSizeRequest,
    ) -> tonic::Result<page_api::GetDbSizeResponse> {
        self.retry
-            .with(async || {
+            .with(async |_| {
                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
+                let mut client = self.shards.load_full().get_zero().client().await?;
                client.get_db_size(req).await
            })
            .await
    }

-    /// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically
-    /// splits requests that straddle shard boundaries, and assembles the responses.
+    /// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
+    /// `attempt` must be 0 (incremented on retry). Automatically splits requests that straddle
+    /// shard boundaries, and assembles the responses.
    ///
    /// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status`
    /// errors. All responses will have `GetPageStatusCode::Ok`.
@@ -128,72 +188,96 @@ impl PageserverClient {
        if req.block_numbers.is_empty() {
            return Err(tonic::Status::invalid_argument("no block number"));
        }
+        // The request attempt must be 0. The client will increment it internally.
+        if req.request_id.attempt != 0 {
+            return Err(tonic::Status::invalid_argument("request attempt must be 0"));
+        }

+        // The shards may change while we're fetching pages. We execute the request using a stable
+        // view of the shards (especially important for requests that span shards), but retry the
+        // top-level (pre-split) request to pick up shard changes. This can lead to unnecessary
+        // retries and re-splits in some cases where requests span shards, but these are expected to
+        // be rare.
+        //
+        // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
+        // once we figure out how to handle these.
+        self.retry
+            .with(async |attempt| {
+                let mut req = req.clone();
+                req.request_id.attempt = attempt as u32;
+                Self::get_page_with_shards(req, &self.shards.load_full()).await
+            })
+            .await
+    }
+
+    /// Fetches pages using the given shards. This uses a stable view of the shards, regardless of
+    /// concurrent shard updates. Does not retry internally, but is retried by `get_page()`.
+    async fn get_page_with_shards(
+        req: page_api::GetPageRequest,
+        shards: &Shards,
+    ) -> tonic::Result<page_api::GetPageResponse> {
        // Fast path: request is for a single shard.
        if let Some(shard_id) =
-            GetPageSplitter::is_single_shard(&req, self.shards.count, self.shards.stripe_size)
+            GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)
        {
-            return self.get_page_for_shard(shard_id, req).await;
+            return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
        }

        // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
        // reassemble the responses.
-        //
-        // TODO: when we support shard map updates, we need to detect when it changes and re-split
-        // the request on errors.
-        let mut splitter = GetPageSplitter::split(req, self.shards.count, self.shards.stripe_size);
+        let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size);

-        let mut shard_requests: FuturesUnordered<_> = splitter
-            .drain_requests()
-            .map(|(shard_id, shard_req)| {
-                // NB: each request will retry internally.
-                self.get_page_for_shard(shard_id, shard_req)
-                    .map(move |result| result.map(|resp| (shard_id, resp)))
-            })
-            .collect();
+        let mut shard_requests = FuturesUnordered::new();
+        for (shard_id, shard_req) in splitter.drain_requests() {
+            let future = Self::get_page_with_shard(shard_req, shards.get(shard_id)?)
+                .map(move |result| result.map(|resp| (shard_id, resp)));
+            shard_requests.push(future);
+        }

        while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
            splitter.add_response(shard_id, shard_response)?;
        }

-        splitter.assemble_response()
+        splitter.get_response()
    }

-    /// Fetches pages that belong to the given shard.
-    #[instrument(skip_all, fields(shard = %shard_id))]
-    async fn get_page_for_shard(
-        &self,
-        shard_id: ShardIndex,
+    /// Fetches pages on the given shard. Does not retry internally.
+    async fn get_page_with_shard(
        req: page_api::GetPageRequest,
+        shard: &Shard,
    ) -> tonic::Result<page_api::GetPageResponse> {
-        let resp = self
-            .retry
-            .with(async || {
-                let stream = self
-                    .shards
-                    .get(shard_id)?
-                    .stream(req.request_class.is_bulk())
-                    .await;
-                let resp = stream.send(req.clone()).await?;
+        let stream = shard.stream(req.request_class.is_bulk()).await;
+        let resp = stream.send(req.clone()).await?;

-                // Convert per-request errors into a tonic::Status.
-                if resp.status_code != page_api::GetPageStatusCode::Ok {
-                    return Err(tonic::Status::new(
-                        resp.status_code.into(),
-                        resp.reason.unwrap_or_else(|| String::from("unknown error")),
-                    ));
-                }
+        // Convert per-request errors into a tonic::Status.
+        if resp.status_code != page_api::GetPageStatusCode::Ok {
+            return Err(tonic::Status::new(
+                resp.status_code.into(),
+                resp.reason.unwrap_or_else(|| String::from("unknown error")),
+            ));
+        }

-                Ok(resp)
-            })
-            .await?;
-
-        // Make sure we got the right number of pages.
-        // NB: check outside of the retry loop, since we don't want to retry this.
-        let (expected, actual) = (req.block_numbers.len(), resp.page_images.len());
-        if expected != actual {
+        // Check that we received the expected pages.
+        if req.rel != resp.rel {
            return Err(tonic::Status::internal(format!(
-                "expected {expected} pages for shard {shard_id}, got {actual}",
+                "shard {} returned wrong relation, expected {} got {}",
+                shard.id, req.rel, resp.rel
+            )));
+        }
+        if !req
+            .block_numbers
+            .iter()
+            .copied()
+            .eq(resp.pages.iter().map(|p| p.block_number))
+        {
+            return Err(tonic::Status::internal(format!(
+                "shard {} returned wrong pages, expected {:?} got {:?}",
+                shard.id,
+                req.block_numbers,
+                resp.pages
+                    .iter()
+                    .map(|page| page.block_number)
+                    .collect::<Vec<_>>()
            )));
        }

@@ -207,9 +291,9 @@ impl PageserverClient {
        req: page_api::GetRelSizeRequest,
    ) -> tonic::Result<page_api::GetRelSizeResponse> {
        self.retry
-            .with(async || {
+            .with(async |_| {
                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
+                let mut client = self.shards.load_full().get_zero().client().await?;
                client.get_rel_size(req).await
            })
            .await
@@ -222,50 +306,53 @@ impl PageserverClient {
        req: page_api::GetSlruSegmentRequest,
    ) -> tonic::Result<page_api::GetSlruSegmentResponse> {
        self.retry
-            .with(async || {
+            .with(async |_| {
                // SLRU segments are only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
+                let mut client = self.shards.load_full().get_zero().client().await?;
                client.get_slru_segment(req).await
            })
            .await
    }
 }

-/// Tracks the tenant's shards.
-struct Shards {
+/// Shard specification for a PageserverClient.
+pub struct ShardSpec {
+    /// Maps shard indices to gRPC URLs.
+    ///
+    /// INVARIANT: every shard 0..count is present, and shard 0 is always present.
+    /// INVARIANT: every URL is valid and uses grpc:// scheme.
+    urls: HashMap<ShardIndex, String>,
    /// The shard count.
    ///
    /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
    count: ShardCount,
-    /// The stripe size. Only used for sharded tenants.
+    /// The stripe size for these shards.
    stripe_size: ShardStripeSize,
-    /// Shards by shard index.
-    ///
-    /// NB: unsharded tenants use count 0, like `ShardIndex::unsharded()`.
-    ///
-    /// INVARIANT: every shard 0..count is present.
-    /// INVARIANT: shard 0 is always present.
-    map: HashMap<ShardIndex, Shard>,
 }

-impl Shards {
-    /// Creates a new set of shards based on a shard map.
-    fn new(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        shard_map: HashMap<ShardIndex, String>,
-        stripe_size: ShardStripeSize,
-        auth_token: Option<String>,
+impl ShardSpec {
+    /// Creates a new shard spec with the given URLs and stripe size. All shards must be given.
+    /// The stripe size may be omitted for unsharded tenants.
+    pub fn new(
+        urls: HashMap<ShardIndex, String>,
+        stripe_size: Option<ShardStripeSize>,
    ) -> anyhow::Result<Self> {
-        let count = match shard_map.len() {
+        // Compute the shard count.
+        let count = match urls.len() {
            0 => return Err(anyhow!("no shards provided")),
            1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()`
            n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")),
            n => ShardCount::new(n as u8),
        };

-        let mut map = HashMap::new();
-        for (shard_id, url) in shard_map {
+        // Determine the stripe size. It doesn't matter for unsharded tenants.
+        if stripe_size.is_none() && !count.is_unsharded() {
+            return Err(anyhow!("stripe size must be given for sharded tenants"));
+        }
+        let stripe_size = stripe_size.unwrap_or_default();
+
+        // Validate the shard spec.
+        for (shard_id, url) in &urls {
            // The shard index must match the computed shard count, even for unsharded tenants.
            if shard_id.shard_count != count {
                return Err(anyhow!("invalid shard index {shard_id}, expected {count}"));
@@ -276,21 +363,72 @@ impl Shards {
            }
            // The above conditions guarantee that we have all shards 0..count: len() matches count,
            // shard number < count, and numbers are unique (via hashmap).
-            let shard = Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?;
-            map.insert(shard_id, shard);
+
+            // Validate the URL.
+            if PageserverProtocol::from_connstring(url)? != PageserverProtocol::Grpc {
+                return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
+            }
        }

        Ok(Self {
+            urls,
            count,
            stripe_size,
-            map,
+        })
+    }
+}
+
+/// Tracks the tenant's shards.
+struct Shards {
+    /// Shards by shard index.
+    ///
+    /// INVARIANT: every shard 0..count is present.
+    /// INVARIANT: shard 0 is always present.
+    by_index: HashMap<ShardIndex, Shard>,
+    /// The shard count.
+    ///
+    /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
+    count: ShardCount,
+    /// The stripe size. Only used for sharded tenants.
+    stripe_size: ShardStripeSize,
+}
+
+impl Shards {
+    /// Creates a new set of shards based on a shard spec.
+    fn new(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_spec: ShardSpec,
+        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
+    ) -> anyhow::Result<Self> {
+        // NB: the shard spec has already been validated when constructed.
+        let mut shards = HashMap::with_capacity(shard_spec.urls.len());
+        for (shard_id, url) in shard_spec.urls {
+            shards.insert(
+                shard_id,
+                Shard::new(
+                    url,
+                    tenant_id,
+                    timeline_id,
+                    shard_id,
+                    auth_token.clone(),
+                    compression,
+                )?,
+            );
+        }
+
+        Ok(Self {
+            by_index: shards,
+            count: shard_spec.count,
+            stripe_size: shard_spec.stripe_size,
        })
    }

    /// Looks up the given shard.
    #[allow(clippy::result_large_err)] // TODO: check perf impact
    fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> {
-        self.map
+        self.by_index
            .get(&shard_id)
            .ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}")))
    }
@@ -312,6 +450,8 @@ impl Shards {
 ///   * Bulk client pool: unbounded.
 ///     * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
 struct Shard {
+    /// The shard ID.
+    id: ShardIndex,
    /// Unary gRPC client pool.
    client_pool: Arc<ClientPool>,
    /// GetPage stream pool.
@@ -328,12 +468,8 @@ impl Shard {
        timeline_id: TimelineId,
        shard_id: ShardIndex,
        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
    ) -> anyhow::Result<Self> {
-        // Sanity-check that the URL uses gRPC.
-        if PageserverProtocol::from_connstring(&url)? != PageserverProtocol::Grpc {
-            return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
-        }
-
        // Common channel pool for unary and stream requests. Bounded by client/stream pools.
        let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;

@@ -344,6 +480,7 @@ impl Shard {
            timeline_id,
            shard_id,
            auth_token.clone(),
+            compression,
            Some(MAX_UNARY_CLIENTS),
        );

@@ -356,6 +493,7 @@ impl Shard {
                timeline_id,
                shard_id,
                auth_token.clone(),
+                compression,
                None, // unbounded, limited by stream pool
            ),
            Some(MAX_STREAMS),
@@ -371,6 +509,7 @@ impl Shard {
                timeline_id,
                shard_id,
                auth_token,
+                compression,
                None, // unbounded, limited by stream pool
            ),
            Some(MAX_BULK_STREAMS),
@@ -378,6 +517,7 @@ impl Shard {
        );

        Ok(Self {
+            id: shard_id,
            client_pool,
            stream_pool,
            bulk_stream_pool,
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -3,4 +3,4 @@ mod pool;
 mod retry;
 mod split;

-pub use client::PageserverClient;
+pub use client::{PageserverClient, ShardSpec};
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -34,10 +34,13 @@ use std::num::NonZero;
 use std::ops::{Deref, DerefMut};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, Weak};
+use std::time::{Duration, Instant};

 use futures::StreamExt as _;
 use tokio::sync::mpsc::{Receiver, Sender};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use tokio_util::sync::CancellationToken;
+use tonic::codec::CompressionEncoding;
 use tonic::transport::{Channel, Endpoint};
 use tracing::{error, warn};

@@ -45,6 +48,25 @@ use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;

+/// Reap channels/clients/streams that have been idle for this long.
+///
+/// TODO: this is per-pool. For nested pools, it can take up to 3x as long for a TCP connection to
+/// be reaped. First, we must wait for an idle stream to be reaped, which marks its client as idle.
+/// Then, we must wait for the idle client to be reaped, which marks its channel as idle. Then, we
+/// must wait for the idle channel to be reaped. Is that a problem? Maybe not, we just have to
+/// account for it when setting the reap threshold. Alternatively, we can immediately reap empty
+/// channels, and/or stream pool clients.
+const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
+    false => Duration::from_secs(180),
+    true => Duration::from_secs(1), // exercise reaping in tests
+};
+
+/// Reap idle resources with this interval.
+const REAP_IDLE_INTERVAL: Duration = match cfg!(any(test, feature = "testing")) {
+    false => Duration::from_secs(10),
+    true => Duration::from_secs(1), // exercise reaping in tests
+};
+
 /// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
 /// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this.
 /// The pool does not limit the number of channels, and instead relies on `ClientPool` or
@@ -52,7 +74,6 @@ use utils::shard::ShardIndex;
 ///
 /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
 ///
-/// TODO: reap idle channels.
 /// TODO: consider prewarming a set of channels, to avoid initial connection latency.
 /// TODO: consider adding a circuit breaker for errors and fail fast.
 pub struct ChannelPool {
@@ -62,6 +83,8 @@ pub struct ChannelPool {
    max_clients_per_channel: NonZero<usize>,
    /// Open channels.
    channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
+    /// Reaps idle channels.
+    idle_reaper: Reaper,
    /// Channel ID generator.
    next_channel_id: AtomicUsize,
 }
@@ -73,6 +96,9 @@ struct ChannelEntry {
    channel: Channel,
    /// Number of clients using this channel.
    clients: usize,
+    /// The channel has been idle (no clients) since this time. None if channel is in use.
+    /// INVARIANT: Some if clients == 0, otherwise None.
+    idle_since: Option<Instant>,
 }

 impl ChannelPool {
@@ -82,12 +108,15 @@ impl ChannelPool {
        E: TryInto<Endpoint> + Send + Sync + 'static,
        <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
    {
-        Ok(Arc::new(Self {
+        let pool = Arc::new(Self {
            endpoint: endpoint.try_into()?,
            max_clients_per_channel,
            channels: Mutex::default(),
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
            next_channel_id: AtomicUsize::default(),
-        }))
+        });
+        pool.idle_reaper.spawn(&pool);
+        Ok(pool)
    }

    /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
@@ -116,8 +145,14 @@ impl ChannelPool {
                entry.clients <= self.max_clients_per_channel.get(),
                "channel overflow"
            );
+            assert_eq!(
+                entry.idle_since.is_some(),
+                entry.clients == 0,
+                "incorrect channel idle state"
+            );
            if entry.clients < self.max_clients_per_channel.get() {
                entry.clients += 1;
+                entry.idle_since = None;
                return ChannelGuard {
                    pool: Arc::downgrade(self),
                    id,
@@ -134,6 +169,7 @@ impl ChannelPool {
        let entry = ChannelEntry {
            channel: channel.clone(),
            clients: 1, // account for the guard below
+            idle_since: None,
        };
        channels.insert(id, entry);

@@ -145,6 +181,20 @@ impl ChannelPool {
    }
 }

+impl Reapable for ChannelPool {
+    /// Reaps channels that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.channels.lock().unwrap().retain(|_, entry| {
+            let Some(idle_since) = entry.idle_since else {
+                assert_ne!(entry.clients, 0, "empty channel not marked idle");
+                return true;
+            };
+            assert_eq!(entry.clients, 0, "idle channel has clients");
+            idle_since >= cutoff
+        })
+    }
+}
+
 /// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`,
 /// since the gRPC client requires an owned `Channel`.
 pub struct ChannelGuard {
@@ -167,10 +217,15 @@ impl Drop for ChannelGuard {
        let Some(pool) = self.pool.upgrade() else {
            return; // pool was dropped
        };
+
        let mut channels = pool.channels.lock().unwrap();
        let entry = channels.get_mut(&self.id).expect("unknown channel");
+        assert!(entry.idle_since.is_none(), "active channel marked idle");
        assert!(entry.clients > 0, "channel underflow");
        entry.clients -= 1;
+        if entry.clients == 0 {
+            entry.idle_since = Some(Instant::now()); // mark channel as idle
+        }
    }
 }

@@ -179,8 +234,6 @@ impl Drop for ChannelGuard {
 /// number of concurrent clients to `max_clients` via semaphore.
 ///
 /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
-///
-/// TODO: reap idle clients.
 pub struct ClientPool {
    /// Tenant ID.
    tenant_id: TenantId,
@@ -190,6 +243,8 @@ pub struct ClientPool {
    shard_id: ShardIndex,
    /// Authentication token, if any.
    auth_token: Option<String>,
+    /// Compression to use.
+    compression: Option<CompressionEncoding>,
    /// Channel pool to acquire channels from.
    channel_pool: Arc<ChannelPool>,
    /// Limits the max number of concurrent clients for this pool. None if the pool is unbounded.
@@ -201,6 +256,8 @@ pub struct ClientPool {
    /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
    /// clients are reaped.
    idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
+    /// Reaps idle clients.
+    idle_reaper: Reaper,
    /// Unique client ID generator.
    next_client_id: AtomicUsize,
 }
@@ -212,6 +269,9 @@ struct ClientEntry {
    client: page_api::Client,
    /// The channel guard for the channel used by the client.
    channel_guard: ChannelGuard,
+    /// The client has been idle since this time. All clients in `ClientPool::idle` are idle by
+    /// definition, so this is the time when it was added back to the pool.
+    idle_since: Instant,
 }

 impl ClientPool {
@@ -224,18 +284,23 @@ impl ClientPool {
        timeline_id: TimelineId,
        shard_id: ShardIndex,
        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
        max_clients: Option<NonZero<usize>>,
    ) -> Arc<Self> {
-        Arc::new(Self {
+        let pool = Arc::new(Self {
            tenant_id,
            timeline_id,
            shard_id,
            auth_token,
+            compression,
            channel_pool,
            idle: Mutex::default(),
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
            limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))),
            next_client_id: AtomicUsize::default(),
-        })
+        });
+        pool.idle_reaper.spawn(&pool);
+        pool
    }

    /// Gets a client from the pool, or creates a new one if necessary. Connections are established
@@ -271,7 +336,7 @@ impl ClientPool {
            self.timeline_id,
            self.shard_id,
            self.auth_token.clone(),
-            None,
+            self.compression,
        )?;

        Ok(ClientGuard {
@@ -287,6 +352,16 @@ impl ClientPool {
    }
 }

+impl Reapable for ClientPool {
+    /// Reaps clients that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.idle
+            .lock()
+            .unwrap()
+            .retain(|_, entry| entry.idle_since >= cutoff)
+    }
+}
+
 /// A client acquired from the pool. The inner client can be accessed via Deref. The client is
 /// returned to the pool when dropped.
 pub struct ClientGuard {
@@ -317,9 +392,11 @@ impl Drop for ClientGuard {
        let Some(pool) = self.pool.upgrade() else {
            return; // pool was dropped
        };
+
        let entry = ClientEntry {
            client: self.client.take().expect("dropped once"),
            channel_guard: self.channel_guard.take().expect("dropped once"),
+            idle_since: Instant::now(),
        };
        pool.idle.lock().unwrap().insert(self.id, entry);

@@ -334,7 +411,6 @@ impl Drop for ClientGuard {
 /// a single request and await the response. Internally, requests are multiplexed across streams and
 /// channels. This allows proper queue depth enforcement and response routing.
 ///
-/// TODO: reap idle streams.
 /// TODO: consider making this generic over request and response types; not currently needed.
 pub struct StreamPool {
    /// The client pool to acquire clients from. Must be unbounded.
@@ -344,7 +420,7 @@ pub struct StreamPool {
    /// Incoming requests will be sent over an existing stream with available capacity. If all
    /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
    /// stream has an associated Tokio task that processes requests and responses.
-    streams: Arc<Mutex<HashMap<StreamID, StreamEntry>>>,
+    streams: Mutex<HashMap<StreamID, StreamEntry>>,
    /// The max number of concurrent streams, or None if unbounded.
    max_streams: Option<NonZero<usize>>,
    /// The max number of concurrent requests per stream.
@@ -352,6 +428,8 @@ pub struct StreamPool {
    /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
    /// None if the pool is unbounded.
    limiter: Option<Arc<Semaphore>>,
+    /// Reaps idle streams.
+    idle_reaper: Reaper,
    /// Stream ID generator.
    next_stream_id: AtomicUsize,
 }
@@ -364,9 +442,11 @@ type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
 struct StreamEntry {
    /// Sends caller requests to the stream task. The stream task exits when this is dropped.
    sender: RequestSender,
-    /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on
-    /// completion without acquiring the `StreamPool::streams` lock.
-    queue_depth: Arc<AtomicUsize>,
+    /// Number of in-flight requests on this stream.
+    queue_depth: usize,
+    /// The time when this stream went idle (queue_depth == 0).
+    /// INVARIANT: Some if queue_depth == 0, otherwise None.
+    idle_since: Option<Instant>,
 }

 impl StreamPool {
@@ -383,16 +463,19 @@ impl StreamPool {
        max_queue_depth: NonZero<usize>,
    ) -> Arc<Self> {
        assert!(client_pool.limiter.is_none(), "bounded client pool");
-        Arc::new(Self {
+        let pool = Arc::new(Self {
            client_pool,
-            streams: Arc::default(),
+            streams: Mutex::default(),
            limiter: max_streams.map(|max_streams| {
                Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
            }),
            max_streams,
            max_queue_depth,
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
            next_stream_id: AtomicUsize::default(),
-        })
+        });
+        pool.idle_reaper.spawn(&pool);
+        pool
    }

    /// Acquires an available stream from the pool, or spins up a new stream async if all streams
@@ -412,8 +495,8 @@ impl StreamPool {
    /// * Allow concurrent clients to join onto streams while they're spun up.
    /// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
    ///
-    /// For now, we just do something simple and functional, but very inefficient (linear scan).
-    pub async fn get(&self) -> StreamGuard {
+    /// For now, we just do something simple but inefficient (linear scan under mutex).
+    pub async fn get(self: &Arc<Self>) -> StreamGuard {
        // Acquire a permit if the pool is bounded.
        let mut permit = None;
        if let Some(limiter) = self.limiter.clone() {
@@ -422,23 +505,23 @@ impl StreamPool {
        let mut streams = self.streams.lock().unwrap();

        // Look for a pooled stream with available capacity.
-        for entry in streams.values() {
+        for (&id, entry) in streams.iter_mut() {
            assert!(
-                entry.queue_depth.load(Ordering::Relaxed) <= self.max_queue_depth.get(),
+                entry.queue_depth <= self.max_queue_depth.get(),
                "stream queue overflow"
            );
-            if entry
-                .queue_depth
-                .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| {
-                    // Increment the queue depth via compare-and-swap.
-                    // TODO: review ordering.
-                    (queue_depth < self.max_queue_depth.get()).then_some(queue_depth + 1)
-                })
-                .is_ok()
-            {
+            assert_eq!(
+                entry.idle_since.is_some(),
+                entry.queue_depth == 0,
+                "incorrect stream idle state"
+            );
+            if entry.queue_depth < self.max_queue_depth.get() {
+                entry.queue_depth += 1;
+                entry.idle_since = None;
                return StreamGuard {
+                    pool: Arc::downgrade(self),
+                    id,
                    sender: entry.sender.clone(),
-                    queue_depth: entry.queue_depth.clone(),
                    permit,
                };
            }
@@ -448,11 +531,11 @@ impl StreamPool {
        // return the guard, while spinning up the stream task async. This allows other callers to
        // join onto this stream and also create additional streams concurrently if this fills up.
        let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
-        let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller
        let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
        let entry = StreamEntry {
            sender: req_tx.clone(),
-            queue_depth: queue_depth.clone(),
+            queue_depth: 1, // reserve quota for this caller
+            idle_since: None,
        };
        streams.insert(id, entry);

@@ -461,20 +544,23 @@ impl StreamPool {
        };

        let client_pool = self.client_pool.clone();
-        let streams = self.streams.clone();
+        let pool = Arc::downgrade(self);

        tokio::spawn(async move {
            if let Err(err) = Self::run_stream(client_pool, req_rx).await {
                error!("stream failed: {err}");
            }
-            // Remove stream from pool on exit.
-            let entry = streams.lock().unwrap().remove(&id);
-            assert!(entry.is_some(), "unknown stream ID: {id}");
+            // Remove stream from pool on exit. Weak reference to avoid holding the pool alive.
+            if let Some(pool) = pool.upgrade() {
+                let entry = pool.streams.lock().unwrap().remove(&id);
+                assert!(entry.is_some(), "unknown stream ID: {id}");
+            }
        });

        StreamGuard {
+            pool: Arc::downgrade(self),
+            id,
            sender: req_tx,
-            queue_depth,
            permit,
        }
    }
@@ -505,6 +591,10 @@ impl StreamPool {

        // Track caller response channels by request ID. If the task returns early, these response
        // channels will be dropped and the waiting callers will receive an error.
+        //
+        // NB: this will leak entries if the server doesn't respond to a request (by request ID).
+        // It shouldn't happen, and if it does it will often hold onto queue depth quota anyway and
+        // block further use. But we could consider reaping closed channels after some time.
        let mut callers = HashMap::new();

        // Process requests and responses.
@@ -552,11 +642,26 @@ impl StreamPool {
    }
 }

+impl Reapable for StreamPool {
+    /// Reaps streams that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.streams.lock().unwrap().retain(|_, entry| {
+            let Some(idle_since) = entry.idle_since else {
+                assert_ne!(entry.queue_depth, 0, "empty stream not marked idle");
+                return true;
+            };
+            assert_eq!(entry.queue_depth, 0, "idle stream has requests");
+            idle_since >= cutoff
+        });
+    }
+}
+
 /// A pooled stream reference. Can be used to send a single request, to properly enforce queue
 /// depth. Queue depth is already reserved and will be returned on drop.
 pub struct StreamGuard {
+    pool: Weak<StreamPool>,
+    id: StreamID,
    sender: RequestSender,
-    queue_depth: Arc<AtomicUsize>,
    permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
 }

@@ -588,11 +693,87 @@ impl StreamGuard {

 impl Drop for StreamGuard {
    fn drop(&mut self) {
+        let Some(pool) = self.pool.upgrade() else {
+            return; // pool was dropped
+        };
+
        // Release the queue depth reservation on drop. This can prematurely decrement it if dropped
        // before the response is received, but that's okay.
-        let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst);
-        assert!(prev_queue_depth > 0, "stream queue underflow");
+        //
+        // TODO: actually, it's probably not okay. Queue depth release should be moved into the
+        // stream task, such that it continues to account for the queue depth slot until the server
+        // responds. Otherwise, if a slow request times out and keeps blocking the stream, the
+        // server will keep waiting on it and we can pile on subsequent requests (including the
+        // timeout retry) in the same stream and get blocked. But we may also want to avoid blocking
+        // requests on e.g. LSN waits and layer downloads, instead returning early to free up the
+        // stream. Or just scale out streams with a queue depth of 1 to sidestep all head-of-line
+        // blocking. TBD.
+        let mut streams = pool.streams.lock().unwrap();
+        let entry = streams.get_mut(&self.id).expect("unknown stream");
+        assert!(entry.idle_since.is_none(), "active stream marked idle");
+        assert!(entry.queue_depth > 0, "stream queue underflow");
+        entry.queue_depth -= 1;
+        if entry.queue_depth == 0 {
+            entry.idle_since = Some(Instant::now()); // mark stream as idle
+        }

        _ = self.permit; // returned on drop, referenced for visibility
    }
 }
+
+/// Periodically reaps idle resources from a pool.
+struct Reaper {
+    /// The task check interval.
+    interval: Duration,
+    /// The threshold for reaping idle resources.
+    threshold: Duration,
+    /// Cancels the reaper task. Cancelled when the reaper is dropped.
+    cancel: CancellationToken,
+}
+
+impl Reaper {
+    /// Creates a new reaper.
+    pub fn new(threshold: Duration, interval: Duration) -> Self {
+        Self {
+            cancel: CancellationToken::new(),
+            threshold,
+            interval,
+        }
+    }
+
+    /// Spawns a task to periodically reap idle resources from the given task pool. The task is
+    /// cancelled when the reaper is dropped.
+    pub fn spawn(&self, pool: &Arc<impl Reapable>) {
+        // NB: hold a weak pool reference, otherwise the task will prevent dropping the pool.
+        let pool = Arc::downgrade(pool);
+        let cancel = self.cancel.clone();
+        let (interval, threshold) = (self.interval, self.threshold);
+
+        tokio::spawn(async move {
+            loop {
+                tokio::select! {
+                    _ = tokio::time::sleep(interval) => {
+                        let Some(pool) = pool.upgrade() else {
+                            return; // pool was dropped
+                        };
+                        pool.reap_idle(Instant::now() - threshold);
+                    }
+
+                    _ = cancel.cancelled() => return,
+                }
+            }
+        });
+    }
+}
+
+impl Drop for Reaper {
+    fn drop(&mut self) {
+        self.cancel.cancel(); // cancel reaper task
+    }
+}
+
+/// A reapable resource pool.
+trait Reapable: Send + Sync + 'static {
+    /// Reaps resources that have been idle since before the given cutoff.
+    fn reap_idle(&self, cutoff: Instant);
+}
--- a/pageserver/client_grpc/src/retry.rs
+++ b/pageserver/client_grpc/src/retry.rs
@@ -23,14 +23,14 @@ impl Retry {
    /// If true, log successful requests. For debugging.
    const LOG_SUCCESS: bool = false;

-    /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
-    /// using the current tracing span for context.
+    /// Runs the given async closure with timeouts and retries (exponential backoff), passing the
+    /// attempt number starting at 0. Logs errors, using the current tracing span for context.
    ///
    /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
    /// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
    pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
    where
-        F: FnMut() -> O,
+        F: FnMut(usize) -> O, // takes attempt number, starting at 0
        O: Future<Output = tonic::Result<T>>,
    {
        let started = Instant::now();
@@ -47,7 +47,7 @@ impl Retry {
                }

                let request_started = Instant::now();
-                tokio::time::timeout(Self::REQUEST_TIMEOUT, f())
+                tokio::time::timeout(Self::REQUEST_TIMEOUT, f(retries))
                    .await
                    .map_err(|_| {
                        tonic::Status::deadline_exceeded(format!(
@@ -131,7 +131,6 @@ impl Retry {
            tonic::Code::Aborted => true,
            tonic::Code::Cancelled => true,
            tonic::Code::DeadlineExceeded => true, // maybe transient slowness
-            tonic::Code::Internal => true,         // maybe transient failure?
            tonic::Code::ResourceExhausted => true,
            tonic::Code::Unavailable => true,

@@ -139,6 +138,10 @@ impl Retry {
            tonic::Code::AlreadyExists => false,
            tonic::Code::DataLoss => false,
            tonic::Code::FailedPrecondition => false,
+            // NB: don't retry Internal. It is intended for serious errors such as invariant
+            // violations, and is also used for client-side invariant checks that would otherwise
+            // result in retry loops.
+            tonic::Code::Internal => false,
            tonic::Code::InvalidArgument => false,
            tonic::Code::NotFound => false,
            tonic::Code::OutOfRange => false,
--- a/pageserver/client_grpc/src/split.rs
+++ b/pageserver/client_grpc/src/split.rs
@@ -5,27 +5,24 @@ use bytes::Bytes;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
 use pageserver_page_api as page_api;
-use utils::shard::{ShardCount, ShardIndex};
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};

 /// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
 /// TODO: add tests for this.
 pub struct GetPageSplitter {
-    /// The original request ID. Used for all shard requests.
-    request_id: page_api::RequestID,
    /// Split requests by shard index.
    requests: HashMap<ShardIndex, page_api::GetPageRequest>,
-    /// Maps the offset in `GetPageRequest::block_numbers` to the owning shard. Used to assemble
-    /// the response pages in the same order as the original request.
+    /// The response being assembled. Preallocated with empty pages, to be filled in.
+    response: page_api::GetPageResponse,
+    /// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used
+    /// to assemble the response pages in the same order as the original request.
    block_shards: Vec<ShardIndex>,
-    /// Page responses by shard index. Will be assembled into a single response.
-    responses: HashMap<ShardIndex, Vec<Bytes>>,
 }

 impl GetPageSplitter {
    /// Checks if the given request only touches a single shard, and returns the shard ID. This is
    /// the common case, so we check first in order to avoid unnecessary allocations and overhead.
-    /// The caller must ensure that the request has at least one block number, or this will panic.
-    pub fn is_single_shard(
+    pub fn for_single_shard(
        req: &page_api::GetPageRequest,
        count: ShardCount,
        stripe_size: ShardStripeSize,
@@ -35,8 +32,12 @@ impl GetPageSplitter {
            return Some(ShardIndex::unsharded());
        }

-        // Find the base shard index for the first page, and compare with the rest.
-        let key = rel_block_to_key(req.rel, *req.block_numbers.first().expect("no pages"));
+        // Find the first page's shard, for comparison. If there are no pages, just return the first
+        // shard (caller likely checked already, otherwise the server will reject it).
+        let Some(&first_page) = req.block_numbers.first() else {
+            return Some(ShardIndex::new(ShardNumber(0), count));
+        };
+        let key = rel_block_to_key(req.rel, first_page);
        let shard_number = key_to_shard_number(count, stripe_size, &key);

        req.block_numbers
@@ -57,19 +58,19 @@ impl GetPageSplitter {
    ) -> Self {
        // The caller should make sure we don't split requests unnecessarily.
        debug_assert!(
-            Self::is_single_shard(&req, count, stripe_size).is_none(),
+            Self::for_single_shard(&req, count, stripe_size).is_none(),
            "unnecessary request split"
        );

        // Split the requests by shard index.
        let mut requests = HashMap::with_capacity(2); // common case
        let mut block_shards = Vec::with_capacity(req.block_numbers.len());
-        for blkno in req.block_numbers {
+        for &blkno in &req.block_numbers {
            let key = rel_block_to_key(req.rel, blkno);
            let shard_number = key_to_shard_number(count, stripe_size, &key);
            let shard_id = ShardIndex::new(shard_number, count);

-            let shard_req = requests
+            requests
                .entry(shard_id)
                .or_insert_with(|| page_api::GetPageRequest {
                    request_id: req.request_id,
@@ -77,27 +78,47 @@ impl GetPageSplitter {
                    rel: req.rel,
                    read_lsn: req.read_lsn,
                    block_numbers: Vec::new(),
-                });
-            shard_req.block_numbers.push(blkno);
+                })
+                .block_numbers
+                .push(blkno);
            block_shards.push(shard_id);
        }

-        Self {
+        // Construct a response to be populated by shard responses. Preallocate empty page slots
+        // with the expected block numbers.
+        let response = page_api::GetPageResponse {
            request_id: req.request_id,
-            responses: HashMap::with_capacity(requests.len()),
+            status_code: page_api::GetPageStatusCode::Ok,
+            reason: None,
+            rel: req.rel,
+            pages: req
+                .block_numbers
+                .into_iter()
+                .map(|block_number| {
+                    page_api::Page {
+                        block_number,
+                        image: Bytes::new(), // empty page slot to be filled in
+                    }
+                })
+                .collect(),
+        };
+
+        Self {
            requests,
+            response,
            block_shards,
        }
    }

-    /// Drains the per-shard requests, moving them out of the hashmap to avoid extra allocations.
+    /// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations.
    pub fn drain_requests(
        &mut self,
    ) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
        self.requests.drain()
    }

-    /// Adds a response from the given shard.
+    /// Adds a response from the given shard. The response must match the request ID and have an OK
+    /// status code. A response must not already exist for the given shard ID.
    #[allow(clippy::result_large_err)]
    pub fn add_response(
        &mut self,
@@ -105,68 +126,84 @@ impl GetPageSplitter {
        response: page_api::GetPageResponse,
    ) -> tonic::Result<()> {
        // The caller should already have converted status codes into tonic::Status.
-        assert_eq!(response.status_code, page_api::GetPageStatusCode::Ok);
-
-        // Make sure the response matches the request ID.
-        if response.request_id != self.request_id {
+        if response.status_code != page_api::GetPageStatusCode::Ok {
            return Err(tonic::Status::internal(format!(
-                "response ID {} does not match request ID {}",
-                response.request_id, self.request_id
+                "unexpected non-OK response for shard {shard_id}: {} {}",
+                response.status_code,
+                response.reason.unwrap_or_default()
            )));
        }

-        // Add the response data to the map.
-        let old = self.responses.insert(shard_id, response.page_images);
-
-        if old.is_some() {
+        if response.request_id != self.response.request_id {
            return Err(tonic::Status::internal(format!(
-                "duplicate response for shard {shard_id}",
+                "response ID mismatch for shard {shard_id}: expected {}, got {}",
+                self.response.request_id, response.request_id
+            )));
+        }
+
+        // Place the shard response pages into the assembled response, in request order.
+        let mut pages = response.pages.into_iter();
+
+        for (i, &s) in self.block_shards.iter().enumerate() {
+            if shard_id != s {
+                continue;
+            }
+
+            let Some(slot) = self.response.pages.get_mut(i) else {
+                return Err(tonic::Status::internal(format!(
+                    "no block_shards slot {i} for shard {shard_id}"
+                )));
+            };
+            let Some(page) = pages.next() else {
+                return Err(tonic::Status::internal(format!(
+                    "missing page {} in shard {shard_id} response",
+                    slot.block_number
+                )));
+            };
+            if page.block_number != slot.block_number {
+                return Err(tonic::Status::internal(format!(
+                    "shard {shard_id} returned wrong page at index {i}, expected {} got {}",
+                    slot.block_number, page.block_number
+                )));
+            }
+            if !slot.image.is_empty() {
+                return Err(tonic::Status::internal(format!(
+                    "shard {shard_id} returned duplicate page {} at index {i}",
+                    slot.block_number
+                )));
+            }
+
+            *slot = page;
+        }
+
+        // Make sure we've consumed all pages from the shard response.
+        if let Some(extra_page) = pages.next() {
+            return Err(tonic::Status::internal(format!(
+                "shard {shard_id} returned extra page: {}",
+                extra_page.block_number
            )));
        }

        Ok(())
    }

-    /// Assembles the shard responses into a single response. Responses must be present for all
-    /// relevant shards, and the total number of pages must match the original request.
+    /// Fetches the final, assembled response.
    #[allow(clippy::result_large_err)]
-    pub fn assemble_response(self) -> tonic::Result<page_api::GetPageResponse> {
-        let mut response = page_api::GetPageResponse {
-            request_id: self.request_id,
-            status_code: page_api::GetPageStatusCode::Ok,
-            reason: None,
-            page_images: Vec::with_capacity(self.block_shards.len()),
-        };
-
-        // Set up per-shard page iterators we can pull from.
-        let mut shard_responses = HashMap::with_capacity(self.responses.len());
-        for (shard_id, responses) in self.responses {
-            shard_responses.insert(shard_id, responses.into_iter());
-        }
-
-        // Reassemble the responses in the same order as the original request.
-        for shard_id in &self.block_shards {
-            let page = shard_responses
-                .get_mut(shard_id)
-                .ok_or_else(|| {
-                    tonic::Status::internal(format!("missing response for shard {shard_id}"))
-                })?
-                .next()
-                .ok_or_else(|| {
-                    tonic::Status::internal(format!("missing page from shard {shard_id}"))
-                })?;
-            response.page_images.push(page);
-        }
-
-        // Make sure there are no additional pages.
-        for (shard_id, mut pages) in shard_responses {
-            if pages.next().is_some() {
+    pub fn get_response(self) -> tonic::Result<page_api::GetPageResponse> {
+        // Check that the response is complete.
+        for (i, page) in self.response.pages.iter().enumerate() {
+            if page.image.is_empty() {
                return Err(tonic::Status::internal(format!(
-                    "extra pages returned from shard {shard_id}"
+                    "missing page {} for shard {}",
+                    page.block_number,
+                    self.block_shards
+                        .get(i)
+                        .map(|s| s.to_string())
+                        .unwrap_or_else(|| "?".to_string())
                )));
            }
        }

-        Ok(response)
+        Ok(self.response)
    }
 }
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -17,6 +17,7 @@ pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
+serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
--- a/pageserver/ctl/src/download_remote_object.rs
+++ b/pageserver/ctl/src/download_remote_object.rs
@@ -0,0 +1,85 @@
+use camino::Utf8PathBuf;
+use clap::Parser;
+use tokio_util::sync::CancellationToken;
+
+/// Download a specific object from remote storage to a local file.
+///
+/// The remote storage configuration is supplied via the `REMOTE_STORAGE_CONFIG` environment
+/// variable, in the same TOML format that the pageserver itself understands. This allows the
+/// command to work with any cloud supported by the `remote_storage` crate (currently AWS S3,
+/// Azure Blob Storage and local files), as long as the credentials are available via the
+/// standard environment variables expected by the underlying SDKs.
+///
+/// Examples for setting the environment variable:
+///
+/// ```bash
+/// # AWS S3 (region can also be provided via AWS_REGION)
+/// export REMOTE_STORAGE_CONFIG='remote_storage = { bucket_name = "my-bucket", bucket_region = "us-east-2" }'
+///
+/// # Azure Blob Storage (account key picked up from AZURE_STORAGE_ACCOUNT_KEY)
+/// export REMOTE_STORAGE_CONFIG='remote_storage = { container = "my-container", account = "my-account" }'
+/// ```
+#[derive(Parser)]
+pub(crate) struct DownloadRemoteObjectCmd {
+    /// Key / path of the object to download (relative to the remote storage prefix).
+    ///
+    /// Examples:
+    ///   "wal/3aa8f.../00000001000000000000000A"
+    ///   "pageserver/v1/tenants/<tenant_id>/timelines/<timeline_id>/layer_12345"
+    pub remote_path: String,
+
+    /// Path of the local file to create. Existing file will be overwritten.
+    ///
+    /// Examples:
+    ///   "./segment"
+    ///   "/tmp/layer_12345.parquet"
+    pub output_file: Utf8PathBuf,
+}
+
+pub(crate) async fn main(cmd: &DownloadRemoteObjectCmd) -> anyhow::Result<()> {
+    use remote_storage::{DownloadOpts, GenericRemoteStorage, RemotePath, RemoteStorageConfig};
+
+    // Fetch remote storage configuration from the environment
+    let config_str = std::env::var("REMOTE_STORAGE_CONFIG").map_err(|_| {
+        anyhow::anyhow!(
+            "'REMOTE_STORAGE_CONFIG' environment variable must be set to a valid remote storage TOML config"
+        )
+    })?;
+
+    let config = RemoteStorageConfig::from_toml_str(&config_str)?;
+
+    // Initialise remote storage client
+    let storage = GenericRemoteStorage::from_config(&config).await?;
+
+    // RemotePath must be relative – leading slashes confuse the parser.
+    let remote_path_str = cmd.remote_path.trim_start_matches('/');
+    let remote_path = RemotePath::from_string(remote_path_str)?;
+
+    let cancel = CancellationToken::new();
+
+    println!(
+        "Downloading '{remote_path}' from remote storage bucket {:?} ...",
+        config.storage.bucket_name()
+    );
+
+    // Start the actual download
+    let download = storage
+        .download(&remote_path, &DownloadOpts::default(), &cancel)
+        .await?;
+
+    // Stream to file
+    let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
+    let tmp_path = cmd.output_file.with_extension("tmp");
+    let mut file = tokio::fs::File::create(&tmp_path).await?;
+    tokio::io::copy(&mut reader, &mut file).await?;
+    file.sync_all().await?;
+    // Atomically move into place
+    tokio::fs::rename(&tmp_path, &cmd.output_file).await?;
+
+    println!(
+        "Downloaded to '{}'. Last modified: {:?}, etag: {}",
+        cmd.output_file, download.last_modified, download.etag
+    );
+
+    Ok(())
+}
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -1,14 +1,16 @@
 use std::str::FromStr;

-use anyhow::Context;
+use anyhow::{Context, Ok};
 use camino::Utf8PathBuf;
 use pageserver::tenant::{
    IndexPart,
    layer_map::{LayerMap, SearchResult},
-    remote_timeline_client::remote_layer_path,
-    storage_layer::{PersistentLayerDesc, ReadableLayerWeak},
+    remote_timeline_client::{index::LayerFileMetadata, remote_layer_path},
+    storage_layer::{LayerName, LayerVisibilityHint, PersistentLayerDesc, ReadableLayerWeak},
 };
 use pageserver_api::key::Key;
+use serde::Serialize;
+use std::collections::BTreeMap;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
@@ -33,6 +35,31 @@ pub(crate) enum IndexPartCmd {
        #[arg(long)]
        lsn: String,
    },
+    /// List all visible delta and image layers at the latest LSN.
+    ListVisibleLayers {
+        #[arg(long)]
+        path: Utf8PathBuf,
+    },
+}
+
+fn create_layer_map_from_index_part(
+    index_part: &IndexPart,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+) -> LayerMap {
+    let mut layer_map = LayerMap::default();
+    {
+        let mut updates = layer_map.batch_update();
+        for (key, value) in index_part.layer_metadata.iter() {
+            updates.insert_historic(PersistentLayerDesc::from_filename(
+                tenant_shard_id,
+                timeline_id,
+                key.clone(),
+                value.file_size,
+            ));
+        }
+    }
+    layer_map
 }

 async fn search_layers(
@@ -49,18 +76,7 @@ async fn search_layers(
        let bytes = tokio::fs::read(path).await?;
        IndexPart::from_json_bytes(&bytes).unwrap()
    };
-    let mut layer_map = LayerMap::default();
-    {
-        let mut updates = layer_map.batch_update();
-        for (key, value) in index_json.layer_metadata.iter() {
-            updates.insert_historic(PersistentLayerDesc::from_filename(
-                tenant_shard_id,
-                timeline_id,
-                key.clone(),
-                value.file_size,
-            ));
-        }
-    }
+    let layer_map = create_layer_map_from_index_part(&index_json, tenant_shard_id, timeline_id);
    let key = Key::from_hex(key)?;

    let lsn = Lsn::from_str(lsn).unwrap();
@@ -98,6 +114,69 @@ async fn search_layers(
    Ok(())
 }

+#[derive(Debug, Clone, Serialize)]
+struct VisibleLayers {
+    pub total_images: u64,
+    pub total_image_bytes: u64,
+    pub total_deltas: u64,
+    pub total_delta_bytes: u64,
+    pub layer_metadata: BTreeMap<LayerName, LayerFileMetadata>,
+}
+
+impl VisibleLayers {
+    pub fn new() -> Self {
+        Self {
+            layer_metadata: BTreeMap::new(),
+            total_images: 0,
+            total_image_bytes: 0,
+            total_deltas: 0,
+            total_delta_bytes: 0,
+        }
+    }
+
+    pub fn add_layer(&mut self, name: LayerName, layer: LayerFileMetadata) {
+        match name {
+            LayerName::Image(_) => {
+                self.total_images += 1;
+                self.total_image_bytes += layer.file_size;
+            }
+            LayerName::Delta(_) => {
+                self.total_deltas += 1;
+                self.total_delta_bytes += layer.file_size;
+            }
+        }
+        self.layer_metadata.insert(name, layer);
+    }
+}
+
+async fn list_visible_layers(path: &Utf8PathBuf) -> anyhow::Result<()> {
+    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    let timeline_id = TimelineId::generate();
+
+    let bytes = tokio::fs::read(path).await.context("read file")?;
+    let index_part = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
+    let layer_map = create_layer_map_from_index_part(&index_part, tenant_shard_id, timeline_id);
+    let mut visible_layers = VisibleLayers::new();
+    let (layers, _key_space) = layer_map.get_visibility(Vec::new());
+    for (layer, visibility) in layers {
+        if visibility == LayerVisibilityHint::Visible {
+            visible_layers.add_layer(
+                layer.layer_name(),
+                index_part
+                    .layer_metadata
+                    .get(&layer.layer_name())
+                    .unwrap()
+                    .clone(),
+            );
+        }
+    }
+    let output = serde_json::to_string_pretty(&visible_layers).context("serialize output")?;
+    println!("{output}");
+
+    Ok(())
+}
+
 pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
    match cmd {
        IndexPartCmd::Dump { path } => {
@@ -114,5 +193,6 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
            key,
            lsn,
        } => search_layers(tenant_id, timeline_id, path, key, lsn).await,
+        IndexPartCmd::ListVisibleLayers { path } => list_visible_layers(path).await,
    }
 }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -4,6 +4,7 @@
 //!
 //! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.

+mod download_remote_object;
 mod draw_timeline_dir;
 mod index_part;
 mod key;
@@ -16,6 +17,7 @@ use std::time::{Duration, SystemTime};

 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
+use download_remote_object::DownloadRemoteObjectCmd;
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use page_trace::PageTraceCmd;
@@ -63,6 +65,7 @@ enum Commands {
    /// Debug print a hex key found from logs
    Key(key::DescribeKeyCommand),
    PageTrace(PageTraceCmd),
+    DownloadRemoteObject(DownloadRemoteObjectCmd),
 }

 /// Read and update pageserver metadata file
@@ -185,6 +188,9 @@ async fn main() -> anyhow::Result<()> {
        }
        Commands::Key(dkc) => dkc.execute(),
        Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
+        Commands::DownloadRemoteObject(cmd) => {
+            download_remote_object::main(&cmd).await?;
+        }
    };
    Ok(())
 }
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -153,7 +153,7 @@ message GetDbSizeResponse {
 message GetPageRequest {
  // A request ID. Will be included in the response. Should be unique for
  // in-flight requests on the stream.
-  uint64 request_id = 1;
+  RequestID request_id = 1;
  // The request class.
  GetPageClass request_class = 2;
  // The LSN to read at.
@@ -177,6 +177,14 @@ message GetPageRequest {
  repeated uint32 block_number = 5;
 }

+// A Request ID. Should be unique for in-flight requests on a stream. Included in the response.
+message RequestID {
+  // The base request ID.
+  uint64 id = 1;
+  // The request attempt. Starts at 0, incremented on each retry.
+  uint32 attempt = 2;
+}
+
 // A GetPageRequest class. Primarily intended for observability, but may also be
 // used for prioritization in the future.
 enum GetPageClass {
@@ -199,13 +207,26 @@ enum GetPageClass {
 // the entire batch is ready, so no one can make use of the individual pages.
 message GetPageResponse {
  // The original request's ID.
-  uint64 request_id = 1;
-  // The response status code.
+  RequestID request_id = 1;
+  // The response status code. If not OK, the rel and page fields will be empty.
  GetPageStatusCode status_code = 2;
  // A string describing the status, if any.
  string reason = 3;
-  // The 8KB page images, in the same order as the request. Empty if status_code != OK.
-  repeated bytes page_image = 4;
+  // The relation that the pages belong to.
+  RelTag rel = 4;
+  // The page(s), in the same order as the request.
+  repeated Page page = 5;
+}
+
+// A page.
+//
+// TODO: it would be slightly more efficient (but less convenient) to have separate arrays of block
+// numbers and images, but given the 8KB page size it's probably negligible. Benchmark it anyway.
+message Page {
+  // The page number.
+  uint32 block_number = 1;
+  // The materialized page image, as an 8KB byte vector.
+  bytes image = 2;
 }

 // A GetPageResponse status code.
--- a/pageserver/page_api/src/client.rs
+++ b/pageserver/page_api/src/client.rs
@@ -1,4 +1,5 @@
 use anyhow::Context as _;
+use futures::future::ready;
 use futures::{Stream, StreamExt as _, TryStreamExt as _};
 use tokio::io::AsyncRead;
 use tokio_util::io::StreamReader;
@@ -110,7 +111,7 @@ impl Client {
    ) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
        let reqs = reqs.map(proto::GetPageRequest::from);
        let resps = self.inner.get_pages(reqs).await?.into_inner();
-        Ok(resps.map_ok(GetPageResponse::from))
+        Ok(resps.and_then(|resp| ready(GetPageResponse::try_from(resp).map_err(|err| err.into()))))
    }

    /// Returns the size of a relation, as # of blocks.
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -356,7 +356,10 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
            return Err(ProtocolError::Missing("block_number"));
        }
        Ok(Self {
-            request_id: pb.request_id,
+            request_id: pb
+                .request_id
+                .ok_or(ProtocolError::Missing("request_id"))?
+                .into(),
            request_class: pb.request_class.into(),
            read_lsn: pb
                .read_lsn
@@ -371,7 +374,7 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
 impl From<GetPageRequest> for proto::GetPageRequest {
    fn from(request: GetPageRequest) -> Self {
        Self {
-            request_id: request.request_id,
+            request_id: Some(request.request_id.into()),
            request_class: request.request_class.into(),
            read_lsn: Some(request.read_lsn.into()),
            rel: Some(request.rel.into()),
@@ -380,8 +383,51 @@ impl From<GetPageRequest> for proto::GetPageRequest {
    }
 }

-/// A GetPage request ID.
-pub type RequestID = u64;
+/// A GetPage request ID and retry attempt. Should be unique for in-flight requests on a stream.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct RequestID {
+    /// The base request ID.
+    pub id: u64,
+    // The request attempt. Starts at 0, incremented on each retry.
+    pub attempt: u32,
+}
+
+impl RequestID {
+    /// Creates a new RequestID with the given ID and an initial attempt of 0.
+    pub fn new(id: u64) -> Self {
+        Self { id, attempt: 0 }
+    }
+}
+
+impl Display for RequestID {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}.{}", self.id, self.attempt)
+    }
+}
+
+impl From<proto::RequestId> for RequestID {
+    fn from(pb: proto::RequestId) -> Self {
+        Self {
+            id: pb.id,
+            attempt: pb.attempt,
+        }
+    }
+}
+
+impl From<u64> for RequestID {
+    fn from(id: u64) -> Self {
+        Self::new(id)
+    }
+}
+
+impl From<RequestID> for proto::RequestId {
+    fn from(request_id: RequestID) -> Self {
+        Self {
+            id: request_id.id,
+            attempt: request_id.attempt,
+        }
+    }
+}

 /// A GetPage request class.
 #[derive(Clone, Copy, Debug, strum_macros::Display)]
@@ -456,32 +502,41 @@ impl From<GetPageClass> for i32 {
 pub struct GetPageResponse {
    /// The original request's ID.
    pub request_id: RequestID,
-    /// The response status code.
+    /// The response status code. If not OK, the `rel` and `pages` fields will be empty.
    pub status_code: GetPageStatusCode,
    /// A string describing the status, if any.
    pub reason: Option<String>,
-    /// The 8KB page images, in the same order as the request. Empty if status != OK.
-    pub page_images: Vec<Bytes>,
+    /// The relation that the pages belong to.
+    pub rel: RelTag,
+    // The page(s), in the same order as the request.
+    pub pages: Vec<Page>,
 }

-impl From<proto::GetPageResponse> for GetPageResponse {
-    fn from(pb: proto::GetPageResponse) -> Self {
-        Self {
-            request_id: pb.request_id,
+impl TryFrom<proto::GetPageResponse> for GetPageResponse {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetPageResponse) -> Result<Self, ProtocolError> {
+        Ok(Self {
+            request_id: pb
+                .request_id
+                .ok_or(ProtocolError::Missing("request_id"))?
+                .into(),
            status_code: pb.status_code.into(),
            reason: Some(pb.reason).filter(|r| !r.is_empty()),
-            page_images: pb.page_image,
-        }
+            rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
+            pages: pb.page.into_iter().map(Page::from).collect(),
+        })
    }
 }

 impl From<GetPageResponse> for proto::GetPageResponse {
    fn from(response: GetPageResponse) -> Self {
        Self {
-            request_id: response.request_id,
+            request_id: Some(response.request_id.into()),
            status_code: response.status_code.into(),
            reason: response.reason.unwrap_or_default(),
-            page_image: response.page_images,
+            rel: Some(response.rel.into()),
+            page: response.pages.into_iter().map(proto::Page::from).collect(),
        }
    }
 }
@@ -514,11 +569,39 @@ impl GetPageResponse {
            request_id,
            status_code,
            reason: Some(status.message().to_string()),
-            page_images: Vec::new(),
+            rel: RelTag::default(),
+            pages: Vec::new(),
        })
    }
 }

+// A page.
+#[derive(Clone, Debug)]
+pub struct Page {
+    /// The page number.
+    pub block_number: u32,
+    /// The materialized page image, as an 8KB byte vector.
+    pub image: Bytes,
+}
+
+impl From<proto::Page> for Page {
+    fn from(pb: proto::Page) -> Self {
+        Self {
+            block_number: pb.block_number,
+            image: pb.image,
+        }
+    }
+}
+
+impl From<Page> for proto::Page {
+    fn from(page: Page) -> Self {
+        Self {
+            block_number: page.block_number,
+            image: page.image,
+        }
+    }
+}
+
 /// A GetPage response status code.
 ///
 /// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -27,8 +27,9 @@ tokio-util.workspace = true
 tonic.workspace = true
 url.workspace = true

-pageserver_client.workspace = true
 pageserver_api.workspace = true
+pageserver_client.workspace = true
+pageserver_client_grpc.workspace = true
 pageserver_page_api.workspace = true
 utils = { path = "../../libs/utils/" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -10,12 +10,14 @@ use anyhow::Context;
 use async_trait::async_trait;
 use bytes::Bytes;
 use camino::Utf8PathBuf;
+use futures::stream::FuturesUnordered;
 use futures::{Stream, StreamExt as _};
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::TenantShardId;
+use pageserver_client_grpc::{self as client_grpc, ShardSpec};
 use pageserver_page_api as page_api;
 use rand::prelude::*;
 use tokio::task::JoinSet;
@@ -37,6 +39,10 @@ pub(crate) struct Args {
    /// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
    page_service_connstring: String,
+    /// Use the rich gRPC Pageserver client `client_grpc::PageserverClient`, rather than the basic
+    /// no-frills `page_api::Client`. Only valid with grpc:// connstrings.
+    #[clap(long)]
+    rich_client: bool,
    #[clap(long)]
    pageserver_jwt: Option<String>,
    #[clap(long, default_value = "1")]
@@ -332,6 +338,7 @@ async fn main_impl(
            let client: Box<dyn Client> = match scheme.as_str() {
                "postgresql" | "postgres" => {
                    assert!(!args.compression, "libpq does not support compression");
+                    assert!(!args.rich_client, "rich client requires grpc://");
                    Box::new(
                        LibpqClient::new(&args.page_service_connstring, worker_id.timeline)
                            .await
@@ -339,6 +346,16 @@ async fn main_impl(
                    )
                }

+                "grpc" if args.rich_client => Box::new(
+                    RichGrpcClient::new(
+                        &args.page_service_connstring,
+                        worker_id.timeline,
+                        args.compression,
+                    )
+                    .await
+                    .unwrap(),
+                ),
+
                "grpc" => Box::new(
                    GrpcClient::new(
                        &args.page_service_connstring,
@@ -657,7 +674,7 @@ impl Client for GrpcClient {
        blks: Vec<u32>,
    ) -> anyhow::Result<()> {
        let req = page_api::GetPageRequest {
-            request_id: req_id,
+            request_id: req_id.into(),
            request_class: page_api::GetPageClass::Normal,
            read_lsn: page_api::ReadLsn {
                request_lsn: req_lsn,
@@ -677,6 +694,79 @@ impl Client for GrpcClient {
            "unexpected status code: {}",
            resp.status_code,
        );
-        Ok((resp.request_id, resp.page_images))
+        Ok((
+            resp.request_id.id,
+            resp.pages.into_iter().map(|p| p.image).collect(),
+        ))
+    }
+}
+
+/// A rich gRPC Pageserver client.
+struct RichGrpcClient {
+    inner: Arc<client_grpc::PageserverClient>,
+    requests: FuturesUnordered<
+        Pin<Box<dyn Future<Output = anyhow::Result<page_api::GetPageResponse>> + Send>>,
+    >,
+}
+
+impl RichGrpcClient {
+    async fn new(
+        connstring: &str,
+        ttid: TenantTimelineId,
+        compression: bool,
+    ) -> anyhow::Result<Self> {
+        let inner = Arc::new(client_grpc::PageserverClient::new(
+            ttid.tenant_id,
+            ttid.timeline_id,
+            ShardSpec::new(
+                [(ShardIndex::unsharded(), connstring.to_string())].into(),
+                None,
+            )?,
+            None,
+            compression.then_some(tonic::codec::CompressionEncoding::Zstd),
+        )?);
+        Ok(Self {
+            inner,
+            requests: FuturesUnordered::new(),
+        })
+    }
+}
+
+#[async_trait]
+impl Client for RichGrpcClient {
+    async fn send_get_page(
+        &mut self,
+        req_id: u64,
+        req_lsn: Lsn,
+        mod_lsn: Lsn,
+        rel: RelTag,
+        blks: Vec<u32>,
+    ) -> anyhow::Result<()> {
+        let req = page_api::GetPageRequest {
+            request_id: req_id.into(),
+            request_class: page_api::GetPageClass::Normal,
+            read_lsn: page_api::ReadLsn {
+                request_lsn: req_lsn,
+                not_modified_since_lsn: Some(mod_lsn),
+            },
+            rel,
+            block_numbers: blks,
+        };
+        let inner = self.inner.clone();
+        self.requests.push(Box::pin(async move {
+            inner
+                .get_page(req)
+                .await
+                .map_err(|err| anyhow::anyhow!("{err}"))
+        }));
+        Ok(())
+    }
+
+    async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
+        let resp = self.requests.next().await.unwrap()?;
+        Ok((
+            resp.request_id.id,
+            resp.pages.into_iter().map(|p| p.image).collect(),
+        ))
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -29,8 +29,8 @@ use pageserver::task_mgr::{
 };
 use pageserver::tenant::{TenantSharedResources, mgr, secondary};
 use pageserver::{
-    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
-    page_cache, page_service, task_mgr, virtual_file,
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener,
+    MetricsCollectionTask, http, page_cache, page_service, task_mgr, virtual_file,
 };
 use postgres_backend::AuthType;
 use remote_storage::GenericRemoteStorage;
@@ -41,6 +41,7 @@ use tracing_utils::OtelGuard;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::crashsafe::syncfs;
 use utils::logging::TracingErrorLayerEnablement;
+use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
 use utils::sentry_init::init_sentry;
 use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener};

@@ -763,6 +764,41 @@ fn start_pageserver(
        (http_task, https_task)
    };

+    /* BEGIN_HADRON */
+    let metrics_collection_task = {
+        let cancel = shutdown_pageserver.child_token();
+        let task = crate::BACKGROUND_RUNTIME.spawn({
+            let cancel = cancel.clone();
+            let background_jobs_barrier = background_jobs_barrier.clone();
+            async move {
+                if conf.force_metric_collection_on_scrape {
+                    return;
+                }
+
+                // first wait until background jobs are cleared to launch.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return; },
+                    _ = background_jobs_barrier.wait() => {}
+                };
+                let mut interval = tokio::time::interval(METRICS_COLLECTION_INTERVAL);
+                loop {
+                    tokio::select! {
+                        _ = cancel.cancelled() => {
+                            tracing::info!("cancelled metrics collection task, exiting...");
+                             break;
+                        },
+                        _ = interval.tick() => {}
+                    }
+                    tokio::task::spawn_blocking(|| {
+                        METRICS_COLLECTOR.run_once(true);
+                    });
+                }
+            }
+        });
+        MetricsCollectionTask(CancellableTask { task, cancel })
+    };
+    /* END_HADRON */
+
    let consumption_metrics_tasks = {
        let cancel = shutdown_pageserver.child_token();
        let task = crate::BACKGROUND_RUNTIME.spawn({
@@ -844,6 +880,7 @@ fn start_pageserver(
            https_endpoint_listener,
            page_service,
            page_service_grpc,
+            metrics_collection_task,
            consumption_metrics_tasks,
            disk_usage_eviction_task,
            &tenant_manager,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -252,6 +252,14 @@ pub struct PageServerConf {
    pub timeline_import_config: pageserver_api::config::TimelineImportConfig,

    pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
+
+    /// Defines what is a big tenant for the purpose of image layer generation.
+    /// See Timeline::should_check_if_image_layers_required
+    pub image_layer_generation_large_timeline_threshold: Option<u64>,
+
+    /// Controls whether to collect all metrics on each scrape or to return potentially stale
+    /// results.
+    pub force_metric_collection_on_scrape: bool,
 }

 /// Token for authentication to safekeepers
@@ -432,6 +440,8 @@ impl PageServerConf {
            posthog_config,
            timeline_import_config,
            basebackup_cache_config,
+            image_layer_generation_large_timeline_threshold,
+            force_metric_collection_on_scrape,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -490,6 +500,8 @@ impl PageServerConf {
            dev_mode,
            timeline_import_config,
            basebackup_cache_config,
+            image_layer_generation_large_timeline_threshold,
+            force_metric_collection_on_scrape,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2,7 +2,9 @@
 //! Management HTTP API
 //!
 use std::cmp::Reverse;
-use std::collections::{BinaryHeap, HashMap};
+use std::collections::BTreeMap;
+use std::collections::BinaryHeap;
+use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
@@ -79,8 +81,8 @@ use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerNa
 use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
 use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
 use crate::tenant::timeline::{
-    CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
-    WaitLsnTimeout, WaitLsnWaiter, import_pgdata,
+    CompactFlags, CompactOptions, CompactRequest, MarkInvisibleRequest, Timeline, WaitLsnTimeout,
+    WaitLsnWaiter, import_pgdata,
 };
 use crate::tenant::{
    GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
@@ -395,6 +397,7 @@ async fn build_timeline_info(
    timeline: &Arc<Timeline>,
    include_non_incremental_logical_size: bool,
    force_await_initial_logical_size: bool,
+    include_image_consistent_lsn: bool,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -419,6 +422,10 @@ async fn build_timeline_info(
                .await?,
        );
    }
+    // HADRON
+    if include_image_consistent_lsn {
+        info.image_consistent_lsn = Some(timeline.compute_image_consistent_lsn().await?);
+    }
    Ok(info)
 }

@@ -508,6 +515,8 @@ async fn build_timeline_info_common(
        is_invisible: Some(is_invisible),

        walreceiver_status,
+        // HADRON
+        image_consistent_lsn: None,
    };
    Ok(info)
 }
@@ -710,6 +719,8 @@ async fn timeline_list_handler(
        parse_query_param(&request, "include-non-incremental-logical-size")?;
    let force_await_initial_logical_size: Option<bool> =
        parse_query_param(&request, "force-await-initial-logical-size")?;
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let state = get_state(&request);
@@ -730,6 +741,7 @@ async fn timeline_list_handler(
                &timeline,
                include_non_incremental_logical_size.unwrap_or(false),
                force_await_initial_logical_size.unwrap_or(false),
+                include_image_consistent_lsn.unwrap_or(false),
                &ctx,
            )
            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -758,6 +770,9 @@ async fn timeline_and_offloaded_list_handler(
        parse_query_param(&request, "include-non-incremental-logical-size")?;
    let force_await_initial_logical_size: Option<bool> =
        parse_query_param(&request, "force-await-initial-logical-size")?;
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
+
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let state = get_state(&request);
@@ -778,6 +793,7 @@ async fn timeline_and_offloaded_list_handler(
                &timeline,
                include_non_incremental_logical_size.unwrap_or(false),
                force_await_initial_logical_size.unwrap_or(false),
+                include_image_consistent_lsn.unwrap_or(false),
                &ctx,
            )
            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -962,6 +978,9 @@ async fn timeline_detail_handler(
        parse_query_param(&request, "include-non-incremental-logical-size")?;
    let force_await_initial_logical_size: Option<bool> =
        parse_query_param(&request, "force-await-initial-logical-size")?;
+    // HADRON
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    // Logical size calculation needs downloading.
@@ -982,6 +1001,7 @@ async fn timeline_detail_handler(
            &timeline,
            include_non_incremental_logical_size.unwrap_or(false),
            force_await_initial_logical_size.unwrap_or(false),
+            include_image_consistent_lsn.unwrap_or(false),
            ctx,
        )
        .await
@@ -2500,9 +2520,10 @@ async fn timeline_checkpoint_handler(
                .compact(&cancel, flags, &ctx)
                .await
                .map_err(|e|
-                    match e {
-                        CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                        CompactionError::Other(e) => ApiError::InternalServerError(e),
+                    if e.is_cancel() {
+                        ApiError::ShuttingDown
+                    } else {
+                        ApiError::InternalServerError(e.into_anyhow())
                    }
                )?;
        }
@@ -3213,6 +3234,30 @@ async fn get_utilization(
        .map_err(ApiError::InternalServerError)
 }

+/// HADRON
+async fn list_tenant_visible_size_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let state = get_state(&request);
+
+    let mut map = BTreeMap::new();
+    for (tenant_shard_id, slot) in state.tenant_manager.list() {
+        match slot {
+            TenantSlot::Attached(tenant) => {
+                let visible_size = tenant.get_visible_size();
+                map.insert(tenant_shard_id, visible_size);
+            }
+            TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
+                continue;
+            }
+        }
+    }
+
+    json_response(StatusCode::OK, map)
+}
+
 async fn list_aux_files(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -3616,6 +3661,7 @@ async fn activate_post_import_handler(
        let timeline_info = build_timeline_info(
            &timeline, false, // include_non_incremental_logical_size,
            false, // force_await_initial_logical_size
+            false, // include_image_consistent_lsn
            &ctx,
        )
        .await
@@ -3937,9 +3983,14 @@ pub fn make_router(
        .expect("construct launch timestamp header middleware"),
    );

+    let force_metric_collection_on_scrape = state.conf.force_metric_collection_on_scrape;
+
+    let prometheus_metrics_handler_wrapper =
+        move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape);
+
    Ok(router
        .data(state)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| request_span(r, prometheus_metrics_handler_wrapper))
        .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
        .get("/profile/heap", |r| request_span(r, profile_heap_handler))
        .get("/v1/status", |r| api_handler(r, status_handler))
@@ -4145,6 +4196,7 @@ pub fn make_router(
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
        .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
        .get("/v1/utilization", |r| api_handler(r, get_utilization))
+        .get("/v1/list_tenant_visible_size", |r| api_handler(r, list_tenant_visible_size_handler))
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
            |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -73,6 +73,9 @@ pub struct HttpEndpointListener(pub CancellableTask);
 pub struct HttpsEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
+// HADRON
+pub struct MetricsCollectionTask(pub CancellableTask);
+
 impl CancellableTask {
    pub async fn shutdown(self) {
        self.cancel.cancel();
@@ -87,6 +90,7 @@ pub async fn shutdown_pageserver(
    https_listener: Option<HttpsEndpointListener>,
    page_service: page_service::Listener,
    grpc_task: Option<CancellableTask>,
+    metrics_collection_task: MetricsCollectionTask,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
@@ -211,6 +215,14 @@ pub async fn shutdown_pageserver(
    // Best effort to persist any outstanding deletions, to avoid leaking objects
    deletion_queue.shutdown(Duration::from_secs(5)).await;

+    // HADRON
+    timed(
+        metrics_collection_task.0.shutdown(),
+        "shutdown metrics collections metrics",
+        Duration::from_secs(1),
+    )
+    .await;
+
    timed(
        consumption_metrics_worker.0.shutdown(),
        "shutdown consumption metrics",
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2847,6 +2847,24 @@ pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy<IntCounter> = Lazy::new(||
    .expect("failed to define a metric")
 });

+// Global counter for PageStream request results by outcome. Outcomes are divided into 3 categories:
+// - success
+// - internal_error: errors that indicate bugs in the storage cluster (e.g. page reconstruction errors, misrouted requests, LSN timeout errors)
+// - other_error: transient error conditions that are expected in normal operation or indicate bugs with other parts of the system (e.g. error due to pageserver shutdown, malformed requests etc.)
+pub(crate) static PAGESTREAM_HANDLER_RESULTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_pagestream_handler_results_total",
+        "Number of pageserver pagestream handler results by outcome (success, internal_error, other_error)",
+        &["outcome"]
+    )
+    .expect("failed to define a metric")
+});
+
+// Constants for pageserver_pagestream_handler_results_total's outcome labels
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_SUCCESS: &str = "success";
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR: &str = "internal_error";
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR: &str = "other_error";
+
 // Metrics collected on WAL redo operations
 //
 // We collect the time spent in actual WAL redo ('redo'), and time waiting
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -70,7 +70,7 @@ use crate::context::{
 };
 use crate::metrics::{
    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
-    MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics,
+    MISROUTED_PAGESTREAM_REQUESTS, PAGESTREAM_HANDLER_RESULTS_TOTAL, SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::{LsnRange, Version};
 use crate::span::{
@@ -1441,20 +1441,57 @@ impl PageServerHandler {
            let (response_msg, ctx) = match handler_result {
                Err(e) => match &e.err {
                    PageStreamError::Shutdown => {
+                        // BEGIN HADRON
+                        PAGESTREAM_HANDLER_RESULTS_TOTAL
+                            .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR])
+                            .inc();
+                        // END HADRON
+
                        // If we fail to fulfil a request during shutdown, which may be _because_ of
                        // shutdown, then do not send the error to the client.  Instead just drop the
                        // connection.
                        span.in_scope(|| info!("dropping connection due to shutdown"));
                        return Err(QueryError::Shutdown);
                    }
-                    PageStreamError::Reconnect(reason) => {
-                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                    PageStreamError::Reconnect(_reason) => {
+                        span.in_scope(|| {
+                            // BEGIN HADRON
+                            // We can get here because the compute node is pointing at the wrong PS. We
+                            // already have a metric to keep track of this so suppressing this log to
+                            // reduce log spam. The information in this log message is not going to be that
+                            // helpful given the volume of logs that can be generated.
+                            // info!("handler requested reconnect: {reason}")
+                            // END HADRON
+                        });
+                        // BEGIN HADRON
+                        PAGESTREAM_HANDLER_RESULTS_TOTAL
+                            .with_label_values(&[
+                                metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
+                            ])
+                            .inc();
+                        // END HADRON
                        return Err(QueryError::Reconnect);
                    }
                    PageStreamError::Read(_)
                    | PageStreamError::LsnTimeout(_)
                    | PageStreamError::NotFound(_)
                    | PageStreamError::BadRequest(_) => {
+                        // BEGIN HADRON
+                        if let PageStreamError::Read(_) | PageStreamError::LsnTimeout(_) = &e.err {
+                            PAGESTREAM_HANDLER_RESULTS_TOTAL
+                                .with_label_values(&[
+                                    metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
+                                ])
+                                .inc();
+                        } else {
+                            PAGESTREAM_HANDLER_RESULTS_TOTAL
+                                .with_label_values(&[
+                                    metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR,
+                                ])
+                                .inc();
+                        }
+                        // END HADRON
+
                        // print the all details to the log with {:#}, but for the client the
                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
                        // here includes cancellation which is not an error.
@@ -1472,7 +1509,15 @@ impl PageServerHandler {
                        )
                    }
                },
-                Ok((response_msg, _op_timer_already_observed, ctx)) => (response_msg, Some(ctx)),
+                Ok((response_msg, _op_timer_already_observed, ctx)) => {
+                    // BEGIN HADRON
+                    PAGESTREAM_HANDLER_RESULTS_TOTAL
+                        .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_SUCCESS])
+                        .inc();
+                    // END HADRON
+
+                    (response_msg, Some(ctx))
+                }
            };

            let ctx = ctx.map(|req_ctx| {
@@ -3293,9 +3338,12 @@ impl GrpcPageServiceHandler {
    }

    /// Generates a PagestreamRequest header from a ReadLsn and request ID.
-    fn make_hdr(read_lsn: page_api::ReadLsn, req_id: u64) -> PagestreamRequest {
+    fn make_hdr(
+        read_lsn: page_api::ReadLsn,
+        req_id: Option<page_api::RequestID>,
+    ) -> PagestreamRequest {
        PagestreamRequest {
-            reqid: req_id,
+            reqid: req_id.map(|r| r.id).unwrap_or_default(),
            request_lsn: read_lsn.request_lsn,
            not_modified_since: read_lsn
                .not_modified_since_lsn
@@ -3405,7 +3453,7 @@ impl GrpcPageServiceHandler {

            batch.push(BatchedGetPageRequest {
                req: PagestreamGetPageRequest {
-                    hdr: Self::make_hdr(req.read_lsn, req.request_id),
+                    hdr: Self::make_hdr(req.read_lsn, Some(req.request_id)),
                    rel: req.rel,
                    blkno,
                },
@@ -3435,12 +3483,16 @@ impl GrpcPageServiceHandler {
            request_id: req.request_id,
            status_code: page_api::GetPageStatusCode::Ok,
            reason: None,
-            page_images: Vec::with_capacity(results.len()),
+            rel: req.rel,
+            pages: Vec::with_capacity(results.len()),
        };

        for result in results {
            match result {
-                Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.page_images.push(r.page),
+                Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.pages.push(page_api::Page {
+                    block_number: r.req.blkno,
+                    image: r.page,
+                }),
                Ok((resp, _, _)) => {
                    return Err(tonic::Status::internal(format!(
                        "unexpected response: {resp:?}"
@@ -3483,7 +3535,7 @@ impl proto::PageService for GrpcPageServiceHandler {
        span_record!(rel=%req.rel, lsn=%req.read_lsn);

        let req = PagestreamExistsRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
            rel: req.rel,
        };

@@ -3633,7 +3685,7 @@ impl proto::PageService for GrpcPageServiceHandler {
        span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn);

        let req = PagestreamDbSizeRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
            dbnode: req.db_oid,
        };

@@ -3683,7 +3735,7 @@ impl proto::PageService for GrpcPageServiceHandler {
                .await?
                .downgrade();
            while let Some(req) = reqs.message().await? {
-                let req_id = req.request_id;
+                let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
                let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
                    .instrument(span.clone()) // propagate request span
                    .await;
@@ -3722,7 +3774,7 @@ impl proto::PageService for GrpcPageServiceHandler {
        span_record!(rel=%req.rel, lsn=%req.read_lsn);

        let req = PagestreamNblocksRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
            rel: req.rel,
        };

@@ -3755,7 +3807,7 @@ impl proto::PageService for GrpcPageServiceHandler {
        span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn);

        let req = PagestreamGetSlruSegmentRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
            kind: req.kind as u8,
            segno: req.segno,
        };
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3291,7 +3291,7 @@ impl TenantShard {
                        // Ignore this, we likely raced with unarchival.
                        OffloadError::NotArchived => Ok(()),
                        OffloadError::AlreadyInProgress => Ok(()),
-                        OffloadError::Cancelled => Err(CompactionError::ShuttingDown),
+                        OffloadError::Cancelled => Err(CompactionError::new_cancelled()),
                        // don't break the anyhow chain
                        OffloadError::Other(err) => Err(CompactionError::Other(err)),
                    })?;
@@ -3321,16 +3321,13 @@ impl TenantShard {

    /// Trips the compaction circuit breaker if appropriate.
    pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
-        match err {
-            err if err.is_cancel() => {}
-            CompactionError::ShuttingDown => (),
-            CompactionError::Other(err) => {
-                self.compaction_circuit_breaker
-                    .lock()
-                    .unwrap()
-                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
-            }
+        if err.is_cancel() {
+            return;
        }
+        self.compaction_circuit_breaker
+            .lock()
+            .unwrap()
+            .fail(&CIRCUIT_BREAKERS_BROKEN, err);
    }

    /// Cancel scheduled compaction tasks
@@ -4174,6 +4171,15 @@ impl TenantShard {
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

+    // HADRON
+    pub fn get_image_creation_timeout(&self) -> Option<Duration> {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf.image_layer_force_creation_period.or(self
+            .conf
+            .default_tenant_conf
+            .image_layer_force_creation_period)
+    }
+
    pub fn get_pitr_interval(&self) -> Duration {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
@@ -5713,6 +5719,16 @@ impl TenantShard {
            .unwrap_or(0)
    }

+    /// HADRON
+    /// Return the visible size of all timelines in this tenant.
+    pub(crate) fn get_visible_size(&self) -> u64 {
+        let timelines = self.timelines.lock().unwrap();
+        timelines
+            .values()
+            .map(|t| t.metrics.visible_physical_size_gauge.get())
+            .sum()
+    }
+
    /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
    /// manifest in `Self::remote_tenant_manifest`.
    ///
@@ -12800,6 +12816,40 @@ mod tests {
                },
            ]
        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_get_force_image_creation_lsn() -> anyhow::Result<()> {
+        let tenant_conf = pageserver_api::models::TenantConfig {
+            pitr_interval: Some(Duration::from_secs(7 * 3600)),
+            image_layer_force_creation_period: Some(Duration::from_secs(3600)),
+            ..Default::default()
+        };
+
+        let tenant_id = TenantId::generate();
+
+        let harness = TenantHarness::create_custom(
+            "test_get_force_image_creation_lsn",
+            tenant_conf,
+            tenant_id,
+            ShardIdentity::unsharded(),
+            Generation::new(1),
+        )
+        .await?;
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        timeline.gc_info.write().unwrap().cutoffs.time = Some(Lsn(100));
+        {
+            let writer = timeline.writer().await;
+            writer.finish_write(Lsn(5000));
+        }
+
+        let image_creation_lsn = timeline.get_force_image_creation_lsn().unwrap();
+        assert_eq!(image_creation_lsn, Lsn(4300));
        Ok(())
    }
 }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -46,10 +46,11 @@
 mod historic_layer_coverage;
 mod layer_coverage;

-use std::collections::{HashMap, VecDeque};
+use std::collections::{BTreeMap, HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Instant;

 use anyhow::Result;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
@@ -904,6 +905,103 @@ impl LayerMap {
        max_stacked_deltas
    }

+    /* BEGIN_HADRON */
+    /**
+     * Compute the image consistent LSN, the largest LSN below which all pages have been redone successfully.
+     * It works by first finding the latest image layers and store them into a map. Then for each delta layer,
+     * find all overlapping image layers in order to potentially increase the image LSN in case there are gaps
+     * (e.g., if an image is created at LSN 100 but the delta layer spans LSN [150, 200], then we can increase
+     * image LSN to 150 because there is no WAL record in between).
+     * Finally, the image consistent LSN is computed by taking the minimum of all image layers.
+     */
+    pub fn compute_image_consistent_lsn(&self, disk_consistent_lsn: Lsn) -> Lsn {
+        struct ImageLayerInfo {
+            // creation LSN of the image layer
+            image_lsn: Lsn,
+            // the current minimum LSN of newer delta layers with overlapping key ranges
+            min_delta_lsn: Lsn,
+        }
+        let started_at = Instant::now();
+
+        let min_l0_deltas_lsn = {
+            let l0_deltas = self.level0_deltas();
+            l0_deltas
+                .iter()
+                .map(|layer| layer.get_lsn_range().start)
+                .min()
+                .unwrap_or(disk_consistent_lsn)
+        };
+        let global_key_range = Key::MIN..Key::MAX;
+
+        // step 1: collect all most recent image layers into a map
+        // map: end key to image_layer_info
+        let mut image_map: BTreeMap<Key, ImageLayerInfo> = BTreeMap::new();
+        for (img_range, img) in self.image_coverage(&global_key_range, disk_consistent_lsn) {
+            let img_lsn = img.map(|layer| layer.get_lsn_range().end).unwrap_or(Lsn(0));
+            image_map.insert(
+                img_range.end,
+                ImageLayerInfo {
+                    image_lsn: img_lsn,
+                    min_delta_lsn: min_l0_deltas_lsn,
+                },
+            );
+        }
+
+        // step 2: go through all delta layers, and update the image layer info with overlapping
+        // key ranges
+        for layer in self.historic.iter() {
+            if !layer.is_delta {
+                continue;
+            }
+            let delta_key_range = layer.get_key_range();
+            let delta_lsn_range = layer.get_lsn_range();
+            for (img_end_key, img_info) in image_map.range_mut(delta_key_range.start..Key::MAX) {
+                debug_assert!(img_end_key >= &delta_key_range.start);
+                if delta_lsn_range.end > img_info.image_lsn {
+                    // the delta layer includes WAL records after the image
+                    // it's possibel that the delta layer's start LSN < image LSN, which will be simply ignored by step 3
+                    img_info.min_delta_lsn =
+                        std::cmp::min(img_info.min_delta_lsn, delta_lsn_range.start);
+                }
+                if img_end_key >= &delta_key_range.end {
+                    // we have fully processed all overlapping image layers
+                    break;
+                }
+            }
+        }
+
+        // step 3, go through all image layers and find the image consistent LSN
+        let mut img_consistent_lsn = min_l0_deltas_lsn.checked_sub(Lsn(1)).unwrap();
+        let mut prev_key = Key::MIN;
+        for (img_key, img_info) in image_map {
+            tracing::debug!(
+                "Image layer {:?}:{} has min delta lsn {}",
+                Range {
+                    start: prev_key,
+                    end: img_key,
+                },
+                img_info.image_lsn,
+                img_info.min_delta_lsn,
+            );
+            let image_lsn = std::cmp::max(
+                img_info.image_lsn,
+                img_info.min_delta_lsn.checked_sub(Lsn(1)).unwrap_or(Lsn(0)),
+            );
+            img_consistent_lsn = std::cmp::min(img_consistent_lsn, image_lsn);
+            prev_key = img_key;
+        }
+        tracing::info!(
+            "computed image_consistent_lsn {} for disk_consistent_lsn {} in {}ms. Processed {} layrs in total.",
+            img_consistent_lsn,
+            disk_consistent_lsn,
+            started_at.elapsed().as_millis(),
+            self.historic.len()
+        );
+        img_consistent_lsn
+    }
+
+    /* END_HADRON */
+
    /// Return all L0 delta layers
    pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
        &self.l0_delta_layers
@@ -1579,6 +1677,138 @@ mod tests {
            LayerVisibilityHint::Visible
        ));
    }
+
+    /* BEGIN_HADRON */
+    #[test]
+    fn test_compute_image_consistent_lsn() {
+        let mut layer_map = LayerMap::default();
+
+        let disk_consistent_lsn = Lsn(1000);
+        // case 1: empty layer map
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(
+            disk_consistent_lsn.checked_sub(Lsn(1)).unwrap(),
+            image_consistent_lsn
+        );
+
+        // case 2: only L0 delta layer
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(100),
+                Lsn(900)..Lsn(990),
+                true,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(100),
+                Lsn(850)..Lsn(899),
+                true,
+            ));
+        }
+
+        // should use min L0 delta LSN - 1 as image consistent LSN
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(849), image_consistent_lsn);
+
+        // case 3: 3 images, no L1 delta
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(40),
+                Lsn(100)..Lsn(100),
+                false,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(40)..Key::from_i128(70),
+                Lsn(200)..Lsn(200),
+                false,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(70)..Key::from_i128(100),
+                Lsn(150)..Lsn(150),
+                false,
+            ));
+        }
+        // should use min L0 delta LSN - 1 as image consistent LSN
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(849), image_consistent_lsn);
+
+        // case 4: 3 images with 1 L1 delta
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(50),
+                Lsn(300)..Lsn(350),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(299), image_consistent_lsn);
+
+        // case 5: 3 images with 1 more L1 delta with smaller LSN
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(50)..Key::from_i128(72),
+                Lsn(200)..Lsn(300),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 6: 3 images with more newer L1 deltas (no impact on final results)
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(30),
+                Lsn(400)..Lsn(500),
+                true,
+            ));
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(35)..Key::from_i128(100),
+                Lsn(450)..Lsn(600),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 7: 3 images with more older L1 deltas (no impact on final results)
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(40),
+                Lsn(0)..Lsn(50),
+                true,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(50)..Key::from_i128(100),
+                Lsn(10)..Lsn(60),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 8: 3 images with one more L1 delta with overlapping LSN range
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(50),
+                Lsn(50)..Lsn(250),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(100), image_consistent_lsn);
+    }
+
+    /* END_HADRON */
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -225,7 +225,7 @@ impl fmt::Display for ImageLayerName {
 /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
 /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
 /// and [`crate::tenant::storage_layer::layer::local_layer_path`])
-#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Ord, PartialOrd)]
 pub enum LayerName {
    Image(ImageLayerName),
    Delta(DeltaLayerName),
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -17,17 +17,14 @@ use tracing::*;
 use utils::backoff::exponential_backoff_duration;
 use utils::completion::Barrier;
 use utils::pausable_failpoint;
-use utils::sync::gate::GateError;

 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
 use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
-use crate::tenant::blob_io::WriteBlobError;
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::{TenantShard, TenantState};
-use crate::virtual_file::owned_buffers_io::write::FlushTaskError;

 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
@@ -310,45 +307,12 @@ pub(crate) fn log_compaction_error(
    task_cancelled: bool,
    degrade_to_warning: bool,
 ) {
-    use CompactionError::*;
+    let is_cancel = err.is_cancel();

-    use crate::tenant::PageReconstructError;
-    use crate::tenant::upload_queue::NotInitialized;
-
-    let level = match err {
-        e if e.is_cancel() => return,
-        ShuttingDown => return,
-        _ if task_cancelled => Level::INFO,
-        Other(err) => {
-            let root_cause = err.root_cause();
-
-            let upload_queue = root_cause
-                .downcast_ref::<NotInitialized>()
-                .is_some_and(|e| e.is_stopping());
-            let timeline = root_cause
-                .downcast_ref::<PageReconstructError>()
-                .is_some_and(|e| e.is_cancel());
-            let buffered_writer_flush_task_canelled = root_cause
-                .downcast_ref::<FlushTaskError>()
-                .is_some_and(|e| e.is_cancel());
-            let write_blob_cancelled = root_cause
-                .downcast_ref::<WriteBlobError>()
-                .is_some_and(|e| e.is_cancel());
-            let gate_closed = root_cause
-                .downcast_ref::<GateError>()
-                .is_some_and(|e| e.is_cancel());
-            let is_stopping = upload_queue
-                || timeline
-                || buffered_writer_flush_task_canelled
-                || write_blob_cancelled
-                || gate_closed;
-
-            if is_stopping {
-                Level::INFO
-            } else {
-                Level::ERROR
-            }
-        }
+    let level = if is_cancel || task_cancelled {
+        Level::INFO
+    } else {
+        Level::ERROR
    };

    if let Some((error_count, sleep_duration)) = retry_info {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1002,7 +1002,7 @@ impl From<WaitLsnError> for tonic::Status {
 impl From<CreateImageLayersError> for CompactionError {
    fn from(e: CreateImageLayersError) -> Self {
        match e {
-            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
+            CreateImageLayersError::Cancelled => CompactionError::new_cancelled(),
            CreateImageLayersError::Other(e) => {
                CompactionError::Other(e.context("create image layers"))
            }
@@ -2117,12 +2117,7 @@ impl Timeline {
        match &result {
            Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
            Err(e) if e.is_cancel() => {}
-            Err(CompactionError::ShuttingDown) => {
-                // Covered by the `Err(e) if e.is_cancel()` branch.
-            }
-            Err(CompactionError::Other(_)) => {
-                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
-            }
+            Err(_) => self.compaction_failed.store(true, AtomicOrdering::Relaxed),
        };

        result
@@ -2851,6 +2846,18 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

+    // HADRON
+    fn get_image_layer_force_creation_period(&self) -> Option<Duration> {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .image_layer_force_creation_period
+            .or(self
+                .conf
+                .default_tenant_conf
+                .image_layer_force_creation_period)
+    }
+
    fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
        let tenant_conf = &self.tenant_conf.load();
        tenant_conf
@@ -3120,7 +3127,6 @@ impl Timeline {
                repartition_threshold: 0,
                last_image_layer_creation_check_at: AtomicLsn::new(0),
                last_image_layer_creation_check_instant: Mutex::new(None),
-
                last_received_wal: Mutex::new(None),
                rel_size_latest_cache: RwLock::new(HashMap::new()),
                rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
@@ -5041,6 +5047,7 @@ impl Timeline {
                .create_image_layers(
                    &partitions,
                    self.initdb_lsn,
+                    None,
                    ImageLayerCreationMode::Initial,
                    ctx,
                    LastImageLayerCreationStatus::Initial,
@@ -5312,14 +5319,19 @@ impl Timeline {
    }

    // Is it time to create a new image layer for the given partition? True if we want to generate.
-    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
+    async fn time_for_new_image_layer(
+        &self,
+        partition: &KeySpace,
+        lsn: Lsn,
+        force_image_creation_lsn: Option<Lsn>,
+    ) -> bool {
        let threshold = self.get_image_creation_threshold();

        let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
        let Ok(layers) = guard.layer_map() else {
            return false;
        };
-
+        let mut min_image_lsn: Lsn = Lsn::MAX;
        let mut max_deltas = 0;
        for part_range in &partition.ranges {
            let image_coverage = layers.image_coverage(part_range, lsn);
@@ -5354,9 +5366,25 @@ impl Timeline {
                        return true;
                    }
                }
+                min_image_lsn = min(min_image_lsn, img_lsn);
            }
        }

+        // HADRON
+        // for child timelines, we consider all pages up to ancestor_LSN are redone successfully by the parent timeline
+        min_image_lsn = min_image_lsn.max(self.get_ancestor_lsn());
+        if min_image_lsn < force_image_creation_lsn.unwrap_or(Lsn(0)) && max_deltas > 0 {
+            info!(
+                "forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}, num deltas: {}",
+                partition.ranges[0].start,
+                partition.ranges[0].end,
+                min_image_lsn,
+                force_image_creation_lsn.unwrap(),
+                max_deltas
+            );
+            return true;
+        }
+
        debug!(
            max_deltas,
            "none of the partitioned ranges had >= {threshold} deltas"
@@ -5582,7 +5610,7 @@ impl Timeline {
    ///        suffer from the lack of image layers
    ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
    fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
-        const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
+        let large_timeline_threshold = self.conf.image_layer_generation_large_timeline_threshold;

        let last_checks_at = self.last_image_layer_creation_check_at.load();
        let distance = lsn
@@ -5596,12 +5624,12 @@ impl Timeline {
        let mut time_based_decision = false;
        let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
        if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
-            let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
-            {
-                self.get_checkpoint_timeout()
-            } else {
-                Duration::from_secs(3600 * 48)
-            };
+            let check_required_after =
+                if Some(Into::<u64>::into(&logical_size)) >= large_timeline_threshold {
+                    self.get_checkpoint_timeout()
+                } else {
+                    Duration::from_secs(3600 * 48)
+                };

            time_based_decision = match *last_check_instant {
                Some(last_check) => {
@@ -5629,10 +5657,12 @@ impl Timeline {
    /// true = we have generate all image layers, false = we preempt the process for L0 compaction.
    ///
    /// `partition_mode` is only for logging purpose and is not used anywhere in this function.
+    #[allow(clippy::too_many_arguments)]
    async fn create_image_layers(
        self: &Arc<Timeline>,
        partitioning: &KeyPartitioning,
        lsn: Lsn,
+        force_image_creation_lsn: Option<Lsn>,
        mode: ImageLayerCreationMode,
        ctx: &RequestContext,
        last_status: LastImageLayerCreationStatus,
@@ -5736,7 +5766,11 @@ impl Timeline {
            } else if let ImageLayerCreationMode::Try = mode {
                // check_for_image_layers = false -> skip
                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
-                if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
+                if !check_for_image_layers
+                    || !self
+                        .time_for_new_image_layer(partition, lsn, force_image_creation_lsn)
+                        .await
+                {
                    start = img_range.end;
                    continue;
                }
@@ -6057,26 +6091,88 @@ impl Drop for Timeline {
    }
 }

-/// Top-level failure to compact.
-#[derive(Debug, thiserror::Error)]
-pub(crate) enum CompactionError {
-    #[error("The timeline or pageserver is shutting down")]
-    ShuttingDown,
-    #[error(transparent)]
-    Other(anyhow::Error),
-}
+pub(crate) use compaction_error::CompactionError;
+/// In a private mod to enforce that [`CompactionError::is_cancel`] is used
+/// instead of `match`ing on [`CompactionError::ShuttingDown`].
+mod compaction_error {
+    use utils::sync::gate::GateError;

-impl CompactionError {
-    /// Errors that can be ignored, i.e., cancel and shutdown.
-    pub fn is_cancel(&self) -> bool {
-        matches!(self, Self::ShuttingDown)
+    use crate::{
+        pgdatadir_mapping::CollectKeySpaceError,
+        tenant::{PageReconstructError, blob_io::WriteBlobError, upload_queue::NotInitialized},
+        virtual_file::owned_buffers_io::write::FlushTaskError,
+    };
+
+    /// Top-level failure to compact. Use [`Self::is_cancel`].
+    #[derive(Debug, thiserror::Error)]
+    pub(crate) enum CompactionError {
+        /// Use [`Self::is_cancel`] instead of checking for this variant.
+        #[error("The timeline or pageserver is shutting down")]
+        #[allow(private_interfaces)]
+        ShuttingDown(ForbidMatching), // private ForbidMatching enforces use of [`Self::is_cancel`].
+        #[error(transparent)]
+        Other(anyhow::Error),
    }

-    pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self {
-        if err.is_cancel() {
-            Self::ShuttingDown
-        } else {
-            Self::Other(err.into_anyhow())
+    #[derive(Debug)]
+    struct ForbidMatching;
+
+    impl CompactionError {
+        pub fn new_cancelled() -> Self {
+            Self::ShuttingDown(ForbidMatching)
+        }
+        /// Errors that can be ignored, i.e., cancel and shutdown.
+        pub fn is_cancel(&self) -> bool {
+            let other = match self {
+                CompactionError::ShuttingDown(_) => return true,
+                CompactionError::Other(other) => other,
+            };
+
+            // The write path of compaction in particular often lacks differentiated
+            // handling errors stemming from cancellation from other errors.
+            // So, if requested, we also check the ::Other variant by downcasting.
+            // The list below has been found empirically from flaky tests and production logs.
+            // The process is simple: on ::Other(), compaction will print the enclosed
+            // anyhow::Error in debug mode, i.e., with backtrace. That backtrace contains the
+            // line where the write path / compaction code does undifferentiated error handling
+            // from a non-anyhow type to an anyhow type. Add the type to the list of downcasts
+            // below, following the same is_cancel() pattern.
+
+            let root_cause = other.root_cause();
+
+            let upload_queue = root_cause
+                .downcast_ref::<NotInitialized>()
+                .is_some_and(|e| e.is_stopping());
+            let timeline = root_cause
+                .downcast_ref::<PageReconstructError>()
+                .is_some_and(|e| e.is_cancel());
+            let buffered_writer_flush_task_canelled = root_cause
+                .downcast_ref::<FlushTaskError>()
+                .is_some_and(|e| e.is_cancel());
+            let write_blob_cancelled = root_cause
+                .downcast_ref::<WriteBlobError>()
+                .is_some_and(|e| e.is_cancel());
+            let gate_closed = root_cause
+                .downcast_ref::<GateError>()
+                .is_some_and(|e| e.is_cancel());
+            upload_queue
+                || timeline
+                || buffered_writer_flush_task_canelled
+                || write_blob_cancelled
+                || gate_closed
+        }
+        pub fn into_anyhow(self) -> anyhow::Error {
+            match self {
+                CompactionError::ShuttingDown(ForbidMatching) => anyhow::Error::new(self),
+                CompactionError::Other(e) => e,
+            }
+        }
+        pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self {
+            if err.is_cancel() {
+                Self::new_cancelled()
+            } else {
+                Self::Other(err.into_anyhow())
+            }
        }
    }
 }
@@ -6088,7 +6184,7 @@ impl From<super::upload_queue::NotInitialized> for CompactionError {
                CompactionError::Other(anyhow::anyhow!(value))
            }
            super::upload_queue::NotInitialized::ShuttingDown
-            | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown,
+            | super::upload_queue::NotInitialized::Stopped => CompactionError::new_cancelled(),
        }
    }
 }
@@ -6098,7 +6194,7 @@ impl From<super::storage_layer::layer::DownloadError> for CompactionError {
        match e {
            super::storage_layer::layer::DownloadError::TimelineShutdown
            | super::storage_layer::layer::DownloadError::DownloadCancelled => {
-                CompactionError::ShuttingDown
+                CompactionError::new_cancelled()
            }
            super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
            | super::storage_layer::layer::DownloadError::DownloadRequired
@@ -6117,14 +6213,14 @@ impl From<super::storage_layer::layer::DownloadError> for CompactionError {

 impl From<layer_manager::Shutdown> for CompactionError {
    fn from(_: layer_manager::Shutdown) -> Self {
-        CompactionError::ShuttingDown
+        CompactionError::new_cancelled()
    }
 }

 impl From<super::storage_layer::errors::PutError> for CompactionError {
    fn from(e: super::storage_layer::errors::PutError) -> Self {
        if e.is_cancel() {
-            CompactionError::ShuttingDown
+            CompactionError::new_cancelled()
        } else {
            CompactionError::Other(e.into_anyhow())
        }
@@ -6223,7 +6319,7 @@ impl Timeline {
        let mut guard = tokio::select! {
            guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard,
            _ = self.cancel.cancelled() => {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
            }
        };

@@ -7050,6 +7146,19 @@ impl Timeline {
            .unwrap()
            .clone()
    }
+
+    /* BEGIN_HADRON */
+    pub(crate) async fn compute_image_consistent_lsn(&self) -> anyhow::Result<Lsn> {
+        let guard = self
+            .layers
+            .read(LayerManagerLockHolder::ComputeImageConsistentLsn)
+            .await;
+        let layer_map = guard.layer_map()?;
+        let disk_consistent_lsn = self.get_disk_consistent_lsn();
+
+        Ok(layer_map.compute_image_consistent_lsn(disk_consistent_lsn))
+    }
+    /* END_HADRON */
 }

 impl Timeline {
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,6 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.

+use std::cmp::min;
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
@@ -572,8 +573,8 @@ impl GcCompactionQueue {
        }
        match res {
            Ok(res) => Ok(res),
-            Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
-            Err(CompactionError::Other(_)) => {
+            Err(e) if e.is_cancel() => Err(e),
+            Err(_) => {
                // There are some cases where traditional gc might collect some layer
                // files causing gc-compaction cannot read the full history of the key.
                // This needs to be resolved in the long-term by improving the compaction
@@ -1260,13 +1261,16 @@ impl Timeline {
        // Is the timeline being deleted?
        if self.is_stopping() {
            trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
+            return Err(CompactionError::new_cancelled());
        }

        let target_file_size = self.get_checkpoint_distance();

        // Define partitioning schema if needed

+        // HADRON
+        let force_image_creation_lsn = self.get_force_image_creation_lsn();
+
        // 1. L0 Compact
        let l0_outcome = {
            let timer = self.metrics.compact_time_histo.start_timer();
@@ -1274,6 +1278,7 @@ impl Timeline {
                .compact_level0(
                    target_file_size,
                    options.flags.contains(CompactFlags::ForceL0Compaction),
+                    force_image_creation_lsn,
                    ctx,
                )
                .await?;
@@ -1376,6 +1381,7 @@ impl Timeline {
                    .create_image_layers(
                        &partitioning,
                        lsn,
+                        force_image_creation_lsn,
                        mode,
                        &image_ctx,
                        self.last_image_layer_creation_status
@@ -1472,6 +1478,41 @@ impl Timeline {
        Ok(CompactionOutcome::Done)
    }

+    /* BEGIN_HADRON */
+    // Get the force image creation LSN based on gc_cutoff_lsn.
+    // Note that this is an estimation and the workload rate may suddenly change. When that happens,
+    // the force image creation may be too early or too late, but eventually it should be able to catch up.
+    pub(crate) fn get_force_image_creation_lsn(self: &Arc<Self>) -> Option<Lsn> {
+        let image_creation_period = self.get_image_layer_force_creation_period()?;
+        let current_lsn = self.get_last_record_lsn();
+        let pitr_lsn = self.gc_info.read().unwrap().cutoffs.time?;
+        let pitr_interval = self.get_pitr_interval();
+        if pitr_lsn == Lsn::INVALID || pitr_interval.is_zero() {
+            tracing::warn!(
+                "pitr LSN/interval not found, skipping force image creation LSN calculation"
+            );
+            return None;
+        }
+
+        let delta_lsn = current_lsn.checked_sub(pitr_lsn).unwrap().0
+            * image_creation_period.as_secs()
+            / pitr_interval.as_secs();
+        let force_image_creation_lsn = current_lsn.checked_sub(delta_lsn).unwrap_or(Lsn(0));
+
+        tracing::info!(
+            "Tenant shard {} computed force_image_creation_lsn: {}. Current lsn: {}, image_layer_force_creation_period: {:?}, GC cutoff: {}, PITR interval: {:?}",
+            self.tenant_shard_id,
+            force_image_creation_lsn,
+            current_lsn,
+            image_creation_period,
+            pitr_lsn,
+            pitr_interval
+        );
+
+        Some(force_image_creation_lsn)
+    }
+    /* END_HADRON */
+
    /// Check for layers that are elegible to be rewritten:
    /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
    ///   we don't indefinitely retain keys in this shard that aren't needed.
@@ -1624,7 +1665,7 @@ impl Timeline {

        for (i, layer) in layers_to_rewrite.into_iter().enumerate() {
            if self.cancel.is_cancelled() {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
            }

            info!(layer=%layer, "rewriting layer after shard split: {}/{}", i, total);
@@ -1722,7 +1763,7 @@ impl Timeline {
                    Ok(()) => {},
                    Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
                    Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
-                        return Err(CompactionError::ShuttingDown);
+                        return Err(CompactionError::new_cancelled());
                    }
                },
                // Don't wait if there's L0 compaction to do. We don't need to update the outcome
@@ -1801,6 +1842,7 @@ impl Timeline {
        self: &Arc<Self>,
        target_file_size: u64,
        force_compaction_ignore_threshold: bool,
+        force_compaction_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> Result<CompactionOutcome, CompactionError> {
        let CompactLevel0Phase1Result {
@@ -1821,6 +1863,7 @@ impl Timeline {
                stats,
                target_file_size,
                force_compaction_ignore_threshold,
+                force_compaction_lsn,
                &ctx,
            )
            .instrument(phase1_span)
@@ -1843,6 +1886,7 @@ impl Timeline {
        mut stats: CompactLevel0Phase1StatsBuilder,
        target_file_size: u64,
        force_compaction_ignore_threshold: bool,
+        force_compaction_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
        let begin = tokio::time::Instant::now();
@@ -1872,11 +1916,28 @@ impl Timeline {
                    return Ok(CompactLevel0Phase1Result::default());
                }
            } else {
-                debug!(
-                    level0_deltas = level0_deltas.len(),
-                    threshold, "too few deltas to compact"
-                );
-                return Ok(CompactLevel0Phase1Result::default());
+                // HADRON
+                let min_lsn = level0_deltas
+                    .iter()
+                    .map(|a| a.get_lsn_range().start)
+                    .reduce(min);
+                if force_compaction_lsn.is_some()
+                    && min_lsn.is_some()
+                    && min_lsn.unwrap() < force_compaction_lsn.unwrap()
+                {
+                    info!(
+                        "forcing L0 compaction of {} L0 deltas. Min lsn: {}, force compaction lsn: {}",
+                        level0_deltas.len(),
+                        min_lsn.unwrap(),
+                        force_compaction_lsn.unwrap()
+                    );
+                } else {
+                    debug!(
+                        level0_deltas = level0_deltas.len(),
+                        threshold, "too few deltas to compact"
+                    );
+                    return Ok(CompactLevel0Phase1Result::default());
+                }
            }
        }

@@ -1985,7 +2046,7 @@ impl Timeline {
            let mut all_keys = Vec::new();
            for l in deltas_to_compact.iter() {
                if self.cancel.is_cancelled() {
-                    return Err(CompactionError::ShuttingDown);
+                    return Err(CompactionError::new_cancelled());
                }
                let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
                let keys = delta
@@ -2078,7 +2139,7 @@ impl Timeline {
        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();

        if self.cancel.is_cancelled() {
-            return Err(CompactionError::ShuttingDown);
+            return Err(CompactionError::new_cancelled());
        }

        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
@@ -2186,7 +2247,7 @@ impl Timeline {
                // avoid hitting the cancellation token on every key. in benches, we end up
                // shuffling an order of million keys per layer, this means we'll check it
                // around tens of times per layer.
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
            }

            let same_key = prev_key == Some(key);
@@ -2271,7 +2332,7 @@ impl Timeline {
                if writer.is_none() {
                    if self.cancel.is_cancelled() {
                        // to be somewhat responsive to cancellation, check for each new layer
-                        return Err(CompactionError::ShuttingDown);
+                        return Err(CompactionError::new_cancelled());
                    }
                    // Create writer if not initiaized yet
                    writer = Some(
@@ -2527,7 +2588,7 @@ impl Timeline {
        // Is the timeline being deleted?
        if self.is_stopping() {
            trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
+            return Err(CompactionError::new_cancelled());
        }

        let (dense_ks, _sparse_ks) = self
@@ -3189,7 +3250,7 @@ impl Timeline {
        let gc_lock = async {
            tokio::select! {
                guard = self.gc_lock.lock() => Ok(guard),
-                _ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
+                _ = cancel.cancelled() => Err(CompactionError::new_cancelled()),
            }
        };

@@ -3462,7 +3523,7 @@ impl Timeline {
            }
            total_layer_size += layer.layer_desc().file_size;
            if cancel.is_cancelled() {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
            }
            let should_yield = yield_for_l0
                && self
@@ -3609,7 +3670,7 @@ impl Timeline {
            }

            if cancel.is_cancelled() {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
            }

            let should_yield = yield_for_l0
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -47,6 +47,7 @@ pub(crate) enum LayerManagerLockHolder {
    ImportPgData,
    DetachAncestor,
    Eviction,
+    ComputeImageConsistentLsn,
    #[cfg(test)]
    Testing,
 }
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -1410,7 +1410,7 @@ pg_init_libpagestore(void)
 							"sharding stripe size",
 							NULL,
 							&stripe_size,
-							32768, 1, INT_MAX,
+							2048, 1, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_BLOCKS,
 							NULL, NULL, NULL);
--- a/pgxn/neon/neon_ddl_handler.c
+++ b/pgxn/neon/neon_ddl_handler.c
@@ -953,7 +953,9 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)

 	/*
 	 * Fire Event Trigger if both function owner and current user are
-	 * superuser, or none of them are.
+	 * superuser. Allow executing Event Trigger function that belongs to a
+	 * superuser when connected as a non-superuser, even when the function is
+	 * SECURITY DEFINER.
 	 */
    else if (event == FHET_START
 		/* still enable it to pass pg_regress tests */
@@ -976,32 +978,7 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 		function_is_owned_by_super = superuser_arg(function_owner);

 		/*
-		 * 1. Refuse to run SECURITY DEFINER function that belongs to a
-		 * superuser when the current user is not a superuser itself.
-		 */
-		if (!role_is_super
-			&& function_is_owned_by_super
-			&& function_is_secdef)
-		{
-			char *func_name = get_func_name(flinfo->fn_oid);
-
-			ereport(WARNING,
-					(errmsg("Skipping Event Trigger"),
-					 errdetail("Event Trigger function \"%s\" is owned by \"%s\" "
-							   "and is SECURITY DEFINER",
-							   func_name,
-							   GetUserNameFromId(function_owner, false))));
-
-			/*
-			 * we can't skip execution directly inside the fmgr_hook so
-			 * instead we change the event trigger function to a noop
-			 * function.
-			 */
-			force_noop(flinfo);
-		}
-
-		/*
-		 * 2. Refuse to run functions that belongs to a non-superuser when the
+		 * Refuse to run functions that belongs to a non-superuser when the
 		 * current user is a superuser.
 		 *
 		 * We could run a SECURITY DEFINER user-function here and be safe with
@@ -1009,7 +986,7 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 		 * infrastructure maintenance operations, where we prefer to skip
 		 * running user-defined code.
 		 */
-		else if (role_is_super && !function_is_owned_by_super)
+		if (role_is_super && !function_is_owned_by_super)
 		{
 			char *func_name = get_func_name(flinfo->fn_oid);

--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -376,6 +376,18 @@ typedef struct PageserverFeedback
 	uint32		shard_number;
 } PageserverFeedback;

+/* BEGIN_HADRON */
+typedef struct WalRateLimiter
+{
+	/* If the value is 1, PG backends will hit backpressure. */
+	pg_atomic_uint32 should_limit;
+	/* The number of bytes sent in the current second. */
+	uint64		sent_bytes;
+	/* The last recorded time in microsecond. */
+	TimestampTz last_recorded_time_us;
+} WalRateLimiter;
+/* END_HADRON */
+
 typedef struct WalproposerShmemState
 {
 	pg_atomic_uint64 propEpochStartLsn;
@@ -395,6 +407,11 @@ typedef struct WalproposerShmemState

 	/* aggregated feedback with min LSNs across shards */
 	PageserverFeedback min_ps_feedback;
+
+	/* BEGIN_HADRON */
+	/* The WAL rate limiter */
+	WalRateLimiter wal_rate_limiter;
+	/* END_HADRON */
 } WalproposerShmemState;

 /*
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -66,6 +66,9 @@ int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
 int			safekeeper_proto_version = 3;
 char	   *safekeeper_conninfo_options = "";
+/* BEGIN_HADRON */
+int         databricks_max_wal_mb_per_second = -1;
+/* END_HADRON */

 /* Set to true in the walproposer bgw. */
 static bool am_walproposer;
@@ -252,6 +255,18 @@ nwp_register_gucs(void)
 							PGC_POSTMASTER,
 							0,
 							NULL, NULL, NULL);
+
+    /* BEGIN_HADRON */
+    DefineCustomIntVariable(
+                            "databricks.max_wal_mb_per_second",
+                            "The maximum WAL MB per second allowed. If breached, sending WAL hit the backpressure. Setting to -1 disables the limit.",
+                            NULL,
+                            &databricks_max_wal_mb_per_second,
+                            -1, -1, INT_MAX,
+                            PGC_SUSET,
+                            GUC_UNIT_MB,
+                            NULL, NULL, NULL);
+    /* END_HADRON */
 }


@@ -393,6 +408,7 @@ assign_neon_safekeepers(const char *newval, void *extra)
 static uint64
 backpressure_lag_impl(void)
 {
+	struct WalproposerShmemState* state = NULL;
 	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0)
 	{
 		XLogRecPtr	writePtr;
@@ -426,6 +442,18 @@ backpressure_lag_impl(void)
 			return (myFlushLsn - applyPtr - max_replication_apply_lag * MB);
 		}
 	}
+
+	/* BEGIN_HADRON */
+	if (databricks_max_wal_mb_per_second == -1) {
+		return 0;
+	}
+
+	state = GetWalpropShmemState();
+	if (state != NULL && pg_atomic_read_u32(&state->wal_rate_limiter.should_limit) == 1)
+	{
+		return 1;
+	}
+	/* END_HADRON */
 	return 0;
 }

@@ -472,6 +500,9 @@ WalproposerShmemInit(void)
 		pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
 		pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
 		pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
+		/* BEGIN_HADRON */
+		pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
+		/* END_HADRON */
 	}
 	LWLockRelease(AddinShmemInitLock);

@@ -487,6 +518,9 @@ WalproposerShmemInit_SyncSafekeeper(void)
 	pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0);
 	pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
 	pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
+	/* BEGIN_HADRON */
+	pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
+	/* END_HADRON */
 }

 #define BACK_PRESSURE_DELAY 10000L // 0.01 sec
@@ -521,7 +555,6 @@ backpressure_throttling_impl(void)
 	if (lag == 0)
 		return retry;

-
 	old_status = get_ps_display(&len);
 	new_status = (char *) palloc(len + 64 + 1);
 	memcpy(new_status, old_status, len);
@@ -1458,6 +1491,8 @@ XLogBroadcastWalProposer(WalProposer *wp)
 {
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
+	struct WalproposerShmemState *state = NULL;
+	TimestampTz now = 0;

 	/* Start from the last sent position */
 	startptr = sentPtr;
@@ -1502,13 +1537,36 @@ XLogBroadcastWalProposer(WalProposer *wp)
 	 * that arbitrary LSN is eventually reported as written, flushed and
 	 * applied, so that it can measure the elapsed time.
 	 */
-	LagTrackerWrite(endptr, GetCurrentTimestamp());
+	now = GetCurrentTimestamp();
+	LagTrackerWrite(endptr, now);

 	/* Do we have any work to do? */
 	Assert(startptr <= endptr);
 	if (endptr <= startptr)
 		return;

+	/* BEGIN_HADRON */
+	state = GetWalpropShmemState();
+	if (databricks_max_wal_mb_per_second != -1 && state != NULL)
+	{
+		uint64 max_wal_bytes = (uint64) databricks_max_wal_mb_per_second * 1024 * 1024;
+		struct WalRateLimiter *limiter = &state->wal_rate_limiter;
+
+		if (now - limiter->last_recorded_time_us > USECS_PER_SEC)
+		{
+			/* Reset the rate limiter */
+			limiter->last_recorded_time_us = now;
+			limiter->sent_bytes = 0;
+			pg_atomic_exchange_u32(&limiter->should_limit, 0);
+		}
+		limiter->sent_bytes += (endptr - startptr);
+		if (limiter->sent_bytes > max_wal_bytes)
+		{
+			pg_atomic_exchange_u32(&limiter->should_limit, 1);
+		}
+	}
+	/* END_HADRON */
+
 	WalProposerBroadcast(wp, startptr, endptr);
 	sentPtr = endptr;

--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -48,12 +48,13 @@ indexmap = { workspace = true, features = ["serde"] }
 ipnet.workspace = true
 itertools.workspace = true
 itoa.workspace = true
-lasso = { workspace = true, features = ["multi-threaded"] }
-measured = { workspace = true, features = ["lasso"] }
+measured = { workspace = true, features = ["paracord"] }
+measured-tokio.workspace = true
 metrics.workspace = true
 once_cell.workspace = true
 opentelemetry = { workspace = true, features = ["trace"] }
 papaya = "0.2.0"
+paracord.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
@@ -127,4 +128,4 @@ rstest.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
 tokio-postgres.workspace = true
-tracing-test = "0.2"
+tracing-test = "0.2"
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -87,7 +87,12 @@ impl JwkCacheEntry {
        self.key_sets
            .values()
            // make sure our requested role has access to the key set
-            .filter(|key_set| key_set.role_names.iter().any(|role| **role == **role_name))
+            .filter(|key_set| {
+                key_set
+                    .role_names
+                    .iter()
+                    .any(|role| *role.as_str() == **role_name)
+            })
            // try and find the requested key-id in the key set
            .find_map(|key_set| {
                key_set
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -12,7 +12,7 @@ use crate::context::RequestContext;
 use crate::control_plane::NodeInfo;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
 use crate::http;
-use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag};
+use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 use crate::types::EndpointId;
 use crate::url::ApiUrl;

@@ -38,9 +38,9 @@ impl LocalBackend {
                },
                // TODO(conrad): make this better reflect compute info rather than endpoint info.
                aux: MetricsAuxInfo {
-                    endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
-                    project_id: ProjectIdTag::get_interner().get_or_intern("local"),
-                    branch_id: BranchIdTag::get_interner().get_or_intern("local"),
+                    endpoint_id: EndpointIdInt::new("local"),
+                    project_id: ProjectIdInt::new("local"),
+                    branch_id: BranchIdInt::new("local"),
                    compute_id: "local".into(),
                    cold_start_info: ColdStartInfo::WarmCached,
                },
--- a/proxy/src/binary/local_proxy.rs
+++ b/proxy/src/binary/local_proxy.rs
@@ -192,6 +192,7 @@ pub async fn run() -> anyhow::Result<()> {
            jemalloc,
            neon_metrics,
            proxy: crate::metrics::Metrics::get(),
+            tokio: measured_tokio::RuntimeCollector::current(),
        },
    ));

--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -513,6 +513,7 @@ pub async fn run() -> anyhow::Result<()> {
            jemalloc,
            neon_metrics,
            proxy: crate::metrics::Metrics::get(),
+            tokio: measured_tokio::RuntimeCollector::current(),
        },
    ));
    maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener));
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -194,7 +194,7 @@ impl ProjectInfoCacheImpl {
        &self,
        endpoint_id: &EndpointId,
    ) -> Option<Ref<'_, EndpointIdInt, EndpointInfo>> {
-        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let endpoint_id = EndpointIdInt::try_new_existing(endpoint_id)?;
        self.cache.get(&endpoint_id)
    }

@@ -204,7 +204,7 @@ impl ProjectInfoCacheImpl {
        role_name: &RoleName,
    ) -> Option<RoleAccessControl> {
        let valid_since = self.get_cache_times();
-        let role_name = RoleNameInt::get(role_name)?;
+        let role_name = RoleNameInt::try_new_existing(role_name)?;
        let endpoint_info = self.get_endpoint_cache(endpoint_id)?;
        endpoint_info.get_role_secret(role_name, valid_since)
    }
@@ -297,10 +297,10 @@ impl ProjectInfoCacheImpl {
    }

    pub fn maybe_invalidate_role_secret(&self, endpoint_id: &EndpointId, role_name: &RoleName) {
-        let Some(endpoint_id) = EndpointIdInt::get(endpoint_id) else {
+        let Some(endpoint_id) = EndpointIdInt::try_new_existing(endpoint_id) else {
            return;
        };
-        let Some(role_name) = RoleNameInt::get(role_name) else {
+        let Some(role_name) = RoleNameInt::try_new_existing(role_name) else {
            return;
        };

--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -71,6 +71,8 @@ pub struct AppMetrics {
    pub neon_metrics: NeonMetrics,
    #[metric(flatten)]
    pub proxy: &'static crate::metrics::Metrics,
+    #[metric(namespace = "tokio")]
+    pub tokio: measured_tokio::RuntimeCollector,
 }

 async fn prometheus_metrics_handler(
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -1,272 +1,59 @@
-use std::hash::BuildHasherDefault;
-use std::marker::PhantomData;
-use std::num::NonZeroUsize;
-use std::ops::Index;
-use std::sync::OnceLock;
-
-use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
-use rustc_hash::FxHasher;
+use paracord::custom_key;

 use crate::types::{AccountId, BranchId, EndpointId, ProjectId, RoleName};

-pub trait InternId: Sized + 'static {
-    fn get_interner() -> &'static StringInterner<Self>;
-}
+custom_key!(pub struct RoleNameInt);
+custom_key!(pub struct EndpointIdInt);
+custom_key!(pub struct BranchIdInt);
+custom_key!(pub struct ProjectIdInt);
+custom_key!(pub struct AccountIdInt);

-pub struct StringInterner<Id> {
-    inner: ThreadedRodeo<Spur, BuildHasherDefault<FxHasher>>,
-    _id: PhantomData<Id>,
-}
-
-#[derive(PartialEq, Debug, Clone, Copy, Eq, Hash)]
-pub struct InternedString<Id> {
-    inner: Spur,
-    _id: PhantomData<Id>,
-}
-
-impl<Id: InternId> std::fmt::Display for InternedString<Id> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        self.as_str().fmt(f)
-    }
-}
-
-impl<Id: InternId> InternedString<Id> {
-    pub(crate) fn as_str(&self) -> &'static str {
-        Id::get_interner().inner.resolve(&self.inner)
-    }
-    pub(crate) fn get(s: &str) -> Option<Self> {
-        Id::get_interner().get(s)
-    }
-}
-
-impl<Id: InternId> AsRef<str> for InternedString<Id> {
-    fn as_ref(&self) -> &str {
-        self.as_str()
-    }
-}
-
-impl<Id: InternId> std::ops::Deref for InternedString<Id> {
-    type Target = str;
-    fn deref(&self) -> &str {
-        self.as_str()
-    }
-}
-
-impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
-    fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
-        struct Visitor<Id>(PhantomData<Id>);
-        impl<Id: InternId> serde::de::Visitor<'_> for Visitor<Id> {
-            type Value = InternedString<Id>;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("a string")
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                Ok(Id::get_interner().get_or_intern(v))
-            }
-        }
-        d.deserialize_str(Visitor::<Id>(PhantomData))
-    }
-}
-
-impl<Id: InternId> serde::Serialize for InternedString<Id> {
-    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
-        self.as_str().serialize(s)
-    }
-}
-
-impl<Id: InternId> StringInterner<Id> {
-    pub(crate) fn new() -> Self {
-        StringInterner {
-            inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher(
-                Capacity::new(2500, NonZeroUsize::new(1 << 16).expect("value is nonzero")),
-                // unbounded
-                MemoryLimits::for_memory_usage(usize::MAX),
-                BuildHasherDefault::<FxHasher>::default(),
-            ),
-            _id: PhantomData,
-        }
-    }
-
-    #[cfg(test)]
-    fn len(&self) -> usize {
-        self.inner.len()
-    }
-
-    #[cfg(test)]
-    fn current_memory_usage(&self) -> usize {
-        self.inner.current_memory_usage()
-    }
-
-    pub(crate) fn get_or_intern(&self, s: &str) -> InternedString<Id> {
-        InternedString {
-            inner: self.inner.get_or_intern(s),
-            _id: PhantomData,
-        }
-    }
-
-    pub(crate) fn get(&self, s: &str) -> Option<InternedString<Id>> {
-        Some(InternedString {
-            inner: self.inner.get(s)?,
-            _id: PhantomData,
-        })
-    }
-}
-
-impl<Id: InternId> Index<InternedString<Id>> for StringInterner<Id> {
-    type Output = str;
-
-    fn index(&self, index: InternedString<Id>) -> &Self::Output {
-        self.inner.resolve(&index.inner)
-    }
-}
-
-impl<Id: InternId> Default for StringInterner<Id> {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct RoleNameTag;
-impl InternId for RoleNameTag {
-    fn get_interner() -> &'static StringInterner<Self> {
-        static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
-        ROLE_NAMES.get_or_init(Default::default)
-    }
-}
-pub type RoleNameInt = InternedString<RoleNameTag>;
 impl From<&RoleName> for RoleNameInt {
    fn from(value: &RoleName) -> Self {
-        RoleNameTag::get_interner().get_or_intern(value)
+        RoleNameInt::new(value)
    }
 }

-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct EndpointIdTag;
-impl InternId for EndpointIdTag {
-    fn get_interner() -> &'static StringInterner<Self> {
-        static ROLE_NAMES: OnceLock<StringInterner<EndpointIdTag>> = OnceLock::new();
-        ROLE_NAMES.get_or_init(Default::default)
-    }
-}
-pub type EndpointIdInt = InternedString<EndpointIdTag>;
 impl From<&EndpointId> for EndpointIdInt {
    fn from(value: &EndpointId) -> Self {
-        EndpointIdTag::get_interner().get_or_intern(value)
+        EndpointIdInt::new(value)
    }
 }
 impl From<EndpointId> for EndpointIdInt {
    fn from(value: EndpointId) -> Self {
-        EndpointIdTag::get_interner().get_or_intern(&value)
+        EndpointIdInt::new(&value)
    }
 }

-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct BranchIdTag;
-impl InternId for BranchIdTag {
-    fn get_interner() -> &'static StringInterner<Self> {
-        static ROLE_NAMES: OnceLock<StringInterner<BranchIdTag>> = OnceLock::new();
-        ROLE_NAMES.get_or_init(Default::default)
-    }
-}
-pub type BranchIdInt = InternedString<BranchIdTag>;
 impl From<&BranchId> for BranchIdInt {
    fn from(value: &BranchId) -> Self {
-        BranchIdTag::get_interner().get_or_intern(value)
+        BranchIdInt::new(value)
    }
 }
 impl From<BranchId> for BranchIdInt {
    fn from(value: BranchId) -> Self {
-        BranchIdTag::get_interner().get_or_intern(&value)
+        BranchIdInt::new(&value)
    }
 }

-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct ProjectIdTag;
-impl InternId for ProjectIdTag {
-    fn get_interner() -> &'static StringInterner<Self> {
-        static ROLE_NAMES: OnceLock<StringInterner<ProjectIdTag>> = OnceLock::new();
-        ROLE_NAMES.get_or_init(Default::default)
-    }
-}
-pub type ProjectIdInt = InternedString<ProjectIdTag>;
 impl From<&ProjectId> for ProjectIdInt {
    fn from(value: &ProjectId) -> Self {
-        ProjectIdTag::get_interner().get_or_intern(value)
+        ProjectIdInt::new(value)
    }
 }
 impl From<ProjectId> for ProjectIdInt {
    fn from(value: ProjectId) -> Self {
-        ProjectIdTag::get_interner().get_or_intern(&value)
+        ProjectIdInt::new(&value)
    }
 }

-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct AccountIdTag;
-impl InternId for AccountIdTag {
-    fn get_interner() -> &'static StringInterner<Self> {
-        static ROLE_NAMES: OnceLock<StringInterner<AccountIdTag>> = OnceLock::new();
-        ROLE_NAMES.get_or_init(Default::default)
-    }
-}
-pub type AccountIdInt = InternedString<AccountIdTag>;
 impl From<&AccountId> for AccountIdInt {
    fn from(value: &AccountId) -> Self {
-        AccountIdTag::get_interner().get_or_intern(value)
+        AccountIdInt::new(value)
    }
 }
 impl From<AccountId> for AccountIdInt {
    fn from(value: AccountId) -> Self {
-        AccountIdTag::get_interner().get_or_intern(&value)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::OnceLock;
-
-    use super::InternId;
-    use crate::intern::StringInterner;
-
-    struct MyId;
-    impl InternId for MyId {
-        fn get_interner() -> &'static StringInterner<Self> {
-            pub(crate) static ROLE_NAMES: OnceLock<StringInterner<MyId>> = OnceLock::new();
-            ROLE_NAMES.get_or_init(Default::default)
-        }
-    }
-
-    #[test]
-    fn push_many_strings() {
-        use rand::rngs::StdRng;
-        use rand::{Rng, SeedableRng};
-        use rand_distr::Zipf;
-
-        let endpoint_dist = Zipf::new(500000, 0.8).unwrap();
-        let endpoints = StdRng::seed_from_u64(272488357).sample_iter(endpoint_dist);
-
-        let interner = MyId::get_interner();
-
-        const N: usize = 100_000;
-        let mut verify = Vec::with_capacity(N);
-        for endpoint in endpoints.take(N) {
-            let endpoint = format!("ep-string-interning-{endpoint}");
-            let key = interner.get_or_intern(&endpoint);
-            verify.push((endpoint, key));
-        }
-
-        for (s, key) in verify {
-            assert_eq!(interner[key], s);
-        }
-
-        // 2031616/59861 = 34 bytes per string
-        assert_eq!(interner.len(), 59_861);
-        // will have other overhead for the internal hashmaps that are not accounted for.
-        assert_eq!(interner.current_memory_usage(), 2_031_616);
+        AccountIdInt::new(&value)
    }
 }
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,6 +1,5 @@
 use std::sync::{Arc, OnceLock};

-use lasso::ThreadedRodeo;
 use measured::label::{
    FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet,
 };
@@ -11,6 +10,7 @@ use measured::{
    MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLogVec};
+use paracord::ParaCord;
 use tokio::time::{self, Instant};

 use crate::control_plane::messages::ColdStartInfo;
@@ -222,7 +222,7 @@ pub enum CacheOutcome {
 #[derive(LabelGroup)]
 #[label(set = ConsoleRequestSet)]
 pub struct ConsoleRequest<'a> {
-    #[label(dynamic_with = ThreadedRodeo, default)]
+    #[label(dynamic_with = ParaCord, default)]
    pub request: &'a str,
 }

@@ -345,7 +345,7 @@ pub struct ConnectionFailuresBreakdownGroup {
 #[derive(LabelGroup, Copy, Clone)]
 #[label(set = RedisErrorsSet)]
 pub struct RedisErrors<'a> {
-    #[label(dynamic_with = ThreadedRodeo, default)]
+    #[label(dynamic_with = ParaCord, default)]
    pub channel: &'a str,
 }

--- a/proxy/src/types.rs
+++ b/proxy/src/types.rs
@@ -1,4 +1,4 @@
-use crate::intern::{EndpointIdInt, EndpointIdTag, InternId};
+use crate::intern::EndpointIdInt;

 macro_rules! smol_str_wrapper {
    ($name:ident) => {
@@ -85,7 +85,7 @@ impl EndpointId {

    #[must_use]
    pub fn normalize_intern(&self) -> EndpointIdInt {
-        EndpointIdTag::get_interner().get_or_intern(self.normalize_str())
+        EndpointIdInt::new(self.normalize_str())
    }
 }

--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -37,6 +37,7 @@ use tracing::*;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
 use utils::id::NodeId;
 use utils::logging::{self, LogFormat, SecretString};
+use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
 use utils::sentry_init::init_sentry;
 use utils::{pid_file, project_build_tag, project_git_version, tcp_listener};

@@ -243,6 +244,11 @@ struct Args {
    #[arg(long)]
    enable_tls_wal_service_api: bool,

+    /// Controls whether to collect all metrics on each scrape or to return potentially stale
+    /// results.
+    #[arg(long, default_value_t = true)]
+    force_metric_collection_on_scrape: bool,
+
    /// Run in development mode (disables security checks)
    #[arg(long, help = "Run in development mode (disables security checks)")]
    dev: bool,
@@ -428,6 +434,7 @@ async fn main() -> anyhow::Result<()> {
        ssl_ca_certs,
        use_https_safekeeper_api: args.use_https_safekeeper_api,
        enable_tls_wal_service_api: args.enable_tls_wal_service_api,
+        force_metric_collection_on_scrape: args.force_metric_collection_on_scrape,
    });

    // initialize sentry if SENTRY_DSN is provided
@@ -640,6 +647,26 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
        .map(|res| ("broker main".to_owned(), res));
    tasks_handles.push(Box::pin(broker_task_handle));

+    /* BEGIN_HADRON */
+    if conf.force_metric_collection_on_scrape {
+        let metrics_handle = current_thread_rt
+            .as_ref()
+            .unwrap_or_else(|| BACKGROUND_RUNTIME.handle())
+            .spawn(async move {
+                let mut interval: tokio::time::Interval =
+                    tokio::time::interval(METRICS_COLLECTION_INTERVAL);
+                loop {
+                    interval.tick().await;
+                    tokio::task::spawn_blocking(|| {
+                        METRICS_COLLECTOR.run_once(true);
+                    });
+                }
+            })
+            .map(|res| ("broker main".to_owned(), res));
+        tasks_handles.push(Box::pin(metrics_handle));
+    }
+    /* END_HADRON */
+
    set_build_info_metric(GIT_VERSION, BUILD_TAG);

    // TODO: update tokio-stream, convert to real async Stream with
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -699,6 +699,11 @@ pub fn make_router(
        }))
    }

+    let force_metric_collection_on_scrape = conf.force_metric_collection_on_scrape;
+
+    let prometheus_metrics_handler_wrapper =
+        move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape);
+
    // NB: on any changes do not forget to update the OpenAPI spec
    // located nearby (/safekeeper/src/http/openapi_spec.yaml).
    let auth = conf.http_auth.clone();
@@ -706,7 +711,9 @@ pub fn make_router(
        .data(conf)
        .data(global_timelines)
        .data(auth)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| {
+            request_span(r, prometheus_metrics_handler_wrapper)
+        })
        .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
        .get("/profile/heap", |r| request_span(r, profile_heap_handler))
        .get("/v1/status", |r| request_span(r, status_handler))
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -134,6 +134,7 @@ pub struct SafeKeeperConf {
    pub ssl_ca_certs: Vec<Pem>,
    pub use_https_safekeeper_api: bool,
    pub enable_tls_wal_service_api: bool,
+    pub force_metric_collection_on_scrape: bool,
 }

 impl SafeKeeperConf {
@@ -183,6 +184,7 @@ impl SafeKeeperConf {
            ssl_ca_certs: Vec::new(),
            use_https_safekeeper_api: false,
            enable_tls_wal_service_api: false,
+            force_metric_collection_on_scrape: true,
        }
    }
 }
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -59,6 +59,15 @@ pub static FLUSH_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
    .expect("Failed to register safekeeper_flush_wal_seconds histogram")
 });
 /* BEGIN_HADRON */
+// Counter of all ProposerAcceptorMessage requests received
+pub static PROPOSER_ACCEPTOR_MESSAGES_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_proposer_acceptor_messages_total",
+        "Total number of ProposerAcceptorMessage requests received by the Safekeeper.",
+        &["outcome"]
+    )
+    .expect("Failed to register safekeeper_proposer_acceptor_messages_total counter")
+});
 pub static WAL_DISK_IO_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "safekeeper_wal_disk_io_errors",
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -24,7 +24,7 @@ use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;

-use crate::metrics::MISC_OPERATION_SECONDS;
+use crate::metrics::{MISC_OPERATION_SECONDS, PROPOSER_ACCEPTOR_MESSAGES_TOTAL};
 use crate::state::TimelineState;
 use crate::{control_file, wal_storage};

@@ -938,7 +938,7 @@ where
        &mut self,
        msg: &ProposerAcceptorMessage,
    ) -> Result<Option<AcceptorProposerMessage>> {
-        match msg {
+        let res = match msg {
            ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg).await,
            ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg).await,
            ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg).await,
@@ -949,7 +949,20 @@ where
                self.handle_append_request(msg, false).await
            }
            ProposerAcceptorMessage::FlushWAL => self.handle_flush().await,
-        }
+        };
+
+        // BEGIN HADRON
+        match &res {
+            Ok(_) => PROPOSER_ACCEPTOR_MESSAGES_TOTAL
+                .with_label_values(&["success"])
+                .inc(),
+            Err(_) => PROPOSER_ACCEPTOR_MESSAGES_TOTAL
+                .with_label_values(&["error"])
+                .inc(),
+        };
+
+        res
+        // END HADRON
    }

    /// Handle initial message from proposer: check its sanity and send my
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -166,7 +166,7 @@ fn hadron_determine_offloader(mgr: &Manager, state: &StateSnapshot) -> (Option<N

    let backup_lag = state.commit_lsn.checked_sub(state.backup_lsn);
    if backup_lag.is_none() {
-        info!("Backup lag is None. Skipping re-election.");
+        debug!("Backup lag is None. Skipping re-election.");
        return (offloader, election_dbg_str);
    }

--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -190,6 +190,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
        ssl_ca_certs: Vec::new(),
        use_https_safekeeper_api: false,
        enable_tls_wal_service_api: false,
+        force_metric_collection_on_scrape: true,
    };

    let mut global = GlobalMap::new(disk, conf.clone())?;
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -31,10 +31,10 @@ humantime.workspace = true
 humantime-serde.workspace = true
 itertools.workspace = true
 json-structural-diff.workspace = true
-lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
+paracord.workspace = true
 postgres_connection.workspace = true
 posthog_client_lite.workspace = true
 rand.workspace = true
@@ -72,4 +72,4 @@ http-utils = { path = "../libs/http-utils/" }
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
 control_plane = { path = "../control_plane" }
-workspace_hack = { version = "0.1", path = "../workspace_hack" }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -850,6 +850,31 @@ async fn handle_tenant_describe(
    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }

+/* BEGIN_HADRON */
+async fn handle_tenant_timeline_describe(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Scrubber)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_timeline_describe(tenant_id, timeline_id)
+            .await?,
+    )
+}
+/* END_HADRON */
+
 async fn handle_tenant_list(
    service: Arc<Service>,
    req: Request<Body>,
@@ -2480,6 +2505,13 @@ pub fn make_router(
            )
        })
        // Timeline operations
+        .get("/control/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_describe,
+                RequestName("v1_tenant_timeline_describe"),
+            )
+        })
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            tenant_service_handler(
                r,
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -183,9 +183,9 @@ impl Default for StorageControllerMetrics {
 #[derive(measured::LabelGroup, Clone)]
 #[label(set = NodeLabelGroupSet)]
 pub(crate) struct NodeLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) az: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) node_id: &'a str,
 }

@@ -198,7 +198,7 @@ pub(crate) struct ReconcileCompleteLabelGroup {
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestStatusLabelGroupSet)]
 pub(crate) struct HttpRequestStatusLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
    pub(crate) status: StatusCode,
@@ -207,7 +207,7 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> {
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestLatencyLabelGroupSet)]
 pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
 }
@@ -215,9 +215,9 @@ pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
 #[derive(measured::LabelGroup, Clone)]
 #[label(set = PageserverRequestLabelGroupSet)]
 pub(crate) struct PageserverRequestLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) pageserver_id: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
 }
@@ -225,9 +225,9 @@ pub(crate) struct PageserverRequestLabelGroup<'a> {
 #[derive(measured::LabelGroup, Clone)]
 #[label(set = SafekeeperRequestLabelGroupSet)]
 pub(crate) struct SafekeeperRequestLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) safekeeper_id: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
 }
@@ -254,11 +254,11 @@ pub(crate) struct LeadershipStatusGroup {
 #[derive(measured::LabelGroup, Clone)]
 #[label(set = ReconcileLongRunningLabelGroupSet)]
 pub(crate) struct ReconcileLongRunningLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) tenant_id: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) shard_number: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) sequence: &'a str,
 }

@@ -282,11 +282,11 @@ pub(crate) enum Method {
 #[derive(measured::LabelGroup, Clone)]
 #[label(set = SafekeeperReconcilerLabelGroupSet)]
 pub(crate) struct SafekeeperReconcilerLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) sk_az: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) sk_node_id: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = paracord::ParaCord, default)]
    pub(crate) sk_hostname: &'a str,
 }

--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -86,6 +86,23 @@ impl PageserverClient {
        )
    }

+    /* BEGIN_HADRON */
+    pub(crate) async fn tenant_timeline_describe(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Result<TimelineInfo> {
+        measured_request!(
+            "tenant_timeline_describe",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner
+                .tenant_timeline_describe(tenant_shard_id, timeline_id,)
+                .await
+        )
+    }
+    /* END_HADRON */
+
    pub(crate) async fn tenant_scan_remote_storage(
        &self,
        tenant_id: TenantId,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -32,7 +32,7 @@ use pageserver_api::controller_api::{
    ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
    SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
    TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    TenantShardMigrateRequest, TenantShardMigrateResponse, TenantTimelineDescribeResponse,
 };
 use pageserver_api::models::{
    self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease,
@@ -1984,11 +1984,14 @@ impl Service {
        });

        // Check that there is enough safekeepers configured that we can create new timelines
-        let test_sk_res = this.safekeepers_for_new_timeline().await;
+        let test_sk_res_str = match this.safekeepers_for_new_timeline().await {
+            Ok(v) => format!("Ok({v:?})"),
+            Err(v) => format!("Err({v:})"),
+        };
        tracing::info!(
            timeline_safekeeper_count = config.timeline_safekeeper_count,
            timelines_onto_safekeepers = config.timelines_onto_safekeepers,
-            "viability test result (test timeline creation on safekeepers): {test_sk_res:?}",
+            "viability test result (test timeline creation on safekeepers): {test_sk_res_str}",
        );

        Ok(this)
@@ -4758,6 +4761,7 @@ impl Service {
        )
        .await;

+        let mut retry_if_not_attached = false;
        let targets = {
            let locked = self.inner.read().unwrap();
            let mut targets = Vec::new();
@@ -4774,6 +4778,24 @@ impl Service {
                        .expect("Pageservers may not be deleted while referenced");

                    targets.push((*tenant_shard_id, node.clone()));
+
+                    if let Some(location) = shard.observed.locations.get(node_id) {
+                        if let Some(ref conf) = location.conf {
+                            if conf.mode != LocationConfigMode::AttachedSingle
+                                && conf.mode != LocationConfigMode::AttachedMulti
+                            {
+                                // If the shard is attached as secondary, we need to retry if 404.
+                                retry_if_not_attached = true;
+                            }
+                            // If the shard is attached as primary, we should succeed.
+                        } else {
+                            // Location conf is not available yet, retry if 404.
+                            retry_if_not_attached = true;
+                        }
+                    } else {
+                        // The shard is not attached to the intended pageserver yet, retry if 404.
+                        retry_if_not_attached = true;
+                    }
                }
            }
            targets
@@ -4804,6 +4826,18 @@ impl Service {
                        valid_until = Some(lease.valid_until);
                    }
                }
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _))
+                    if retry_if_not_attached =>
+                {
+                    // This is expected if the attach is not finished yet. Return 503 so that the client can retry.
+                    return Err(ApiError::ResourceUnavailable(
+                        format!(
+                            "Timeline is not attached to the pageserver {} yet, please retry",
+                            node.get_id()
+                        )
+                        .into(),
+                    ));
+                }
                Err(e) => {
                    return Err(passthrough_api_error(&node, e));
                }
@@ -5452,6 +5486,92 @@ impl Service {
        .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
    }

+    /* BEGIN_HADRON */
+    pub(crate) async fn tenant_timeline_describe(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<TenantTimelineDescribeResponse, ApiError> {
+        self.tenant_remote_mutation(tenant_id, |locations| async move {
+            if locations.0.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            };
+
+            let locations: Vec<(TenantShardId, Node)> = locations
+                .0
+                .iter()
+                .map(|t| (*t.0, t.1.latest.node.clone()))
+                .collect();
+            let mut futs = FuturesUnordered::new();
+
+            for (shard_id, node) in locations {
+                futs.push({
+                    async move {
+                        let result = node
+                            .with_client_retries(
+                                |client| async move {
+                                    client
+                                        .tenant_timeline_describe(&shard_id, &timeline_id)
+                                        .await
+                                },
+                                &self.http_client,
+                                &self.config.pageserver_jwt_token,
+                                3,
+                                3,
+                                Duration::from_secs(30),
+                                &self.cancel,
+                            )
+                            .await;
+                        (result, shard_id, node.get_id())
+                    }
+                });
+            }
+
+            let mut results: Vec<TimelineInfo> = Vec::new();
+            while let Some((result, tenant_shard_id, node_id)) = futs.next().await {
+                match result {
+                    Some(Ok(timeline_info)) => results.push(timeline_info),
+                    Some(Err(e)) => {
+                        tracing::warn!(
+                            "Failed to describe tenant {} timeline {} for pageserver {}: {e}",
+                            tenant_shard_id,
+                            timeline_id,
+                            node_id,
+                        );
+                        return Err(ApiError::ResourceUnavailable(format!("{e}").into()));
+                    }
+                    None => return Err(ApiError::Cancelled),
+                }
+            }
+            let mut image_consistent_lsn: Option<Lsn> = Some(Lsn::MAX);
+            for timeline_info in &results {
+                if let Some(tline_image_consistent_lsn) = timeline_info.image_consistent_lsn {
+                    image_consistent_lsn = Some(std::cmp::min(
+                        image_consistent_lsn.unwrap(),
+                        tline_image_consistent_lsn,
+                    ));
+                } else {
+                    tracing::warn!(
+                        "Timeline {} on shard {} does not have image consistent lsn",
+                        timeline_info.timeline_id,
+                        timeline_info.tenant_id
+                    );
+                    image_consistent_lsn = None;
+                    break;
+                }
+            }
+
+            Ok(TenantTimelineDescribeResponse {
+                shards: results,
+                image_consistent_lsn,
+            })
+        })
+        .await?
+    }
+    /* END_HADRON */
+
    /// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not
    /// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory
    /// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -25,7 +25,8 @@ use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo};
 use safekeeper_api::PgVersionId;
 use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration};
 use safekeeper_api::models::{
-    PullTimelineRequest, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse,
+    PullTimelineRequest, TimelineLocateResponse, TimelineMembershipSwitchRequest,
+    TimelineMembershipSwitchResponse,
 };
 use safekeeper_api::{INITIAL_TERM, Term};
 use safekeeper_client::mgmt_api;
@@ -37,13 +38,6 @@ use utils::lsn::Lsn;

 use super::Service;

-#[derive(serde::Serialize, serde::Deserialize, Clone)]
-pub struct TimelineLocateResponse {
-    pub generation: SafekeeperGeneration,
-    pub sk_set: Vec<NodeId>,
-    pub new_sk_set: Option<Vec<NodeId>>,
-}
-
 impl Service {
    fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, ApiError> {
        let members = safekeepers
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1795,6 +1795,33 @@ def neon_env_builder(
        record_property("preserve_database_files", builder.preserve_database_files)


+@pytest.fixture(scope="function")
+def neon_env_builder_local(
+    neon_env_builder: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_distrib_dir: Path,
+) -> NeonEnvBuilder:
+    """
+    Fixture to create a Neon environment for test with its own pg_install copy.
+
+    This allows the test to edit the list of available extensions in the
+    local instance of Postgres used for the test, and install extensions via
+    downloading them when a remote extension is tested, for instance, or
+    copying files around for local extension testing.
+    """
+    test_local_pginstall = test_output_dir / "pg_install"
+    log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}")
+
+    # We can't copy only the version that we are currently testing because other
+    # binaries like the storage controller need specific Postgres versions.
+    shutil.copytree(pg_distrib_dir, test_local_pginstall)
+
+    neon_env_builder.pg_distrib_dir = test_local_pginstall
+    log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}")
+
+    return neon_env_builder
+
+
@dataclass
 class PageserverPort:
    pg: int
@@ -2315,6 +2342,20 @@ class NeonStorageController(MetricsGetter, LogUtils):
        response.raise_for_status()
        return response.json()

+    # HADRON
+    def tenant_timeline_describe(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ):
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+        return response.json()
+
    def nodes(self):
        """
        :return: list of {"id": ""}
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -333,6 +333,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys")
        self.verbose_error(res)

+    def list_tenant_visible_size(self) -> dict[TenantShardId, int]:
+        res = self.get(f"http://localhost:{self.port}/v1/list_tenant_visible_size")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
    def tenant_list(self) -> list[dict[Any, Any]]:
        res = self.get(f"http://localhost:{self.port}/v1/tenant")
        self.verbose_error(res)
@@ -1002,7 +1009,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):

    def get_metrics_str(self) -> str:
        """You probably want to use get_metrics() instead."""
-        res = self.get(f"http://localhost:{self.port}/metrics")
+        res = self.get(f"http://localhost:{self.port}/metrics?use_latest=true")
        self.verbose_error(res)
        return res.text

--- a/test_runner/fixtures/port_distributor.py
+++ b/test_runner/fixtures/port_distributor.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import re
 import socket
 from contextlib import closing
+from itertools import cycle

 from fixtures.log_helper import log

@@ -34,15 +35,23 @@ def can_bind(host: str, port: int) -> bool:

 class PortDistributor:
    def __init__(self, base_port: int, port_number: int):
-        self.iterator = iter(range(base_port, base_port + port_number))
+        self.base_port = base_port
+        self.port_number = port_number
+        self.cycle = cycle(range(base_port, base_port + port_number))
        self.port_map: dict[int, int] = {}

    def get_port(self) -> int:
-        for port in self.iterator:
+        checked = 0
+        for port in self.cycle:
            if can_bind("localhost", port):
                return port
+            elif checked < self.port_number:
+                checked += 1
+            else:
+                break
+
        raise RuntimeError(
-            "port range configured for test is exhausted, consider enlarging the range"
+            f"port range ({self.base_port}..{self.base_port + self.port_number}) configured for test is exhausted, consider enlarging the range"
        )

    def replace_with_new_port(self, value: int | str) -> int | str:
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -143,7 +143,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):

    def get_metrics_str(self) -> str:
        """You probably want to use get_metrics() instead."""
-        request_result = self.get(f"http://localhost:{self.port}/metrics")
+        request_result = self.get(f"http://localhost:{self.port}/metrics?use_latest=true")
        request_result.raise_for_status()
        return request_result.text

--- a/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql
+++ b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql
@@ -0,0 +1,32 @@
+\echo Use "CREATE EXTENSION test_event_trigger_extension" to load this file. \quit
+
+CREATE SCHEMA event_trigger;
+
+create sequence if not exists event_trigger.seq_schema_version as int cycle;
+
+create or replace function event_trigger.increment_schema_version()
+    returns event_trigger
+    security definer
+    language plpgsql
+as $$
+begin
+    perform pg_catalog.nextval('event_trigger.seq_schema_version');
+end;
+$$;
+
+create or replace function event_trigger.get_schema_version()
+    returns int
+    security definer
+    language sql
+as $$
+    select last_value from event_trigger.seq_schema_version;
+$$;
+
+-- On DDL event, increment the schema version number
+create event trigger event_trigger_watch_ddl
+    on ddl_command_end
+    execute procedure event_trigger.increment_schema_version();
+
+create event trigger event_trigger_watch_drop
+    on sql_drop
+    execute procedure event_trigger.increment_schema_version();
--- a/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control
+++ b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control
@@ -0,0 +1,8 @@
+default_version = '1.0'
+comment = 'Test extension with Event Trigger'
+
+# make sure the extension objects are owned by the bootstrap user
+# to check that the SECURITY DEFINER event trigger function is still
+# called during non-superuser DDL events.
+superuser = true
+trusted = true
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -165,6 +165,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
        "gc_horizon": 23 * (1024 * 1024),
        "gc_period": "2h 13m",
        "image_creation_threshold": 7,
+        "image_layer_force_creation_period": "1m",
        "pitr_interval": "1m",
        "lagging_wal_timeout": "23m",
        "lazy_slru_download": True,
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -944,3 +944,143 @@ def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool
                f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)"
            )
            assert res[0][0] == 1
+
+
+# BEGIN_HADRON
+def get_layer_map(env, tenant_shard_id, timeline_id, ps_id):
+    client = env.pageservers[ps_id].http_client()
+    layer_map = client.layer_map_info(tenant_shard_id, timeline_id)
+    image_layer_count = 0
+    delta_layer_count = 0
+    for layer in layer_map.historic_layers:
+        if layer.kind == "Image":
+            image_layer_count += 1
+        elif layer.kind == "Delta":
+            delta_layer_count += 1
+    return image_layer_count, delta_layer_count
+
+
+def test_image_layer_force_creation_period(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests that page server can force creating new images if image_layer_force_creation_period is enabled
+    """
+    # use large knobs to disable L0 compaction/image creation except for the force image creation
+    tenant_conf = {
+        "compaction_threshold": "100",
+        "image_creation_threshold": "100",
+        "image_layer_creation_check_threshold": "1",
+        "checkpoint_distance": 10 * 1024,
+        "checkpoint_timeout": "1s",
+        "image_layer_force_creation_period": "1s",
+        "pitr_interval": "10s",
+        "gc_period": "1s",
+        "compaction_period": "1s",
+        "lsn_lease_length": "1s",
+    }
+
+    # consider every tenant large to run the image layer generation check more eagerly
+    neon_env_builder.pageserver_config_override = (
+        "image_layer_generation_large_timeline_threshold=0"
+    )
+
+    neon_env_builder.num_pageservers = 1
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+    # Generate some rows.
+    for v in range(10):
+        endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))")
+
+    # Sleep a bit such that the inserts are considered when calculating the forced image layer creation LSN.
+    time.sleep(2)
+
+    def check_force_image_creation():
+        ps_http = env.pageserver.http_client()
+        ps_http.timeline_compact(tenant_id, timeline_id)
+        image, delta = get_layer_map(env, tenant_id, timeline_id, 0)
+        log.info(f"images: {image}, deltas: {delta}")
+        assert image > 0
+
+        env.pageserver.assert_log_contains("forcing L0 compaction of")
+        env.pageserver.assert_log_contains("forcing image creation for partitioned range")
+
+    wait_until(check_force_image_creation)
+
+    endpoint.stop_and_destroy()
+
+    env.pageserver.allowed_errors.append(
+        ".*created delta file of size.*larger than double of target.*"
+    )
+
+
+def test_image_consistent_lsn(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the /v1/tenant/<tenant_id>/timeline/<timeline_id> endpoint and the computation of image_consistent_lsn
+    """
+    # use large knobs to disable L0 compaction/image creation except for the force image creation
+    tenant_conf = {
+        "compaction_threshold": "100",
+        "image_creation_threshold": "100",
+        "image_layer_creation_check_threshold": "1",
+        "checkpoint_distance": 10 * 1024,
+        "checkpoint_timeout": "1s",
+        "image_layer_force_creation_period": "1s",
+        "pitr_interval": "10s",
+        "gc_period": "1s",
+        "compaction_period": "1s",
+        "lsn_lease_length": "1s",
+    }
+
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf,
+        initial_tenant_shard_count=4,
+        initial_tenant_shard_stripe_size=1,
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER, val text)")
+    for v in range(10):
+        endpoint.safe_psql(
+            f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))", log_query=False
+        )
+
+    response = env.storage_controller.tenant_timeline_describe(tenant_id, timeline_id)
+    shards = response["shards"]
+    for shard in shards:
+        assert shard["image_consistent_lsn"] is not None
+    image_consistent_lsn = response["image_consistent_lsn"]
+    assert image_consistent_lsn is not None
+
+    # do more writes and wait for image_consistent_lsn to advance
+    for v in range(100):
+        endpoint.safe_psql(
+            f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))", log_query=False
+        )
+
+    def check_image_consistent_lsn_advanced():
+        response = env.storage_controller.tenant_timeline_describe(tenant_id, timeline_id)
+        new_image_consistent_lsn = response["image_consistent_lsn"]
+        shards = response["shards"]
+        for shard in shards:
+            print(f"shard {shard['tenant_id']} image_consistent_lsn{shard['image_consistent_lsn']}")
+        assert new_image_consistent_lsn != image_consistent_lsn
+
+    wait_until(check_image_consistent_lsn_advanced)
+
+    endpoint.stop_and_destroy()
+
+    for ps in env.pageservers:
+        ps.allowed_errors.append(".*created delta file of size.*larger than double of target.*")
+
+
+# END_HADRON
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -2,7 +2,6 @@ from __future__ import annotations

 import os
 import platform
-import shutil
 import tarfile
 from enum import StrEnum
 from pathlib import Path
@@ -31,27 +30,6 @@ if TYPE_CHECKING:
    from werkzeug.wrappers.request import Request


-# use neon_env_builder_local fixture to override the default neon_env_builder fixture
-# and use a test-specific pg_install instead of shared one
-@pytest.fixture(scope="function")
-def neon_env_builder_local(
-    neon_env_builder: NeonEnvBuilder,
-    test_output_dir: Path,
-    pg_distrib_dir: Path,
-) -> NeonEnvBuilder:
-    test_local_pginstall = test_output_dir / "pg_install"
-    log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}")
-
-    # We can't copy only the version that we are currently testing because other
-    # binaries like the storage controller need specific Postgres versions.
-    shutil.copytree(pg_distrib_dir, test_local_pginstall)
-
-    neon_env_builder.pg_distrib_dir = test_local_pginstall
-    log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}")
-
-    return neon_env_builder
-
-
@final
 class RemoteExtension(StrEnum):
    SQL_ONLY = "test_extension_sql_only"
--- a/test_runner/regress/test_event_trigger_extension.py
+++ b/test_runner/regress/test_event_trigger_extension.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+import shutil
+from pathlib import Path
+from typing import TYPE_CHECKING, cast
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.paths import BASE_DIR
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from fixtures.neon_fixtures import (
+        NeonEnvBuilder,
+    )
+    from fixtures.pg_version import PgVersion
+
+
+# use neon_env_builder_local fixture to override the default neon_env_builder fixture
+# and use a test-specific pg_install instead of shared one
+@pytest.fixture(scope="function")
+def neon_env_builder_event_trigger_extension(
+    neon_env_builder_local: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_version: PgVersion,
+) -> NeonEnvBuilder:
+    test_local_pginstall = test_output_dir / "pg_install"
+
+    # Now copy the SQL only extension test_event_trigger_extension in the local
+    # pginstall extension directory on-disk
+    test_event_trigger_extension_dir = (
+        BASE_DIR / "test_runner" / "regress" / "data" / "test_event_trigger_extension"
+    )
+
+    test_local_extension_dir = (
+        test_local_pginstall / f"v{pg_version}" / "share" / "postgresql" / "extension"
+    )
+
+    log.info(f"copy {test_event_trigger_extension_dir} to {test_local_extension_dir}")
+
+    for f in [
+        test_event_trigger_extension_dir / "test_event_trigger_extension.control",
+        test_event_trigger_extension_dir / "test_event_trigger_extension--1.0.sql",
+    ]:
+        shutil.copy(f, test_local_extension_dir)
+
+    return neon_env_builder_local
+
+
+def test_event_trigger_extension(neon_env_builder_event_trigger_extension: NeonEnvBuilder):
+    """
+    Test installing an extension that contains an Event Trigger.
+
+    The Event Trigger function is owned by the extension owner, which at
+    CREATE EXTENSION is going to be the Postgres bootstrap user, per the
+    extension control file where both superuser = true and trusted = true.
+
+    Also this function is SECURTY DEFINER, to allow for making changes to
+    the extension SQL objects, in our case a sequence.
+
+    This test makes sure that the event trigger function is fired correctly
+    by non-privileged user DDL actions such as CREATE TABLE.
+    """
+    env = neon_env_builder_event_trigger_extension.init_start()
+    env.create_branch("test_event_trigger_extension")
+
+    endpoint = env.endpoints.create_start("test_event_trigger_extension")
+    extension = "test_event_trigger_extension"
+    database = "test_event_trigger_extension"
+
+    endpoint.safe_psql(f"CREATE DATABASE {database}")
+    endpoint.safe_psql(f"CREATE EXTENSION {extension}", dbname=database)
+
+    # check that the extension is owned by the bootstrap superuser (cloud_admin)
+    pg_bootstrap_superuser_name = "cloud_admin"
+    with endpoint.connect(dbname=database) as pg_conn:
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                f"select rolname from pg_roles r join pg_extension e on r.oid = e.extowner where extname = '{extension}'"
+            )
+            owner = cast("tuple[str]", cur.fetchone())[0]
+            assert owner == pg_bootstrap_superuser_name, (
+                f"extension {extension} is not owned by bootstrap user '{pg_bootstrap_superuser_name}'"
+            )
+
+    # test that the SQL-only Event Trigger (SECURITY DEFINER function) runs
+    # correctly now that the extension has been installed
+    #
+    # create table to trigger the event trigger, twice, check sequence count
+    with endpoint.connect(dbname=database) as pg_conn:
+        log.info("creating SQL objects (tables)")
+        with pg_conn.cursor() as cur:
+            cur.execute("CREATE TABLE foo1(id int primary key)")
+            cur.execute("CREATE TABLE foo2(id int)")
+
+            cur.execute("SELECT event_trigger.get_schema_version()")
+            res = cast("tuple[int]", cur.fetchone())
+            ver = res[0]
+
+            log.info(f"schema version is now {ver}")
+            assert ver == 2, "schema version is not 2"
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -1,6 +1,7 @@
 import random
 import threading
 from enum import StrEnum
+from time import sleep
 from typing import Any

 import pytest
@@ -24,18 +25,7 @@ OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
 OFFLOAD_ERR_LABEL = "compute_ctl_lfc_offload_errors_total"
 METHOD_VALUES = [e for e in PrewarmMethod]
 METHOD_IDS = [e.value for e in PrewarmMethod]
-
-
-def check_pinned_entries(cur: Cursor):
-    """
-    Wait till none of LFC buffers are pinned
-    """
-
-    def none_pinned():
-        cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
-        assert cur.fetchall()[0][0] == 0
-
-    wait_until(none_pinned)
+AUTOOFFLOAD_INTERVAL_SECS = 2


 def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
@@ -49,9 +39,18 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]:


 def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
+    if method == PrewarmMethod.POSTGRES:
+        cur.execute("select neon.get_local_cache_state()")
+        return cur.fetchall()[0][0]
+
    if method == PrewarmMethod.AUTOPREWARM:
+        # With autoprewarm, we need to be sure LFC was offloaded after all writes
+        # finish, so we sleep. Otherwise we'll have less prewarmed pages than we want
+        sleep(AUTOOFFLOAD_INTERVAL_SECS)
        client.offload_lfc_wait()
-    elif method == PrewarmMethod.COMPUTE_CTL:
+        return
+
+    if method == PrewarmMethod.COMPUTE_CTL:
        status = client.prewarm_lfc_status()
        assert status["status"] == "not_prewarmed"
        assert "error" not in status
@@ -60,11 +59,9 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor)
        parsed = prom_parse(client)
        desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0}
        assert parsed == desired, f"{parsed=} != {desired=}"
-    elif method == PrewarmMethod.POSTGRES:
-        cur.execute("select get_local_cache_state()")
-        return cur.fetchall()[0][0]
-    else:
-        raise AssertionError(f"{method} not in PrewarmMethod")
+        return
+
+    raise AssertionError(f"{method} not in PrewarmMethod")


 def prewarm_endpoint(
@@ -75,7 +72,7 @@ def prewarm_endpoint(
    elif method == PrewarmMethod.COMPUTE_CTL:
        client.prewarm_lfc()
    elif method == PrewarmMethod.POSTGRES:
-        cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+        cur.execute("select neon.prewarm_local_cache(%s)", (lfc_state,))


 def check_prewarmed(
@@ -106,21 +103,20 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
        "neon.file_cache_size_limit=1GB",
        "neon.file_cache_prewarm_limit=1000",
    ]
-    offload_secs = 2

    if method == PrewarmMethod.AUTOPREWARM:
        endpoint = env.endpoints.create_start(
            branch_name="main",
            config_lines=cfg,
            autoprewarm=True,
-            offload_lfc_interval_seconds=offload_secs,
+            offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS,
        )
    else:
        endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)

    pg_conn = endpoint.connect()
    pg_cur = pg_conn.cursor()
-    pg_cur.execute("create extension neon")
+    pg_cur.execute("create schema neon; create extension neon with schema neon")
    pg_cur.execute("create database lfc")

    lfc_conn = endpoint.connect(dbname="lfc")
@@ -135,7 +131,7 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):

    endpoint.stop()
    if method == PrewarmMethod.AUTOPREWARM:
-        endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs)
+        endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS)
    else:
        endpoint.start()

@@ -146,10 +142,12 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
    lfc_cur = lfc_conn.cursor()
    prewarm_endpoint(method, client, pg_cur, lfc_state)

-    pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
+    pg_cur.execute(
+        "select lfc_value from neon.neon_lfc_stats where lfc_key='file_cache_used_pages'"
+    )
    lfc_used_pages = pg_cur.fetchall()[0][0]
    log.info(f"Used LFC size: {lfc_used_pages}")
-    pg_cur.execute("select * from get_prewarm_info()")
+    pg_cur.execute("select * from neon.get_prewarm_info()")
    total, prewarmed, skipped, _ = pg_cur.fetchall()[0]
    log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}")
    progress = (prewarmed + skipped) * 100 // total
@@ -162,7 +160,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
    lfc_cur.execute("select sum(pk) from t")
    assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2

-    check_pinned_entries(pg_cur)
    desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
    check_prewarmed(method, client, desired)

@@ -191,7 +188,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet

    pg_conn = endpoint.connect()
    pg_cur = pg_conn.cursor()
-    pg_cur.execute("create extension neon")
+    pg_cur.execute("create schema neon; create extension neon with schema neon")
    pg_cur.execute("CREATE DATABASE lfc")

    lfc_conn = endpoint.connect(dbname="lfc")
@@ -243,9 +240,9 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
    prewarm_thread.start()

    def prewarmed():
-        assert n_prewarms > 5
+        assert n_prewarms > 3

-    wait_until(prewarmed)
+    wait_until(prewarmed, timeout=40)  # debug builds don't finish in 20s

    running = False
    for t in workload_threads:
@@ -256,7 +253,6 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
    total_balance = lfc_cur.fetchall()[0][0]
    assert total_balance == 0

-    check_pinned_entries(pg_cur)
    if method == PrewarmMethod.POSTGRES:
        return
    desired = {
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from typing import TYPE_CHECKING

 from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    DEFAULT_BRANCH_NAME,
    NeonEnv,
@@ -164,3 +165,15 @@ def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder
            {"rel_size_migration": "legacy"},
        )
        assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy"
+
+
+def test_pageserver_get_tenant_visible_size(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_pageservers = 1
+    env = neon_env_builder.init_start()
+    env.create_tenant(shard_count=4)
+    env.create_tenant(shard_count=2)
+
+    json = env.pageserver.http_client().list_tenant_visible_size()
+    log.info(f"{json}")
+    # initial tennat + 2 newly created tenants
+    assert len(json) == 7
--- a/test_runner/regress/test_replica_promotes.py
+++ b/test_runner/regress/test_replica_promotes.py
@@ -60,7 +60,7 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):

    with primary.connect() as primary_conn:
        primary_cur = primary_conn.cursor()
-        primary_cur.execute("create extension neon")
+        primary_cur.execute("create schema neon;create extension neon with schema neon")
        primary_cur.execute(
            "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
        )
@@ -172,7 +172,7 @@ def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv):
    secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")

    with primary.connect() as conn, conn.cursor() as cur:
-        cur.execute("create extension neon")
+        cur.execute("create schema neon;create extension neon with schema neon")
        cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
        cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
        cur.execute("show neon.safekeepers")
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -72,6 +72,7 @@ num-rational = { version = "0.4", default-features = false, features = ["num-big
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 p256 = { version = "0.13", features = ["jwk"] }
+paracord = { version = "0.1", features = ["serde"] }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
 rand = { version = "0.8", features = ["small_rng"] }
Author	SHA1	Message	Date
Conrad Ludgate	0e952cee8e	fix deps	2025-07-11 14:46:56 +01:00
Conrad Ludgate	6d7ab67401	fix lints	2025-07-11 14:18:30 +01:00
Conrad Ludgate	3ff8afa32c	update measured properly	2025-07-11 13:59:21 +01:00
Conrad Ludgate	682a54fa9e	replace lasso completely	2025-07-11 13:59:21 +01:00
Conrad Ludgate	d5c17559ce	bump	2025-07-11 13:59:21 +01:00
Conrad Ludgate	e8ccb4a4d1	fmt	2025-07-11 13:58:53 +01:00
Conrad Ludgate	bdd68bb069	bump	2025-07-11 13:58:53 +01:00
Conrad Ludgate	783260b88a	switch to paracord	2025-07-11 13:57:59 +01:00
Vlad Lazar	15f633922a	pageserver: use image consistent LSN for force image layer creation (#12547 ) This is a no-op for the neon deployment * Introduce the concept image consistent lsn: of the largest LSN below which all pages have been redone successfully * Use the image consistent LSN for forced image layer creations * Optionally expose the image consistent LSN via the timeline describe HTTP endpoint * Add a sharded timeline describe endpoint to storcon --------- Co-authored-by: Chen Luo <chen.luo@databricks.com>	2025-07-11 11:39:51 +00:00
Dmitrii Kovalkov	c34d36d8a2	storcon_cli: timeline-safekeeper-migrate and timeline-locate subcommands (#12548 ) ## Problem We have a `safekeeper_migrate` handler, but no subcommand in `storcon_cli`. Same for `/:timeline_id/locate` for identifying current set of safekeepers. - Closes: https://github.com/neondatabase/neon/issues/12395 ## Summary of changes - Add `timeline-safekeeper-migrate` and `timeline-locate` subcommands to `storcon_cli`	2025-07-11 10:49:37 +00:00
Tristan Partin	cec0543b51	Add background to compute migration 0002-alter_roles.sql (#11708 ) On December 8th, 2023, an engineering escalation (INC-110) was opened after it was found that BYPASSRLS was being applied to all roles. PR that introduced the issue: https://github.com/neondatabase/neon/pull/5657 Subsequent commit on main: `ad99fa5f03` NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but because it isn't easy to know if a Postgres cluster is affected by the issue, we need to keep the migration around for a long time, if not indefinitely, so any cluster can be fixed. Branching is the gift that keeps on giving... Signed-off-by: Tristan Partin <tristan.partin@databricks.com> Signed-off-by: Tristan Partin <tristan.partin@databricks.com>	2025-07-10 22:58:54 +00:00
Erik Grinaker	8aa9540a05	pageserver/page_api: include block number and rel in gRPC `GetPageResponse` (#12542 ) ## Problem With gRPC `GetPageRequest` batches, we'll have non-trivial fragmentation/reassembly logic in several places of the stack (concurrent reads, shard splits, LFC hits, etc). If we included the block numbers with the pages in `GetPageResponse` we could have better verification and observability that the final responses are correct. Touches #11735. Requires #12480. ## Summary of changes Add a `Page` struct with`block_number` for `GetPageResponse`, along with the `RelTag` for completeness, and verify them in the rich gRPC client.	2025-07-10 22:35:14 +00:00
Alex Chi Z.	b91f821e8b	fix(libpagestore): update the default stripe size (#12557 ) ## Problem Part of LKB-379 The pageserver connstrings are updated in the postmaster and then there's a hook to propagate it to the shared memory of all backends. However, the shard stripe doesn't. This would cause problems during shard splits: * the compute has active reads/writes * shard split happens and the cplane applies the new config (pageserver connstring + stripe size) * pageserver connstring will be updated immediately once the postmaster receives the SIGHUP, and it will be copied over the the shared memory of all other backends. * stripe size is a normal GUC and we don't have special handling around that, so if any active backend has ongoing txns the value won't be applied. * now it's possible for backends to issue requests based on the wrong stripe size; what's worse, if a request gets cached in the prefetch buffer, it will get stuck forever. ## Summary of changes To make sure it aligns with the current default in storcon. Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-07-10 21:49:52 +00:00
Erik Grinaker	44ea17b7b2	pageserver/page_api: add attempt to GetPage request ID (#12536 ) ## Problem `GetPageRequest::request_id` is supposed to be a unique ID for a request. It's not, because we may retry the request using the same ID. This causes assertion failures and confusion. Touches #11735. Requires #12480. ## Summary of changes Extend the request ID with a retry attempt, and handle it in the gRPC client and server.	2025-07-10 20:39:42 +00:00
Tristan Partin	1b7339b53e	PG: add max_wal_rate (#12470 ) ## Problem One PG tenant may write too fast and overwhelm the PS. The other tenants sharing the same PSs will get very little bandwidth. We had one experiment that two tenants sharing the same PSs. One tenant runs a large ingestion that delivers hundreds of MB/s while the other only get < 10 MB/s. ## Summary of changes Rate limit how fast PG can generate WALs. The default is -1. We may scale the default value with the CPU count. Need to run some experiments to verify. ## How is this tested? CI. PGBench. No limit first. Then set to 1 MB/s and you can see the tps drop. Then reverted the change and tps increased again. pgbench -i -s 10 -p 55432 -h 127.0.0.1 -U cloud_admin -d postgres pgbench postgres -c 10 -j 10 -T 6000000 -P 1 -b tpcb-like -h 127.0.0.1 -U cloud_admin -p 55432 progress: 33.0 s, 986.0 tps, lat 10.142 ms stddev 3.856 progress: 34.0 s, 973.0 tps, lat 10.299 ms stddev 3.857 progress: 35.0 s, 1004.0 tps, lat 9.939 ms stddev 3.604 progress: 36.0 s, 984.0 tps, lat 10.183 ms stddev 3.713 progress: 37.0 s, 998.0 tps, lat 10.004 ms stddev 3.668 progress: 38.0 s, 648.9 tps, lat 12.947 ms stddev 24.970 progress: 39.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 40.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 41.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 42.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 43.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 44.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 45.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 46.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 47.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 48.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 49.0 s, 347.3 tps, lat 321.560 ms stddev 1805.633 progress: 50.0 s, 346.8 tps, lat 9.898 ms stddev 3.809 progress: 51.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 52.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 53.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 54.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 55.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 56.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 57.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 58.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 59.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 60.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 61.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 62.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 63.0 s, 494.5 tps, lat 276.504 ms stddev 1853.689 progress: 64.0 s, 488.0 tps, lat 20.530 ms stddev 71.981 progress: 65.0 s, 407.8 tps, lat 9.502 ms stddev 3.329 progress: 66.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 67.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 68.0 s, 504.5 tps, lat 71.627 ms stddev 397.733 progress: 69.0 s, 371.0 tps, lat 24.898 ms stddev 29.007 progress: 70.0 s, 541.0 tps, lat 19.684 ms stddev 24.094 progress: 71.0 s, 342.0 tps, lat 29.542 ms stddev 54.935 Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>	2025-07-10 20:34:11 +00:00
Mikhail	3593fe195a	split TerminationPending into two values, keeping ComputeStatus stateless (#12506 ) After https://github.com/neondatabase/neon/pull/12240 we observed issues in our go code as `ComputeStatus` is not stateless, thus doesn't deserialize as string. ``` could not check compute activity: json: cannot unmarshal object into Go struct field ComputeState.status of type computeclient.ComputeStatus ``` - Fix this by splitting this status into two. - Update compute OpenApi spec to reflect changes to `/terminate` in previous PR	2025-07-10 19:28:10 +00:00
Mikhail	c5aaf1ae21	Qualify call to neon extension in compute_ctl's prewarming (#12554 ) https://github.com/neondatabase/cloud/issues/19011 Calls without `neon.` failed on staging. Also fix local tests to work with qualified calls	2025-07-10 18:37:54 +00:00
Alex Chi Z.	13b5e7b26f	fix(compute_ctl): reload config before applying spec (#12551 ) ## Problem If we have catalog update AND a pageserver migration batched in a single spec, we will not be able to apply the spec (running the SQL) because the compute is not attached to the right pageserver and we are not able to read anything if we don't pick up the latest pageserver connstring. This is not a case for now because cplane always schedules shard split / pageserver migrations with `skip_pg_catalog_updates` (I suppose). Context: https://databricks.slack.com/archives/C09254R641L/p1752163559259399?thread_ts=1752160163.141149&cid=C09254R641L With this fix, backpressure will likely not be able to affect reconfigurations. ## Summary of changes Do `pg_reload_conf` before we apply specs in SQL. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-07-10 18:02:54 +00:00
Erik Grinaker	dcdfe80bf0	pagebench: add support for rich gRPC client (#12477 ) ## Problem We need to benchmark the rich gRPC client `client_grpc::PageserverClient` against the basic, no-frills `page_api::Client` to determine how much overhead it adds. Touches #11735. Requires #12476. ## Summary of changes Add a `pagebench --rich-client` parameter to use `client_grpc::PageserverClient`. Also adds a compression parameter to the client.	2025-07-10 17:30:09 +00:00
Alexander Bayandin	8630d37f5e	test_runner: manually reuse ports in PortDistributor (#12423 ) ## Problem Sometimes we run out of free ports in `PortDistributor`. This affects particularly failed tests that we rerun automatically up to 3 times (which makes it use up to 3x more ports) ## Summary of changes - Cycle over the range of ports to reuse freed ports from previous tests Ref: LKB-62	2025-07-10 15:53:38 +00:00
Erik Grinaker	2fc77c836b	pageserver/client_grpc: add shard map updates (#12480 ) ## Problem The communicator gRPC client must support changing the shard map on splits. Touches #11735. Requires #12476. ## Summary of changes * Wrap the shard set in a `ArcSwap` to allow swapping it out. * Add a new `ShardSpec` parameter struct to pass validated shard info to the client. * Add `update_shards()` to change the shard set. In-flight requests are allowed to complete using the old shards. * Restructure `get_page` to use a stable view of the shard map, and retry errors at the top (pre-split) level to pick up shard map changes. * Also marks `tonic::Status::Internal` as non-retryable, so that we can use it for client-side invariant checks without continually retrying these.	2025-07-10 15:46:39 +00:00
HaoyuHuang	2c6b327be6	A few PS changes (#12540 ) # TLDR All changes are no-op except some metrics. ## Summary of changes I ### Pageserver Added a new global counter metric `pageserver_pagestream_handler_results_total` that categorizes pagestream request results according to their outcomes: 1. Success 2. Internal errors 3. Other errors Internal errors include: 1. Page reconstruction error: This probably indicates a pageserver bug/corruption 2. LSN timeout error: Could indicate overload or bugs with PS's ability to reach other components 3. Misrouted request error: Indicates bugs in the Storage Controller/HCC Other errors include transient errors that are expected during normal operation or errors indicating bugs with other parts of the system (e.g., malformed requests, errors due to cancelled operations during PS shutdown, etc.) ## Summary of changes II This PR adds a pageserver endpoint and its counterpart in storage controller to list visible size of all tenant shards. This will be a prerequisite of the tenant rebalance command. ## Problem III We need a way to download WAL segments/layerfiles from S3 and replay WAL records. We cannot access production S3 from our laptops directly, and we also can't transfer any user data out of production systems for GDPR compliance, so we need solutions. ## Summary of changes III This PR adds a couple of tools to support the debugging workflow in production: 1. A new `pagectl download-remote-object` command that can be used to download remote storage objects assuming the correct access is set up. ## Summary of changes IV This PR adds a command to list all visible delta and image layers from index_part. This is useful to debug compaction issues as index_part often contain a lot of covered layers due to PITR. --------- Co-authored-by: William Huang <william.huang@databricks.com> Co-authored-by: Chen Luo <chen.luo@databricks.com> Co-authored-by: Vlad Lazar <vlad@neon.tech>	2025-07-10 14:39:38 +00:00
Alex Chi Z.	be5bbaecad	fix(storcon): correctly handle 404 error in lsn lease (#12537 ) ## Problem close LKB-253 ## Summary of changes 404 for timeline requests could happen when the tenant is intended to be on a pageserver but not attached yet. This patch adds handling for the lease request. In the future, we should extend this handling to more operations. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-07-10 14:28:58 +00:00
Arpad Müller	d33b3c7457	Print viability via custom printing impl (#12544 ) As per https://github.com/neondatabase/neon/pull/12485#issuecomment-3056525882 , we don't want to print the viability error via a debug impl as it prints the backtrace. SafekeeperInfo doesn't have a display impl, so fall back to `Debug` for the `Ok` case. It gives single line output so it's okay to use `Debug` for it. Follow up of https://github.com/neondatabase/neon/pull/12485	2025-07-10 14:03:20 +00:00
Vlad Lazar	ffeede085e	libs: move metric collection for pageserver and safekeeper in a background task (#12525 ) ## Problem Safekeeper and pageserver metrics collection might time out. We've seen this in both hadron and neon. ## Summary of changes This PR moves metrics collection in PS/SK to the background so that we will always get some metrics, despite there may be some delays. Will leave it to the future work to reduce metrics collection time. --------- Co-authored-by: Chen Luo <chen.luo@databricks.com>	2025-07-10 11:58:22 +00:00
Mikhail	bdca5b500b	Fix test_lfc_prewarm: reduce number of prewarms, sleep before LFC offloading (#12515 ) Fixes: - Sleep before LFC offloading in `test_lfc_prewarm[autoprewarm]` to ensure offloaded LFC is the one exported after all writes finish - Reduce number of prewarms and increase timeout in `test_lfc_prewarm_under_workload` as debug builds were failing due to timeout. Additional changes: - Remove `check_pinned_entries`: https://github.com/neondatabase/neon/pull/12447#discussion_r2185946210 - Fix LFC error metrics description: https://github.com/neondatabase/neon/pull/12486#discussion_r2190763107	2025-07-10 11:11:53 +00:00
Erik Grinaker	f4b03ddd7b	pageserver/client_grpc: reap idle pool resources (#12476 ) ## Problem The gRPC client pools don't reap idle resources. Touches #11735. Requires #12475. ## Summary of changes Reap idle pool resources (channels/clients/streams) after 3 minutes of inactivity. Also restructure the `StreamPool` to use a mutex rather than atomics for synchronization, for simplicity. This will be optimized later.	2025-07-10 10:18:37 +00:00
Vlad Lazar	08b19f001c	pageserver: optionally force image layer creation on timeout (#12529 ) This PR introduces a `image_creation_timeout` to page servers so that we can force the image creation after a certain period. This is set to 1 day on dev/staging for now, and will rollout to production 1/2 weeks later. Majority of the PR are boilerplate code to add the new knob. Specific changes of the PR are: 1. During L0 compaction, check if we should force a compaction if min(LSN) of all delta layers < force_image_creation LSN. 2. During image creation, check if we should force a compaction if the image's LSN < force_image_creation LSN and there are newer deltas with overlapping key ranges. 3. Also tweaked the check image creation interval to make sure we honor image_creation_timeout. Vlad's note: This should be a no-op. I added an extra PS config for the large timeline threshold to enable this. --------- Co-authored-by: Chen Luo <chen.luo@databricks.com>	2025-07-10 10:07:21 +00:00
Dimitri Fontaine	1a45b2ec90	Review security model for executing Event Trigger code. (#12463 ) When a function is owned by a superuser (bootstrap user or otherwise), we consider it safe to run it. Only a superuser could have installed it, typically from CREATE EXTENSION script: we trust the code to execute. ## Problem This is intended to solve running pg_graphql Event Triggers graphql_watch_ddl and graphql_watch_drop which are executing the secdef function graphql.increment_schema_version(). ## Summary of changes Allow executing Event Trigger function owned by a superuser and with SECURITY DEFINER properties. The Event Trigger code runs with superuser privileges, and we consider that it's fine. --------- Co-authored-by: Tristan Partin <tristan.partin@databricks.com>	2025-07-10 08:06:33 +00:00
Tristan Partin	13e38a58a1	Grant pg_signal_backend to neon_superuser (#12533 ) Allow neon_superuser to cancel backends from non-neon_superusers, excluding Postgres superusers. Signed-off-by: Tristan Partin <tristan.partin@databricks.com> Co-authored-by: Vikas Jain <vikas.jain@databricks.com>	2025-07-09 21:35:39 +00:00
Christian Schwarz	2edd59aefb	impr(compaction): unify checking of `CompactionError` for cancellation reason (#12392 ) There are a couple of places that call `CompactionError::is_cancel` but don't check the `::Other` variant via downcasting for root cause being cancellation. The only place that does it is `log_compaction_error`. It's sad we have to do it, but, until we get around cleaning up all the culprits, a step forward is to unify the behavior so that all places that inspect a `CompactionError` for cancellation reason follow the same behavior. Thus, this PR ... - moves the downcasting checks against the `::Other` variant from `log_compaction_error` into `is_cancel()` and - enforces via type system that `.is_cancel()` is used to check whether a CompactionError is due to cancellation (matching on the `CompactionError::ShuttingDown` will cause a compile-time error). I don't think there's a _serious_ case right now where matching instead of using `is_cancel` causes problems. The worst case I could find is the circuit breaker and `compaction_failed`, which don't really matter if we're shutting down the timeline anyway. But it's unaesthetic and might cause log/alert noise down the line, so, this PR fixes that at least. Refs - https://databricks.atlassian.net/browse/LKB-182 - slack conversation about this PR: https://databricks.slack.com/archives/C09254R641L/p1751284317955159	2025-07-09 21:15:44 +00:00
				`@@ -0,0 +1 @@`
				`GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;`