diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 3142a36fa0..25b2fc702a 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -31,6 +31,7 @@ config-variables:
   - NEON_PROD_AWS_ACCOUNT_ID
   - PGREGRESS_PG16_PROJECT_ID
   - PGREGRESS_PG17_PROJECT_ID
+  - PREWARM_PGBENCH_SIZE
   - REMOTE_STORAGE_AZURE_CONTAINER
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_CICD_CHANNEL_ID
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 79371ec704..df80bad579 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -219,6 +219,7 @@ jobs:
           --ignore test_runner/performance/test_cumulative_statistics_persistence.py
           --ignore test_runner/performance/test_perf_many_relations.py
           --ignore test_runner/performance/test_perf_oltp_large_tenant.py
+          --ignore test_runner/performance/test_lfc_prewarm.py
       env:
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -410,6 +411,77 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
+  prewarm-test:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }}
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 17
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+
+    steps:
+    - name: Harden the runner (Audit all outbound calls)
+      uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+      with:
+        egress-policy: audit
+
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Run prewarm benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_lfc_prewarm.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      id: create-allure-report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+      with:
+        store-test-results-into-db: true
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
   generate-matrices:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
diff --git a/.gitmodules b/.gitmodules
index d1330bf28c..e381fb079e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,16 +1,16 @@
 [submodule "vendor/postgres-v14"]
 	path = vendor/postgres-v14
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_14_STABLE_neon
 [submodule "vendor/postgres-v15"]
 	path = vendor/postgres-v15
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_15_STABLE_neon
 [submodule "vendor/postgres-v16"]
 	path = vendor/postgres-v16
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_16_STABLE_neon
 [submodule "vendor/postgres-v17"]
 	path = vendor/postgres-v17
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_17_STABLE_neon
diff --git a/Cargo.lock b/Cargo.lock
index caed814d5f..2f36790d30 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4294,7 +4294,9 @@ dependencies = [
  "humantime-serde",
  "pageserver_api",
  "pageserver_client",
+ "pageserver_client_grpc",
  "pageserver_page_api",
+ "pprof",
  "rand 0.8.5",
  "reqwest",
  "serde",
@@ -4323,6 +4325,7 @@ dependencies = [
  "pageserver_api",
  "postgres_ffi",
  "remote_storage",
+ "serde",
  "serde_json",
  "svg_fmt",
  "thiserror 1.0.69",
@@ -4499,6 +4502,7 @@ name = "pageserver_client_grpc"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "arc-swap",
  "bytes",
  "compute_api",
  "futures",
@@ -4506,6 +4510,7 @@ dependencies = [
  "pageserver_page_api",
  "tokio",
  "tokio-stream",
+ "tokio-util",
  "tonic 0.13.1",
  "tracing",
  "utils",
@@ -5285,6 +5290,7 @@ dependencies = [
  "async-trait",
  "atomic-take",
  "aws-config",
+ "aws-credential-types",
  "aws-sdk-iam",
  "aws-sigv4",
  "base64 0.22.1",
@@ -5324,6 +5330,7 @@ dependencies = [
  "itoa",
  "jose-jwa",
  "jose-jwk",
+ "json",
  "lasso",
  "measured",
  "metrics",
@@ -6987,6 +6994,7 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "reqwest",
+ "safekeeper_api",
  "serde_json",
  "storage_controller_client",
  "tokio",
@@ -7556,6 +7564,7 @@ dependencies = [
  "futures-core",
  "pin-project-lite",
  "tokio",
+ "tokio-util",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 14f2cfcb56..df2064a4a7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -201,7 +201,7 @@ tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.g
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
-tokio-stream = "0.1"
+tokio-stream = { version = "0.1", features = ["sync"] }
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
 toml = "0.8"
@@ -262,6 +262,7 @@ neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_client_grpc = { path = "./pageserver/client_grpc" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
diff --git a/compute_tools/README.md b/compute_tools/README.md
index 8d84031efc..49f1368f0e 100644
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -46,11 +46,14 @@ stateDiagram-v2
   Configuration --> Failed : Failed to configure the compute
   Configuration --> Running : Compute has been configured
   Empty --> Init : Compute spec is immediately available
-  Empty --> TerminationPending : Requested termination
+  Empty --> TerminationPendingFast : Requested termination
+  Empty --> TerminationPendingImmediate : Requested termination
   Init --> Failed : Failed to start Postgres
   Init --> Running : Started Postgres
-  Running --> TerminationPending : Requested termination
-  TerminationPending --> Terminated : Terminated compute
+  Running --> TerminationPendingFast : Requested termination
+  Running --> TerminationPendingImmediate : Requested termination
+  TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status
+  TerminationPendingImmediate --> Terminated : Terminated compute immediately
   Failed --> [*] : Compute exited
   Terminated --> [*] : Compute exited
 ```
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0496d38e67..8f42cf699b 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -956,14 +956,20 @@ impl ComputeNode {
             None
         };
 
-        let mut delay_exit = false;
         let mut state = self.state.lock().unwrap();
         state.terminate_flush_lsn = lsn;
-        if let ComputeStatus::TerminationPending { mode } = state.status {
+
+        let delay_exit = state.status == ComputeStatus::TerminationPendingFast;
+        if state.status == ComputeStatus::TerminationPendingFast
+            || state.status == ComputeStatus::TerminationPendingImmediate
+        {
+            info!(
+                "Changing compute status from {} to {}",
+                state.status,
+                ComputeStatus::Terminated
+            );
             state.status = ComputeStatus::Terminated;
             self.state_changed.notify_all();
-            // we were asked to terminate gracefully, don't exit to avoid restart
-            delay_exit = mode == compute_api::responses::TerminateMode::Fast
         }
         drop(state);
 
@@ -1034,6 +1040,8 @@ impl ComputeNode {
             PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
         };
 
+        self.fix_zenith_signal_neon_signal()?;
+
         let mut state = self.state.lock().unwrap();
         state.metrics.pageserver_connect_micros =
             connected.duration_since(started).as_micros() as u64;
@@ -1043,6 +1051,27 @@ impl ComputeNode {
         Ok(())
     }
 
+    /// Move the Zenith signal file to Neon signal file location.
+    /// This makes Compute compatible with older PageServers that don't yet
+    /// know about the Zenith->Neon rename.
+    fn fix_zenith_signal_neon_signal(&self) -> Result<()> {
+        let datadir = Path::new(&self.params.pgdata);
+
+        let neonsig = datadir.join("neon.signal");
+
+        if neonsig.is_file() {
+            return Ok(());
+        }
+
+        let zenithsig = datadir.join("zenith.signal");
+
+        if zenithsig.is_file() {
+            fs::copy(zenithsig, neonsig)?;
+        }
+
+        Ok(())
+    }
+
     /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
     /// the connection was established, and the (compressed) size of the basebackup.
     fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
@@ -1805,6 +1834,8 @@ impl ComputeNode {
             tls_config,
         )?;
 
+        self.pg_reload_conf()?;
+
         if !spec.skip_pg_catalog_updates {
             let max_concurrent_connections = spec.reconfigure_concurrency;
             // Temporarily reset max_cluster_size in config
@@ -1824,10 +1855,9 @@ impl ComputeNode {
 
                 Ok(())
             })?;
+            self.pg_reload_conf()?;
         }
 
-        self.pg_reload_conf()?;
-
         let unknown_op = "unknown".to_string();
         let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
         info!(
@@ -1900,7 +1930,8 @@ impl ComputeNode {
 
                             // exit loop
                             ComputeStatus::Failed
-                            | ComputeStatus::TerminationPending { .. }
+                            | ComputeStatus::TerminationPendingFast
+                            | ComputeStatus::TerminationPendingImmediate
                             | ComputeStatus::Terminated => break 'cert_update,
 
                             // wait
@@ -2456,7 +2487,7 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
                 serde_json::to_string(&extensions).expect("failed to serialize extensions list")
             );
         }
-        Err(err) => error!("could not get installed extensions: {err:?}"),
+        Err(err) => error!("could not get installed extensions: {err}"),
     }
     Ok(())
 }
diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs
index 3f6f9a7ecc..d014a5bb72 100644
--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -70,7 +70,7 @@ impl ComputeNode {
             }
         };
         let row = match client
-            .query_one("select * from get_prewarm_info()", &[])
+            .query_one("select * from neon.get_prewarm_info()", &[])
             .await
         {
             Ok(row) => row,
@@ -146,7 +146,7 @@ impl ComputeNode {
         ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
             .await
             .context("connecting to postgres")?
-            .query_one("select prewarm_local_cache($1)", &[&uncompressed])
+            .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
             .await
             .context("loading LFC state into postgres")
             .map(|_| ())
@@ -196,7 +196,7 @@ impl ComputeNode {
         ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
             .await
             .context("connecting to postgres")?
-            .query_one("select get_local_cache_state()", &[])
+            .query_one("select neon.get_local_cache_state()", &[])
             .await
             .context("querying LFC state")?
             .try_get::<usize, &[u8]>(0)
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 3c58b284b3..93a357e160 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -371,9 +371,28 @@ paths:
       summary: Terminate Postgres and wait for it to exit
       description: ""
       operationId: terminate
+      parameters:
+        - name: mode
+          in: query
+          description: "Terminate mode: fast (wait 30s before returning) and immediate"
+          required: false
+          schema:
+            type: string
+            enum: ["fast", "immediate"]
+            default: fast
       responses:
         200:
           description: Result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TerminateResponse"
+        201:
+          description: Result if compute is already terminated
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TerminateResponse"
         412:
           description: "wrong state"
           content:
@@ -530,11 +549,14 @@ components:
       type: string
       enum:
         - empty
-        - init
-        - failed
-        - running
         - configuration_pending
+        - init
+        - running
         - configuration
+        - failed
+        - termination_pending_fast
+        - termination_pending_immediate
+        - terminated
       example: running
 
     ExtensionInstallRequest:
@@ -660,6 +682,17 @@ components:
           description: Role name.
           example: "neon"
 
+    TerminateResponse:
+      type: object
+      required:
+        - lsn
+      properties:
+        lsn:
+          type: string
+          nullable: true
+          description: "last WAL flush LSN"
+          example: "0/028F10D8"
+
     SetRoleGrantsResponse:
       type: object
       required:
diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs
index 32d90a5990..5b30b020c8 100644
--- a/compute_tools/src/http/routes/terminate.rs
+++ b/compute_tools/src/http/routes/terminate.rs
@@ -3,7 +3,7 @@ use crate::http::JsonResponse;
 use axum::extract::State;
 use axum::response::Response;
 use axum_extra::extract::OptionalQuery;
-use compute_api::responses::{ComputeStatus, TerminateResponse};
+use compute_api::responses::{ComputeStatus, TerminateMode, TerminateResponse};
 use http::StatusCode;
 use serde::Deserialize;
 use std::sync::Arc;
@@ -12,7 +12,7 @@ use tracing::info;
 
 #[derive(Deserialize, Default)]
 pub struct TerminateQuery {
-    mode: compute_api::responses::TerminateMode,
+    mode: TerminateMode,
 }
 
 /// Terminate the compute.
@@ -24,16 +24,16 @@ pub(in crate::http) async fn terminate(
     {
         let mut state = compute.state.lock().unwrap();
         if state.status == ComputeStatus::Terminated {
-            return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn);
+            let response = TerminateResponse {
+                lsn: state.terminate_flush_lsn,
+            };
+            return JsonResponse::success(StatusCode::CREATED, response);
         }
 
         if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
             return JsonResponse::invalid_status(state.status);
         }
-        state.set_status(
-            ComputeStatus::TerminationPending { mode },
-            &compute.state_changed,
-        );
+        state.set_status(mode.into(), &compute.state_changed);
     }
 
     forward_termination_signal(false);
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 411e03b7ec..90e1a17be4 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -2,6 +2,7 @@ use std::collections::HashMap;
 
 use anyhow::Result;
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
+use tokio_postgres::error::Error as PostgresError;
 use tokio_postgres::{Client, Config, NoTls};
 
 use crate::metrics::INSTALLED_EXTENSIONS;
@@ -10,7 +11,7 @@ use crate::metrics::INSTALLED_EXTENSIONS;
 /// and to make database listing query here more explicit.
 ///
 /// Limit the number of databases to 500 to avoid excessive load.
-async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
+async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
     // `pg_database.datconnlimit = -2` means that the database is in the
     // invalid state
     let databases = client
@@ -37,7 +38,9 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 /// Same extension can be installed in multiple databases with different versions,
 /// so we report a separate metric (number of databases where it is installed)
 /// for each extension version.
-pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExtensions> {
+pub async fn get_installed_extensions(
+    mut conf: Config,
+) -> Result<InstalledExtensions, PostgresError> {
     conf.application_name("compute_ctl:get_installed_extensions");
     let databases: Vec<String> = {
         let (mut client, connection) = conf.connect(NoTls).await?;
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index 91dedbb42a..6e4df73c0f 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -108,7 +108,7 @@ pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "compute_ctl_lfc_prewarm_errors_total",
-        "Total number of LFC prewarms errors requested by compute_ctl or autoprewarm option",
+        "Total number of LFC prewarm errors",
     )
     .expect("failed to define a metric")
 });
@@ -124,7 +124,7 @@ pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "compute_ctl_lfc_offload_errors_total",
-        "Total number of LFC offload errors requested by compute_ctl or lfc_offload_period_seconds option",
+        "Total number of LFC offload errors",
     )
     .expect("failed to define a metric")
 });
diff --git a/compute_tools/src/migrations/0002-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql
index 6cb49f873f..8fc371eb8f 100644
--- a/compute_tools/src/migrations/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/0002-alter_roles.sql
@@ -1,3 +1,16 @@
+-- On December 8th, 2023, an engineering escalation (INC-110) was opened after
+-- it was found that BYPASSRLS was being applied to all roles.
+--
+-- PR that introduced the issue: https://github.com/neondatabase/neon/pull/5657
+-- Subsequent commit on main: https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072
+--
+-- NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but because it
+-- isn't easy to know if a Postgres cluster is affected by the issue, we need to
+-- keep the migration around for a long time, if not indefinitely, so any
+-- cluster can be fixed.
+--
+-- Branching is the gift that keeps on giving...
+
 DO $$
 DECLARE
     role_name text;
diff --git a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
new file mode 100644
index 0000000000..36e31544be
--- /dev/null
+++ b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
@@ -0,0 +1 @@
+GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;
diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
index deb7a364af..3464a2b1cf 100644
--- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
@@ -6,14 +6,18 @@ BEGIN
             admin_option AS admin
         INTO monitor
         FROM pg_auth_members
-        WHERE roleid = 'neon_superuser'::regrole
-            AND member = 'pg_monitor'::regrole;
+        WHERE roleid = 'pg_monitor'::regrole
+            AND member = 'neon_superuser'::regrole;
 
-    IF NOT monitor.member THEN
+    IF monitor IS NULL THEN
+        RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';
+    END IF;
+
+    IF monitor.admin IS NULL OR NOT monitor.member THEN
         RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor';
     END IF;
 
-    IF NOT monitor.admin THEN
+    IF monitor.admin IS NULL OR NOT monitor.admin THEN
         RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor';
     END IF;
 END $$;
diff --git a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
new file mode 100644
index 0000000000..e62b742d30
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
@@ -0,0 +1,23 @@
+DO $$
+DECLARE
+    signal_backend record;
+BEGIN
+    SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
+            admin_option AS admin
+        INTO signal_backend
+        FROM pg_auth_members
+        WHERE roleid = 'pg_signal_backend'::regrole
+            AND member = 'neon_superuser'::regrole;
+
+    IF signal_backend IS NULL THEN
+        RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_signal_backend';
+    END IF;
+
+    IF signal_backend.member IS NULL OR NOT signal_backend.member THEN
+        RAISE EXCEPTION 'neon_superuser is not a member of pg_signal_backend';
+    END IF;
+
+    IF signal_backend.admin IS NULL OR NOT signal_backend.admin THEN
+        RAISE EXCEPTION 'neon_superuser cannot grant pg_signal_backend';
+    END IF;
+END $$;
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 8a2f6addad..fa01545856 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -84,7 +84,8 @@ impl ComputeMonitor {
         if matches!(
             compute_status,
             ComputeStatus::Terminated
-                | ComputeStatus::TerminationPending { .. }
+                | ComputeStatus::TerminationPendingFast
+                | ComputeStatus::TerminationPendingImmediate
                 | ComputeStatus::Failed
         ) {
             info!(
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 43cfbb48f7..b6382b2f56 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -197,6 +197,7 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
         include_str!(
             "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
         ),
+        include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"),
     ];
 
     MigrationRunner::new(client, &migrations)
diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
index f43f459636..988b08e875 100644
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -36,7 +36,7 @@ impl StorageBroker {
     pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
         let broker = &self.env.broker;
 
-        print!("Starting neon broker at {}", broker.client_url());
+        println!("Starting neon broker at {}", broker.client_url());
 
         let mut args = Vec::new();
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 74ab15dc97..91a62b0ca4 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -32,7 +32,8 @@
 //!     config.json                 - passed to `compute_ctl`
 //!     pgdata/
 //!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
-//!         zenith.signal
+//!         neon.signal
+//!         zenith.signal         - copy of neon.signal, for backward compatibility
 //!         <other PostgreSQL files>
 //! ```
 //!
@@ -922,7 +923,8 @@ impl Endpoint {
                         ComputeStatus::Empty
                         | ComputeStatus::ConfigurationPending
                         | ComputeStatus::Configuration
-                        | ComputeStatus::TerminationPending { .. }
+                        | ComputeStatus::TerminationPendingFast
+                        | ComputeStatus::TerminationPendingImmediate
                         | ComputeStatus::Terminated => {
                             bail!("unexpected compute status: {:?}", state.status)
                         }
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index d0611113e8..d34dd39f61 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -217,6 +217,9 @@ pub struct NeonStorageControllerConf {
     pub posthog_config: Option<PostHogConfig>,
 
     pub kick_secondary_downloads: Option<bool>,
+
+    #[serde(with = "humantime_serde")]
+    pub shard_split_request_timeout: Option<Duration>,
 }
 
 impl NeonStorageControllerConf {
@@ -250,6 +253,7 @@ impl Default for NeonStorageControllerConf {
             timeline_safekeeper_count: None,
             posthog_config: None,
             kick_secondary_downloads: None,
+            shard_split_request_timeout: None,
         }
     }
 }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 3f66960edd..843ead807d 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -303,7 +303,7 @@ impl PageServerNode {
     async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
-        print!(
+        println!(
             "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
             self.conf.id,
             self.pg_connection_config.raw_address(),
@@ -452,6 +452,12 @@ impl PageServerNode {
                 .map(|x| x.parse::<usize>())
                 .transpose()
                 .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
+            // HADRON
+            image_layer_force_creation_period: settings
+                .remove("image_layer_force_creation_period")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'image_layer_force_creation_period' as duration")?,
             image_layer_creation_check_threshold: settings
                 .remove("image_layer_creation_check_threshold")
                 .map(|x| x.parse::<u8>())
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index da9dafd8e9..2ba2f3ebe4 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -127,7 +127,7 @@ impl SafekeeperNode {
         extra_opts: &[String],
         retry_timeout: &Duration,
     ) -> anyhow::Result<()> {
-        print!(
+        println!(
             "Starting safekeeper at '{}' in '{}', retrying for {:?}",
             self.pg_connection_config.raw_address(),
             self.datadir_path().display(),
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index bb83a6319c..f996f39967 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -648,6 +648,13 @@ impl StorageController {
             args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
         }
 
+        if let Some(duration) = self.config.shard_split_request_timeout {
+            args.push(format!(
+                "--shard-split-request-timeout={}",
+                humantime::Duration::from(duration)
+            ));
+        }
+
         let mut envs = vec![
             ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
             ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
@@ -660,7 +667,7 @@ impl StorageController {
             ));
         }
 
-        println!("Starting storage controller");
+        println!("Starting storage controller at {scheme}://{host}:{listen_port}");
 
         background_process::start_process(
             COMMAND,
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
index ce89116691..61d48b2469 100644
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -14,6 +14,7 @@ humantime.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
+safekeeper_api.workspace=true
 serde_json = { workspace = true, features = ["raw_value"] }
 storage_controller_client.workspace = true
 tokio.workspace = true
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 701c4b3b2e..fcc5549beb 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -11,7 +11,7 @@ use pageserver_api::controller_api::{
     PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
     ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
     SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    TenantShardMigrateRequest, TenantShardMigrateResponse, TimelineSafekeeperMigrateRequest,
 };
 use pageserver_api::models::{
     EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
@@ -21,6 +21,7 @@ use pageserver_api::models::{
 use pageserver_api::shard::{ShardStripeSize, TenantShardId};
 use pageserver_client::mgmt_api::{self};
 use reqwest::{Certificate, Method, StatusCode, Url};
+use safekeeper_api::models::TimelineLocateResponse;
 use storage_controller_client::control_api::Client;
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -279,6 +280,23 @@ enum Command {
         #[arg(long)]
         concurrency: Option<usize>,
     },
+    /// Locate safekeepers for a timeline from the storcon DB.
+    TimelineLocate {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+    },
+    /// Migrate a timeline to a new set of safekeepers
+    TimelineSafekeeperMigrate {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+        /// Example: --new-sk-set 1,2,3
+        #[arg(long, required = true, value_delimiter = ',')]
+        new_sk_set: Vec<NodeId>,
+    },
 }
 
 #[derive(Parser)]
@@ -458,6 +476,7 @@ async fn main() -> anyhow::Result<()> {
                         listen_http_port,
                         listen_https_port,
                         availability_zone_id: AvailabilityZone(availability_zone_id),
+                        node_ip_addr: None,
                     }),
                 )
                 .await?;
@@ -1324,7 +1343,7 @@ async fn main() -> anyhow::Result<()> {
             concurrency,
         } => {
             let mut path = format!(
-                "/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
+                "v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
             );
 
             if let Some(c) = concurrency {
@@ -1335,6 +1354,41 @@ async fn main() -> anyhow::Result<()> {
                 .dispatch::<(), ()>(Method::POST, path, None)
                 .await?;
         }
+        Command::TimelineLocate {
+            tenant_id,
+            timeline_id,
+        } => {
+            let path = format!("debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate");
+
+            let resp = storcon_client
+                .dispatch::<(), TimelineLocateResponse>(Method::GET, path, None)
+                .await?;
+
+            let sk_set = resp.sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
+            let new_sk_set = resp
+                .new_sk_set
+                .as_ref()
+                .map(|ids| ids.iter().map(|id| id.0 as i64).collect::<Vec<_>>());
+
+            println!("generation = {}", resp.generation);
+            println!("sk_set = {sk_set:?}");
+            println!("new_sk_set = {new_sk_set:?}");
+        }
+        Command::TimelineSafekeeperMigrate {
+            tenant_id,
+            timeline_id,
+            new_sk_set,
+        } => {
+            let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate");
+
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::POST,
+                    path,
+                    Some(TimelineSafekeeperMigrateRequest { new_sk_set }),
+                )
+                .await?;
+        }
     }
 
     Ok(())
diff --git a/docs/core_changes.md b/docs/core_changes.md
index 1388317728..abfd20af26 100644
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -129,9 +129,10 @@ segment to bootstrap the WAL writing, but it doesn't contain the checkpoint reco
 changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
 from WAL.
 
-This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start
-at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last
-checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo.
+This includes code to read the `neon.signal` (also `zenith.signal`) file, which tells the startup 
+code the LSN to start at. When the `neon.signal` file is present, the startup uses that LSN
+instead of the last checkpoint's LSN. The system is known to be consistent at that LSN, without 
+any WAL redo.
 
 
 ### How to get rid of the patch
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index e10c381fb4..2fe233214a 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -121,6 +121,15 @@ pub enum TerminateMode {
     Immediate,
 }
 
+impl From<TerminateMode> for ComputeStatus {
+    fn from(mode: TerminateMode) -> Self {
+        match mode {
+            TerminateMode::Fast => ComputeStatus::TerminationPendingFast,
+            TerminateMode::Immediate => ComputeStatus::TerminationPendingImmediate,
+        }
+    }
+}
+
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
@@ -141,7 +150,9 @@ pub enum ComputeStatus {
     // control-plane to terminate it.
     Failed,
     // Termination requested
-    TerminationPending { mode: TerminateMode },
+    TerminationPendingFast,
+    // Termination requested, without waiting 30s before returning from /terminate
+    TerminationPendingImmediate,
     // Terminated Postgres
     Terminated,
 }
@@ -160,7 +171,10 @@ impl Display for ComputeStatus {
             ComputeStatus::Running => f.write_str("running"),
             ComputeStatus::Configuration => f.write_str("configuration"),
             ComputeStatus::Failed => f.write_str("failed"),
-            ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"),
+            ComputeStatus::TerminationPendingFast => f.write_str("termination-pending-fast"),
+            ComputeStatus::TerminationPendingImmediate => {
+                f.write_str("termination-pending-immediate")
+            }
             ComputeStatus::Terminated => f.write_str("terminated"),
         }
     }
diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs
index f32ced1180..a61bf8e08a 100644
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -20,6 +20,7 @@ use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{Instrument, debug, info, info_span, warn};
 use utils::auth::{AuthError, Claims, SwappableJwtAuth};
+use utils::metrics_collector::{METRICS_COLLECTOR, METRICS_STALE_MILLIS};
 
 use crate::error::{ApiError, api_error_handler, route_error_handler};
 use crate::request::{get_query_param, parse_query_param};
@@ -250,9 +251,28 @@ impl std::io::Write for ChannelWriter {
     }
 }
 
-pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn prometheus_metrics_handler(
+    req: Request<Body>,
+    force_metric_collection_on_scrape: bool,
+) -> Result<Response<Body>, ApiError> {
     SERVE_METRICS_COUNT.inc();
 
+    // HADRON
+    let requested_use_latest = parse_query_param(&req, "use_latest")?;
+
+    let use_latest = match requested_use_latest {
+        None => force_metric_collection_on_scrape,
+        Some(true) => true,
+        Some(false) => {
+            if force_metric_collection_on_scrape {
+                // We don't cache in this case
+                true
+            } else {
+                false
+            }
+        }
+    };
+
     let started_at = std::time::Instant::now();
 
     let (tx, rx) = mpsc::channel(1);
@@ -277,12 +297,18 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
 
         let _span = span.entered();
 
-        let metrics = metrics::gather();
+        // HADRON
+        let collected = if use_latest {
+            // Skip caching the results if we always force metric collection on scrape.
+            METRICS_COLLECTOR.run_once(!force_metric_collection_on_scrape)
+        } else {
+            METRICS_COLLECTOR.last_collected()
+        };
 
         let gathered_at = std::time::Instant::now();
 
         let res = encoder
-            .encode(&metrics, &mut writer)
+            .encode(&collected.metrics, &mut writer)
             .and_then(|_| writer.flush().map_err(|e| e.into()));
 
         // this instant is not when we finally got the full response sent, sending is done by hyper
@@ -295,6 +321,10 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
         let encoded_in = encoded_at - gathered_at - writer.wait_time();
         let total = encoded_at - started_at;
 
+        // HADRON
+        let staleness_ms = (encoded_at - collected.collected_at).as_millis();
+        METRICS_STALE_MILLIS.set(staleness_ms as i64);
+
         match res {
             Ok(()) => {
                 tracing::info!(
@@ -303,6 +333,7 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
                     spawning_ms = spawned_in.as_millis(),
                     collection_ms = collected_in.as_millis(),
                     encoding_ms = encoded_in.as_millis(),
+                    stalenss_ms = staleness_ms,
                     "responded /metrics"
                 );
             }
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index dc7e9aed7f..f01c65d1bd 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -226,6 +226,7 @@ pub struct ConfigToml {
     pub synthetic_size_calculation_interval: Duration,
     pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
     pub test_remote_failures: u64,
+    pub test_remote_failures_probability: u64,
     pub ondemand_download_behavior_treat_error_as_warn: bool,
     #[serde(with = "humantime_serde")]
     pub background_task_maximum_delay: Duration,
@@ -271,6 +272,9 @@ pub struct ConfigToml {
     pub timeline_import_config: TimelineImportConfig,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub basebackup_cache_config: Option<BasebackupCacheConfig>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_layer_generation_large_timeline_threshold: Option<u64>,
+    pub force_metric_collection_on_scrape: bool,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -560,6 +564,11 @@ pub struct TenantConfigToml {
     pub gc_period: Duration,
     // Delta layer churn threshold to create L1 image layers.
     pub image_creation_threshold: usize,
+    // HADRON
+    // When the timeout is reached, PageServer will (1) force compact any remaining L0 deltas and
+    // (2) create image layers if there are any L1 deltas.
+    #[serde(with = "humantime_serde")]
+    pub image_layer_force_creation_period: Option<Duration>,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
     // The unit is time.
@@ -758,6 +767,7 @@ impl Default for ConfigToml {
             disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
 
             test_remote_failures: (0),
+            test_remote_failures_probability: (100),
 
             ondemand_download_behavior_treat_error_as_warn: (false),
 
@@ -821,6 +831,8 @@ impl Default for ConfigToml {
             },
             basebackup_cache_config: None,
             posthog_config: None,
+            image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024),
+            force_metric_collection_on_scrape: true,
         }
     }
 }
@@ -914,6 +926,7 @@ impl Default for TenantConfigToml {
             gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                 .expect("cannot parse default gc period"),
             image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
+            image_layer_force_creation_period: None,
             pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
                 .expect("cannot parse default PITR interval"),
             walreceiver_connect_timeout: humantime::parse_duration(
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a8c7083b17..8f86b03f72 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,5 +1,6 @@
 use std::collections::{HashMap, HashSet};
 use std::fmt::Display;
+use std::net::IpAddr;
 use std::str::FromStr;
 use std::time::{Duration, Instant};
 
@@ -10,7 +11,7 @@ use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
 
-use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
+use crate::models::{PageserverUtilization, ShardParameters, TenantConfig, TimelineInfo};
 use crate::shard::{ShardStripeSize, TenantShardId};
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -60,6 +61,11 @@ pub struct NodeRegisterRequest {
     pub listen_https_port: Option<u16>,
 
     pub availability_zone_id: AvailabilityZone,
+
+    // Reachable IP address of the PS/SK registering, if known.
+    // Hadron Cluster Coordiantor will update the DNS record of the registering node
+    // with this IP address.
+    pub node_ip_addr: Option<IpAddr>,
 }
 
 #[derive(Serialize, Deserialize)]
@@ -126,6 +132,13 @@ pub struct TenantDescribeResponse {
     pub config: TenantConfig,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantTimelineDescribeResponse {
+    pub shards: Vec<TimelineInfo>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_consistent_lsn: Option<Lsn>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct NodeShardResponse {
     pub node_id: NodeId,
@@ -538,6 +551,39 @@ pub struct SafekeeperDescribeResponse {
     pub scheduling_policy: SkSchedulingPolicy,
 }
 
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct TimelineSafekeeperPeer {
+    pub node_id: NodeId,
+    pub listen_http_addr: String,
+    pub http_port: i32,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SCSafekeeperTimeline {
+    // SC does not know the tenant id.
+    pub timeline_id: TimelineId,
+    pub peers: Vec<NodeId>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SCSafekeeperTimelinesResponse {
+    pub timelines: Vec<SCSafekeeperTimeline>,
+    pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SafekeeperTimeline {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub peers: Vec<NodeId>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SafekeeperTimelinesResponse {
+    pub timelines: Vec<SafekeeperTimeline>,
+    pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
+}
+
 #[derive(Serialize, Deserialize, Clone)]
 pub struct SafekeeperSchedulingPolicyRequest {
     pub scheduling_policy: SkSchedulingPolicy,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 6735320484..11e02a8550 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -597,6 +597,9 @@ pub struct TenantConfigPatch {
     pub gc_period: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub image_creation_threshold: FieldPatch<usize>,
+    // HADRON
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_layer_force_creation_period: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub pitr_interval: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
@@ -700,6 +703,11 @@ pub struct TenantConfig {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_creation_threshold: Option<usize>,
 
+    // HADRON
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    pub image_layer_force_creation_period: Option<Duration>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(with = "humantime_serde")]
     pub pitr_interval: Option<Duration>,
@@ -798,6 +806,7 @@ impl TenantConfig {
             mut gc_horizon,
             mut gc_period,
             mut image_creation_threshold,
+            mut image_layer_force_creation_period,
             mut pitr_interval,
             mut walreceiver_connect_timeout,
             mut lagging_wal_timeout,
@@ -861,6 +870,11 @@ impl TenantConfig {
         patch
             .image_creation_threshold
             .apply(&mut image_creation_threshold);
+        // HADRON
+        patch
+            .image_layer_force_creation_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut image_layer_force_creation_period);
         patch
             .pitr_interval
             .map(|v| humantime::parse_duration(&v))?
@@ -942,6 +956,7 @@ impl TenantConfig {
             gc_horizon,
             gc_period,
             image_creation_threshold,
+            image_layer_force_creation_period,
             pitr_interval,
             walreceiver_connect_timeout,
             lagging_wal_timeout,
@@ -1016,6 +1031,9 @@ impl TenantConfig {
             image_creation_threshold: self
                 .image_creation_threshold
                 .unwrap_or(global_conf.image_creation_threshold),
+            image_layer_force_creation_period: self
+                .image_layer_force_creation_period
+                .or(global_conf.image_layer_force_creation_period),
             pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval),
             walreceiver_connect_timeout: self
                 .walreceiver_connect_timeout
@@ -1604,6 +1622,9 @@ pub struct TimelineInfo {
 
     /// Whether the timeline is invisible in synthetic size calculations.
     pub is_invisible: Option<bool>,
+    // HADRON: the largest LSN below which all page updates have been included in the image layers.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_consistent_lsn: Option<Lsn>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 69316fd493..0ae13552b8 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -43,6 +43,7 @@ itertools.workspace = true
 sync_wrapper = { workspace = true, features = ["futures"] }
 
 byteorder = "1.4"
+rand = "0.8.5"
 
 [dev-dependencies]
 camino-tempfile.workspace = true
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index ed416b2811..5885c3e791 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -732,9 +732,15 @@ impl GenericRemoteStorage {
         })
     }
 
-    pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
-        Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
+    /* BEGIN_HADRON */
+    pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self {
+        Self::Unreliable(Arc::new(UnreliableWrapper::new(
+            s,
+            fail_first,
+            fail_probability,
+        )))
     }
+    /* END_HADRON */
 
     /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
     pub async fn upload_storage_object(
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index f9856a5856..e895380192 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -1,6 +1,8 @@
 //! This module provides a wrapper around a real RemoteStorage implementation that
 //! causes the first N attempts at each upload or download operatio to fail. For
 //! testing purposes.
+use rand::Rng;
+use std::cmp;
 use std::collections::HashMap;
 use std::collections::hash_map::Entry;
 use std::num::NonZeroU32;
@@ -25,6 +27,13 @@ pub struct UnreliableWrapper {
 
     // Tracks how many failed attempts of each operation has been made.
     attempts: Mutex<HashMap<RemoteOp, u64>>,
+
+    /* BEGIN_HADRON */
+    // This the probability of failure for each operation, ranged from [0, 100].
+    // The probability is default to 100, which means that all operations will fail.
+    // Storage will fail by probability up to attempts_to_fail times.
+    attempt_failure_probability: u64,
+    /* END_HADRON */
 }
 
 /// Used to identify retries of different unique operation.
@@ -40,7 +49,11 @@ enum RemoteOp {
 }
 
 impl UnreliableWrapper {
-    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
+    pub fn new(
+        inner: crate::GenericRemoteStorage,
+        attempts_to_fail: u64,
+        attempt_failure_probability: u64,
+    ) -> Self {
         assert!(attempts_to_fail > 0);
         let inner = match inner {
             GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
@@ -51,9 +64,11 @@ impl UnreliableWrapper {
                 panic!("Can't wrap unreliable wrapper unreliably")
             }
         };
+        let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100);
         UnreliableWrapper {
             inner,
             attempts_to_fail,
+            attempt_failure_probability: actual_attempt_failure_probability,
             attempts: Mutex::new(HashMap::new()),
         }
     }
@@ -66,6 +81,7 @@ impl UnreliableWrapper {
     ///
     fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
         let mut attempts = self.attempts.lock().unwrap();
+        let mut rng = rand::thread_rng();
 
         match attempts.entry(op) {
             Entry::Occupied(mut e) => {
@@ -75,15 +91,19 @@ impl UnreliableWrapper {
                     *p
                 };
 
-                if attempts_before_this >= self.attempts_to_fail {
-                    // let it succeed
-                    e.remove();
-                    Ok(attempts_before_this)
-                } else {
+                /* BEGIN_HADRON */
+                // If there are more attempts to fail, fail the request by probability.
+                if (attempts_before_this < self.attempts_to_fail)
+                    && (rng.gen_range(0..=100) < self.attempt_failure_probability)
+                {
                     let error =
                         anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
                     Err(error)
+                } else {
+                    e.remove();
+                    Ok(attempts_before_this)
                 }
+                /* END_HADRON */
             }
             Entry::Vacant(e) => {
                 let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index e87232474b..59e112654b 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -11,7 +11,7 @@ use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;
 
-use crate::membership::Configuration;
+use crate::membership::{Configuration, SafekeeperGeneration};
 use crate::{ServerInfo, Term};
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -311,3 +311,12 @@ pub struct PullTimelineResponse {
     pub safekeeper_host: Option<String>,
     // TODO: add more fields?
 }
+
+/// Response to a timeline locate request.
+/// Storcon-only API.
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct TimelineLocateResponse {
+    pub generation: SafekeeperGeneration,
+    pub sk_set: Vec<NodeId>,
+    pub new_sk_set: Option<Vec<NodeId>>,
+}
diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs
index 2a85f54a01..0b3b5e6c4f 100644
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -44,3 +44,63 @@ where
         }
     }
 }
+
+/* BEGIN_HADRON */
+pub enum DeploymentMode {
+    Local,
+    Dev,
+    Staging,
+    Prod,
+}
+
+pub fn get_deployment_mode() -> Option<DeploymentMode> {
+    match std::env::var("DEPLOYMENT_MODE") {
+        Ok(env) => match env.as_str() {
+            "development" => Some(DeploymentMode::Dev),
+            "staging" => Some(DeploymentMode::Staging),
+            "production" => Some(DeploymentMode::Prod),
+            _ => {
+                tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env);
+                None
+            }
+        },
+        Err(_) => {
+            // tracing::error!("DEPLOYMENT_MODE not set");
+            None
+        }
+    }
+}
+
+pub fn is_dev_or_staging() -> bool {
+    matches!(
+        get_deployment_mode(),
+        Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging)
+    )
+}
+
+pub enum TestingMode {
+    Chaos,
+    Stress,
+}
+
+pub fn get_test_mode() -> Option<TestingMode> {
+    match std::env::var("HADRON_TEST_MODE") {
+        Ok(env) => match env.as_str() {
+            "chaos" => Some(TestingMode::Chaos),
+            "stress" => Some(TestingMode::Stress),
+            _ => {
+                tracing::error!("Unexpected HADRON_TEST_MODE: {}", env);
+                None
+            }
+        },
+        Err(_) => {
+            tracing::error!("HADRON_TEST_MODE not set");
+            None
+        }
+    }
+}
+
+pub fn is_chaos_testing() -> bool {
+    matches!(get_test_mode(), Some(TestingMode::Chaos))
+}
+/* END_HADRON */
diff --git a/libs/utils/src/ip_address.rs b/libs/utils/src/ip_address.rs
new file mode 100644
index 0000000000..d0834d0ba5
--- /dev/null
+++ b/libs/utils/src/ip_address.rs
@@ -0,0 +1,73 @@
+use std::env::{VarError, var};
+use std::error::Error;
+use std::net::IpAddr;
+use std::str::FromStr;
+
+/// Name of the environment variable containing the reachable IP address of the node. If set, the IP address contained in this
+/// environment variable is used as the reachable IP address of the pageserver or safekeeper node during node registration.
+/// In a Kubernetes environment, this environment variable should be set by Kubernetes to the Pod IP (specified in the Pod
+/// template).
+pub const HADRON_NODE_IP_ADDRESS: &str = "HADRON_NODE_IP_ADDRESS";
+
+/// Read the reachable IP address of this page server from env var HADRON_NODE_IP_ADDRESS.
+/// In Kubernetes this environment variable is set to the Pod IP (specified in the Pod template).
+pub fn read_node_ip_addr_from_env() -> Result<Option<IpAddr>, Box<dyn Error>> {
+    match var(HADRON_NODE_IP_ADDRESS) {
+        Ok(v) => {
+            if let Ok(addr) = IpAddr::from_str(&v) {
+                Ok(Some(addr))
+            } else {
+                Err(format!("Invalid IP address string: {v}. Cannot be parsed as either an IPv4 or an IPv6 address.").into())
+            }
+        }
+        Err(VarError::NotPresent) => Ok(None),
+        Err(e) => Err(e.into()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::env;
+    use std::net::{Ipv4Addr, Ipv6Addr};
+
+    #[test]
+    fn test_read_node_ip_addr_from_env() {
+        // SAFETY: test code
+        unsafe {
+            // Test with a valid IPv4 address
+            env::set_var(HADRON_NODE_IP_ADDRESS, "192.168.1.1");
+            let result = read_node_ip_addr_from_env().unwrap();
+            assert_eq!(result, Some(IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1))));
+
+            // Test with a valid IPv6 address
+            env::set_var(
+                HADRON_NODE_IP_ADDRESS,
+                "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
+            );
+        }
+        let result = read_node_ip_addr_from_env().unwrap();
+        assert_eq!(
+            result,
+            Some(IpAddr::V6(
+                Ipv6Addr::from_str("2001:0db8:85a3:0000:0000:8a2e:0370:7334").unwrap()
+            ))
+        );
+
+        // Test with an invalid IP address
+        // SAFETY: test code
+        unsafe {
+            env::set_var(HADRON_NODE_IP_ADDRESS, "invalid_ip");
+        }
+        let result = read_node_ip_addr_from_env();
+        assert!(result.is_err());
+
+        // Test with no environment variable set
+        // SAFETY: test code
+        unsafe {
+            env::remove_var(HADRON_NODE_IP_ADDRESS);
+        }
+        let result = read_node_ip_addr_from_env().unwrap();
+        assert_eq!(result, None);
+    }
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 11f787562c..69771be5dc 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,6 +26,9 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
 
+// utility functions to obtain reachable IP addresses in PS/SK nodes.
+pub mod ip_address;
+
 pub mod shard;
 
 mod hex;
@@ -99,6 +102,8 @@ pub mod elapsed_accum;
 #[cfg(target_os = "linux")]
 pub mod linux_socket_ioctl;
 
+pub mod metrics_collector;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 5828a400a0..d67c0f123b 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,4 +1,5 @@
 use std::future::Future;
+use std::pin::Pin;
 use std::str::FromStr;
 use std::time::Duration;
 
@@ -7,7 +8,7 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
 use tokio::time::Instant;
-use tracing::info;
+use tracing::{info, warn};
 
 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
@@ -377,10 +378,11 @@ impl std::fmt::Debug for SecretString {
 ///
 /// TODO: consider upgrading this to a warning, but currently it fires too often.
 #[inline]
-pub async fn log_slow<F, O>(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O
-where
-    F: Future<Output = O>,
-{
+pub async fn log_slow<O>(
+    name: &str,
+    threshold: Duration,
+    f: Pin<&mut impl Future<Output = O>>,
+) -> O {
     monitor_slow_future(
         threshold,
         threshold, // period = threshold
@@ -394,16 +396,42 @@ where
             if !is_slow {
                 return;
             }
+            let elapsed = elapsed_total.as_secs_f64();
             if ready {
-                info!(
-                    "slow {name} completed after {:.3}s",
-                    elapsed_total.as_secs_f64()
-                );
+                info!("slow {name} completed after {elapsed:.3}s");
             } else {
-                info!(
-                    "slow {name} still running after {:.3}s",
-                    elapsed_total.as_secs_f64()
-                );
+                info!("slow {name} still running after {elapsed:.3}s");
+            }
+        },
+    )
+    .await
+}
+
+/// Logs a periodic warning if a future is slow to complete.
+#[inline]
+pub async fn warn_slow<O>(
+    name: &str,
+    threshold: Duration,
+    f: Pin<&mut impl Future<Output = O>>,
+) -> O {
+    monitor_slow_future(
+        threshold,
+        threshold, // period = threshold
+        f,
+        |MonitorSlowFutureCallback {
+             ready,
+             is_slow,
+             elapsed_total,
+             elapsed_since_last_callback: _,
+         }| {
+            if !is_slow {
+                return;
+            }
+            let elapsed = elapsed_total.as_secs_f64();
+            if ready {
+                warn!("slow {name} completed after {elapsed:.3}s");
+            } else {
+                warn!("slow {name} still running after {elapsed:.3}s");
             }
         },
     )
@@ -416,7 +444,7 @@ where
 pub async fn monitor_slow_future<F, O>(
     threshold: Duration,
     period: Duration,
-    mut fut: std::pin::Pin<&mut F>,
+    mut fut: Pin<&mut F>,
     mut cb: impl FnMut(MonitorSlowFutureCallback),
 ) -> O
 where
diff --git a/libs/utils/src/metrics_collector.rs b/libs/utils/src/metrics_collector.rs
new file mode 100644
index 0000000000..9e57fcd643
--- /dev/null
+++ b/libs/utils/src/metrics_collector.rs
@@ -0,0 +1,75 @@
+use std::{
+    sync::{Arc, RwLock},
+    time::{Duration, Instant},
+};
+
+use metrics::{IntGauge, proto::MetricFamily, register_int_gauge};
+use once_cell::sync::Lazy;
+
+pub static METRICS_STALE_MILLIS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "metrics_metrics_stale_milliseconds",
+        "The current metrics stale time in milliseconds"
+    )
+    .expect("failed to define a metric")
+});
+
+#[derive(Debug)]
+pub struct CollectedMetrics {
+    pub metrics: Vec<MetricFamily>,
+    pub collected_at: Instant,
+}
+
+impl CollectedMetrics {
+    fn new(metrics: Vec<MetricFamily>) -> Self {
+        Self {
+            metrics,
+            collected_at: Instant::now(),
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct MetricsCollector {
+    last_collected: RwLock<Arc<CollectedMetrics>>,
+}
+
+impl MetricsCollector {
+    pub fn new() -> Self {
+        Self {
+            last_collected: RwLock::new(Arc::new(CollectedMetrics::new(vec![]))),
+        }
+    }
+
+    #[tracing::instrument(name = "metrics_collector", skip_all)]
+    pub fn run_once(&self, cache_metrics: bool) -> Arc<CollectedMetrics> {
+        let started = Instant::now();
+        let metrics = metrics::gather();
+        let collected = Arc::new(CollectedMetrics::new(metrics));
+        if cache_metrics {
+            let mut guard = self.last_collected.write().unwrap();
+            *guard = collected.clone();
+        }
+        tracing::info!(
+            "Collected {} metric families in {} ms",
+            collected.metrics.len(),
+            started.elapsed().as_millis()
+        );
+        collected
+    }
+
+    pub fn last_collected(&self) -> Arc<CollectedMetrics> {
+        self.last_collected.read().unwrap().clone()
+    }
+}
+
+impl Default for MetricsCollector {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+// Interval for metrics collection. Currently hard-coded to be the same as the metrics scape interval from the obs agent
+pub static METRICS_COLLECTION_INTERVAL: Duration = Duration::from_secs(30);
+
+pub static METRICS_COLLECTOR: Lazy<MetricsCollector> = Lazy::new(MetricsCollector::default);
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 7c6abf252e..5f856a44d4 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -428,6 +428,12 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
         shard_number: 0,
     };
 
+    let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
+        should_limit: crate::bindings::pg_atomic_uint32 { value: 0 },
+        sent_bytes: 0,
+        last_recorded_time_us: 0,
+    };
+
     crate::bindings::WalproposerShmemState {
         propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
         donor_name: [0; 64],
@@ -441,6 +447,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
         num_shards: 0,
         replica_promote: false,
         min_ps_feedback: empty_feedback,
+        wal_rate_limiter: empty_wal_rate_limiter,
     }
 }
 
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index af4be23b9b..fe1ddc2e7d 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
 use std::error::Error as _;
 use std::time::Duration;
 
@@ -251,6 +251,70 @@ impl Client {
         Ok(())
     }
 
+    pub async fn tenant_timeline_compact(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        force_image_layer_creation: bool,
+        must_force_image_layer_creation: bool,
+        scheduled: bool,
+        wait_until_done: bool,
+    ) -> Result<()> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/compact",
+            self.mgmt_api_endpoint
+        ))
+        .expect("Cannot build URL");
+
+        if force_image_layer_creation {
+            path.query_pairs_mut()
+                .append_pair("force_image_layer_creation", "true");
+        }
+
+        if must_force_image_layer_creation {
+            path.query_pairs_mut()
+                .append_pair("must_force_image_layer_creation", "true");
+        }
+
+        if scheduled {
+            path.query_pairs_mut().append_pair("scheduled", "true");
+        }
+        if wait_until_done {
+            path.query_pairs_mut()
+                .append_pair("wait_until_scheduled_compaction_done", "true");
+            path.query_pairs_mut()
+                .append_pair("wait_until_uploaded", "true");
+        }
+        self.request(Method::PUT, path, ()).await?;
+        Ok(())
+    }
+
+    /* BEGIN_HADRON */
+    pub async fn tenant_timeline_describe(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Result<TimelineInfo> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        ))
+        .expect("Cannot build URL");
+        path.query_pairs_mut()
+            .append_pair("include-image-consistent-lsn", "true");
+
+        let response: reqwest::Response = self.request(Method::GET, path, ()).await?;
+        let body = response.json().await.map_err(Error::ReceiveBody)?;
+        Ok(body)
+    }
+
+    pub async fn list_tenant_visible_size(&self) -> Result<BTreeMap<TenantShardId, u64>> {
+        let uri = format!("{}/v1/list_tenant_visible_size", self.mgmt_api_endpoint);
+        let resp = self.get(&uri).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+    /* END_HADRON */
+
     pub async fn tenant_scan_remote_storage(
         &self,
         tenant_id: TenantId,
diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml
index 84e27abb84..e2741ad839 100644
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -4,8 +4,12 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+testing = ["pageserver_api/testing"]
+
 [dependencies]
 anyhow.workspace = true
+arc-swap.workspace = true
 bytes.workspace = true
 compute_api.workspace = true
 futures.workspace = true
@@ -13,6 +17,7 @@ pageserver_api.workspace = true
 pageserver_page_api.workspace = true
 tokio.workspace = true
 tokio-stream.workspace = true
+tokio-util.workspace = true
 tonic.workspace = true
 tracing.workspace = true
 utils.workspace = true
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 63852868c3..3a9edc7092 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -1,11 +1,16 @@
 use std::collections::HashMap;
 use std::num::NonZero;
+use std::pin::pin;
 use std::sync::Arc;
+use std::time::{Duration, Instant};
 
 use anyhow::anyhow;
+use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
-use tracing::instrument;
+use tonic::codec::CompressionEncoding;
+use tracing::{debug, instrument};
+use utils::logging::warn_slow;
 
 use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
 use crate::retry::Retry;
@@ -19,28 +24,40 @@ use utils::shard::{ShardCount, ShardIndex, ShardNumber};
 /// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
 /// when full.
 ///
+/// Normal requests are small, and we don't pipeline them, so we can afford a large number of
+/// streams per connection.
+///
 /// TODO: tune all of these constants, and consider making them configurable.
-/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
-/// with only streams.
-const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
+const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(64).unwrap();
 
-/// Max number of concurrent unary request clients per shard.
-const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
+/// Max number of concurrent bulk GetPage streams per channel (i.e. TCP connection). These use a
+/// dedicated channel pool with a lower client limit, to avoid TCP-level head-of-line blocking and
+/// transmission delays. This also concentrates large window sizes on a smaller set of
+/// streams/connections, presumably reducing memory use.
+const MAX_BULK_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
 
-/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage
-/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`.
-const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
+/// The batch size threshold at which a GetPage request will use the bulk stream pool.
+///
+/// The gRPC initial window size is 64 KB. Each page is 8 KB, so let's avoid increasing the window
+/// size for the normal stream pool, and route requests for >= 5 pages (>32 KB) to the bulk pool.
+const BULK_THRESHOLD_BATCH_SIZE: usize = 5;
 
-/// Max number of pipelined requests per stream.
-const MAX_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(2).unwrap();
+/// The overall request call timeout, including retries and pool acquisition.
+/// TODO: should we retry forever? Should the caller decide?
+const CALL_TIMEOUT: Duration = Duration::from_secs(60);
 
-/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
-/// are more throughput-oriented, we have a smaller limit but higher queue depth.
-const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
+/// The per-request (retry attempt) timeout, including any lazy connection establishment.
+const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
 
-/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus
-/// get a larger queue depth.
-const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
+/// The initial request retry backoff duration. The first retry does not back off.
+/// TODO: use a different backoff for ResourceExhausted (rate limiting)? Needs server support.
+const BASE_BACKOFF: Duration = Duration::from_millis(5);
+
+/// The maximum request retry backoff duration.
+const MAX_BACKOFF: Duration = Duration::from_secs(5);
+
+/// Threshold and interval for warning about slow operation.
+const SLOW_THRESHOLD: Duration = Duration::from_secs(3);
 
 /// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
 /// basic `page_api::Client` gRPC client, and supports:
@@ -48,48 +65,113 @@ const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
 /// * Sharded tenants across multiple Pageservers.
 /// * Pooling of connections, clients, and streams for efficient resource use.
 /// * Concurrent use by many callers.
-/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling.
+/// * Internal handling of GetPage bidirectional streams.
 /// * Automatic retries.
 /// * Observability.
 ///
+/// The client has dedicated connection/client/stream pools per shard, for resource reuse. These
+/// pools are unbounded: we allow scaling out as many concurrent streams as needed to serve all
+/// concurrent callers, which mostly eliminates head-of-line blocking. Idle streams are fairly
+/// cheap: the server task currently uses 26 KB of memory, so we can comfortably fit 100,000
+/// concurrent idle streams (2.5 GB memory). The worst case degenerates to the old libpq case with
+/// one stream per backend, but without the TCP connection overhead. In the common case we expect
+/// significantly lower stream counts due to stream sharing, driven e.g. by idle backends, LFC hits,
+/// read coalescing, sharding (backends typically only talk to one shard at a time), etc.
+///
 /// TODO: this client does not support base backups or LSN leases, as these are only used by
 /// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
 pub struct PageserverClient {
-    // TODO: support swapping out the shard map, e.g. via an ArcSwap.
-    shards: Shards,
-    retry: Retry,
+    /// The tenant ID.
+    tenant_id: TenantId,
+    /// The timeline ID.
+    timeline_id: TimelineId,
+    /// The JWT auth token for this tenant, if any.
+    auth_token: Option<String>,
+    /// The compression to use, if any.
+    compression: Option<CompressionEncoding>,
+    /// The shards for this tenant.
+    shards: ArcSwap<Shards>,
 }
 
 impl PageserverClient {
     /// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given
-    /// in the shard map, which must be complete and must use gRPC URLs.
+    /// in the shard spec, which must be complete and must use gRPC URLs.
     pub fn new(
         tenant_id: TenantId,
         timeline_id: TimelineId,
-        shard_map: HashMap<ShardIndex, String>,
-        stripe_size: ShardStripeSize,
+        shard_spec: ShardSpec,
         auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
     ) -> anyhow::Result<Self> {
-        let shards = Shards::new(tenant_id, timeline_id, shard_map, stripe_size, auth_token)?;
+        let shards = Shards::new(
+            tenant_id,
+            timeline_id,
+            shard_spec,
+            auth_token.clone(),
+            compression,
+        )?;
         Ok(Self {
-            shards,
-            retry: Retry,
+            tenant_id,
+            timeline_id,
+            auth_token,
+            compression,
+            shards: ArcSwap::new(Arc::new(shards)),
         })
     }
 
+    /// Updates the shards from the given shard spec. In-flight requests will complete using the
+    /// existing shards, but may retry with the new shards if they fail.
+    ///
+    /// TODO: verify that in-flight requests are allowed to complete, and that the old pools are
+    /// properly spun down and dropped afterwards.
+    pub fn update_shards(&self, shard_spec: ShardSpec) -> anyhow::Result<()> {
+        // Validate the shard spec. We should really use `ArcSwap::rcu` for this, to avoid races
+        // with concurrent updates, but that involves creating a new `Shards` on every attempt,
+        // which spins up a bunch of Tokio tasks and such. These should already be checked elsewhere
+        // in the stack, and if they're violated then we already have problems elsewhere, so a
+        // best-effort but possibly-racy check is okay here.
+        let old = self.shards.load_full();
+        if shard_spec.count < old.count {
+            return Err(anyhow!(
+                "can't reduce shard count from {} to {}",
+                old.count,
+                shard_spec.count
+            ));
+        }
+        if !old.count.is_unsharded() && shard_spec.stripe_size != old.stripe_size {
+            return Err(anyhow!(
+                "can't change stripe size from {} to {}",
+                old.stripe_size,
+                shard_spec.stripe_size
+            ));
+        }
+
+        let shards = Shards::new(
+            self.tenant_id,
+            self.timeline_id,
+            shard_spec,
+            self.auth_token.clone(),
+            self.compression,
+        )?;
+        self.shards.store(Arc::new(shards));
+        Ok(())
+    }
+
     /// Returns whether a relation exists.
     #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
     pub async fn check_rel_exists(
         &self,
         req: page_api::CheckRelExistsRequest,
     ) -> tonic::Result<page_api::CheckRelExistsResponse> {
-        self.retry
-            .with(async || {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
-                client.check_rel_exists(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.check_rel_exists(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Returns the total size of a database, as # of bytes.
@@ -98,17 +180,20 @@ impl PageserverClient {
         &self,
         req: page_api::GetDbSizeRequest,
     ) -> tonic::Result<page_api::GetDbSizeResponse> {
-        self.retry
-            .with(async || {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
-                client.get_db_size(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_db_size(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
-    /// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically
-    /// splits requests that straddle shard boundaries, and assembles the responses.
+    /// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
+    /// `attempt` must be 0 (incremented on retry). Automatically splits requests that straddle
+    /// shard boundaries, and assembles the responses.
     ///
     /// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status`
     /// errors. All responses will have `GetPageStatusCode::Ok`.
@@ -128,72 +213,101 @@ impl PageserverClient {
         if req.block_numbers.is_empty() {
             return Err(tonic::Status::invalid_argument("no block number"));
         }
+        // The request attempt must be 0. The client will increment it internally.
+        if req.request_id.attempt != 0 {
+            return Err(tonic::Status::invalid_argument("request attempt must be 0"));
+        }
 
+        debug!("sending request: {req:?}");
+
+        // The shards may change while we're fetching pages. We execute the request using a stable
+        // view of the shards (especially important for requests that span shards), but retry the
+        // top-level (pre-split) request to pick up shard changes. This can lead to unnecessary
+        // retries and re-splits in some cases where requests span shards, but these are expected to
+        // be rare.
+        //
+        // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
+        // once we figure out how to handle these.
+        let resp = Self::with_retries(CALL_TIMEOUT, async |attempt| {
+            let mut req = req.clone();
+            req.request_id.attempt = attempt as u32;
+            let shards = self.shards.load_full();
+            Self::with_timeout(REQUEST_TIMEOUT, Self::get_page_with_shards(req, &shards)).await
+        })
+        .await?;
+
+        debug!("received response: {resp:?}");
+        Ok(resp)
+    }
+
+    /// Fetches pages using the given shards. This uses a stable view of the shards, regardless of
+    /// concurrent shard updates. Does not retry internally, but is retried by `get_page()`.
+    async fn get_page_with_shards(
+        req: page_api::GetPageRequest,
+        shards: &Shards,
+    ) -> tonic::Result<page_api::GetPageResponse> {
         // Fast path: request is for a single shard.
         if let Some(shard_id) =
-            GetPageSplitter::is_single_shard(&req, self.shards.count, self.shards.stripe_size)
+            GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)
         {
-            return self.get_page_for_shard(shard_id, req).await;
+            return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
         }
 
         // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
         // reassemble the responses.
-        //
-        // TODO: when we support shard map updates, we need to detect when it changes and re-split
-        // the request on errors.
-        let mut splitter = GetPageSplitter::split(req, self.shards.count, self.shards.stripe_size);
+        let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size);
 
-        let mut shard_requests: FuturesUnordered<_> = splitter
-            .drain_requests()
-            .map(|(shard_id, shard_req)| {
-                // NB: each request will retry internally.
-                self.get_page_for_shard(shard_id, shard_req)
-                    .map(move |result| result.map(|resp| (shard_id, resp)))
-            })
-            .collect();
+        let mut shard_requests = FuturesUnordered::new();
+        for (shard_id, shard_req) in splitter.drain_requests() {
+            let future = Self::get_page_with_shard(shard_req, shards.get(shard_id)?)
+                .map(move |result| result.map(|resp| (shard_id, resp)));
+            shard_requests.push(future);
+        }
 
         while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
             splitter.add_response(shard_id, shard_response)?;
         }
 
-        splitter.assemble_response()
+        splitter.get_response()
     }
 
-    /// Fetches pages that belong to the given shard.
-    #[instrument(skip_all, fields(shard = %shard_id))]
-    async fn get_page_for_shard(
-        &self,
-        shard_id: ShardIndex,
+    /// Fetches pages on the given shard. Does not retry internally.
+    async fn get_page_with_shard(
         req: page_api::GetPageRequest,
+        shard: &Shard,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let resp = self
-            .retry
-            .with(async || {
-                let stream = self
-                    .shards
-                    .get(shard_id)?
-                    .stream(req.request_class.is_bulk())
-                    .await;
-                let resp = stream.send(req.clone()).await?;
+        let mut stream = shard.stream(Self::is_bulk(&req)).await?;
+        let resp = stream.send(req.clone()).await?;
 
-                // Convert per-request errors into a tonic::Status.
-                if resp.status_code != page_api::GetPageStatusCode::Ok {
-                    return Err(tonic::Status::new(
-                        resp.status_code.into(),
-                        resp.reason.unwrap_or_else(|| String::from("unknown error")),
-                    ));
-                }
+        // Convert per-request errors into a tonic::Status.
+        if resp.status_code != page_api::GetPageStatusCode::Ok {
+            return Err(tonic::Status::new(
+                resp.status_code.into(),
+                resp.reason.unwrap_or_else(|| String::from("unknown error")),
+            ));
+        }
 
-                Ok(resp)
-            })
-            .await?;
-
-        // Make sure we got the right number of pages.
-        // NB: check outside of the retry loop, since we don't want to retry this.
-        let (expected, actual) = (req.block_numbers.len(), resp.page_images.len());
-        if expected != actual {
+        // Check that we received the expected pages.
+        if req.rel != resp.rel {
             return Err(tonic::Status::internal(format!(
-                "expected {expected} pages for shard {shard_id}, got {actual}",
+                "shard {} returned wrong relation, expected {} got {}",
+                shard.id, req.rel, resp.rel
+            )));
+        }
+        if !req
+            .block_numbers
+            .iter()
+            .copied()
+            .eq(resp.pages.iter().map(|p| p.block_number))
+        {
+            return Err(tonic::Status::internal(format!(
+                "shard {} returned wrong pages, expected {:?} got {:?}",
+                shard.id,
+                req.block_numbers,
+                resp.pages
+                    .iter()
+                    .map(|page| page.block_number)
+                    .collect::<Vec<_>>()
             )));
         }
 
@@ -206,13 +320,15 @@ impl PageserverClient {
         &self,
         req: page_api::GetRelSizeRequest,
     ) -> tonic::Result<page_api::GetRelSizeResponse> {
-        self.retry
-            .with(async || {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
-                client.get_rel_size(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_rel_size(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Fetches an SLRU segment.
@@ -221,51 +337,91 @@ impl PageserverClient {
         &self,
         req: page_api::GetSlruSegmentRequest,
     ) -> tonic::Result<page_api::GetSlruSegmentResponse> {
-        self.retry
-            .with(async || {
-                // SLRU segments are only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
-                client.get_slru_segment(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // SLRU segments are only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_slru_segment(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
+    }
+
+    /// Runs the given async closure with retries up to the given timeout. Only certain gRPC status
+    /// codes are retried, see [`Retry::should_retry`]. Returns `DeadlineExceeded` on timeout.
+    async fn with_retries<T, F, O>(timeout: Duration, f: F) -> tonic::Result<T>
+    where
+        F: FnMut(usize) -> O, // pass attempt number, starting at 0
+        O: Future<Output = tonic::Result<T>>,
+    {
+        Retry {
+            timeout: Some(timeout),
+            base_backoff: BASE_BACKOFF,
+            max_backoff: MAX_BACKOFF,
+        }
+        .with(f)
+        .await
+    }
+
+    /// Runs the given future with a timeout. Returns `DeadlineExceeded` on timeout.
+    async fn with_timeout<T>(
+        timeout: Duration,
+        f: impl Future<Output = tonic::Result<T>>,
+    ) -> tonic::Result<T> {
+        let started = Instant::now();
+        tokio::time::timeout(timeout, f).await.map_err(|_| {
+            tonic::Status::deadline_exceeded(format!(
+                "request timed out after {:.3}s",
+                started.elapsed().as_secs_f64()
+            ))
+        })?
+    }
+
+    /// Returns true if the request is considered a bulk request and should use the bulk pool.
+    fn is_bulk(req: &page_api::GetPageRequest) -> bool {
+        req.block_numbers.len() >= BULK_THRESHOLD_BATCH_SIZE
     }
 }
 
-/// Tracks the tenant's shards.
-struct Shards {
+/// Shard specification for a PageserverClient.
+pub struct ShardSpec {
+    /// Maps shard indices to gRPC URLs.
+    ///
+    /// INVARIANT: every shard 0..count is present, and shard 0 is always present.
+    /// INVARIANT: every URL is valid and uses grpc:// scheme.
+    urls: HashMap<ShardIndex, String>,
     /// The shard count.
     ///
     /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
     count: ShardCount,
-    /// The stripe size. Only used for sharded tenants.
+    /// The stripe size for these shards.
     stripe_size: ShardStripeSize,
-    /// Shards by shard index.
-    ///
-    /// NB: unsharded tenants use count 0, like `ShardIndex::unsharded()`.
-    ///
-    /// INVARIANT: every shard 0..count is present.
-    /// INVARIANT: shard 0 is always present.
-    map: HashMap<ShardIndex, Shard>,
 }
 
-impl Shards {
-    /// Creates a new set of shards based on a shard map.
-    fn new(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        shard_map: HashMap<ShardIndex, String>,
-        stripe_size: ShardStripeSize,
-        auth_token: Option<String>,
+impl ShardSpec {
+    /// Creates a new shard spec with the given URLs and stripe size. All shards must be given.
+    /// The stripe size may be omitted for unsharded tenants.
+    pub fn new(
+        urls: HashMap<ShardIndex, String>,
+        stripe_size: Option<ShardStripeSize>,
     ) -> anyhow::Result<Self> {
-        let count = match shard_map.len() {
+        // Compute the shard count.
+        let count = match urls.len() {
             0 => return Err(anyhow!("no shards provided")),
             1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()`
             n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")),
             n => ShardCount::new(n as u8),
         };
 
-        let mut map = HashMap::new();
-        for (shard_id, url) in shard_map {
+        // Determine the stripe size. It doesn't matter for unsharded tenants.
+        if stripe_size.is_none() && !count.is_unsharded() {
+            return Err(anyhow!("stripe size must be given for sharded tenants"));
+        }
+        let stripe_size = stripe_size.unwrap_or_default();
+
+        // Validate the shard spec.
+        for (shard_id, url) in &urls {
             // The shard index must match the computed shard count, even for unsharded tenants.
             if shard_id.shard_count != count {
                 return Err(anyhow!("invalid shard index {shard_id}, expected {count}"));
@@ -276,21 +432,72 @@ impl Shards {
             }
             // The above conditions guarantee that we have all shards 0..count: len() matches count,
             // shard number < count, and numbers are unique (via hashmap).
-            let shard = Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?;
-            map.insert(shard_id, shard);
+
+            // Validate the URL.
+            if PageserverProtocol::from_connstring(url)? != PageserverProtocol::Grpc {
+                return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
+            }
         }
 
         Ok(Self {
+            urls,
             count,
             stripe_size,
-            map,
+        })
+    }
+}
+
+/// Tracks the tenant's shards.
+struct Shards {
+    /// Shards by shard index.
+    ///
+    /// INVARIANT: every shard 0..count is present.
+    /// INVARIANT: shard 0 is always present.
+    by_index: HashMap<ShardIndex, Shard>,
+    /// The shard count.
+    ///
+    /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
+    count: ShardCount,
+    /// The stripe size. Only used for sharded tenants.
+    stripe_size: ShardStripeSize,
+}
+
+impl Shards {
+    /// Creates a new set of shards based on a shard spec.
+    fn new(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_spec: ShardSpec,
+        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
+    ) -> anyhow::Result<Self> {
+        // NB: the shard spec has already been validated when constructed.
+        let mut shards = HashMap::with_capacity(shard_spec.urls.len());
+        for (shard_id, url) in shard_spec.urls {
+            shards.insert(
+                shard_id,
+                Shard::new(
+                    url,
+                    tenant_id,
+                    timeline_id,
+                    shard_id,
+                    auth_token.clone(),
+                    compression,
+                )?,
+            );
+        }
+
+        Ok(Self {
+            by_index: shards,
+            count: shard_spec.count,
+            stripe_size: shard_spec.stripe_size,
         })
     }
 
     /// Looks up the given shard.
     #[allow(clippy::result_large_err)] // TODO: check perf impact
     fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> {
-        self.map
+        self.by_index
             .get(&shard_id)
             .ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}")))
     }
@@ -302,21 +509,31 @@ impl Shards {
     }
 }
 
-/// A single shard. Uses dedicated resource pools with the following structure:
+/// A single shard. Has dedicated resource pools with the following structure:
 ///
-/// * Channel pool: unbounded.
-///   * Unary client pool: MAX_UNARY_CLIENTS.
-///   * Stream client pool: unbounded.
-///     * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH.
-/// * Bulk channel pool: unbounded.
+/// * Channel pool: MAX_CLIENTS_PER_CHANNEL.
+///   * Client pool: unbounded.
+///     * Stream pool: unbounded.
+/// * Bulk channel pool: MAX_BULK_CLIENTS_PER_CHANNEL.
 ///   * Bulk client pool: unbounded.
-///     * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
+///     * Bulk stream pool: unbounded.
+///
+/// We use a separate bulk channel pool with a lower concurrency limit for large batch requests.
+/// This avoids TCP-level head-of-line blocking, and also concentrates large window sizes on a
+/// smaller set of streams/connections, which presumably reduces memory use. Neither of these pools
+/// are bounded, nor do they pipeline requests, so the latency characteristics should be mostly
+/// similar (except for TCP transmission time).
+///
+/// TODO: since we never use bounded pools, we could consider removing the pool limiters. However,
+/// the code is fairly trivial, so we may as well keep them around for now in case we need them.
 struct Shard {
+    /// The shard ID.
+    id: ShardIndex,
     /// Unary gRPC client pool.
     client_pool: Arc<ClientPool>,
     /// GetPage stream pool.
     stream_pool: Arc<StreamPool>,
-    /// GetPage stream pool for bulk requests, e.g. prefetches.
+    /// GetPage stream pool for bulk requests.
     bulk_stream_pool: Arc<StreamPool>,
 }
 
@@ -328,56 +545,36 @@ impl Shard {
         timeline_id: TimelineId,
         shard_id: ShardIndex,
         auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
     ) -> anyhow::Result<Self> {
-        // Sanity-check that the URL uses gRPC.
-        if PageserverProtocol::from_connstring(&url)? != PageserverProtocol::Grpc {
-            return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
-        }
-
-        // Common channel pool for unary and stream requests. Bounded by client/stream pools.
-        let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
-
-        // Client pool for unary requests.
+        // Shard pools for unary requests and non-bulk GetPage requests.
         let client_pool = ClientPool::new(
-            channel_pool.clone(),
+            ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?,
             tenant_id,
             timeline_id,
             shard_id,
             auth_token.clone(),
-            Some(MAX_UNARY_CLIENTS),
+            compression,
+            None, // unbounded
         );
+        let stream_pool = StreamPool::new(client_pool.clone(), None); // unbounded
 
-        // GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients,
-        // but shares a channel pool with it (as it's unbounded).
-        let stream_pool = StreamPool::new(
-            ClientPool::new(
-                channel_pool.clone(),
-                tenant_id,
-                timeline_id,
-                shard_id,
-                auth_token.clone(),
-                None, // unbounded, limited by stream pool
-            ),
-            Some(MAX_STREAMS),
-            MAX_STREAM_QUEUE_DEPTH,
-        );
-
-        // Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
-        // to avoid head-of-line blocking of latency-sensitive requests.
+        // Bulk GetPage stream pool for large batches (prefetches, sequential scans, vacuum, etc.).
         let bulk_stream_pool = StreamPool::new(
             ClientPool::new(
-                ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?,
+                ChannelPool::new(url, MAX_BULK_CLIENTS_PER_CHANNEL)?,
                 tenant_id,
                 timeline_id,
                 shard_id,
                 auth_token,
-                None, // unbounded, limited by stream pool
+                compression,
+                None, // unbounded,
             ),
-            Some(MAX_BULK_STREAMS),
-            MAX_BULK_STREAM_QUEUE_DEPTH,
+            None, // unbounded
         );
 
         Ok(Self {
+            id: shard_id,
             client_pool,
             stream_pool,
             bulk_stream_pool,
@@ -385,19 +582,23 @@ impl Shard {
     }
 
     /// Returns a pooled client for this shard.
+    #[instrument(skip_all)]
     async fn client(&self) -> tonic::Result<ClientGuard> {
-        self.client_pool
-            .get()
-            .await
-            .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
+        warn_slow(
+            "client pool acquisition",
+            SLOW_THRESHOLD,
+            pin!(self.client_pool.get()),
+        )
+        .await
     }
 
-    /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
-    /// pool (e.g. for prefetches).
-    async fn stream(&self, bulk: bool) -> StreamGuard {
-        match bulk {
-            false => self.stream_pool.get().await,
-            true => self.bulk_stream_pool.get().await,
-        }
+    /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk pool.
+    #[instrument(skip_all, fields(bulk))]
+    async fn stream(&self, bulk: bool) -> tonic::Result<StreamGuard> {
+        let pool = match bulk {
+            false => &self.stream_pool,
+            true => &self.bulk_stream_pool,
+        };
+        warn_slow("stream pool acquisition", SLOW_THRESHOLD, pin!(pool.get())).await
     }
 }
diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs
index 3fc7178be2..14fb3fbd5a 100644
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -3,4 +3,4 @@ mod pool;
 mod retry;
 mod split;
 
-pub use client::PageserverClient;
+pub use client::{PageserverClient, ShardSpec};
diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 5a50004fd1..98a649b4c8 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -9,19 +9,36 @@
 //!
 //! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
 //!   can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
-//!   per-channel client limit. Channels may be closed when they are no longer used by any clients.
+//!   per-channel client limit. Channels are closed immediately when empty, and indirectly rely on
+//!   client/stream idle timeouts.
 //!
 //! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
 //!   channel from the ChannelPool for the client's lifetime. A client can only be acquired by a
-//!   single caller at a time, and is returned to the pool when dropped. Idle clients may be removed
-//!   from the pool after some time, to free up the channel.
+//!   single caller at a time, and is returned to the pool when dropped. Idle clients are removed
+//!   from the pool after a while to free up resources.
 //!
 //! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
-//!   ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
-//!   returns a guard that can be used to send a single request, to properly enforce queue depth and
-//!   route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
-//!   possibly pipelining multiple requests from multiple callers on the same stream (up to some
-//!   queue depth). Idle streams may be removed from the pool after a while to free up the client.
+//!   ClientPool for the stream's lifetime. A stream can only be acquired by a single caller at a
+//!   time, and is returned to the pool when dropped. Idle streams are removed from the pool after
+//!   a while to free up resources.
+//!
+//!   The stream only supports sending a single, synchronous request at a time, and does not support
+//!   pipelining multiple requests from different callers onto the same stream -- instead, we scale
+//!   out concurrent streams to improve throughput. There are many reasons for this design choice:
+//!
+//!     * It (mostly) eliminates head-of-line blocking. A single stream is processed sequentially by
+//!       a single server task, which may block e.g. on layer downloads, LSN waits, etc.
+//!
+//!     * Cancellation becomes trivial, by closing the stream. Otherwise, if a caller goes away
+//!       (e.g. because of a timeout), the request would still be processed by the server and block
+//!       requests behind it in the stream. It might even block its own timeout retry.
+//!
+//!     * Stream scheduling becomes significantly simpler and cheaper.
+//!
+//!     * Individual callers can still use client-side batching for pipelining.
+//!
+//!     * Idle streams are cheap. Benchmarks show that an idle GetPage stream takes up about 26 KB
+//!       per stream (2.5 GB for 100,000 streams), so we can afford to scale out.
 //!
 //! Each channel corresponds to one TCP connection. Each client unary request and each stream
 //! corresponds to one HTTP/2 stream and server task.
@@ -29,22 +46,42 @@
 //! TODO: error handling (including custom error types).
 //! TODO: observability.
 
-use std::collections::{BTreeMap, HashMap};
+use std::collections::BTreeMap;
 use std::num::NonZero;
 use std::ops::{Deref, DerefMut};
+use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, Weak};
+use std::time::{Duration, Instant};
 
-use futures::StreamExt as _;
-use tokio::sync::mpsc::{Receiver, Sender};
-use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use futures::{Stream, StreamExt as _};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, watch};
+use tokio_stream::wrappers::WatchStream;
+use tokio_util::sync::CancellationToken;
+use tonic::codec::CompressionEncoding;
 use tonic::transport::{Channel, Endpoint};
-use tracing::{error, warn};
 
 use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;
 
+/// Reap clients/streams that have been idle for this long. Channels are reaped immediately when
+/// empty, and indirectly rely on the client/stream idle timeouts.
+///
+/// A stream's client will be reaped after 2x the idle threshold (first stream the client), but
+/// that's okay -- if the stream closes abruptly (e.g. due to timeout or cancellation), we want to
+/// keep its client around in the pool for a while.
+const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
+    false => Duration::from_secs(180),
+    true => Duration::from_secs(1), // exercise reaping in tests
+};
+
+/// Reap idle resources with this interval.
+const REAP_IDLE_INTERVAL: Duration = match cfg!(any(test, feature = "testing")) {
+    false => Duration::from_secs(10),
+    true => Duration::from_secs(1), // exercise reaping in tests
+};
+
 /// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
 /// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this.
 /// The pool does not limit the number of channels, and instead relies on `ClientPool` or
@@ -52,7 +89,6 @@ use utils::shard::ShardIndex;
 ///
 /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
 ///
-/// TODO: reap idle channels.
 /// TODO: consider prewarming a set of channels, to avoid initial connection latency.
 /// TODO: consider adding a circuit breaker for errors and fail fast.
 pub struct ChannelPool {
@@ -108,14 +144,15 @@ impl ChannelPool {
         let mut channels = self.channels.lock().unwrap();
 
         // Try to find an existing channel with available capacity. We check entries in BTreeMap
-        // order, to fill up the lower-ordered channels first. The ClientPool also prefers clients
-        // with lower-ordered channel IDs first. This will cluster clients in lower-ordered
+        // order, to fill up the lower-ordered channels first. The client/stream pools also prefer
+        // clients with lower-ordered channel IDs first. This will cluster clients in lower-ordered
         // channels, and free up higher-ordered channels such that they can be reaped.
         for (&id, entry) in channels.iter_mut() {
             assert!(
                 entry.clients <= self.max_clients_per_channel.get(),
                 "channel overflow"
             );
+            assert_ne!(entry.clients, 0, "empty channel not reaped");
             if entry.clients < self.max_clients_per_channel.get() {
                 entry.clients += 1;
                 return ChannelGuard {
@@ -161,16 +198,22 @@ impl ChannelGuard {
     }
 }
 
-/// Returns the channel to the pool.
+/// Returns the channel to the pool. The channel is closed when empty.
 impl Drop for ChannelGuard {
     fn drop(&mut self) {
         let Some(pool) = self.pool.upgrade() else {
             return; // pool was dropped
         };
+
         let mut channels = pool.channels.lock().unwrap();
         let entry = channels.get_mut(&self.id).expect("unknown channel");
         assert!(entry.clients > 0, "channel underflow");
         entry.clients -= 1;
+
+        // Reap empty channels immediately.
+        if entry.clients == 0 {
+            channels.remove(&self.id);
+        }
     }
 }
 
@@ -179,8 +222,6 @@ impl Drop for ChannelGuard {
 /// number of concurrent clients to `max_clients` via semaphore.
 ///
 /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
-///
-/// TODO: reap idle clients.
 pub struct ClientPool {
     /// Tenant ID.
     tenant_id: TenantId,
@@ -190,6 +231,8 @@ pub struct ClientPool {
     shard_id: ShardIndex,
     /// Authentication token, if any.
     auth_token: Option<String>,
+    /// Compression to use.
+    compression: Option<CompressionEncoding>,
     /// Channel pool to acquire channels from.
     channel_pool: Arc<ChannelPool>,
     /// Limits the max number of concurrent clients for this pool. None if the pool is unbounded.
@@ -198,9 +241,10 @@ pub struct ClientPool {
     ///
     /// The first client in the map will be acquired next. The map is sorted by client ID, which in
     /// turn is sorted by its channel ID, such that we prefer acquiring idle clients from
-    /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
-    /// clients are reaped.
+    /// lower-ordered channels. This allows us to free up and reap higher-ordered channels.
     idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
+    /// Reaps idle clients.
+    idle_reaper: Reaper,
     /// Unique client ID generator.
     next_client_id: AtomicUsize,
 }
@@ -212,6 +256,9 @@ struct ClientEntry {
     client: page_api::Client,
     /// The channel guard for the channel used by the client.
     channel_guard: ChannelGuard,
+    /// The client has been idle since this time. All clients in `ClientPool::idle` are idle by
+    /// definition, so this is the time when it was added back to the pool.
+    idle_since: Instant,
 }
 
 impl ClientPool {
@@ -224,18 +271,23 @@ impl ClientPool {
         timeline_id: TimelineId,
         shard_id: ShardIndex,
         auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
         max_clients: Option<NonZero<usize>>,
     ) -> Arc<Self> {
-        Arc::new(Self {
+        let pool = Arc::new(Self {
             tenant_id,
             timeline_id,
             shard_id,
             auth_token,
+            compression,
             channel_pool,
             idle: Mutex::default(),
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
             limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))),
             next_client_id: AtomicUsize::default(),
-        })
+        });
+        pool.idle_reaper.spawn(&pool);
+        pool
     }
 
     /// Gets a client from the pool, or creates a new one if necessary. Connections are established
@@ -245,7 +297,7 @@ impl ClientPool {
     /// This is moderately performance-sensitive. It is called for every unary request, but these
     /// establish a new gRPC stream per request so they're already expensive. GetPage requests use
     /// the `StreamPool` instead.
-    pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
+    pub async fn get(self: &Arc<Self>) -> tonic::Result<ClientGuard> {
         // Acquire a permit if the pool is bounded.
         let mut permit = None;
         if let Some(limiter) = self.limiter.clone() {
@@ -263,7 +315,7 @@ impl ClientPool {
             });
         }
 
-        // Slow path: construct a new client.
+        // Construct a new client.
         let mut channel_guard = self.channel_pool.get();
         let client = page_api::Client::new(
             channel_guard.take(),
@@ -271,8 +323,9 @@ impl ClientPool {
             self.timeline_id,
             self.shard_id,
             self.auth_token.clone(),
-            None,
-        )?;
+            self.compression,
+        )
+        .map_err(|err| tonic::Status::internal(format!("failed to create client: {err}")))?;
 
         Ok(ClientGuard {
             pool: Arc::downgrade(self),
@@ -287,6 +340,16 @@ impl ClientPool {
     }
 }
 
+impl Reapable for ClientPool {
+    /// Reaps clients that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.idle
+            .lock()
+            .unwrap()
+            .retain(|_, entry| entry.idle_since >= cutoff)
+    }
+}
+
 /// A client acquired from the pool. The inner client can be accessed via Deref. The client is
 /// returned to the pool when dropped.
 pub struct ClientGuard {
@@ -317,9 +380,11 @@ impl Drop for ClientGuard {
         let Some(pool) = self.pool.upgrade() else {
             return; // pool was dropped
         };
+
         let entry = ClientEntry {
             client: self.client.take().expect("dropped once"),
             channel_guard: self.channel_guard.take().expect("dropped once"),
+            idle_since: Instant::now(),
         };
         pool.idle.lock().unwrap().insert(self.id, entry);
 
@@ -330,269 +395,268 @@ impl Drop for ClientGuard {
 /// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
 /// acquires a client from the inner `ClientPool` for the stream's lifetime.
 ///
-/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send
-/// a single request and await the response. Internally, requests are multiplexed across streams and
-/// channels. This allows proper queue depth enforcement and response routing.
+/// Individual streams only send a single request at a time, and do not pipeline multiple callers
+/// onto the same stream. Instead, we scale out the number of concurrent streams. This is primarily
+/// to eliminate head-of-line blocking. See the module documentation for more details.
 ///
-/// TODO: reap idle streams.
 /// TODO: consider making this generic over request and response types; not currently needed.
 pub struct StreamPool {
     /// The client pool to acquire clients from. Must be unbounded.
     client_pool: Arc<ClientPool>,
-    /// All pooled streams.
+    /// Idle pooled streams. Acquired streams are removed from here and returned on drop.
     ///
-    /// Incoming requests will be sent over an existing stream with available capacity. If all
-    /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
-    /// stream has an associated Tokio task that processes requests and responses.
-    streams: Arc<Mutex<HashMap<StreamID, StreamEntry>>>,
-    /// The max number of concurrent streams, or None if unbounded.
-    max_streams: Option<NonZero<usize>>,
-    /// The max number of concurrent requests per stream.
-    max_queue_depth: NonZero<usize>,
-    /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
-    /// None if the pool is unbounded.
+    /// The first stream in the map will be acquired next. The map is sorted by stream ID, which is
+    /// equivalent to the client ID and in turn sorted by its channel ID. This way we prefer
+    /// acquiring idle streams from lower-ordered channels, which allows us to free up and reap
+    /// higher-ordered channels.
+    idle: Mutex<BTreeMap<StreamID, StreamEntry>>,
+    /// Limits the max number of concurrent streams. None if the pool is unbounded.
     limiter: Option<Arc<Semaphore>>,
-    /// Stream ID generator.
-    next_stream_id: AtomicUsize,
+    /// Reaps idle streams.
+    idle_reaper: Reaper,
 }
 
-type StreamID = usize;
-type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>;
-type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>;
-type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
+/// The stream ID. Reuses the inner client ID.
+type StreamID = ClientID;
 
+/// A pooled stream.
 struct StreamEntry {
-    /// Sends caller requests to the stream task. The stream task exits when this is dropped.
-    sender: RequestSender,
-    /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on
-    /// completion without acquiring the `StreamPool::streams` lock.
-    queue_depth: Arc<AtomicUsize>,
+    /// The bidirectional stream.
+    stream: BiStream,
+    /// The time when this stream was last used, i.e. when it was put back into `StreamPool::idle`.
+    idle_since: Instant,
+}
+
+/// A bidirectional GetPage stream and its client. Can send requests and receive responses.
+struct BiStream {
+    /// The owning client. Holds onto the channel slot while the stream is alive.
+    client: ClientGuard,
+    /// Stream for sending requests. Uses a watch channel, so it can only send a single request at a
+    /// time, and the caller must await the response before sending another request. This is
+    /// enforced by `StreamGuard::send`.
+    sender: watch::Sender<page_api::GetPageRequest>,
+    /// Stream for receiving responses.
+    receiver: Pin<Box<dyn Stream<Item = tonic::Result<page_api::GetPageResponse>> + Send>>,
 }
 
 impl StreamPool {
-    /// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth`
-    /// concurrent requests on each stream, and use up to `max_streams` concurrent streams.
+    /// Creates a new stream pool, using the given client pool. It will use up to `max_streams`
+    /// concurrent streams.
     ///
     /// The client pool must be unbounded. The stream pool will enforce its own limits, and because
     /// streams are long-lived they can cause persistent starvation if they exhaust the client pool.
     /// The stream pool should generally have its own dedicated client pool (but it can share a
     /// channel pool with others since these are always unbounded).
-    pub fn new(
-        client_pool: Arc<ClientPool>,
-        max_streams: Option<NonZero<usize>>,
-        max_queue_depth: NonZero<usize>,
-    ) -> Arc<Self> {
+    pub fn new(client_pool: Arc<ClientPool>, max_streams: Option<NonZero<usize>>) -> Arc<Self> {
         assert!(client_pool.limiter.is_none(), "bounded client pool");
-        Arc::new(Self {
+        let pool = Arc::new(Self {
             client_pool,
-            streams: Arc::default(),
-            limiter: max_streams.map(|max_streams| {
-                Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
-            }),
-            max_streams,
-            max_queue_depth,
-            next_stream_id: AtomicUsize::default(),
-        })
+            idle: Mutex::default(),
+            limiter: max_streams.map(|max_streams| Arc::new(Semaphore::new(max_streams.get()))),
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
+        });
+        pool.idle_reaper.spawn(&pool);
+        pool
     }
 
-    /// Acquires an available stream from the pool, or spins up a new stream async if all streams
-    /// are full. Returns a guard that can be used to send a single request on the stream and await
-    /// the response, with queue depth quota already acquired. Blocks if the pool is at capacity
-    /// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight).
+    /// Acquires an available stream from the pool, or spins up a new stream if all streams are
+    /// full. Returns a guard that can be used to send requests and await the responses. Blocks if
+    /// the pool is full.
     ///
     /// This is very performance-sensitive, as it is on the GetPage hot path.
     ///
-    /// TODO: this must do something more sophisticated for performance. We want:
-    ///
-    /// * Cheap, concurrent access in the common case where we can use a pooled stream.
-    /// * Quick acquisition of pooled streams with available capacity.
-    /// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
-    /// * Prefer filling up existing streams' queue depth before spinning up new streams.
-    /// * Don't hold a lock while spinning up new streams.
-    /// * Allow concurrent clients to join onto streams while they're spun up.
-    /// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
-    ///
-    /// For now, we just do something simple and functional, but very inefficient (linear scan).
-    pub async fn get(&self) -> StreamGuard {
+    /// TODO: is a `Mutex<BTreeMap>` performant enough? Will it become too contended? We can't
+    /// trivially use e.g. DashMap or sharding, because we want to pop lower-ordered streams first
+    /// to free up higher-ordered channels.
+    pub async fn get(self: &Arc<Self>) -> tonic::Result<StreamGuard> {
         // Acquire a permit if the pool is bounded.
         let mut permit = None;
         if let Some(limiter) = self.limiter.clone() {
             permit = Some(limiter.acquire_owned().await.expect("never closed"));
         }
-        let mut streams = self.streams.lock().unwrap();
 
-        // Look for a pooled stream with available capacity.
-        for entry in streams.values() {
-            assert!(
-                entry.queue_depth.load(Ordering::Relaxed) <= self.max_queue_depth.get(),
-                "stream queue overflow"
-            );
-            if entry
-                .queue_depth
-                .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| {
-                    // Increment the queue depth via compare-and-swap.
-                    // TODO: review ordering.
-                    (queue_depth < self.max_queue_depth.get()).then_some(queue_depth + 1)
-                })
-                .is_ok()
-            {
-                return StreamGuard {
-                    sender: entry.sender.clone(),
-                    queue_depth: entry.queue_depth.clone(),
-                    permit,
-                };
-            }
+        // Fast path: acquire an idle stream from the pool.
+        if let Some((_, entry)) = self.idle.lock().unwrap().pop_first() {
+            return Ok(StreamGuard {
+                pool: Arc::downgrade(self),
+                stream: Some(entry.stream),
+                can_reuse: true,
+                permit,
+            });
         }
 
-        // No available stream, spin up a new one. We install the stream entry in the pool first and
-        // return the guard, while spinning up the stream task async. This allows other callers to
-        // join onto this stream and also create additional streams concurrently if this fills up.
-        let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
-        let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller
-        let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
-        let entry = StreamEntry {
-            sender: req_tx.clone(),
-            queue_depth: queue_depth.clone(),
-        };
-        streams.insert(id, entry);
+        // Spin up a new stream. Uses a watch channel to send a single request at a time, since
+        // `StreamGuard::send` enforces this anyway and it avoids unnecessary channel overhead.
+        let mut client = self.client_pool.get().await?;
 
-        if let Some(max_streams) = self.max_streams {
-            assert!(streams.len() <= max_streams.get(), "stream overflow");
-        };
+        let (req_tx, req_rx) = watch::channel(page_api::GetPageRequest::default());
+        let req_stream = WatchStream::from_changes(req_rx);
+        let resp_stream = client.get_pages(req_stream).await?;
 
-        let client_pool = self.client_pool.clone();
-        let streams = self.streams.clone();
-
-        tokio::spawn(async move {
-            if let Err(err) = Self::run_stream(client_pool, req_rx).await {
-                error!("stream failed: {err}");
-            }
-            // Remove stream from pool on exit.
-            let entry = streams.lock().unwrap().remove(&id);
-            assert!(entry.is_some(), "unknown stream ID: {id}");
-        });
-
-        StreamGuard {
-            sender: req_tx,
-            queue_depth,
+        Ok(StreamGuard {
+            pool: Arc::downgrade(self),
+            stream: Some(BiStream {
+                client,
+                sender: req_tx,
+                receiver: Box::pin(resp_stream),
+            }),
+            can_reuse: true,
             permit,
-        }
-    }
-
-    /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
-    /// bidirectional GetPage stream, then forwards requests and responses between callers and the
-    /// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be
-    /// atomic with pool stream acquisition.
-    ///
-    /// The task exits when the request channel is closed, or on a stream error. The caller is
-    /// responsible for removing the stream from the pool on exit.
-    async fn run_stream(
-        client_pool: Arc<ClientPool>,
-        mut caller_rx: RequestReceiver,
-    ) -> anyhow::Result<()> {
-        // Acquire a client from the pool and create a stream.
-        let mut client = client_pool.get().await?;
-
-        // NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could
-        // theoretically deadlock if both the client and server block on sends (since we're not
-        // reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and
-        // low queue depths, but it was seen to happen with the libpq protocol so better safe than
-        // sorry. It should never buffer more than the queue depth anyway, but using an unbounded
-        // channel guarantees that it will never block.
-        let (req_tx, req_rx) = mpsc::unbounded_channel();
-        let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
-        let mut resp_stream = client.get_pages(req_stream).await?;
-
-        // Track caller response channels by request ID. If the task returns early, these response
-        // channels will be dropped and the waiting callers will receive an error.
-        let mut callers = HashMap::new();
-
-        // Process requests and responses.
-        loop {
-            tokio::select! {
-                // Receive requests from callers and send them to the stream.
-                req = caller_rx.recv() => {
-                    // Shut down if request channel is closed.
-                    let Some((req, resp_tx)) = req else {
-                        return Ok(());
-                    };
-
-                    // Store the response channel by request ID.
-                    if callers.contains_key(&req.request_id) {
-                        // Error on request ID duplicates. Ignore callers that went away.
-                        _ = resp_tx.send(Err(tonic::Status::invalid_argument(
-                            format!("duplicate request ID: {}", req.request_id),
-                        )));
-                        continue;
-                    }
-                    callers.insert(req.request_id, resp_tx);
-
-                    // Send the request on the stream. Bail out if the stream is closed.
-                    req_tx.send(req).map_err(|_| {
-                        tonic::Status::unavailable("stream closed")
-                    })?;
-                }
-
-                // Receive responses from the stream and send them to callers.
-                resp = resp_stream.next() => {
-                    // Shut down if the stream is closed, and bail out on stream errors.
-                    let Some(resp) = resp.transpose()? else {
-                        return Ok(())
-                    };
-
-                    // Send the response to the caller. Ignore errors if the caller went away.
-                    let Some(resp_tx) = callers.remove(&resp.request_id) else {
-                        warn!("received response for unknown request ID: {}", resp.request_id);
-                        continue;
-                    };
-                    _ = resp_tx.send(Ok(resp));
-                }
-            }
-        }
+        })
     }
 }
 
-/// A pooled stream reference. Can be used to send a single request, to properly enforce queue
-/// depth. Queue depth is already reserved and will be returned on drop.
+impl Reapable for StreamPool {
+    /// Reaps streams that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.idle
+            .lock()
+            .unwrap()
+            .retain(|_, entry| entry.idle_since >= cutoff);
+    }
+}
+
+/// A stream acquired from the pool. Returned to the pool when dropped, unless there are still
+/// in-flight requests on the stream, or the stream failed.
 pub struct StreamGuard {
-    sender: RequestSender,
-    queue_depth: Arc<AtomicUsize>,
+    pool: Weak<StreamPool>,
+    stream: Option<BiStream>,             // Some until dropped
+    can_reuse: bool,                      // returned to pool if true
     permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
 }
 
 impl StreamGuard {
-    /// Sends a request on the stream and awaits the response. Consumes the guard, since it's only
-    /// valid for a single request (to enforce queue depth). This also drops the guard on return and
-    /// returns the queue depth quota to the pool.
+    /// Sends a request on the stream and awaits the response. If the future is dropped before it
+    /// resolves (e.g. due to a timeout or cancellation), the stream will be closed to cancel the
+    /// request and is not returned to the pool. The same is true if the stream errors, in which
+    /// case the caller can't send further requests on the stream.
     ///
-    /// The `GetPageRequest::request_id` must be unique across in-flight requests.
+    /// We only support sending a single request at a time, to eliminate head-of-line blocking. See
+    /// module documentation for details.
     ///
     /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
     /// to avoid tearing down the stream for per-request errors. Callers must check this.
     pub async fn send(
-        self,
+        &mut self,
         req: page_api::GetPageRequest,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let (resp_tx, resp_rx) = oneshot::channel();
+        let req_id = req.request_id;
+        let stream = self.stream.as_mut().expect("not dropped");
 
-        self.sender
-            .send((req, resp_tx))
-            .await
+        // Mark the stream as not reusable while the request is in flight. We can't return the
+        // stream to the pool until we receive the response, to avoid head-of-line blocking and
+        // stale responses. Failed streams can't be reused either.
+        if !self.can_reuse {
+            return Err(tonic::Status::internal("stream can't be reused"));
+        }
+        self.can_reuse = false;
+
+        // Send the request and receive the response.
+        //
+        // NB: this uses a watch channel, so it's unsafe to change this code to pipeline requests.
+        stream
+            .sender
+            .send(req)
             .map_err(|_| tonic::Status::unavailable("stream closed"))?;
 
-        resp_rx
+        let resp = stream
+            .receiver
+            .next()
             .await
-            .map_err(|_| tonic::Status::unavailable("stream closed"))?
+            .ok_or_else(|| tonic::Status::unavailable("stream closed"))??;
+
+        if resp.request_id != req_id {
+            return Err(tonic::Status::internal(format!(
+                "response ID {} does not match request ID {}",
+                resp.request_id, req_id
+            )));
+        }
+
+        // Success, mark the stream as reusable.
+        self.can_reuse = true;
+
+        Ok(resp)
     }
 }
 
 impl Drop for StreamGuard {
     fn drop(&mut self) {
-        // Release the queue depth reservation on drop. This can prematurely decrement it if dropped
-        // before the response is received, but that's okay.
-        let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst);
-        assert!(prev_queue_depth > 0, "stream queue underflow");
+        let Some(pool) = self.pool.upgrade() else {
+            return; // pool was dropped
+        };
+
+        // If the stream isn't reusable, it can't be returned to the pool.
+        if !self.can_reuse {
+            return;
+        }
+
+        // Place the idle stream back into the pool.
+        let entry = StreamEntry {
+            stream: self.stream.take().expect("dropped once"),
+            idle_since: Instant::now(),
+        };
+        pool.idle
+            .lock()
+            .unwrap()
+            .insert(entry.stream.client.id, entry);
 
         _ = self.permit; // returned on drop, referenced for visibility
     }
 }
+
+/// Periodically reaps idle resources from a pool.
+struct Reaper {
+    /// The task check interval.
+    interval: Duration,
+    /// The threshold for reaping idle resources.
+    threshold: Duration,
+    /// Cancels the reaper task. Cancelled when the reaper is dropped.
+    cancel: CancellationToken,
+}
+
+impl Reaper {
+    /// Creates a new reaper.
+    pub fn new(threshold: Duration, interval: Duration) -> Self {
+        Self {
+            cancel: CancellationToken::new(),
+            threshold,
+            interval,
+        }
+    }
+
+    /// Spawns a task to periodically reap idle resources from the given task pool. The task is
+    /// cancelled when the reaper is dropped.
+    pub fn spawn(&self, pool: &Arc<impl Reapable>) {
+        // NB: hold a weak pool reference, otherwise the task will prevent dropping the pool.
+        let pool = Arc::downgrade(pool);
+        let cancel = self.cancel.clone();
+        let (interval, threshold) = (self.interval, self.threshold);
+
+        tokio::spawn(async move {
+            loop {
+                tokio::select! {
+                    _ = tokio::time::sleep(interval) => {
+                        let Some(pool) = pool.upgrade() else {
+                            return; // pool was dropped
+                        };
+                        pool.reap_idle(Instant::now() - threshold);
+                    }
+
+                    _ = cancel.cancelled() => return,
+                }
+            }
+        });
+    }
+}
+
+impl Drop for Reaper {
+    fn drop(&mut self) {
+        self.cancel.cancel(); // cancel reaper task
+    }
+}
+
+/// A reapable resource pool.
+trait Reapable: Send + Sync + 'static {
+    /// Reaps resources that have been idle since before the given cutoff.
+    fn reap_idle(&self, cutoff: Instant);
+}
diff --git a/pageserver/client_grpc/src/retry.rs b/pageserver/client_grpc/src/retry.rs
index b0473204d7..8a138711e8 100644
--- a/pageserver/client_grpc/src/retry.rs
+++ b/pageserver/client_grpc/src/retry.rs
@@ -1,5 +1,6 @@
 use std::time::Duration;
 
+use futures::future::pending;
 use tokio::time::Instant;
 use tracing::{error, info, warn};
 
@@ -8,60 +9,54 @@ use utils::backoff::exponential_backoff_duration;
 /// A retry handler for Pageserver gRPC requests.
 ///
 /// This is used instead of backoff::retry for better control and observability.
-pub struct Retry;
+pub struct Retry {
+    /// Timeout across all retry attempts. If None, retries forever.
+    pub timeout: Option<Duration>,
+    /// The initial backoff duration. The first retry does not use a backoff.
+    pub base_backoff: Duration,
+    /// The maximum backoff duration.
+    pub max_backoff: Duration,
+}
 
 impl Retry {
-    /// The per-request timeout.
-    // TODO: tune these, and/or make them configurable. Should we retry forever?
-    const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
-    /// The total timeout across all attempts
-    const TOTAL_TIMEOUT: Duration = Duration::from_secs(60);
-    /// The initial backoff duration.
-    const BASE_BACKOFF: Duration = Duration::from_millis(10);
-    /// The maximum backoff duration.
-    const MAX_BACKOFF: Duration = Duration::from_secs(10);
-    /// If true, log successful requests. For debugging.
-    const LOG_SUCCESS: bool = false;
-
     /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
     /// using the current tracing span for context.
     ///
-    /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
-    /// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
+    /// Only certain gRPC status codes are retried, see [`Self::should_retry`].
     pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
     where
-        F: FnMut() -> O,
+        F: FnMut(usize) -> O, // pass attempt number, starting at 0
         O: Future<Output = tonic::Result<T>>,
     {
         let started = Instant::now();
-        let deadline = started + Self::TOTAL_TIMEOUT;
+        let deadline = self.timeout.map(|timeout| started + timeout);
         let mut last_error = None;
         let mut retries = 0;
         loop {
-            // Set up a future to wait for the backoff (if any) and run the request with a timeout.
+            // Set up a future to wait for the backoff, if any, and run the closure.
             let backoff_and_try = async {
                 // NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
                 // https://github.com/tokio-rs/tokio/issues/6866
-                if let Some(backoff) = Self::backoff_duration(retries) {
+                if let Some(backoff) = self.backoff_duration(retries) {
                     tokio::time::sleep(backoff).await;
                 }
 
-                let request_started = Instant::now();
-                tokio::time::timeout(Self::REQUEST_TIMEOUT, f())
-                    .await
-                    .map_err(|_| {
-                        tonic::Status::deadline_exceeded(format!(
-                            "request timed out after {:.3}s",
-                            request_started.elapsed().as_secs_f64()
-                        ))
-                    })?
+                f(retries).await
             };
 
-            // Wait for the backoff and request, or bail out if the total timeout is exceeded.
+            // Set up a future for the timeout, if any.
+            let timeout = async {
+                match deadline {
+                    Some(deadline) => tokio::time::sleep_until(deadline).await,
+                    None => pending().await,
+                }
+            };
+
+            // Wait for the backoff and request, or bail out if the timeout is exceeded.
             let result = tokio::select! {
                 result = backoff_and_try => result,
 
-                _ = tokio::time::sleep_until(deadline) => {
+                _ = timeout => {
                     let last_error = last_error.unwrap_or_else(|| {
                         tonic::Status::deadline_exceeded(format!(
                             "request timed out after {:.3}s",
@@ -79,7 +74,7 @@ impl Retry {
             match result {
                 // Success, return the result.
                 Ok(result) => {
-                    if retries > 0 || Self::LOG_SUCCESS {
+                    if retries > 0 {
                         info!(
                             "request succeeded after {retries} retries in {:.3}s",
                             started.elapsed().as_secs_f64(),
@@ -112,12 +107,13 @@ impl Retry {
         }
     }
 
-    /// Returns the backoff duration for the given retry attempt, or None for no backoff.
-    fn backoff_duration(retry: usize) -> Option<Duration> {
+    /// Returns the backoff duration for the given retry attempt, or None for no backoff. The first
+    /// attempt and first retry never backs off, so this returns None for 0 and 1 retries.
+    fn backoff_duration(&self, retries: usize) -> Option<Duration> {
         let backoff = exponential_backoff_duration(
-            retry as u32,
-            Self::BASE_BACKOFF.as_secs_f64(),
-            Self::MAX_BACKOFF.as_secs_f64(),
+            (retries as u32).saturating_sub(1), // first retry does not back off
+            self.base_backoff.as_secs_f64(),
+            self.max_backoff.as_secs_f64(),
         );
         (!backoff.is_zero()).then_some(backoff)
     }
@@ -131,7 +127,6 @@ impl Retry {
             tonic::Code::Aborted => true,
             tonic::Code::Cancelled => true,
             tonic::Code::DeadlineExceeded => true, // maybe transient slowness
-            tonic::Code::Internal => true,         // maybe transient failure?
             tonic::Code::ResourceExhausted => true,
             tonic::Code::Unavailable => true,
 
@@ -139,6 +134,10 @@ impl Retry {
             tonic::Code::AlreadyExists => false,
             tonic::Code::DataLoss => false,
             tonic::Code::FailedPrecondition => false,
+            // NB: don't retry Internal. It is intended for serious errors such as invariant
+            // violations, and is also used for client-side invariant checks that would otherwise
+            // result in retry loops.
+            tonic::Code::Internal => false,
             tonic::Code::InvalidArgument => false,
             tonic::Code::NotFound => false,
             tonic::Code::OutOfRange => false,
diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/client_grpc/src/split.rs
index 5bbcaab393..b7539b900c 100644
--- a/pageserver/client_grpc/src/split.rs
+++ b/pageserver/client_grpc/src/split.rs
@@ -5,27 +5,24 @@ use bytes::Bytes;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
 use pageserver_page_api as page_api;
-use utils::shard::{ShardCount, ShardIndex};
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};
 
 /// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
 /// TODO: add tests for this.
 pub struct GetPageSplitter {
-    /// The original request ID. Used for all shard requests.
-    request_id: page_api::RequestID,
     /// Split requests by shard index.
     requests: HashMap<ShardIndex, page_api::GetPageRequest>,
-    /// Maps the offset in `GetPageRequest::block_numbers` to the owning shard. Used to assemble
-    /// the response pages in the same order as the original request.
+    /// The response being assembled. Preallocated with empty pages, to be filled in.
+    response: page_api::GetPageResponse,
+    /// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used
+    /// to assemble the response pages in the same order as the original request.
     block_shards: Vec<ShardIndex>,
-    /// Page responses by shard index. Will be assembled into a single response.
-    responses: HashMap<ShardIndex, Vec<Bytes>>,
 }
 
 impl GetPageSplitter {
     /// Checks if the given request only touches a single shard, and returns the shard ID. This is
     /// the common case, so we check first in order to avoid unnecessary allocations and overhead.
-    /// The caller must ensure that the request has at least one block number, or this will panic.
-    pub fn is_single_shard(
+    pub fn for_single_shard(
         req: &page_api::GetPageRequest,
         count: ShardCount,
         stripe_size: ShardStripeSize,
@@ -35,8 +32,12 @@ impl GetPageSplitter {
             return Some(ShardIndex::unsharded());
         }
 
-        // Find the base shard index for the first page, and compare with the rest.
-        let key = rel_block_to_key(req.rel, *req.block_numbers.first().expect("no pages"));
+        // Find the first page's shard, for comparison. If there are no pages, just return the first
+        // shard (caller likely checked already, otherwise the server will reject it).
+        let Some(&first_page) = req.block_numbers.first() else {
+            return Some(ShardIndex::new(ShardNumber(0), count));
+        };
+        let key = rel_block_to_key(req.rel, first_page);
         let shard_number = key_to_shard_number(count, stripe_size, &key);
 
         req.block_numbers
@@ -57,19 +58,19 @@ impl GetPageSplitter {
     ) -> Self {
         // The caller should make sure we don't split requests unnecessarily.
         debug_assert!(
-            Self::is_single_shard(&req, count, stripe_size).is_none(),
+            Self::for_single_shard(&req, count, stripe_size).is_none(),
             "unnecessary request split"
         );
 
         // Split the requests by shard index.
         let mut requests = HashMap::with_capacity(2); // common case
         let mut block_shards = Vec::with_capacity(req.block_numbers.len());
-        for blkno in req.block_numbers {
+        for &blkno in &req.block_numbers {
             let key = rel_block_to_key(req.rel, blkno);
             let shard_number = key_to_shard_number(count, stripe_size, &key);
             let shard_id = ShardIndex::new(shard_number, count);
 
-            let shard_req = requests
+            requests
                 .entry(shard_id)
                 .or_insert_with(|| page_api::GetPageRequest {
                     request_id: req.request_id,
@@ -77,27 +78,47 @@ impl GetPageSplitter {
                     rel: req.rel,
                     read_lsn: req.read_lsn,
                     block_numbers: Vec::new(),
-                });
-            shard_req.block_numbers.push(blkno);
+                })
+                .block_numbers
+                .push(blkno);
             block_shards.push(shard_id);
         }
 
-        Self {
+        // Construct a response to be populated by shard responses. Preallocate empty page slots
+        // with the expected block numbers.
+        let response = page_api::GetPageResponse {
             request_id: req.request_id,
-            responses: HashMap::with_capacity(requests.len()),
+            status_code: page_api::GetPageStatusCode::Ok,
+            reason: None,
+            rel: req.rel,
+            pages: req
+                .block_numbers
+                .into_iter()
+                .map(|block_number| {
+                    page_api::Page {
+                        block_number,
+                        image: Bytes::new(), // empty page slot to be filled in
+                    }
+                })
+                .collect(),
+        };
+
+        Self {
             requests,
+            response,
             block_shards,
         }
     }
 
-    /// Drains the per-shard requests, moving them out of the hashmap to avoid extra allocations.
+    /// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations.
     pub fn drain_requests(
         &mut self,
     ) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
         self.requests.drain()
     }
 
-    /// Adds a response from the given shard.
+    /// Adds a response from the given shard. The response must match the request ID and have an OK
+    /// status code. A response must not already exist for the given shard ID.
     #[allow(clippy::result_large_err)]
     pub fn add_response(
         &mut self,
@@ -105,68 +126,84 @@ impl GetPageSplitter {
         response: page_api::GetPageResponse,
     ) -> tonic::Result<()> {
         // The caller should already have converted status codes into tonic::Status.
-        assert_eq!(response.status_code, page_api::GetPageStatusCode::Ok);
-
-        // Make sure the response matches the request ID.
-        if response.request_id != self.request_id {
+        if response.status_code != page_api::GetPageStatusCode::Ok {
             return Err(tonic::Status::internal(format!(
-                "response ID {} does not match request ID {}",
-                response.request_id, self.request_id
+                "unexpected non-OK response for shard {shard_id}: {} {}",
+                response.status_code,
+                response.reason.unwrap_or_default()
             )));
         }
 
-        // Add the response data to the map.
-        let old = self.responses.insert(shard_id, response.page_images);
-
-        if old.is_some() {
+        if response.request_id != self.response.request_id {
             return Err(tonic::Status::internal(format!(
-                "duplicate response for shard {shard_id}",
+                "response ID mismatch for shard {shard_id}: expected {}, got {}",
+                self.response.request_id, response.request_id
+            )));
+        }
+
+        // Place the shard response pages into the assembled response, in request order.
+        let mut pages = response.pages.into_iter();
+
+        for (i, &s) in self.block_shards.iter().enumerate() {
+            if shard_id != s {
+                continue;
+            }
+
+            let Some(slot) = self.response.pages.get_mut(i) else {
+                return Err(tonic::Status::internal(format!(
+                    "no block_shards slot {i} for shard {shard_id}"
+                )));
+            };
+            let Some(page) = pages.next() else {
+                return Err(tonic::Status::internal(format!(
+                    "missing page {} in shard {shard_id} response",
+                    slot.block_number
+                )));
+            };
+            if page.block_number != slot.block_number {
+                return Err(tonic::Status::internal(format!(
+                    "shard {shard_id} returned wrong page at index {i}, expected {} got {}",
+                    slot.block_number, page.block_number
+                )));
+            }
+            if !slot.image.is_empty() {
+                return Err(tonic::Status::internal(format!(
+                    "shard {shard_id} returned duplicate page {} at index {i}",
+                    slot.block_number
+                )));
+            }
+
+            *slot = page;
+        }
+
+        // Make sure we've consumed all pages from the shard response.
+        if let Some(extra_page) = pages.next() {
+            return Err(tonic::Status::internal(format!(
+                "shard {shard_id} returned extra page: {}",
+                extra_page.block_number
             )));
         }
 
         Ok(())
     }
 
-    /// Assembles the shard responses into a single response. Responses must be present for all
-    /// relevant shards, and the total number of pages must match the original request.
+    /// Fetches the final, assembled response.
     #[allow(clippy::result_large_err)]
-    pub fn assemble_response(self) -> tonic::Result<page_api::GetPageResponse> {
-        let mut response = page_api::GetPageResponse {
-            request_id: self.request_id,
-            status_code: page_api::GetPageStatusCode::Ok,
-            reason: None,
-            page_images: Vec::with_capacity(self.block_shards.len()),
-        };
-
-        // Set up per-shard page iterators we can pull from.
-        let mut shard_responses = HashMap::with_capacity(self.responses.len());
-        for (shard_id, responses) in self.responses {
-            shard_responses.insert(shard_id, responses.into_iter());
-        }
-
-        // Reassemble the responses in the same order as the original request.
-        for shard_id in &self.block_shards {
-            let page = shard_responses
-                .get_mut(shard_id)
-                .ok_or_else(|| {
-                    tonic::Status::internal(format!("missing response for shard {shard_id}"))
-                })?
-                .next()
-                .ok_or_else(|| {
-                    tonic::Status::internal(format!("missing page from shard {shard_id}"))
-                })?;
-            response.page_images.push(page);
-        }
-
-        // Make sure there are no additional pages.
-        for (shard_id, mut pages) in shard_responses {
-            if pages.next().is_some() {
+    pub fn get_response(self) -> tonic::Result<page_api::GetPageResponse> {
+        // Check that the response is complete.
+        for (i, page) in self.response.pages.iter().enumerate() {
+            if page.image.is_empty() {
                 return Err(tonic::Status::internal(format!(
-                    "extra pages returned from shard {shard_id}"
+                    "missing page {} for shard {}",
+                    page.block_number,
+                    self.block_shards
+                        .get(i)
+                        .map(|s| s.to_string())
+                        .unwrap_or_else(|| "?".to_string())
                 )));
             }
         }
 
-        Ok(response)
+        Ok(self.response)
     }
 }
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index 7b70f0dc87..ba34fa1f69 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -17,6 +17,7 @@ pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
+serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
diff --git a/pageserver/ctl/src/download_remote_object.rs b/pageserver/ctl/src/download_remote_object.rs
new file mode 100644
index 0000000000..aa09774701
--- /dev/null
+++ b/pageserver/ctl/src/download_remote_object.rs
@@ -0,0 +1,85 @@
+use camino::Utf8PathBuf;
+use clap::Parser;
+use tokio_util::sync::CancellationToken;
+
+/// Download a specific object from remote storage to a local file.
+///
+/// The remote storage configuration is supplied via the `REMOTE_STORAGE_CONFIG` environment
+/// variable, in the same TOML format that the pageserver itself understands. This allows the
+/// command to work with any cloud supported by the `remote_storage` crate (currently AWS S3,
+/// Azure Blob Storage and local files), as long as the credentials are available via the
+/// standard environment variables expected by the underlying SDKs.
+///
+/// Examples for setting the environment variable:
+///
+/// ```bash
+/// # AWS S3 (region can also be provided via AWS_REGION)
+/// export REMOTE_STORAGE_CONFIG='remote_storage = { bucket_name = "my-bucket", bucket_region = "us-east-2" }'
+///
+/// # Azure Blob Storage (account key picked up from AZURE_STORAGE_ACCOUNT_KEY)
+/// export REMOTE_STORAGE_CONFIG='remote_storage = { container = "my-container", account = "my-account" }'
+/// ```
+#[derive(Parser)]
+pub(crate) struct DownloadRemoteObjectCmd {
+    /// Key / path of the object to download (relative to the remote storage prefix).
+    ///
+    /// Examples:
+    ///   "wal/3aa8f.../00000001000000000000000A"
+    ///   "pageserver/v1/tenants/<tenant_id>/timelines/<timeline_id>/layer_12345"
+    pub remote_path: String,
+
+    /// Path of the local file to create. Existing file will be overwritten.
+    ///
+    /// Examples:
+    ///   "./segment"
+    ///   "/tmp/layer_12345.parquet"
+    pub output_file: Utf8PathBuf,
+}
+
+pub(crate) async fn main(cmd: &DownloadRemoteObjectCmd) -> anyhow::Result<()> {
+    use remote_storage::{DownloadOpts, GenericRemoteStorage, RemotePath, RemoteStorageConfig};
+
+    // Fetch remote storage configuration from the environment
+    let config_str = std::env::var("REMOTE_STORAGE_CONFIG").map_err(|_| {
+        anyhow::anyhow!(
+            "'REMOTE_STORAGE_CONFIG' environment variable must be set to a valid remote storage TOML config"
+        )
+    })?;
+
+    let config = RemoteStorageConfig::from_toml_str(&config_str)?;
+
+    // Initialise remote storage client
+    let storage = GenericRemoteStorage::from_config(&config).await?;
+
+    // RemotePath must be relative – leading slashes confuse the parser.
+    let remote_path_str = cmd.remote_path.trim_start_matches('/');
+    let remote_path = RemotePath::from_string(remote_path_str)?;
+
+    let cancel = CancellationToken::new();
+
+    println!(
+        "Downloading '{remote_path}' from remote storage bucket {:?} ...",
+        config.storage.bucket_name()
+    );
+
+    // Start the actual download
+    let download = storage
+        .download(&remote_path, &DownloadOpts::default(), &cancel)
+        .await?;
+
+    // Stream to file
+    let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
+    let tmp_path = cmd.output_file.with_extension("tmp");
+    let mut file = tokio::fs::File::create(&tmp_path).await?;
+    tokio::io::copy(&mut reader, &mut file).await?;
+    file.sync_all().await?;
+    // Atomically move into place
+    tokio::fs::rename(&tmp_path, &cmd.output_file).await?;
+
+    println!(
+        "Downloaded to '{}'. Last modified: {:?}, etag: {}",
+        cmd.output_file, download.last_modified, download.etag
+    );
+
+    Ok(())
+}
diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 838d00e490..9801f3c9dc 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -1,14 +1,16 @@
 use std::str::FromStr;
 
-use anyhow::Context;
+use anyhow::{Context, Ok};
 use camino::Utf8PathBuf;
 use pageserver::tenant::{
     IndexPart,
     layer_map::{LayerMap, SearchResult},
-    remote_timeline_client::remote_layer_path,
-    storage_layer::{PersistentLayerDesc, ReadableLayerWeak},
+    remote_timeline_client::{index::LayerFileMetadata, remote_layer_path},
+    storage_layer::{LayerName, LayerVisibilityHint, PersistentLayerDesc, ReadableLayerWeak},
 };
 use pageserver_api::key::Key;
+use serde::Serialize;
+use std::collections::BTreeMap;
 use utils::{
     id::{TenantId, TimelineId},
     lsn::Lsn,
@@ -33,6 +35,31 @@ pub(crate) enum IndexPartCmd {
         #[arg(long)]
         lsn: String,
     },
+    /// List all visible delta and image layers at the latest LSN.
+    ListVisibleLayers {
+        #[arg(long)]
+        path: Utf8PathBuf,
+    },
+}
+
+fn create_layer_map_from_index_part(
+    index_part: &IndexPart,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+) -> LayerMap {
+    let mut layer_map = LayerMap::default();
+    {
+        let mut updates = layer_map.batch_update();
+        for (key, value) in index_part.layer_metadata.iter() {
+            updates.insert_historic(PersistentLayerDesc::from_filename(
+                tenant_shard_id,
+                timeline_id,
+                key.clone(),
+                value.file_size,
+            ));
+        }
+    }
+    layer_map
 }
 
 async fn search_layers(
@@ -49,18 +76,7 @@ async fn search_layers(
         let bytes = tokio::fs::read(path).await?;
         IndexPart::from_json_bytes(&bytes).unwrap()
     };
-    let mut layer_map = LayerMap::default();
-    {
-        let mut updates = layer_map.batch_update();
-        for (key, value) in index_json.layer_metadata.iter() {
-            updates.insert_historic(PersistentLayerDesc::from_filename(
-                tenant_shard_id,
-                timeline_id,
-                key.clone(),
-                value.file_size,
-            ));
-        }
-    }
+    let layer_map = create_layer_map_from_index_part(&index_json, tenant_shard_id, timeline_id);
     let key = Key::from_hex(key)?;
 
     let lsn = Lsn::from_str(lsn).unwrap();
@@ -98,6 +114,69 @@ async fn search_layers(
     Ok(())
 }
 
+#[derive(Debug, Clone, Serialize)]
+struct VisibleLayers {
+    pub total_images: u64,
+    pub total_image_bytes: u64,
+    pub total_deltas: u64,
+    pub total_delta_bytes: u64,
+    pub layer_metadata: BTreeMap<LayerName, LayerFileMetadata>,
+}
+
+impl VisibleLayers {
+    pub fn new() -> Self {
+        Self {
+            layer_metadata: BTreeMap::new(),
+            total_images: 0,
+            total_image_bytes: 0,
+            total_deltas: 0,
+            total_delta_bytes: 0,
+        }
+    }
+
+    pub fn add_layer(&mut self, name: LayerName, layer: LayerFileMetadata) {
+        match name {
+            LayerName::Image(_) => {
+                self.total_images += 1;
+                self.total_image_bytes += layer.file_size;
+            }
+            LayerName::Delta(_) => {
+                self.total_deltas += 1;
+                self.total_delta_bytes += layer.file_size;
+            }
+        }
+        self.layer_metadata.insert(name, layer);
+    }
+}
+
+async fn list_visible_layers(path: &Utf8PathBuf) -> anyhow::Result<()> {
+    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    let timeline_id = TimelineId::generate();
+
+    let bytes = tokio::fs::read(path).await.context("read file")?;
+    let index_part = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
+    let layer_map = create_layer_map_from_index_part(&index_part, tenant_shard_id, timeline_id);
+    let mut visible_layers = VisibleLayers::new();
+    let (layers, _key_space) = layer_map.get_visibility(Vec::new());
+    for (layer, visibility) in layers {
+        if visibility == LayerVisibilityHint::Visible {
+            visible_layers.add_layer(
+                layer.layer_name(),
+                index_part
+                    .layer_metadata
+                    .get(&layer.layer_name())
+                    .unwrap()
+                    .clone(),
+            );
+        }
+    }
+    let output = serde_json::to_string_pretty(&visible_layers).context("serialize output")?;
+    println!("{output}");
+
+    Ok(())
+}
+
 pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
     match cmd {
         IndexPartCmd::Dump { path } => {
@@ -114,5 +193,6 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
             key,
             lsn,
         } => search_layers(tenant_id, timeline_id, path, key, lsn).await,
+        IndexPartCmd::ListVisibleLayers { path } => list_visible_layers(path).await,
     }
 }
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 3cd4faaf2e..e84ad2c87f 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -4,6 +4,7 @@
 //!
 //! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
 
+mod download_remote_object;
 mod draw_timeline_dir;
 mod index_part;
 mod key;
@@ -16,6 +17,7 @@ use std::time::{Duration, SystemTime};
 
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
+use download_remote_object::DownloadRemoteObjectCmd;
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use page_trace::PageTraceCmd;
@@ -63,6 +65,7 @@ enum Commands {
     /// Debug print a hex key found from logs
     Key(key::DescribeKeyCommand),
     PageTrace(PageTraceCmd),
+    DownloadRemoteObject(DownloadRemoteObjectCmd),
 }
 
 /// Read and update pageserver metadata file
@@ -185,6 +188,9 @@ async fn main() -> anyhow::Result<()> {
         }
         Commands::Key(dkc) => dkc.execute(),
         Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
+        Commands::DownloadRemoteObject(cmd) => {
+            download_remote_object::main(&cmd).await?;
+        }
     };
     Ok(())
 }
diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto
index 1d6c230916..d113a04a42 100644
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -153,7 +153,7 @@ message GetDbSizeResponse {
 message GetPageRequest {
   // A request ID. Will be included in the response. Should be unique for
   // in-flight requests on the stream.
-  uint64 request_id = 1;
+  RequestID request_id = 1;
   // The request class.
   GetPageClass request_class = 2;
   // The LSN to read at.
@@ -177,6 +177,14 @@ message GetPageRequest {
   repeated uint32 block_number = 5;
 }
 
+// A Request ID. Should be unique for in-flight requests on a stream. Included in the response.
+message RequestID {
+  // The base request ID.
+  uint64 id = 1;
+  // The request attempt. Starts at 0, incremented on each retry.
+  uint32 attempt = 2;
+}
+
 // A GetPageRequest class. Primarily intended for observability, but may also be
 // used for prioritization in the future.
 enum GetPageClass {
@@ -199,13 +207,26 @@ enum GetPageClass {
 // the entire batch is ready, so no one can make use of the individual pages.
 message GetPageResponse {
   // The original request's ID.
-  uint64 request_id = 1;
-  // The response status code.
+  RequestID request_id = 1;
+  // The response status code. If not OK, the rel and page fields will be empty.
   GetPageStatusCode status_code = 2;
   // A string describing the status, if any.
   string reason = 3;
-  // The 8KB page images, in the same order as the request. Empty if status_code != OK.
-  repeated bytes page_image = 4;
+  // The relation that the pages belong to.
+  RelTag rel = 4;
+  // The page(s), in the same order as the request.
+  repeated Page page = 5;
+}
+
+// A page.
+//
+// TODO: it would be slightly more efficient (but less convenient) to have separate arrays of block
+// numbers and images, but given the 8KB page size it's probably negligible. Benchmark it anyway.
+message Page {
+  // The page number.
+  uint32 block_number = 1;
+  // The materialized page image, as an 8KB byte vector.
+  bytes image = 2;
 }
 
 // A GetPageResponse status code.
diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs
index 6523d00d3d..f70d0e7b28 100644
--- a/pageserver/page_api/src/client.rs
+++ b/pageserver/page_api/src/client.rs
@@ -1,4 +1,5 @@
 use anyhow::Context as _;
+use futures::future::ready;
 use futures::{Stream, StreamExt as _, TryStreamExt as _};
 use tokio::io::AsyncRead;
 use tokio_util::io::StreamReader;
@@ -110,7 +111,7 @@ impl Client {
     ) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
         let reqs = reqs.map(proto::GetPageRequest::from);
         let resps = self.inner.get_pages(reqs).await?.into_inner();
-        Ok(resps.map_ok(GetPageResponse::from))
+        Ok(resps.and_then(|resp| ready(GetPageResponse::try_from(resp).map_err(|err| err.into()))))
     }
 
     /// Returns the size of a relation, as # of blocks.
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index d0d3517d41..a3286ecf15 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -49,7 +49,7 @@ impl From<ProtocolError> for tonic::Status {
 }
 
 /// The LSN a request should read at.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, Default)]
 pub struct ReadLsn {
     /// The request's read LSN.
     pub request_lsn: Lsn,
@@ -329,7 +329,7 @@ impl From<GetDbSizeResponse> for proto::GetDbSizeResponse {
 }
 
 /// Requests one or more pages.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Default)]
 pub struct GetPageRequest {
     /// A request ID. Will be included in the response. Should be unique for in-flight requests on
     /// the stream.
@@ -356,7 +356,10 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
             return Err(ProtocolError::Missing("block_number"));
         }
         Ok(Self {
-            request_id: pb.request_id,
+            request_id: pb
+                .request_id
+                .ok_or(ProtocolError::Missing("request_id"))?
+                .into(),
             request_class: pb.request_class.into(),
             read_lsn: pb
                 .read_lsn
@@ -371,7 +374,7 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
 impl From<GetPageRequest> for proto::GetPageRequest {
     fn from(request: GetPageRequest) -> Self {
         Self {
-            request_id: request.request_id,
+            request_id: Some(request.request_id.into()),
             request_class: request.request_class.into(),
             read_lsn: Some(request.read_lsn.into()),
             rel: Some(request.rel.into()),
@@ -380,16 +383,60 @@ impl From<GetPageRequest> for proto::GetPageRequest {
     }
 }
 
-/// A GetPage request ID.
-pub type RequestID = u64;
+/// A GetPage request ID and retry attempt. Should be unique for in-flight requests on a stream.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct RequestID {
+    /// The base request ID.
+    pub id: u64,
+    // The request attempt. Starts at 0, incremented on each retry.
+    pub attempt: u32,
+}
+
+impl RequestID {
+    /// Creates a new RequestID with the given ID and an initial attempt of 0.
+    pub fn new(id: u64) -> Self {
+        Self { id, attempt: 0 }
+    }
+}
+
+impl Display for RequestID {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}.{}", self.id, self.attempt)
+    }
+}
+
+impl From<proto::RequestId> for RequestID {
+    fn from(pb: proto::RequestId) -> Self {
+        Self {
+            id: pb.id,
+            attempt: pb.attempt,
+        }
+    }
+}
+
+impl From<u64> for RequestID {
+    fn from(id: u64) -> Self {
+        Self::new(id)
+    }
+}
+
+impl From<RequestID> for proto::RequestId {
+    fn from(request_id: RequestID) -> Self {
+        Self {
+            id: request_id.id,
+            attempt: request_id.attempt,
+        }
+    }
+}
 
 /// A GetPage request class.
-#[derive(Clone, Copy, Debug, strum_macros::Display)]
+#[derive(Clone, Copy, Debug, Default, strum_macros::Display)]
 pub enum GetPageClass {
     /// Unknown class. For backwards compatibility: used when an older client version sends a class
     /// that a newer server version has removed.
     Unknown,
     /// A normal request. This is the default.
+    #[default]
     Normal,
     /// A prefetch request. NB: can only be classified on pg < 18.
     Prefetch,
@@ -397,19 +444,6 @@ pub enum GetPageClass {
     Background,
 }
 
-impl GetPageClass {
-    /// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than
-    /// latency-sensitive).
-    pub fn is_bulk(&self) -> bool {
-        match self {
-            Self::Unknown => false,
-            Self::Normal => false,
-            Self::Prefetch => true,
-            Self::Background => true,
-        }
-    }
-}
-
 impl From<proto::GetPageClass> for GetPageClass {
     fn from(pb: proto::GetPageClass) -> Self {
         match pb {
@@ -456,32 +490,41 @@ impl From<GetPageClass> for i32 {
 pub struct GetPageResponse {
     /// The original request's ID.
     pub request_id: RequestID,
-    /// The response status code.
+    /// The response status code. If not OK, the `rel` and `pages` fields will be empty.
     pub status_code: GetPageStatusCode,
     /// A string describing the status, if any.
     pub reason: Option<String>,
-    /// The 8KB page images, in the same order as the request. Empty if status != OK.
-    pub page_images: Vec<Bytes>,
+    /// The relation that the pages belong to.
+    pub rel: RelTag,
+    // The page(s), in the same order as the request.
+    pub pages: Vec<Page>,
 }
 
-impl From<proto::GetPageResponse> for GetPageResponse {
-    fn from(pb: proto::GetPageResponse) -> Self {
-        Self {
-            request_id: pb.request_id,
+impl TryFrom<proto::GetPageResponse> for GetPageResponse {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetPageResponse) -> Result<Self, ProtocolError> {
+        Ok(Self {
+            request_id: pb
+                .request_id
+                .ok_or(ProtocolError::Missing("request_id"))?
+                .into(),
             status_code: pb.status_code.into(),
             reason: Some(pb.reason).filter(|r| !r.is_empty()),
-            page_images: pb.page_image,
-        }
+            rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
+            pages: pb.page.into_iter().map(Page::from).collect(),
+        })
     }
 }
 
 impl From<GetPageResponse> for proto::GetPageResponse {
     fn from(response: GetPageResponse) -> Self {
         Self {
-            request_id: response.request_id,
+            request_id: Some(response.request_id.into()),
             status_code: response.status_code.into(),
             reason: response.reason.unwrap_or_default(),
-            page_image: response.page_images,
+            rel: Some(response.rel.into()),
+            page: response.pages.into_iter().map(proto::Page::from).collect(),
         }
     }
 }
@@ -514,11 +557,39 @@ impl GetPageResponse {
             request_id,
             status_code,
             reason: Some(status.message().to_string()),
-            page_images: Vec::new(),
+            rel: RelTag::default(),
+            pages: Vec::new(),
         })
     }
 }
 
+// A page.
+#[derive(Clone, Debug)]
+pub struct Page {
+    /// The page number.
+    pub block_number: u32,
+    /// The materialized page image, as an 8KB byte vector.
+    pub image: Bytes,
+}
+
+impl From<proto::Page> for Page {
+    fn from(pb: proto::Page) -> Self {
+        Self {
+            block_number: pb.block_number,
+            image: pb.image,
+        }
+    }
+}
+
+impl From<Page> for proto::Page {
+    fn from(page: Page) -> Self {
+        Self {
+            block_number: page.block_number,
+            image: page.image,
+        }
+    }
+}
+
 /// A GetPage response status code.
 ///
 /// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml
index f5dfc0db25..609fef2b4f 100644
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -16,6 +16,7 @@ futures.workspace = true
 hdrhistogram.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
+pprof.workspace = true
 rand.workspace = true
 reqwest.workspace = true
 serde.workspace = true
@@ -27,8 +28,9 @@ tokio-util.workspace = true
 tonic.workspace = true
 url.workspace = true
 
-pageserver_client.workspace = true
 pageserver_api.workspace = true
+pageserver_client.workspace = true
+pageserver_client_grpc.workspace = true
 pageserver_page_api.workspace = true
 utils = { path = "../../libs/utils/" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index f14caf548c..30b30d36f6 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -10,12 +10,14 @@ use anyhow::Context;
 use async_trait::async_trait;
 use bytes::Bytes;
 use camino::Utf8PathBuf;
+use futures::stream::FuturesUnordered;
 use futures::{Stream, StreamExt as _};
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::TenantShardId;
+use pageserver_client_grpc::{self as client_grpc, ShardSpec};
 use pageserver_page_api as page_api;
 use rand::prelude::*;
 use tokio::task::JoinSet;
@@ -37,6 +39,10 @@ pub(crate) struct Args {
     /// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
     #[clap(long, default_value = "postgres://postgres@localhost:64000")]
     page_service_connstring: String,
+    /// Use the rich gRPC Pageserver client `client_grpc::PageserverClient`, rather than the basic
+    /// no-frills `page_api::Client`. Only valid with grpc:// connstrings.
+    #[clap(long)]
+    rich_client: bool,
     #[clap(long)]
     pageserver_jwt: Option<String>,
     #[clap(long, default_value = "1")]
@@ -332,6 +338,7 @@ async fn main_impl(
             let client: Box<dyn Client> = match scheme.as_str() {
                 "postgresql" | "postgres" => {
                     assert!(!args.compression, "libpq does not support compression");
+                    assert!(!args.rich_client, "rich client requires grpc://");
                     Box::new(
                         LibpqClient::new(&args.page_service_connstring, worker_id.timeline)
                             .await
@@ -339,6 +346,16 @@ async fn main_impl(
                     )
                 }
 
+                "grpc" if args.rich_client => Box::new(
+                    RichGrpcClient::new(
+                        &args.page_service_connstring,
+                        worker_id.timeline,
+                        args.compression,
+                    )
+                    .await
+                    .unwrap(),
+                ),
+
                 "grpc" => Box::new(
                     GrpcClient::new(
                         &args.page_service_connstring,
@@ -657,7 +674,7 @@ impl Client for GrpcClient {
         blks: Vec<u32>,
     ) -> anyhow::Result<()> {
         let req = page_api::GetPageRequest {
-            request_id: req_id,
+            request_id: req_id.into(),
             request_class: page_api::GetPageClass::Normal,
             read_lsn: page_api::ReadLsn {
                 request_lsn: req_lsn,
@@ -677,6 +694,79 @@ impl Client for GrpcClient {
             "unexpected status code: {}",
             resp.status_code,
         );
-        Ok((resp.request_id, resp.page_images))
+        Ok((
+            resp.request_id.id,
+            resp.pages.into_iter().map(|p| p.image).collect(),
+        ))
+    }
+}
+
+/// A rich gRPC Pageserver client.
+struct RichGrpcClient {
+    inner: Arc<client_grpc::PageserverClient>,
+    requests: FuturesUnordered<
+        Pin<Box<dyn Future<Output = anyhow::Result<page_api::GetPageResponse>> + Send>>,
+    >,
+}
+
+impl RichGrpcClient {
+    async fn new(
+        connstring: &str,
+        ttid: TenantTimelineId,
+        compression: bool,
+    ) -> anyhow::Result<Self> {
+        let inner = Arc::new(client_grpc::PageserverClient::new(
+            ttid.tenant_id,
+            ttid.timeline_id,
+            ShardSpec::new(
+                [(ShardIndex::unsharded(), connstring.to_string())].into(),
+                None,
+            )?,
+            None,
+            compression.then_some(tonic::codec::CompressionEncoding::Zstd),
+        )?);
+        Ok(Self {
+            inner,
+            requests: FuturesUnordered::new(),
+        })
+    }
+}
+
+#[async_trait]
+impl Client for RichGrpcClient {
+    async fn send_get_page(
+        &mut self,
+        req_id: u64,
+        req_lsn: Lsn,
+        mod_lsn: Lsn,
+        rel: RelTag,
+        blks: Vec<u32>,
+    ) -> anyhow::Result<()> {
+        let req = page_api::GetPageRequest {
+            request_id: req_id.into(),
+            request_class: page_api::GetPageClass::Normal,
+            read_lsn: page_api::ReadLsn {
+                request_lsn: req_lsn,
+                not_modified_since_lsn: Some(mod_lsn),
+            },
+            rel,
+            block_numbers: blks,
+        };
+        let inner = self.inner.clone();
+        self.requests.push(Box::pin(async move {
+            inner
+                .get_page(req)
+                .await
+                .map_err(|err| anyhow::anyhow!("{err}"))
+        }));
+        Ok(())
+    }
+
+    async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
+        let resp = self.requests.next().await.unwrap()?;
+        Ok((
+            resp.request_id.id,
+            resp.pages.into_iter().map(|p| p.image).collect(),
+        ))
     }
 }
diff --git a/pageserver/pagebench/src/cmd/idle_streams.rs b/pageserver/pagebench/src/cmd/idle_streams.rs
new file mode 100644
index 0000000000..73bc9f3f46
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/idle_streams.rs
@@ -0,0 +1,127 @@
+use std::sync::Arc;
+
+use anyhow::anyhow;
+use futures::StreamExt;
+use tonic::transport::Endpoint;
+use tracing::info;
+
+use pageserver_page_api::{GetPageClass, GetPageRequest, GetPageStatusCode, ReadLsn, RelTag};
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+use utils::shard::ShardIndex;
+
+/// Starts a large number of idle gRPC GetPage streams.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    /// The Pageserver to connect to. Must use grpc://.
+    #[clap(long, default_value = "grpc://localhost:51051")]
+    server: String,
+    /// The Pageserver HTTP API.
+    #[clap(long, default_value = "http://localhost:9898")]
+    http_server: String,
+    /// The number of streams to open.
+    #[clap(long, default_value = "100000")]
+    count: usize,
+    /// Number of streams per connection.
+    #[clap(long, default_value = "100")]
+    per_connection: usize,
+    /// Send a single GetPage request on each stream.
+    #[clap(long, default_value_t = false)]
+    send_request: bool,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()?;
+
+    rt.block_on(main_impl(args))
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    // Discover a tenant and timeline to use.
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        reqwest::Client::new(),
+        args.http_server.clone(),
+        None,
+    ));
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: Some(1),
+            targets: None,
+        },
+    )
+    .await?;
+    let ttid = timelines
+        .first()
+        .ok_or_else(|| anyhow!("no timelines found"))?;
+
+    // Set up the initial client.
+    let endpoint = Endpoint::from_shared(args.server.clone())?;
+
+    let connect = async || {
+        pageserver_page_api::Client::new(
+            endpoint.connect().await?,
+            ttid.tenant_id,
+            ttid.timeline_id,
+            ShardIndex::unsharded(),
+            None,
+            None,
+        )
+    };
+
+    let mut client = connect().await?;
+    let mut streams = Vec::with_capacity(args.count);
+
+    // Create streams.
+    for i in 0..args.count {
+        if i % 100 == 0 {
+            info!("opened {}/{} streams", i, args.count);
+        }
+        if i % args.per_connection == 0 && i > 0 {
+            client = connect().await?;
+        }
+
+        let (req_tx, req_rx) = tokio::sync::mpsc::unbounded_channel();
+        let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
+        let mut resp_stream = client.get_pages(req_stream).await?;
+
+        // Send request if specified.
+        if args.send_request {
+            req_tx.send(GetPageRequest {
+                request_id: 1.into(),
+                request_class: GetPageClass::Normal,
+                read_lsn: ReadLsn {
+                    request_lsn: Lsn::MAX,
+                    not_modified_since_lsn: Some(Lsn(1)),
+                },
+                rel: RelTag {
+                    spcnode: 1664, // pg_global
+                    dbnode: 0,     // shared database
+                    relnode: 1262, // pg_authid
+                    forknum: 0,    // init
+                },
+                block_numbers: vec![0],
+            })?;
+            let resp = resp_stream
+                .next()
+                .await
+                .transpose()?
+                .ok_or_else(|| anyhow!("no response"))?;
+            if resp.status_code != GetPageStatusCode::Ok {
+                return Err(anyhow!("{} response", resp.status_code));
+            }
+        }
+
+        // Hold onto streams to avoid closing them.
+        streams.push((req_tx, resp_stream));
+    }
+
+    info!("opened {} streams, sleeping", args.count);
+
+    // Block forever, to hold the idle streams open for inspection.
+    futures::future::pending::<()>().await;
+
+    Ok(())
+}
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 5527557450..ceca58e032 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -1,4 +1,7 @@
+use std::fs::File;
+
 use clap::Parser;
+use tracing::info;
 use utils::logging;
 
 /// Re-usable pieces of code that aren't CLI-specific.
@@ -17,38 +20,73 @@ mod cmd {
     pub(super) mod aux_files;
     pub(super) mod basebackup;
     pub(super) mod getpage_latest_lsn;
+    pub(super) mod idle_streams;
     pub(super) mod ondemand_download_churn;
     pub(super) mod trigger_initial_size_calculation;
 }
 
 /// Component-level performance test for pageserver.
 #[derive(clap::Parser)]
-enum Args {
+struct Args {
+    /// Takes a client CPU profile into profile.svg. The benchmark must exit cleanly before it's
+    /// written, e.g. via --runtime.
+    #[arg(long)]
+    profile: bool,
+
+    #[command(subcommand)]
+    subcommand: Subcommand,
+}
+
+#[derive(clap::Subcommand)]
+enum Subcommand {
     Basebackup(cmd::basebackup::Args),
     GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
     TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
     OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
     AuxFiles(cmd::aux_files::Args),
+    IdleStreams(cmd::idle_streams::Args),
 }
 
-fn main() {
+fn main() -> anyhow::Result<()> {
     logging::init(
         logging::LogFormat::Plain,
         logging::TracingErrorLayerEnablement::Disabled,
         logging::Output::Stderr,
-    )
-    .unwrap();
+    )?;
     logging::replace_panic_hook_with_tracing_panic_hook().forget();
 
     let args = Args::parse();
-    match args {
-        Args::Basebackup(args) => cmd::basebackup::main(args),
-        Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
-        Args::TriggerInitialSizeCalculation(args) => {
+
+    // Start a CPU profile if requested.
+    let mut profiler = None;
+    if args.profile {
+        profiler = Some(
+            pprof::ProfilerGuardBuilder::default()
+                .frequency(1000)
+                .blocklist(&["libc", "libgcc", "pthread", "vdso"])
+                .build()?,
+        );
+    }
+
+    match args.subcommand {
+        Subcommand::Basebackup(args) => cmd::basebackup::main(args),
+        Subcommand::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
+        Subcommand::TriggerInitialSizeCalculation(args) => {
             cmd::trigger_initial_size_calculation::main(args)
         }
-        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
-        Args::AuxFiles(args) => cmd::aux_files::main(args),
+        Subcommand::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
+        Subcommand::AuxFiles(args) => cmd::aux_files::main(args),
+        Subcommand::IdleStreams(args) => cmd::idle_streams::main(args),
+    }?;
+
+    // Generate a CPU flamegraph if requested.
+    if let Some(profiler) = profiler {
+        let report = profiler.report().build()?;
+        drop(profiler); // stop profiling
+        let file = File::create("profile.svg")?;
+        report.flamegraph(file)?;
+        info!("wrote CPU profile flamegraph to profile.svg")
     }
-    .unwrap()
+
+    Ok(())
 }
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 36dada1e89..1a44c80e2d 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -114,7 +114,7 @@ where
     // Compute postgres doesn't have any previous WAL files, but the first
     // record that it's going to write needs to include the LSN of the
     // previous record (xl_prev). We include prev_record_lsn in the
-    // "zenith.signal" file, so that postgres can read it during startup.
+    // "neon.signal" file, so that postgres can read it during startup.
     //
     // We don't keep full history of record boundaries in the page server,
     // however, only the predecessor of the latest record on each
@@ -751,34 +751,39 @@ where
 
     //
     // Add generated pg_control file and bootstrap WAL segment.
-    // Also send zenith.signal file with extra bootstrap data.
+    // Also send neon.signal and zenith.signal file with extra bootstrap data.
     //
     async fn add_pgcontrol_file(
         &mut self,
         pg_control_bytes: Bytes,
         system_identifier: u64,
     ) -> Result<(), BasebackupError> {
-        // add zenith.signal file
-        let mut zenith_signal = String::new();
+        // add neon.signal file
+        let mut neon_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
             if self.timeline.is_ancestor_lsn(self.lsn) {
-                write!(zenith_signal, "PREV LSN: none")
+                write!(neon_signal, "PREV LSN: none")
                     .map_err(|e| BasebackupError::Server(e.into()))?;
             } else {
-                write!(zenith_signal, "PREV LSN: invalid")
+                write!(neon_signal, "PREV LSN: invalid")
                     .map_err(|e| BasebackupError::Server(e.into()))?;
             }
         } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
+            write!(neon_signal, "PREV LSN: {}", self.prev_record_lsn)
                 .map_err(|e| BasebackupError::Server(e.into()))?;
         }
-        self.ar
-            .append(
-                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
-                zenith_signal.as_bytes(),
-            )
-            .await
-            .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
+
+        // TODO: Remove zenith.signal once all historical computes have been replaced
+        // ... and thus support the neon.signal file.
+        for signalfilename in ["neon.signal", "zenith.signal"] {
+            self.ar
+                .append(
+                    &new_tar_header(signalfilename, neon_signal.len() as u64)?,
+                    neon_signal.as_bytes(),
+                )
+                .await
+                .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,neon.signal"))?;
+        }
 
         //send pg_control
         let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 327384fd82..dfb8b437c3 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -29,8 +29,8 @@ use pageserver::task_mgr::{
 };
 use pageserver::tenant::{TenantSharedResources, mgr, secondary};
 use pageserver::{
-    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
-    page_cache, page_service, task_mgr, virtual_file,
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener,
+    MetricsCollectionTask, http, page_cache, page_service, task_mgr, virtual_file,
 };
 use postgres_backend::AuthType;
 use remote_storage::GenericRemoteStorage;
@@ -41,6 +41,7 @@ use tracing_utils::OtelGuard;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::crashsafe::syncfs;
 use utils::logging::TracingErrorLayerEnablement;
+use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
 use utils::sentry_init::init_sentry;
 use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener};
 
@@ -763,6 +764,41 @@ fn start_pageserver(
         (http_task, https_task)
     };
 
+    /* BEGIN_HADRON */
+    let metrics_collection_task = {
+        let cancel = shutdown_pageserver.child_token();
+        let task = crate::BACKGROUND_RUNTIME.spawn({
+            let cancel = cancel.clone();
+            let background_jobs_barrier = background_jobs_barrier.clone();
+            async move {
+                if conf.force_metric_collection_on_scrape {
+                    return;
+                }
+
+                // first wait until background jobs are cleared to launch.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return; },
+                    _ = background_jobs_barrier.wait() => {}
+                };
+                let mut interval = tokio::time::interval(METRICS_COLLECTION_INTERVAL);
+                loop {
+                    tokio::select! {
+                        _ = cancel.cancelled() => {
+                            tracing::info!("cancelled metrics collection task, exiting...");
+                             break;
+                        },
+                        _ = interval.tick() => {}
+                    }
+                    tokio::task::spawn_blocking(|| {
+                        METRICS_COLLECTOR.run_once(true);
+                    });
+                }
+            }
+        });
+        MetricsCollectionTask(CancellableTask { task, cancel })
+    };
+    /* END_HADRON */
+
     let consumption_metrics_tasks = {
         let cancel = shutdown_pageserver.child_token();
         let task = crate::BACKGROUND_RUNTIME.spawn({
@@ -844,6 +880,7 @@ fn start_pageserver(
             https_endpoint_listener,
             page_service,
             page_service_grpc,
+            metrics_collection_task,
             consumption_metrics_tasks,
             disk_usage_eviction_task,
             &tenant_manager,
@@ -880,17 +917,15 @@ async fn create_remote_storage_client(
     // If `test_remote_failures` is non-zero, wrap the client with a
     // wrapper that simulates failures.
     if conf.test_remote_failures > 0 {
-        if !cfg!(feature = "testing") {
-            anyhow::bail!(
-                "test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"
-            );
-        }
         info!(
             "Simulating remote failures for first {} attempts of each op",
             conf.test_remote_failures
         );
-        remote_storage =
-            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
+        remote_storage = GenericRemoteStorage::unreliable_wrapper(
+            remote_storage,
+            conf.test_remote_failures,
+            conf.test_remote_failures_probability,
+        );
     }
 
     Ok(remote_storage)
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 99d7e0ca3a..bb73ae1dd5 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -147,7 +147,11 @@ pub struct PageServerConf {
 
     pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
 
+    // The number of allowed failures in remote storage operations.
     pub test_remote_failures: u64,
+    // The probability of failure in remote storage operations. Only works when test_remote_failures > 1.
+    // Use 100 for 100% failure, 0 for no failure.
+    pub test_remote_failures_probability: u64,
 
     pub ondemand_download_behavior_treat_error_as_warn: bool,
 
@@ -248,6 +252,14 @@ pub struct PageServerConf {
     pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
 
     pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
+
+    /// Defines what is a big tenant for the purpose of image layer generation.
+    /// See Timeline::should_check_if_image_layers_required
+    pub image_layer_generation_large_timeline_threshold: Option<u64>,
+
+    /// Controls whether to collect all metrics on each scrape or to return potentially stale
+    /// results.
+    pub force_metric_collection_on_scrape: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -392,6 +404,7 @@ impl PageServerConf {
             synthetic_size_calculation_interval,
             disk_usage_based_eviction,
             test_remote_failures,
+            test_remote_failures_probability,
             ondemand_download_behavior_treat_error_as_warn,
             background_task_maximum_delay,
             control_plane_api,
@@ -427,6 +440,8 @@ impl PageServerConf {
             posthog_config,
             timeline_import_config,
             basebackup_cache_config,
+            image_layer_generation_large_timeline_threshold,
+            force_metric_collection_on_scrape,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -461,6 +476,7 @@ impl PageServerConf {
             synthetic_size_calculation_interval,
             disk_usage_based_eviction,
             test_remote_failures,
+            test_remote_failures_probability,
             ondemand_download_behavior_treat_error_as_warn,
             background_task_maximum_delay,
             control_plane_api: control_plane_api
@@ -484,6 +500,8 @@ impl PageServerConf {
             dev_mode,
             timeline_import_config,
             basebackup_cache_config,
+            image_layer_generation_large_timeline_threshold,
+            force_metric_collection_on_scrape,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index f1f9aaf43c..be1de43d18 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -194,6 +194,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
                         listen_http_port: m.http_port,
                         listen_https_port: m.https_port,
                         availability_zone_id: az_id.expect("Checked above"),
+                        node_ip_addr: None,
                     })
                 }
                 Err(e) => {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3612686b5d..3e844a375d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2,12 +2,15 @@
 //! Management HTTP API
 //!
 use std::cmp::Reverse;
-use std::collections::{BinaryHeap, HashMap};
+use std::collections::BTreeMap;
+use std::collections::BinaryHeap;
+use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
 use anyhow::{Context, Result, anyhow};
+use bytes::Bytes;
 use enumset::EnumSet;
 use futures::future::join_all;
 use futures::{StreamExt, TryFutureExt};
@@ -44,6 +47,7 @@ use pageserver_api::shard::{ShardCount, TenantShardId};
 use postgres_ffi::PgMajorVersion;
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
 use scopeguard::defer;
+use serde::{Deserialize, Serialize};
 use serde_json::json;
 use tenant_size_model::svg::SvgBranchKind;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -55,6 +59,7 @@ use utils::auth::SwappableJwtAuth;
 use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
+use wal_decoder::models::record::NeonWalRecord;
 
 use crate::config::PageServerConf;
 use crate::context;
@@ -75,12 +80,13 @@ use crate::tenant::remote_timeline_client::{
 };
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
+use crate::tenant::storage_layer::ValuesReconstructState;
 use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
 use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
 use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
 use crate::tenant::timeline::{
-    CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
-    WaitLsnTimeout, WaitLsnWaiter, import_pgdata,
+    CompactFlags, CompactOptions, CompactRequest, MarkInvisibleRequest, Timeline, WaitLsnTimeout,
+    WaitLsnWaiter, import_pgdata,
 };
 use crate::tenant::{
     GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
@@ -395,6 +401,7 @@ async fn build_timeline_info(
     timeline: &Arc<Timeline>,
     include_non_incremental_logical_size: bool,
     force_await_initial_logical_size: bool,
+    include_image_consistent_lsn: bool,
     ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
     crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -419,6 +426,10 @@ async fn build_timeline_info(
                 .await?,
         );
     }
+    // HADRON
+    if include_image_consistent_lsn {
+        info.image_consistent_lsn = Some(timeline.compute_image_consistent_lsn().await?);
+    }
     Ok(info)
 }
 
@@ -508,6 +519,8 @@ async fn build_timeline_info_common(
         is_invisible: Some(is_invisible),
 
         walreceiver_status,
+        // HADRON
+        image_consistent_lsn: None,
     };
     Ok(info)
 }
@@ -710,6 +723,8 @@ async fn timeline_list_handler(
         parse_query_param(&request, "include-non-incremental-logical-size")?;
     let force_await_initial_logical_size: Option<bool> =
         parse_query_param(&request, "force-await-initial-logical-size")?;
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let state = get_state(&request);
@@ -730,6 +745,7 @@ async fn timeline_list_handler(
                 &timeline,
                 include_non_incremental_logical_size.unwrap_or(false),
                 force_await_initial_logical_size.unwrap_or(false),
+                include_image_consistent_lsn.unwrap_or(false),
                 &ctx,
             )
             .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -758,6 +774,9 @@ async fn timeline_and_offloaded_list_handler(
         parse_query_param(&request, "include-non-incremental-logical-size")?;
     let force_await_initial_logical_size: Option<bool> =
         parse_query_param(&request, "force-await-initial-logical-size")?;
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
+
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let state = get_state(&request);
@@ -778,6 +797,7 @@ async fn timeline_and_offloaded_list_handler(
                 &timeline,
                 include_non_incremental_logical_size.unwrap_or(false),
                 force_await_initial_logical_size.unwrap_or(false),
+                include_image_consistent_lsn.unwrap_or(false),
                 &ctx,
             )
             .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -962,6 +982,9 @@ async fn timeline_detail_handler(
         parse_query_param(&request, "include-non-incremental-logical-size")?;
     let force_await_initial_logical_size: Option<bool> =
         parse_query_param(&request, "force-await-initial-logical-size")?;
+    // HADRON
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     // Logical size calculation needs downloading.
@@ -982,6 +1005,7 @@ async fn timeline_detail_handler(
             &timeline,
             include_non_incremental_logical_size.unwrap_or(false),
             force_await_initial_logical_size.unwrap_or(false),
+            include_image_consistent_lsn.unwrap_or(false),
             ctx,
         )
         .await
@@ -2500,9 +2524,10 @@ async fn timeline_checkpoint_handler(
                 .compact(&cancel, flags, &ctx)
                 .await
                 .map_err(|e|
-                    match e {
-                        CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                        CompactionError::Other(e) => ApiError::InternalServerError(e),
+                    if e.is_cancel() {
+                        ApiError::ShuttingDown
+                    } else {
+                        ApiError::InternalServerError(e.into_anyhow())
                     }
                 )?;
         }
@@ -2687,6 +2712,16 @@ async fn deletion_queue_flush(
     }
 }
 
+/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+struct GetPageResponse {
+    pub page: Bytes,
+    pub layers_visited: u32,
+    pub delta_layers_visited: u32,
+    pub records: Vec<(Lsn, NeonWalRecord)>,
+    pub img: Option<(Lsn, Bytes)>,
+}
+
 async fn getpage_at_lsn_handler(
     request: Request<Body>,
     cancel: CancellationToken,
@@ -2737,21 +2772,24 @@ async fn getpage_at_lsn_handler_inner(
 
         // Use last_record_lsn if no lsn is provided
         let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let page = timeline.get(key.0, lsn, &ctx).await?;
 
         if touch {
             json_response(StatusCode::OK, ())
         } else {
-            Result::<_, ApiError>::Ok(
-                Response::builder()
-                    .status(StatusCode::OK)
-                    .header(header::CONTENT_TYPE, "application/octet-stream")
-                    .body(hyper::Body::from(page))
-                    .unwrap(),
-            )
+            let mut reconstruct_state = ValuesReconstructState::new_with_debug(IoConcurrency::sequential());
+            let page = timeline.debug_get(key.0, lsn, &ctx, &mut reconstruct_state).await?;
+            let response = GetPageResponse {
+                page,
+                layers_visited: reconstruct_state.get_layers_visited(),
+                delta_layers_visited: reconstruct_state.get_delta_layers_visited(),
+                records: reconstruct_state.debug_state.records.clone(),
+                img: reconstruct_state.debug_state.img.clone(),
+            };
+
+            json_response(StatusCode::OK, response)
         }
     }
-    .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .instrument(info_span!("timeline_debug_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
     .await
 }
 
@@ -3213,6 +3251,30 @@ async fn get_utilization(
         .map_err(ApiError::InternalServerError)
 }
 
+/// HADRON
+async fn list_tenant_visible_size_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let state = get_state(&request);
+
+    let mut map = BTreeMap::new();
+    for (tenant_shard_id, slot) in state.tenant_manager.list() {
+        match slot {
+            TenantSlot::Attached(tenant) => {
+                let visible_size = tenant.get_visible_size();
+                map.insert(tenant_shard_id, visible_size);
+            }
+            TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
+                continue;
+            }
+        }
+    }
+
+    json_response(StatusCode::OK, map)
+}
+
 async fn list_aux_files(
     mut request: Request<Body>,
     _cancel: CancellationToken,
@@ -3616,6 +3678,7 @@ async fn activate_post_import_handler(
         let timeline_info = build_timeline_info(
             &timeline, false, // include_non_incremental_logical_size,
             false, // force_await_initial_logical_size
+            false, // include_image_consistent_lsn
             &ctx,
         )
         .await
@@ -3937,9 +4000,14 @@ pub fn make_router(
         .expect("construct launch timestamp header middleware"),
     );
 
+    let force_metric_collection_on_scrape = state.conf.force_metric_collection_on_scrape;
+
+    let prometheus_metrics_handler_wrapper =
+        move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape);
+
     Ok(router
         .data(state)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| request_span(r, prometheus_metrics_handler_wrapper))
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
         .get("/profile/heap", |r| request_span(r, profile_heap_handler))
         .get("/v1/status", |r| api_handler(r, status_handler))
@@ -4132,7 +4200,7 @@ pub fn make_router(
         })
         .get(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
-            |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
+            |r|  testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
         )
         .get(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage",
@@ -4145,6 +4213,7 @@ pub fn make_router(
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
+        .get("/v1/list_tenant_visible_size", |r| api_handler(r, list_tenant_visible_size_handler))
         .post(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
             |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 96fe0c1078..409cc2e3c5 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -610,13 +610,13 @@ async fn import_file(
         debug!("imported twophase file");
     } else if file_path.starts_with("pg_wal") {
         debug!("found wal file in base section. ignore it");
-    } else if file_path.starts_with("zenith.signal") {
+    } else if file_path.starts_with("zenith.signal") || file_path.starts_with("neon.signal") {
         // Parse zenith signal file to set correct previous LSN
         let bytes = read_all_bytes(reader).await?;
-        // zenith.signal format is "PREV LSN: prev_lsn"
+        // neon.signal format is "PREV LSN: prev_lsn"
         // TODO write serialization and deserialization in the same place.
-        let zenith_signal = std::str::from_utf8(&bytes)?.trim();
-        let prev_lsn = match zenith_signal {
+        let neon_signal = std::str::from_utf8(&bytes)?.trim();
+        let prev_lsn = match neon_signal {
             "PREV LSN: none" => Lsn(0),
             "PREV LSN: invalid" => Lsn(0),
             other => {
@@ -624,17 +624,17 @@ async fn import_file(
                 split[1]
                     .trim()
                     .parse::<Lsn>()
-                    .context("can't parse zenith.signal")?
+                    .context("can't parse neon.signal")?
             }
         };
 
-        // zenith.signal is not necessarily the last file, that we handle
+        // neon.signal is not necessarily the last file, that we handle
         // but it is ok to call `finish_write()`, because final `modification.commit()`
         // will update lsn once more to the final one.
         let writer = modification.tline.writer().await;
         writer.finish_write(prev_lsn);
 
-        debug!("imported zenith signal {}", prev_lsn);
+        debug!("imported neon signal {}", prev_lsn);
     } else if file_path.starts_with("pg_tblspc") {
         // TODO Backups exported from neon won't have pg_tblspc, but we will need
         // this to import arbitrary postgres databases.
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 0dd3c465e0..0864026f6b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -73,6 +73,9 @@ pub struct HttpEndpointListener(pub CancellableTask);
 pub struct HttpsEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
+// HADRON
+pub struct MetricsCollectionTask(pub CancellableTask);
+
 impl CancellableTask {
     pub async fn shutdown(self) {
         self.cancel.cancel();
@@ -87,6 +90,7 @@ pub async fn shutdown_pageserver(
     https_listener: Option<HttpsEndpointListener>,
     page_service: page_service::Listener,
     grpc_task: Option<CancellableTask>,
+    metrics_collection_task: MetricsCollectionTask,
     consumption_metrics_worker: ConsumptionMetricsTasks,
     disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
     tenant_manager: &TenantManager,
@@ -211,6 +215,14 @@ pub async fn shutdown_pageserver(
     // Best effort to persist any outstanding deletions, to avoid leaking objects
     deletion_queue.shutdown(Duration::from_secs(5)).await;
 
+    // HADRON
+    timed(
+        metrics_collection_task.0.shutdown(),
+        "shutdown metrics collections metrics",
+        Duration::from_secs(1),
+    )
+    .await;
+
     timed(
         consumption_metrics_worker.0.shutdown(),
         "shutdown consumption metrics",
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index eb89e166b2..1b783326a0 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2847,6 +2847,24 @@ pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy<IntCounter> = Lazy::new(||
     .expect("failed to define a metric")
 });
 
+// Global counter for PageStream request results by outcome. Outcomes are divided into 3 categories:
+// - success
+// - internal_error: errors that indicate bugs in the storage cluster (e.g. page reconstruction errors, misrouted requests, LSN timeout errors)
+// - other_error: transient error conditions that are expected in normal operation or indicate bugs with other parts of the system (e.g. error due to pageserver shutdown, malformed requests etc.)
+pub(crate) static PAGESTREAM_HANDLER_RESULTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_pagestream_handler_results_total",
+        "Number of pageserver pagestream handler results by outcome (success, internal_error, other_error)",
+        &["outcome"]
+    )
+    .expect("failed to define a metric")
+});
+
+// Constants for pageserver_pagestream_handler_results_total's outcome labels
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_SUCCESS: &str = "success";
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR: &str = "internal_error";
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR: &str = "other_error";
+
 // Metrics collected on WAL redo operations
 //
 // We collect the time spent in actual WAL redo ('redo'), and time waiting
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 70fdb2e789..1fc7e4eac7 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -70,7 +70,7 @@ use crate::context::{
 };
 use crate::metrics::{
     self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
-    MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics,
+    MISROUTED_PAGESTREAM_REQUESTS, PAGESTREAM_HANDLER_RESULTS_TOTAL, SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::{LsnRange, Version};
 use crate::span::{
@@ -1441,20 +1441,57 @@ impl PageServerHandler {
             let (response_msg, ctx) = match handler_result {
                 Err(e) => match &e.err {
                     PageStreamError::Shutdown => {
+                        // BEGIN HADRON
+                        PAGESTREAM_HANDLER_RESULTS_TOTAL
+                            .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR])
+                            .inc();
+                        // END HADRON
+
                         // If we fail to fulfil a request during shutdown, which may be _because_ of
                         // shutdown, then do not send the error to the client.  Instead just drop the
                         // connection.
                         span.in_scope(|| info!("dropping connection due to shutdown"));
                         return Err(QueryError::Shutdown);
                     }
-                    PageStreamError::Reconnect(reason) => {
-                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                    PageStreamError::Reconnect(_reason) => {
+                        span.in_scope(|| {
+                            // BEGIN HADRON
+                            // We can get here because the compute node is pointing at the wrong PS. We
+                            // already have a metric to keep track of this so suppressing this log to
+                            // reduce log spam. The information in this log message is not going to be that
+                            // helpful given the volume of logs that can be generated.
+                            // info!("handler requested reconnect: {reason}")
+                            // END HADRON
+                        });
+                        // BEGIN HADRON
+                        PAGESTREAM_HANDLER_RESULTS_TOTAL
+                            .with_label_values(&[
+                                metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
+                            ])
+                            .inc();
+                        // END HADRON
                         return Err(QueryError::Reconnect);
                     }
                     PageStreamError::Read(_)
                     | PageStreamError::LsnTimeout(_)
                     | PageStreamError::NotFound(_)
                     | PageStreamError::BadRequest(_) => {
+                        // BEGIN HADRON
+                        if let PageStreamError::Read(_) | PageStreamError::LsnTimeout(_) = &e.err {
+                            PAGESTREAM_HANDLER_RESULTS_TOTAL
+                                .with_label_values(&[
+                                    metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
+                                ])
+                                .inc();
+                        } else {
+                            PAGESTREAM_HANDLER_RESULTS_TOTAL
+                                .with_label_values(&[
+                                    metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR,
+                                ])
+                                .inc();
+                        }
+                        // END HADRON
+
                         // print the all details to the log with {:#}, but for the client the
                         // error message is enough.  Do not log if shutting down, as the anyhow::Error
                         // here includes cancellation which is not an error.
@@ -1472,7 +1509,15 @@ impl PageServerHandler {
                         )
                     }
                 },
-                Ok((response_msg, _op_timer_already_observed, ctx)) => (response_msg, Some(ctx)),
+                Ok((response_msg, _op_timer_already_observed, ctx)) => {
+                    // BEGIN HADRON
+                    PAGESTREAM_HANDLER_RESULTS_TOTAL
+                        .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_SUCCESS])
+                        .inc();
+                    // END HADRON
+
+                    (response_msg, Some(ctx))
+                }
             };
 
             let ctx = ctx.map(|req_ctx| {
@@ -3293,9 +3338,12 @@ impl GrpcPageServiceHandler {
     }
 
     /// Generates a PagestreamRequest header from a ReadLsn and request ID.
-    fn make_hdr(read_lsn: page_api::ReadLsn, req_id: u64) -> PagestreamRequest {
+    fn make_hdr(
+        read_lsn: page_api::ReadLsn,
+        req_id: Option<page_api::RequestID>,
+    ) -> PagestreamRequest {
         PagestreamRequest {
-            reqid: req_id,
+            reqid: req_id.map(|r| r.id).unwrap_or_default(),
             request_lsn: read_lsn.request_lsn,
             not_modified_since: read_lsn
                 .not_modified_since_lsn
@@ -3405,7 +3453,7 @@ impl GrpcPageServiceHandler {
 
             batch.push(BatchedGetPageRequest {
                 req: PagestreamGetPageRequest {
-                    hdr: Self::make_hdr(req.read_lsn, req.request_id),
+                    hdr: Self::make_hdr(req.read_lsn, Some(req.request_id)),
                     rel: req.rel,
                     blkno,
                 },
@@ -3435,12 +3483,16 @@ impl GrpcPageServiceHandler {
             request_id: req.request_id,
             status_code: page_api::GetPageStatusCode::Ok,
             reason: None,
-            page_images: Vec::with_capacity(results.len()),
+            rel: req.rel,
+            pages: Vec::with_capacity(results.len()),
         };
 
         for result in results {
             match result {
-                Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.page_images.push(r.page),
+                Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.pages.push(page_api::Page {
+                    block_number: r.req.blkno,
+                    image: r.page,
+                }),
                 Ok((resp, _, _)) => {
                     return Err(tonic::Status::internal(format!(
                         "unexpected response: {resp:?}"
@@ -3483,7 +3535,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         span_record!(rel=%req.rel, lsn=%req.read_lsn);
 
         let req = PagestreamExistsRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
             rel: req.rel,
         };
 
@@ -3633,7 +3685,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn);
 
         let req = PagestreamDbSizeRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
             dbnode: req.db_oid,
         };
 
@@ -3683,7 +3735,7 @@ impl proto::PageService for GrpcPageServiceHandler {
                 .await?
                 .downgrade();
             while let Some(req) = reqs.message().await? {
-                let req_id = req.request_id;
+                let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
                 let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
                     .instrument(span.clone()) // propagate request span
                     .await;
@@ -3722,7 +3774,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         span_record!(rel=%req.rel, lsn=%req.read_lsn);
 
         let req = PagestreamNblocksRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
             rel: req.rel,
         };
 
@@ -3755,7 +3807,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn);
 
         let req = PagestreamGetSlruSegmentRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
             kind: req.kind as u8,
             segno: req.segno,
         };
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f576119db8..1a3016e7f1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3291,7 +3291,7 @@ impl TenantShard {
                         // Ignore this, we likely raced with unarchival.
                         OffloadError::NotArchived => Ok(()),
                         OffloadError::AlreadyInProgress => Ok(()),
-                        OffloadError::Cancelled => Err(CompactionError::ShuttingDown),
+                        OffloadError::Cancelled => Err(CompactionError::new_cancelled()),
                         // don't break the anyhow chain
                         OffloadError::Other(err) => Err(CompactionError::Other(err)),
                     })?;
@@ -3321,16 +3321,13 @@ impl TenantShard {
 
     /// Trips the compaction circuit breaker if appropriate.
     pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
-        match err {
-            err if err.is_cancel() => {}
-            CompactionError::ShuttingDown => (),
-            CompactionError::Other(err) => {
-                self.compaction_circuit_breaker
-                    .lock()
-                    .unwrap()
-                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
-            }
+        if err.is_cancel() {
+            return;
         }
+        self.compaction_circuit_breaker
+            .lock()
+            .unwrap()
+            .fail(&CIRCUIT_BREAKERS_BROKEN, err);
     }
 
     /// Cancel scheduled compaction tasks
@@ -3396,7 +3393,13 @@ impl TenantShard {
                 .collect_vec();
 
             for timeline in timelines {
-                timeline.maybe_freeze_ephemeral_layer().await;
+                // Include a span with the timeline ID. The parent span already has the tenant ID.
+                let span =
+                    info_span!("maybe_freeze_ephemeral_layer", timeline_id = %timeline.timeline_id);
+                timeline
+                    .maybe_freeze_ephemeral_layer()
+                    .instrument(span)
+                    .await;
             }
         }
 
@@ -4174,6 +4177,15 @@ impl TenantShard {
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
+    // HADRON
+    pub fn get_image_creation_timeout(&self) -> Option<Duration> {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf.image_layer_force_creation_period.or(self
+            .conf
+            .default_tenant_conf
+            .image_layer_force_creation_period)
+    }
+
     pub fn get_pitr_interval(&self) -> Duration {
         let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
@@ -5713,6 +5725,16 @@ impl TenantShard {
             .unwrap_or(0)
     }
 
+    /// HADRON
+    /// Return the visible size of all timelines in this tenant.
+    pub(crate) fn get_visible_size(&self) -> u64 {
+        let timelines = self.timelines.lock().unwrap();
+        timelines
+            .values()
+            .map(|t| t.metrics.visible_physical_size_gauge.get())
+            .sum()
+    }
+
     /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
     /// manifest in `Self::remote_tenant_manifest`.
     ///
@@ -12800,6 +12822,40 @@ mod tests {
                 },
             ]
         );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_get_force_image_creation_lsn() -> anyhow::Result<()> {
+        let tenant_conf = pageserver_api::models::TenantConfig {
+            pitr_interval: Some(Duration::from_secs(7 * 3600)),
+            image_layer_force_creation_period: Some(Duration::from_secs(3600)),
+            ..Default::default()
+        };
+
+        let tenant_id = TenantId::generate();
+
+        let harness = TenantHarness::create_custom(
+            "test_get_force_image_creation_lsn",
+            tenant_conf,
+            tenant_id,
+            ShardIdentity::unsharded(),
+            Generation::new(1),
+        )
+        .await?;
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        timeline.gc_info.write().unwrap().cutoffs.time = Some(Lsn(100));
+        {
+            let writer = timeline.writer().await;
+            writer.finish_write(Lsn(5000));
+        }
+
+        let image_creation_lsn = timeline.get_force_image_creation_lsn().unwrap();
+        assert_eq!(image_creation_lsn, Lsn(4300));
         Ok(())
     }
 }
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 23052ccee7..ba02602cfe 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -46,10 +46,11 @@
 mod historic_layer_coverage;
 mod layer_coverage;
 
-use std::collections::{HashMap, VecDeque};
+use std::collections::{BTreeMap, HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Instant;
 
 use anyhow::Result;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
@@ -904,6 +905,103 @@ impl LayerMap {
         max_stacked_deltas
     }
 
+    /* BEGIN_HADRON */
+    /**
+     * Compute the image consistent LSN, the largest LSN below which all pages have been redone successfully.
+     * It works by first finding the latest image layers and store them into a map. Then for each delta layer,
+     * find all overlapping image layers in order to potentially increase the image LSN in case there are gaps
+     * (e.g., if an image is created at LSN 100 but the delta layer spans LSN [150, 200], then we can increase
+     * image LSN to 150 because there is no WAL record in between).
+     * Finally, the image consistent LSN is computed by taking the minimum of all image layers.
+     */
+    pub fn compute_image_consistent_lsn(&self, disk_consistent_lsn: Lsn) -> Lsn {
+        struct ImageLayerInfo {
+            // creation LSN of the image layer
+            image_lsn: Lsn,
+            // the current minimum LSN of newer delta layers with overlapping key ranges
+            min_delta_lsn: Lsn,
+        }
+        let started_at = Instant::now();
+
+        let min_l0_deltas_lsn = {
+            let l0_deltas = self.level0_deltas();
+            l0_deltas
+                .iter()
+                .map(|layer| layer.get_lsn_range().start)
+                .min()
+                .unwrap_or(disk_consistent_lsn)
+        };
+        let global_key_range = Key::MIN..Key::MAX;
+
+        // step 1: collect all most recent image layers into a map
+        // map: end key to image_layer_info
+        let mut image_map: BTreeMap<Key, ImageLayerInfo> = BTreeMap::new();
+        for (img_range, img) in self.image_coverage(&global_key_range, disk_consistent_lsn) {
+            let img_lsn = img.map(|layer| layer.get_lsn_range().end).unwrap_or(Lsn(0));
+            image_map.insert(
+                img_range.end,
+                ImageLayerInfo {
+                    image_lsn: img_lsn,
+                    min_delta_lsn: min_l0_deltas_lsn,
+                },
+            );
+        }
+
+        // step 2: go through all delta layers, and update the image layer info with overlapping
+        // key ranges
+        for layer in self.historic.iter() {
+            if !layer.is_delta {
+                continue;
+            }
+            let delta_key_range = layer.get_key_range();
+            let delta_lsn_range = layer.get_lsn_range();
+            for (img_end_key, img_info) in image_map.range_mut(delta_key_range.start..Key::MAX) {
+                debug_assert!(img_end_key >= &delta_key_range.start);
+                if delta_lsn_range.end > img_info.image_lsn {
+                    // the delta layer includes WAL records after the image
+                    // it's possibel that the delta layer's start LSN < image LSN, which will be simply ignored by step 3
+                    img_info.min_delta_lsn =
+                        std::cmp::min(img_info.min_delta_lsn, delta_lsn_range.start);
+                }
+                if img_end_key >= &delta_key_range.end {
+                    // we have fully processed all overlapping image layers
+                    break;
+                }
+            }
+        }
+
+        // step 3, go through all image layers and find the image consistent LSN
+        let mut img_consistent_lsn = min_l0_deltas_lsn.checked_sub(Lsn(1)).unwrap();
+        let mut prev_key = Key::MIN;
+        for (img_key, img_info) in image_map {
+            tracing::debug!(
+                "Image layer {:?}:{} has min delta lsn {}",
+                Range {
+                    start: prev_key,
+                    end: img_key,
+                },
+                img_info.image_lsn,
+                img_info.min_delta_lsn,
+            );
+            let image_lsn = std::cmp::max(
+                img_info.image_lsn,
+                img_info.min_delta_lsn.checked_sub(Lsn(1)).unwrap_or(Lsn(0)),
+            );
+            img_consistent_lsn = std::cmp::min(img_consistent_lsn, image_lsn);
+            prev_key = img_key;
+        }
+        tracing::info!(
+            "computed image_consistent_lsn {} for disk_consistent_lsn {} in {}ms. Processed {} layrs in total.",
+            img_consistent_lsn,
+            disk_consistent_lsn,
+            started_at.elapsed().as_millis(),
+            self.historic.len()
+        );
+        img_consistent_lsn
+    }
+
+    /* END_HADRON */
+
     /// Return all L0 delta layers
     pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
         &self.l0_delta_layers
@@ -1579,6 +1677,138 @@ mod tests {
             LayerVisibilityHint::Visible
         ));
     }
+
+    /* BEGIN_HADRON */
+    #[test]
+    fn test_compute_image_consistent_lsn() {
+        let mut layer_map = LayerMap::default();
+
+        let disk_consistent_lsn = Lsn(1000);
+        // case 1: empty layer map
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(
+            disk_consistent_lsn.checked_sub(Lsn(1)).unwrap(),
+            image_consistent_lsn
+        );
+
+        // case 2: only L0 delta layer
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(100),
+                Lsn(900)..Lsn(990),
+                true,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(100),
+                Lsn(850)..Lsn(899),
+                true,
+            ));
+        }
+
+        // should use min L0 delta LSN - 1 as image consistent LSN
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(849), image_consistent_lsn);
+
+        // case 3: 3 images, no L1 delta
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(40),
+                Lsn(100)..Lsn(100),
+                false,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(40)..Key::from_i128(70),
+                Lsn(200)..Lsn(200),
+                false,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(70)..Key::from_i128(100),
+                Lsn(150)..Lsn(150),
+                false,
+            ));
+        }
+        // should use min L0 delta LSN - 1 as image consistent LSN
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(849), image_consistent_lsn);
+
+        // case 4: 3 images with 1 L1 delta
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(50),
+                Lsn(300)..Lsn(350),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(299), image_consistent_lsn);
+
+        // case 5: 3 images with 1 more L1 delta with smaller LSN
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(50)..Key::from_i128(72),
+                Lsn(200)..Lsn(300),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 6: 3 images with more newer L1 deltas (no impact on final results)
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(30),
+                Lsn(400)..Lsn(500),
+                true,
+            ));
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(35)..Key::from_i128(100),
+                Lsn(450)..Lsn(600),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 7: 3 images with more older L1 deltas (no impact on final results)
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(40),
+                Lsn(0)..Lsn(50),
+                true,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(50)..Key::from_i128(100),
+                Lsn(10)..Lsn(60),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 8: 3 images with one more L1 delta with overlapping LSN range
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(50),
+                Lsn(50)..Lsn(250),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(100), image_consistent_lsn);
+    }
+
+    /* END_HADRON */
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 15853d3614..52f67abde5 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1678,6 +1678,8 @@ impl TenantManager {
         // Phase 6: Release the InProgress on the parent shard
         drop(parent_slot_guard);
 
+        utils::pausable_failpoint!("shard-split-post-finish-pause");
+
         Ok(child_shards)
     }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9fbb9d2438..43ea8fffa3 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -75,7 +75,7 @@ where
 /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
 /// call, to collect more records.
 ///
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub(crate) struct ValueReconstructState {
     pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
     pub(crate) img: Option<(Lsn, Bytes)>,
@@ -308,6 +308,9 @@ pub struct ValuesReconstructState {
     layers_visited: u32,
     delta_layers_visited: u32,
 
+    pub(crate) enable_debug: bool,
+    pub(crate) debug_state: ValueReconstructState,
+
     pub(crate) io_concurrency: IoConcurrency,
     num_active_ios: Arc<AtomicUsize>,
 
@@ -657,6 +660,23 @@ impl ValuesReconstructState {
             layers_visited: 0,
             delta_layers_visited: 0,
             io_concurrency,
+            enable_debug: false,
+            debug_state: ValueReconstructState::default(),
+            num_active_ios: Arc::new(AtomicUsize::new(0)),
+            read_path: None,
+        }
+    }
+
+    pub(crate) fn new_with_debug(io_concurrency: IoConcurrency) -> Self {
+        Self {
+            keys: HashMap::new(),
+            keys_done: KeySpaceRandomAccum::new(),
+            keys_with_image_coverage: None,
+            layers_visited: 0,
+            delta_layers_visited: 0,
+            io_concurrency,
+            enable_debug: true,
+            debug_state: ValueReconstructState::default(),
             num_active_ios: Arc::new(AtomicUsize::new(0)),
             read_path: None,
         }
@@ -670,6 +690,12 @@ impl ValuesReconstructState {
         self.io_concurrency.spawn_io(fut).await;
     }
 
+    pub(crate) fn set_debug_state(&mut self, debug_state: &ValueReconstructState) {
+        if self.enable_debug {
+            self.debug_state = debug_state.clone();
+        }
+    }
+
     pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
         self.layers_visited += 1;
         if let ReadableLayer::PersistentLayer(layer) = layer {
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index 0f7995f87b..973852defc 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -225,7 +225,7 @@ impl fmt::Display for ImageLayerName {
 /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
 /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
 /// and [`crate::tenant::storage_layer::layer::local_layer_path`])
-#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Ord, PartialOrd)]
 pub enum LayerName {
     Image(ImageLayerName),
     Delta(DeltaLayerName),
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index bcece5589a..08fc7d61a5 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -17,17 +17,14 @@ use tracing::*;
 use utils::backoff::exponential_backoff_duration;
 use utils::completion::Barrier;
 use utils::pausable_failpoint;
-use utils::sync::gate::GateError;
 
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
 use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
-use crate::tenant::blob_io::WriteBlobError;
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::{TenantShard, TenantState};
-use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
 
 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
@@ -310,45 +307,12 @@ pub(crate) fn log_compaction_error(
     task_cancelled: bool,
     degrade_to_warning: bool,
 ) {
-    use CompactionError::*;
+    let is_cancel = err.is_cancel();
 
-    use crate::tenant::PageReconstructError;
-    use crate::tenant::upload_queue::NotInitialized;
-
-    let level = match err {
-        e if e.is_cancel() => return,
-        ShuttingDown => return,
-        _ if task_cancelled => Level::INFO,
-        Other(err) => {
-            let root_cause = err.root_cause();
-
-            let upload_queue = root_cause
-                .downcast_ref::<NotInitialized>()
-                .is_some_and(|e| e.is_stopping());
-            let timeline = root_cause
-                .downcast_ref::<PageReconstructError>()
-                .is_some_and(|e| e.is_cancel());
-            let buffered_writer_flush_task_canelled = root_cause
-                .downcast_ref::<FlushTaskError>()
-                .is_some_and(|e| e.is_cancel());
-            let write_blob_cancelled = root_cause
-                .downcast_ref::<WriteBlobError>()
-                .is_some_and(|e| e.is_cancel());
-            let gate_closed = root_cause
-                .downcast_ref::<GateError>()
-                .is_some_and(|e| e.is_cancel());
-            let is_stopping = upload_queue
-                || timeline
-                || buffered_writer_flush_task_canelled
-                || write_blob_cancelled
-                || gate_closed;
-
-            if is_stopping {
-                Level::INFO
-            } else {
-                Level::ERROR
-            }
-        }
+    let level = if is_cancel || task_cancelled {
+        Level::INFO
+    } else {
+        Level::ERROR
     };
 
     if let Some((error_count, sleep_duration)) = retry_info {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6088f40669..73d2d72b59 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1002,7 +1002,7 @@ impl From<WaitLsnError> for tonic::Status {
 impl From<CreateImageLayersError> for CompactionError {
     fn from(e: CreateImageLayersError) -> Self {
         match e {
-            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
+            CreateImageLayersError::Cancelled => CompactionError::new_cancelled(),
             CreateImageLayersError::Other(e) => {
                 CompactionError::Other(e.context("create image layers"))
             }
@@ -1253,6 +1253,57 @@ impl Timeline {
         }
     }
 
+    #[inline(always)]
+    pub(crate) async fn debug_get(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+        reconstruct_state: &mut ValuesReconstructState,
+    ) -> Result<Bytes, PageReconstructError> {
+        if !lsn.is_valid() {
+            return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
+        }
+
+        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
+        // already checked the key against the shard_identity when looking up the Timeline from
+        // page_service.
+        debug_assert!(!self.shard_identity.is_key_disposable(&key));
+
+        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
+        let vectored_res = self
+            .debug_get_vectored_impl(query, reconstruct_state, ctx)
+            .await;
+
+        let key_value = vectored_res?.pop_first();
+        match key_value {
+            Some((got_key, value)) => {
+                if got_key != key {
+                    error!(
+                        "Expected {}, but singular vectored get returned {}",
+                        key, got_key
+                    );
+                    Err(PageReconstructError::Other(anyhow!(
+                        "Singular vectored get returned wrong key"
+                    )))
+                } else {
+                    value
+                }
+            }
+            None => Err(PageReconstructError::MissingKey(Box::new(
+                MissingKeyError {
+                    keyspace: KeySpace::single(key..key.next()),
+                    shard: self.shard_identity.get_shard_number(&key),
+                    original_hwm_lsn: lsn,
+                    ancestor_lsn: None,
+                    backtrace: None,
+                    read_path: None,
+                    query: None,
+                },
+            ))),
+        }
+    }
+
     pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;
 
     /// Look up multiple page versions at a given LSN
@@ -1547,6 +1598,98 @@ impl Timeline {
         Ok(results)
     }
 
+    // A copy of the get_vectored_impl method except that we store the image and wal records into `reconstruct_state`.
+    // This is only used in the http getpage call for debugging purpose.
+    pub(super) async fn debug_get_vectored_impl(
+        &self,
+        query: VersionedKeySpaceQuery,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if query.is_empty() {
+            return Ok(BTreeMap::default());
+        }
+
+        let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
+            Some(ReadPath::new(
+                query.total_keyspace(),
+                query.high_watermark_lsn()?,
+            ))
+        } else {
+            None
+        };
+
+        reconstruct_state.read_path = read_path;
+
+        let traversal_res: Result<(), _> = self
+            .get_vectored_reconstruct_data(query.clone(), reconstruct_state, ctx)
+            .await;
+
+        if let Err(err) = traversal_res {
+            // Wait for all the spawned IOs to complete.
+            // See comments on `spawn_io` inside `storage_layer` for more details.
+            let mut collect_futs = std::mem::take(&mut reconstruct_state.keys)
+                .into_values()
+                .map(|state| state.collect_pending_ios())
+                .collect::<FuturesUnordered<_>>();
+            while collect_futs.next().await.is_some() {}
+            return Err(err);
+        };
+
+        let reconstruct_state = Arc::new(Mutex::new(reconstruct_state));
+        let futs = FuturesUnordered::new();
+
+        for (key, state) in std::mem::take(&mut reconstruct_state.lock().unwrap().keys) {
+            let req_lsn_for_key = query.map_key_to_lsn(&key);
+            futs.push({
+                let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
+                let rc_clone = Arc::clone(&reconstruct_state);
+
+                async move {
+                    assert_eq!(state.situation, ValueReconstructSituation::Complete);
+
+                    let converted = match state.collect_pending_ios().await {
+                        Ok(ok) => ok,
+                        Err(err) => {
+                            return (key, Err(err));
+                        }
+                    };
+                    DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);
+
+                    // The walredo module expects the records to be descending in terms of Lsn.
+                    // And we submit the IOs in that order, so, there shuold be no need to sort here.
+                    debug_assert!(
+                        converted
+                            .records
+                            .is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)),
+                        "{converted:?}"
+                    );
+                    {
+                        let mut guard = rc_clone.lock().unwrap();
+                        guard.set_debug_state(&converted);
+                    }
+                    (
+                        key,
+                        walredo_self
+                            .reconstruct_value(
+                                key,
+                                req_lsn_for_key,
+                                converted,
+                                RedoAttemptType::ReadPage,
+                            )
+                            .await,
+                    )
+                }
+            });
+        }
+
+        let results = futs
+            .collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
+            .await;
+
+        Ok(results)
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
@@ -1893,6 +2036,8 @@ impl Timeline {
     // an ephemeral layer open forever when idle.  It also freezes layers if the global limit on
     // ephemeral layer bytes has been breached.
     pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
         let Ok(mut write_guard) = self.write_lock.try_lock() else {
             // If the write lock is held, there is an active wal receiver: rolling open layers
             // is their responsibility while they hold this lock.
@@ -2117,12 +2262,7 @@ impl Timeline {
         match &result {
             Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
             Err(e) if e.is_cancel() => {}
-            Err(CompactionError::ShuttingDown) => {
-                // Covered by the `Err(e) if e.is_cancel()` branch.
-            }
-            Err(CompactionError::Other(_)) => {
-                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
-            }
+            Err(_) => self.compaction_failed.store(true, AtomicOrdering::Relaxed),
         };
 
         result
@@ -2851,6 +2991,18 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
+    // HADRON
+    fn get_image_layer_force_creation_period(&self) -> Option<Duration> {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .image_layer_force_creation_period
+            .or(self
+                .conf
+                .default_tenant_conf
+                .image_layer_force_creation_period)
+    }
+
     fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
         let tenant_conf = &self.tenant_conf.load();
         tenant_conf
@@ -3120,7 +3272,6 @@ impl Timeline {
                 repartition_threshold: 0,
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
                 last_image_layer_creation_check_instant: Mutex::new(None),
-
                 last_received_wal: Mutex::new(None),
                 rel_size_latest_cache: RwLock::new(HashMap::new()),
                 rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
@@ -5041,6 +5192,7 @@ impl Timeline {
                 .create_image_layers(
                     &partitions,
                     self.initdb_lsn,
+                    None,
                     ImageLayerCreationMode::Initial,
                     ctx,
                     LastImageLayerCreationStatus::Initial,
@@ -5312,14 +5464,19 @@ impl Timeline {
     }
 
     // Is it time to create a new image layer for the given partition? True if we want to generate.
-    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
+    async fn time_for_new_image_layer(
+        &self,
+        partition: &KeySpace,
+        lsn: Lsn,
+        force_image_creation_lsn: Option<Lsn>,
+    ) -> bool {
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
         let Ok(layers) = guard.layer_map() else {
             return false;
         };
-
+        let mut min_image_lsn: Lsn = Lsn::MAX;
         let mut max_deltas = 0;
         for part_range in &partition.ranges {
             let image_coverage = layers.image_coverage(part_range, lsn);
@@ -5354,9 +5511,25 @@ impl Timeline {
                         return true;
                     }
                 }
+                min_image_lsn = min(min_image_lsn, img_lsn);
             }
         }
 
+        // HADRON
+        // for child timelines, we consider all pages up to ancestor_LSN are redone successfully by the parent timeline
+        min_image_lsn = min_image_lsn.max(self.get_ancestor_lsn());
+        if min_image_lsn < force_image_creation_lsn.unwrap_or(Lsn(0)) && max_deltas > 0 {
+            info!(
+                "forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}, num deltas: {}",
+                partition.ranges[0].start,
+                partition.ranges[0].end,
+                min_image_lsn,
+                force_image_creation_lsn.unwrap(),
+                max_deltas
+            );
+            return true;
+        }
+
         debug!(
             max_deltas,
             "none of the partitioned ranges had >= {threshold} deltas"
@@ -5576,13 +5749,14 @@ impl Timeline {
     /// Predicate function which indicates whether we should check if new image layers
     /// are required. Since checking if new image layers are required is expensive in
     /// terms of CPU, we only do it in the following cases:
-    /// 1. If the timeline has ingested sufficient WAL to justify the cost
+    /// 1. If the timeline has ingested sufficient WAL to justify the cost or ...
     /// 2. If enough time has passed since the last check:
     ///     1. For large tenants, we wish to perform the check more often since they
-    ///        suffer from the lack of image layers
+    ///        suffer from the lack of image layers. Note that we assume sharded tenants
+    ///        to be large since non-zero shards do not track the logical size.
     ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
     fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
-        const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
+        let large_timeline_threshold = self.conf.image_layer_generation_large_timeline_threshold;
 
         let last_checks_at = self.last_image_layer_creation_check_at.load();
         let distance = lsn
@@ -5593,30 +5767,39 @@ impl Timeline {
 
         let distance_based_decision = distance.0 >= min_distance;
 
-        let mut time_based_decision = false;
         let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
-        if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
-            let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
-            {
-                self.get_checkpoint_timeout()
-            } else {
-                Duration::from_secs(3600 * 48)
-            };
-
-            time_based_decision = match *last_check_instant {
-                Some(last_check) => {
-                    let elapsed = last_check.elapsed();
-                    elapsed >= check_required_after
+        let check_required_after = (|| {
+            if self.shard_identity.is_unsharded() {
+                if let CurrentLogicalSize::Exact(logical_size) =
+                    self.current_logical_size.current_size()
+                {
+                    if Some(Into::<u64>::into(&logical_size)) < large_timeline_threshold {
+                        return Duration::from_secs(3600 * 48);
+                    }
                 }
-                None => true,
-            };
-        }
+            }
+
+            self.get_checkpoint_timeout()
+        })();
+
+        let time_based_decision = match *last_check_instant {
+            Some(last_check) => {
+                let elapsed = last_check.elapsed();
+                elapsed >= check_required_after
+            }
+            None => true,
+        };
 
         // Do the expensive delta layer counting only if this timeline has ingested sufficient
         // WAL since the last check or a checkpoint timeout interval has elapsed since the last
         // check.
         let decision = distance_based_decision || time_based_decision;
-
+        tracing::info!(
+            "Decided to check image layers: {}. Distance-based decision: {}, time-based decision: {}",
+            decision,
+            distance_based_decision,
+            time_based_decision
+        );
         if decision {
             self.last_image_layer_creation_check_at.store(lsn);
             *last_check_instant = Some(Instant::now());
@@ -5629,10 +5812,12 @@ impl Timeline {
     /// true = we have generate all image layers, false = we preempt the process for L0 compaction.
     ///
     /// `partition_mode` is only for logging purpose and is not used anywhere in this function.
+    #[allow(clippy::too_many_arguments)]
     async fn create_image_layers(
         self: &Arc<Timeline>,
         partitioning: &KeyPartitioning,
         lsn: Lsn,
+        force_image_creation_lsn: Option<Lsn>,
         mode: ImageLayerCreationMode,
         ctx: &RequestContext,
         last_status: LastImageLayerCreationStatus,
@@ -5736,7 +5921,11 @@ impl Timeline {
             } else if let ImageLayerCreationMode::Try = mode {
                 // check_for_image_layers = false -> skip
                 // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
-                if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
+                if !check_for_image_layers
+                    || !self
+                        .time_for_new_image_layer(partition, lsn, force_image_creation_lsn)
+                        .await
+                {
                     start = img_range.end;
                     continue;
                 }
@@ -6057,26 +6246,88 @@ impl Drop for Timeline {
     }
 }
 
-/// Top-level failure to compact.
-#[derive(Debug, thiserror::Error)]
-pub(crate) enum CompactionError {
-    #[error("The timeline or pageserver is shutting down")]
-    ShuttingDown,
-    #[error(transparent)]
-    Other(anyhow::Error),
-}
+pub(crate) use compaction_error::CompactionError;
+/// In a private mod to enforce that [`CompactionError::is_cancel`] is used
+/// instead of `match`ing on [`CompactionError::ShuttingDown`].
+mod compaction_error {
+    use utils::sync::gate::GateError;
 
-impl CompactionError {
-    /// Errors that can be ignored, i.e., cancel and shutdown.
-    pub fn is_cancel(&self) -> bool {
-        matches!(self, Self::ShuttingDown)
+    use crate::{
+        pgdatadir_mapping::CollectKeySpaceError,
+        tenant::{PageReconstructError, blob_io::WriteBlobError, upload_queue::NotInitialized},
+        virtual_file::owned_buffers_io::write::FlushTaskError,
+    };
+
+    /// Top-level failure to compact. Use [`Self::is_cancel`].
+    #[derive(Debug, thiserror::Error)]
+    pub(crate) enum CompactionError {
+        /// Use [`Self::is_cancel`] instead of checking for this variant.
+        #[error("The timeline or pageserver is shutting down")]
+        #[allow(private_interfaces)]
+        ShuttingDown(ForbidMatching), // private ForbidMatching enforces use of [`Self::is_cancel`].
+        #[error(transparent)]
+        Other(anyhow::Error),
     }
 
-    pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self {
-        if err.is_cancel() {
-            Self::ShuttingDown
-        } else {
-            Self::Other(err.into_anyhow())
+    #[derive(Debug)]
+    struct ForbidMatching;
+
+    impl CompactionError {
+        pub fn new_cancelled() -> Self {
+            Self::ShuttingDown(ForbidMatching)
+        }
+        /// Errors that can be ignored, i.e., cancel and shutdown.
+        pub fn is_cancel(&self) -> bool {
+            let other = match self {
+                CompactionError::ShuttingDown(_) => return true,
+                CompactionError::Other(other) => other,
+            };
+
+            // The write path of compaction in particular often lacks differentiated
+            // handling errors stemming from cancellation from other errors.
+            // So, if requested, we also check the ::Other variant by downcasting.
+            // The list below has been found empirically from flaky tests and production logs.
+            // The process is simple: on ::Other(), compaction will print the enclosed
+            // anyhow::Error in debug mode, i.e., with backtrace. That backtrace contains the
+            // line where the write path / compaction code does undifferentiated error handling
+            // from a non-anyhow type to an anyhow type. Add the type to the list of downcasts
+            // below, following the same is_cancel() pattern.
+
+            let root_cause = other.root_cause();
+
+            let upload_queue = root_cause
+                .downcast_ref::<NotInitialized>()
+                .is_some_and(|e| e.is_stopping());
+            let timeline = root_cause
+                .downcast_ref::<PageReconstructError>()
+                .is_some_and(|e| e.is_cancel());
+            let buffered_writer_flush_task_canelled = root_cause
+                .downcast_ref::<FlushTaskError>()
+                .is_some_and(|e| e.is_cancel());
+            let write_blob_cancelled = root_cause
+                .downcast_ref::<WriteBlobError>()
+                .is_some_and(|e| e.is_cancel());
+            let gate_closed = root_cause
+                .downcast_ref::<GateError>()
+                .is_some_and(|e| e.is_cancel());
+            upload_queue
+                || timeline
+                || buffered_writer_flush_task_canelled
+                || write_blob_cancelled
+                || gate_closed
+        }
+        pub fn into_anyhow(self) -> anyhow::Error {
+            match self {
+                CompactionError::ShuttingDown(ForbidMatching) => anyhow::Error::new(self),
+                CompactionError::Other(e) => e,
+            }
+        }
+        pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self {
+            if err.is_cancel() {
+                Self::new_cancelled()
+            } else {
+                Self::Other(err.into_anyhow())
+            }
         }
     }
 }
@@ -6088,7 +6339,7 @@ impl From<super::upload_queue::NotInitialized> for CompactionError {
                 CompactionError::Other(anyhow::anyhow!(value))
             }
             super::upload_queue::NotInitialized::ShuttingDown
-            | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown,
+            | super::upload_queue::NotInitialized::Stopped => CompactionError::new_cancelled(),
         }
     }
 }
@@ -6098,7 +6349,7 @@ impl From<super::storage_layer::layer::DownloadError> for CompactionError {
         match e {
             super::storage_layer::layer::DownloadError::TimelineShutdown
             | super::storage_layer::layer::DownloadError::DownloadCancelled => {
-                CompactionError::ShuttingDown
+                CompactionError::new_cancelled()
             }
             super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
             | super::storage_layer::layer::DownloadError::DownloadRequired
@@ -6117,14 +6368,14 @@ impl From<super::storage_layer::layer::DownloadError> for CompactionError {
 
 impl From<layer_manager::Shutdown> for CompactionError {
     fn from(_: layer_manager::Shutdown) -> Self {
-        CompactionError::ShuttingDown
+        CompactionError::new_cancelled()
     }
 }
 
 impl From<super::storage_layer::errors::PutError> for CompactionError {
     fn from(e: super::storage_layer::errors::PutError) -> Self {
         if e.is_cancel() {
-            CompactionError::ShuttingDown
+            CompactionError::new_cancelled()
         } else {
             CompactionError::Other(e.into_anyhow())
         }
@@ -6223,7 +6474,7 @@ impl Timeline {
         let mut guard = tokio::select! {
             guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard,
             _ = self.cancel.cancelled() => {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
         };
 
@@ -7050,6 +7301,19 @@ impl Timeline {
             .unwrap()
             .clone()
     }
+
+    /* BEGIN_HADRON */
+    pub(crate) async fn compute_image_consistent_lsn(&self) -> anyhow::Result<Lsn> {
+        let guard = self
+            .layers
+            .read(LayerManagerLockHolder::ComputeImageConsistentLsn)
+            .await;
+        let layer_map = guard.layer_map()?;
+        let disk_consistent_lsn = self.get_disk_consistent_lsn();
+
+        Ok(layer_map.compute_image_consistent_lsn(disk_consistent_lsn))
+    }
+    /* END_HADRON */
 }
 
 impl Timeline {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index c263df1eb2..aa1aa937b6 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,6 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
+use std::cmp::min;
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
@@ -572,8 +573,8 @@ impl GcCompactionQueue {
         }
         match res {
             Ok(res) => Ok(res),
-            Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
-            Err(CompactionError::Other(_)) => {
+            Err(e) if e.is_cancel() => Err(e),
+            Err(_) => {
                 // There are some cases where traditional gc might collect some layer
                 // files causing gc-compaction cannot read the full history of the key.
                 // This needs to be resolved in the long-term by improving the compaction
@@ -1260,13 +1261,16 @@ impl Timeline {
         // Is the timeline being deleted?
         if self.is_stopping() {
             trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
+            return Err(CompactionError::new_cancelled());
         }
 
         let target_file_size = self.get_checkpoint_distance();
 
         // Define partitioning schema if needed
 
+        // HADRON
+        let force_image_creation_lsn = self.get_force_image_creation_lsn();
+
         // 1. L0 Compact
         let l0_outcome = {
             let timer = self.metrics.compact_time_histo.start_timer();
@@ -1274,6 +1278,7 @@ impl Timeline {
                 .compact_level0(
                     target_file_size,
                     options.flags.contains(CompactFlags::ForceL0Compaction),
+                    force_image_creation_lsn,
                     ctx,
                 )
                 .await?;
@@ -1376,6 +1381,7 @@ impl Timeline {
                     .create_image_layers(
                         &partitioning,
                         lsn,
+                        force_image_creation_lsn,
                         mode,
                         &image_ctx,
                         self.last_image_layer_creation_status
@@ -1472,6 +1478,41 @@ impl Timeline {
         Ok(CompactionOutcome::Done)
     }
 
+    /* BEGIN_HADRON */
+    // Get the force image creation LSN based on gc_cutoff_lsn.
+    // Note that this is an estimation and the workload rate may suddenly change. When that happens,
+    // the force image creation may be too early or too late, but eventually it should be able to catch up.
+    pub(crate) fn get_force_image_creation_lsn(self: &Arc<Self>) -> Option<Lsn> {
+        let image_creation_period = self.get_image_layer_force_creation_period()?;
+        let current_lsn = self.get_last_record_lsn();
+        let pitr_lsn = self.gc_info.read().unwrap().cutoffs.time?;
+        let pitr_interval = self.get_pitr_interval();
+        if pitr_lsn == Lsn::INVALID || pitr_interval.is_zero() {
+            tracing::warn!(
+                "pitr LSN/interval not found, skipping force image creation LSN calculation"
+            );
+            return None;
+        }
+
+        let delta_lsn = current_lsn.checked_sub(pitr_lsn).unwrap().0
+            * image_creation_period.as_secs()
+            / pitr_interval.as_secs();
+        let force_image_creation_lsn = current_lsn.checked_sub(delta_lsn).unwrap_or(Lsn(0));
+
+        tracing::info!(
+            "Tenant shard {} computed force_image_creation_lsn: {}. Current lsn: {}, image_layer_force_creation_period: {:?}, GC cutoff: {}, PITR interval: {:?}",
+            self.tenant_shard_id,
+            force_image_creation_lsn,
+            current_lsn,
+            image_creation_period,
+            pitr_lsn,
+            pitr_interval
+        );
+
+        Some(force_image_creation_lsn)
+    }
+    /* END_HADRON */
+
     /// Check for layers that are elegible to be rewritten:
     /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
     ///   we don't indefinitely retain keys in this shard that aren't needed.
@@ -1624,7 +1665,7 @@ impl Timeline {
 
         for (i, layer) in layers_to_rewrite.into_iter().enumerate() {
             if self.cancel.is_cancelled() {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
 
             info!(layer=%layer, "rewriting layer after shard split: {}/{}", i, total);
@@ -1722,7 +1763,7 @@ impl Timeline {
                     Ok(()) => {},
                     Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
                     Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
-                        return Err(CompactionError::ShuttingDown);
+                        return Err(CompactionError::new_cancelled());
                     }
                 },
                 // Don't wait if there's L0 compaction to do. We don't need to update the outcome
@@ -1801,6 +1842,7 @@ impl Timeline {
         self: &Arc<Self>,
         target_file_size: u64,
         force_compaction_ignore_threshold: bool,
+        force_compaction_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<CompactionOutcome, CompactionError> {
         let CompactLevel0Phase1Result {
@@ -1821,6 +1863,7 @@ impl Timeline {
                 stats,
                 target_file_size,
                 force_compaction_ignore_threshold,
+                force_compaction_lsn,
                 &ctx,
             )
             .instrument(phase1_span)
@@ -1843,6 +1886,7 @@ impl Timeline {
         mut stats: CompactLevel0Phase1StatsBuilder,
         target_file_size: u64,
         force_compaction_ignore_threshold: bool,
+        force_compaction_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
         let begin = tokio::time::Instant::now();
@@ -1872,11 +1916,28 @@ impl Timeline {
                     return Ok(CompactLevel0Phase1Result::default());
                 }
             } else {
-                debug!(
-                    level0_deltas = level0_deltas.len(),
-                    threshold, "too few deltas to compact"
-                );
-                return Ok(CompactLevel0Phase1Result::default());
+                // HADRON
+                let min_lsn = level0_deltas
+                    .iter()
+                    .map(|a| a.get_lsn_range().start)
+                    .reduce(min);
+                if force_compaction_lsn.is_some()
+                    && min_lsn.is_some()
+                    && min_lsn.unwrap() < force_compaction_lsn.unwrap()
+                {
+                    info!(
+                        "forcing L0 compaction of {} L0 deltas. Min lsn: {}, force compaction lsn: {}",
+                        level0_deltas.len(),
+                        min_lsn.unwrap(),
+                        force_compaction_lsn.unwrap()
+                    );
+                } else {
+                    debug!(
+                        level0_deltas = level0_deltas.len(),
+                        threshold, "too few deltas to compact"
+                    );
+                    return Ok(CompactLevel0Phase1Result::default());
+                }
             }
         }
 
@@ -1985,7 +2046,7 @@ impl Timeline {
             let mut all_keys = Vec::new();
             for l in deltas_to_compact.iter() {
                 if self.cancel.is_cancelled() {
-                    return Err(CompactionError::ShuttingDown);
+                    return Err(CompactionError::new_cancelled());
                 }
                 let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
                 let keys = delta
@@ -2078,7 +2139,7 @@ impl Timeline {
         stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
 
         if self.cancel.is_cancelled() {
-            return Err(CompactionError::ShuttingDown);
+            return Err(CompactionError::new_cancelled());
         }
 
         stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
@@ -2186,7 +2247,7 @@ impl Timeline {
                 // avoid hitting the cancellation token on every key. in benches, we end up
                 // shuffling an order of million keys per layer, this means we'll check it
                 // around tens of times per layer.
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
 
             let same_key = prev_key == Some(key);
@@ -2271,7 +2332,7 @@ impl Timeline {
                 if writer.is_none() {
                     if self.cancel.is_cancelled() {
                         // to be somewhat responsive to cancellation, check for each new layer
-                        return Err(CompactionError::ShuttingDown);
+                        return Err(CompactionError::new_cancelled());
                     }
                     // Create writer if not initiaized yet
                     writer = Some(
@@ -2527,7 +2588,7 @@ impl Timeline {
         // Is the timeline being deleted?
         if self.is_stopping() {
             trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
+            return Err(CompactionError::new_cancelled());
         }
 
         let (dense_ks, _sparse_ks) = self
@@ -3189,7 +3250,7 @@ impl Timeline {
         let gc_lock = async {
             tokio::select! {
                 guard = self.gc_lock.lock() => Ok(guard),
-                _ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
+                _ = cancel.cancelled() => Err(CompactionError::new_cancelled()),
             }
         };
 
@@ -3462,7 +3523,7 @@ impl Timeline {
             }
             total_layer_size += layer.layer_desc().file_size;
             if cancel.is_cancelled() {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
             let should_yield = yield_for_l0
                 && self
@@ -3609,7 +3670,7 @@ impl Timeline {
             }
 
             if cancel.is_cancelled() {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
 
             let should_yield = yield_for_l0
diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 33c97287c0..7bca66190f 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -359,14 +359,14 @@ impl<T: Types> Cache<T> {
                 Err(e) => {
                     // Retry on tenant manager error to handle tenant split more gracefully
                     if attempt < GET_MAX_RETRIES {
-                        tracing::warn!(
-                            "Fail to resolve tenant shard in attempt {}: {:?}. Retrying...",
-                            attempt,
-                            e
-                        );
                         tokio::time::sleep(RETRY_BACKOFF).await;
                         continue;
                     } else {
+                        tracing::warn!(
+                            "Failed to resolve tenant shard after {} attempts: {:?}",
+                            GET_MAX_RETRIES,
+                            e
+                        );
                         return Err(e);
                     }
                 }
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 2eccf48579..d8d81a6c91 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -47,6 +47,7 @@ pub(crate) enum LayerManagerLockHolder {
     ImportPgData,
     DetachAncestor,
     Eviction,
+    ComputeImageConsistentLsn,
     #[cfg(test)]
     Testing,
 }
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index c6d3cafe9a..f053c9ed37 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -147,6 +147,16 @@ pub enum RedoAttemptType {
     GcCompaction,
 }
 
+impl std::fmt::Display for RedoAttemptType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            RedoAttemptType::ReadPage => write!(f, "read page"),
+            RedoAttemptType::LegacyCompaction => write!(f, "legacy compaction"),
+            RedoAttemptType::GcCompaction => write!(f, "gc compaction"),
+        }
+    }
+}
+
 ///
 /// Public interface of WAL redo manager
 ///
@@ -199,6 +209,7 @@ impl PostgresRedoManager {
                         self.conf.wal_redo_timeout,
                         pg_version,
                         max_retry_attempts,
+                        redo_attempt_type,
                     )
                     .await
                 };
@@ -221,6 +232,7 @@ impl PostgresRedoManager {
                 self.conf.wal_redo_timeout,
                 pg_version,
                 max_retry_attempts,
+                redo_attempt_type,
             )
             .await
         }
@@ -445,6 +457,7 @@ impl PostgresRedoManager {
         wal_redo_timeout: Duration,
         pg_version: PgMajorVersion,
         max_retry_attempts: u32,
+        redo_attempt_type: RedoAttemptType,
     ) -> Result<Bytes, Error> {
         *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
 
@@ -485,17 +498,28 @@ impl PostgresRedoManager {
                 );
 
                 if let Err(e) = result.as_ref() {
-                    error!(
-                        "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
-                        records.len(),
-                        records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-                        records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                        nbytes,
-                        base_img_lsn,
-                        lsn,
-                        n_attempts,
-                        e,
-                    );
+                    macro_rules! message {
+                        ($level:tt) => {
+                            $level!(
+                                "error applying {} WAL records {}..{} ({} bytes) to key {} during {}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                                records.len(),
+                                records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+                                records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                                nbytes,
+                                key,
+                                redo_attempt_type,
+                                base_img_lsn,
+                                lsn,
+                                n_attempts,
+                                e,
+                            )
+                        }
+                    }
+                    match redo_attempt_type {
+                        RedoAttemptType::ReadPage => message!(error),
+                        RedoAttemptType::LegacyCompaction => message!(error),
+                        RedoAttemptType::GcCompaction => message!(warn),
+                    }
                 }
 
                 result.map_err(Error::Other)
diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c
index bd53855eab..158b8940a3 100644
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -421,7 +421,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
 {
 	if (resp->tag != T_NeonGetPageResponse && resp->tag != T_NeonErrorResponse)
 	{
-		neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=%ld, ring_flush=%ld, ring_unused=%ld",
+		neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=" UINT64_FORMAT ", ring_flush=" UINT64_FORMAT ", ring_unused=" UINT64_FORMAT "",
 					   resp->tag, MyPState->ring_receive, MyPState->ring_flush, MyPState->ring_unused);
 	}
 	if (neon_protocol_version >= 3)
@@ -438,7 +438,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
 				getpage_resp->req.blkno != slot->buftag.blockNum)
 			{
 				NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-											"Receive unexpected getpage response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
+											"Receive unexpected getpage response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
 											resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
 											slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), slot->buftag.forkNum, slot->buftag.blockNum);
 			}
@@ -447,7 +447,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
 				 resp->lsn != slot->request_lsns.request_lsn ||
 				 resp->not_modified_since != slot->request_lsns.not_modified_since)
 		{
-			elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+			elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 				 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 				 slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
 		}
@@ -496,9 +496,9 @@ communicator_prefetch_pump_state(void)
 			slot->my_ring_index != MyPState->ring_receive)
 		{
 			neon_shard_log(slot->shard_no, PANIC,
-						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
 						   slot->status, slot->response,
-						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+						   slot->my_ring_index, MyPState->ring_receive);
 		}
 		/* update prefetch state */
 		MyPState->n_responses_buffered += 1;
@@ -789,9 +789,9 @@ prefetch_read(PrefetchRequest *slot)
 		slot->my_ring_index != MyPState->ring_receive)
 	{
 		neon_shard_log(slot->shard_no, PANIC,
-					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
+					   "Incorrect prefetch read: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
 					   slot->status, slot->response,
-					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
+					   slot->my_ring_index, MyPState->ring_receive);
 	}
 
 	/*
@@ -816,9 +816,9 @@ prefetch_read(PrefetchRequest *slot)
 			slot->my_ring_index != MyPState->ring_receive)
 		{
 			neon_shard_log(shard_no, PANIC,
-						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
 						   slot->status, slot->response,
-						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+						   slot->my_ring_index, MyPState->ring_receive);
 		}
 
 		/* update prefetch state */
@@ -852,8 +852,8 @@ prefetch_read(PrefetchRequest *slot)
 		 * and the prefetch queue was flushed during the receive call
 		 */
 		neon_shard_log(shard_no, LOG,
-					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   (long) my_ring_index,
+					   "No response from reading prefetch entry " UINT64_FORMAT ": %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   my_ring_index,
 					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
 					   buftag.forkNum, buftag.blockNum);
 		return false;
@@ -1844,7 +1844,7 @@ nm_to_string(NeonMessage *msg)
 				NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg;
 
 				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\"");
-				appendStringInfo(&s, ", \"db_size\": %ld}",
+				appendStringInfo(&s, ", \"db_size\": " INT64_FORMAT "}",
 								 msg_resp->db_size);
 				appendStringInfoChar(&s, '}');
 
@@ -2045,7 +2045,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
 						exists_resp->req.forknum != request.forknum)
 					{
 						NEON_PANIC_CONNECTION_STATE(0, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
 					}
@@ -2058,14 +2058,14 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
 				{
 					if (!equal_requests(resp, &request.hdr))
 					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 					}
 				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 								resp->reqid,
 								RelFileInfoFmt(rinfo),
 								forkNum,
@@ -2241,7 +2241,7 @@ Retry:
 			case T_NeonErrorResponse:
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[shard %d, reqid " UINT64_HEX_FORMAT "] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
 								slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
 								forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
 						 errdetail("page server returned error: %s",
@@ -2294,7 +2294,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
 						relsize_resp->req.forknum != forknum)
 					{
 						NEON_PANIC_CONNECTION_STATE(0, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
 					}
@@ -2307,14 +2307,14 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
 				{
 					if (!equal_requests(resp, &request.hdr))
 					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 					}
 				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 								resp->reqid,
 								RelFileInfoFmt(rinfo),
 								forknum,
@@ -2364,7 +2364,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
 						dbsize_resp->req.dbNode != dbNode)
 					{
 						NEON_PANIC_CONNECTION_STATE(0, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
+													"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
 					}
@@ -2377,14 +2377,14 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
 				{
 					if (!equal_requests(resp, &request.hdr))
 					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 					}
 				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read db size of db %u from page server at lsn %X/%08X",
 								resp->reqid,
 								dbNode, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)),
 						 errdetail("page server returned error: %s",
@@ -2455,7 +2455,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 					slru_resp->req.segno != segno)
 				{
 					NEON_PANIC_CONNECTION_STATE(0, PANIC,
-												"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
+												"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
 												resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
 												request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno);
 				}
@@ -2469,14 +2469,14 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 			{
 				if (!equal_requests(resp, &request.hdr))
 				{
-					elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+					elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 						 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 						 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 				}
 			}
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %llu at lsn %X/%08X",
+					 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read SLRU %d segment %llu at lsn %X/%08X",
 							resp->reqid,
 							kind,
 							(unsigned long long) segno,
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 8cfa09bc87..2c87f139af 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -162,8 +162,34 @@ typedef struct FileCacheControl
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
 	dlist_head  holes;          /* double linked list of punched holes */
-	HyperLogLogState wss_estimation; /* estimation of working set size */
+
 	ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */
+
+	/*
+	 * Estimation of working set size.
+	 *
+	 * This is not guarded by the lock. No locking is needed because all the
+	 * writes to the "registers" are simple 64-bit stores, to update a
+	 * timestamp. We assume that:
+	 *
+	 * - 64-bit stores are atomic. We could enforce that by using
+	 *   pg_atomic_uint64 instead of TimestampTz as the datatype in hll.h, but
+	 *   for now we just rely on it implicitly.
+	 *
+	 * - Even if they're not, and there is a race between two stores, it
+	 *   doesn't matter much which one wins because they're both updating the
+	 *   register with the current timestamp. Or you have a race between
+	 *   resetting the register and updating it, in which case it also doesn't
+	 *   matter much which one wins.
+	 *
+	 * - If they're not atomic, you might get an occasional "torn write" if
+	 *   you're really unlucky, but we tolerate that too. It just means that
+	 *   the estimate will be a little off, until the register is updated
+	 *   again.
+	 */
+	HyperLogLogState wss_estimation;
+
+	/* Prewarmer state */
 	PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
 	size_t n_prewarm_workers;
 	size_t n_prewarm_entries;
@@ -205,6 +231,8 @@ bool AmPrewarmWorker;
 
 #define LFC_ENABLED() (lfc_ctl->limit != 0)
 
+PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
+
 /*
  * Close LFC file if opened.
  * All backends should close their LFC files once LFC is disabled.
@@ -1142,6 +1170,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 
+	/* Update working set size estimate for the blocks */
+	for (int i = 0; i < nblocks; i++)
+	{
+		tag.blockNum = blkno + i;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}
+
 	/*
 	 * For every chunk that has blocks we're interested in, we
 	 * 1. get the chunk header
@@ -1220,14 +1255,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}
 
 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
-
-		/* Approximate working set for the blocks assumed in this entry */
-		for (int i = 0; i < blocks_in_chunk; i++)
-		{
-			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-		}
-
 		if (entry == NULL)
 		{
 			/* Pages are not cached */
@@ -1504,9 +1531,15 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		return false;
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	tag.forkNum = forknum;
 
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+	/* Update working set size estimate for the blocks */
+	if (lfc_prewarm_update_ws_estimation)
+	{
+		tag.blockNum = blkno;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}
 
 	tag.blockNum = blkno - chunk_offs;
 	hash = get_hash_value(lfc_hash, &tag);
@@ -1524,19 +1557,13 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 
 	if (lwlsn > lsn)
 	{
-		elog(DEBUG1, "Skip LFC write for %d because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X",
+		elog(DEBUG1, "Skip LFC write for %u because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X",
 			 blkno, LSN_FORMAT_ARGS(lwlsn), LSN_FORMAT_ARGS(lsn));
 		LWLockRelease(lfc_lock);
 		return false;
 	}
 
 	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
-
-	if (lfc_prewarm_update_ws_estimation)
-	{
-		tag.blockNum = blkno;
-		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-	}
 	if (found)
 	{
 		state = GET_STATE(entry, chunk_offs);
@@ -1649,9 +1676,15 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		return;
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	tag.forkNum = forkNum;
 
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+	/* Update working set size estimate for the blocks */
+	for (int i = 0; i < nblocks; i++)
+	{
+		tag.blockNum = blkno + i;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
 
@@ -1692,14 +1725,6 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		cv = &lfc_ctl->cv[hash % N_COND_VARS];
 
 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
-
-		/* Approximate working set for the blocks assumed in this entry */
-		for (int i = 0; i < blocks_in_chunk; i++)
-		{
-			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-		}
-
 		if (found)
 		{
 			/*
@@ -2135,40 +2160,23 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		SRF_RETURN_DONE(funcctx);
 }
 
-PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
 
-Datum
-approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+/*
+ * Internal implementation of the approximate_working_set_size_seconds()
+ * function.
+ */
+int32
+lfc_approximate_working_set_size_seconds(time_t duration, bool reset)
 {
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
-		LWLockAcquire(lfc_lock, LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
-}
+	int32		dc;
 
-PG_FUNCTION_INFO_V1(approximate_working_set_size);
+	if (lfc_size_limit == 0)
+		return -1;
 
-Datum
-approximate_working_set_size(PG_FUNCTION_ARGS)
-{
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		bool reset = PG_GETARG_BOOL(0);
-		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
-		if (reset)
-			memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
+	dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
+	if (reset)
+		memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
+	return dc;
 }
 
 PG_FUNCTION_INFO_V1(get_local_cache_state);
diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h
index d5ac55d5ba..14e5d4f753 100644
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -47,7 +47,8 @@ extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blk
 extern FileCacheState* lfc_get_state(size_t max_entries);
 extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);
 
-PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
+extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset);
+
 
 static inline bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 3b6c4247c3..05ba6da663 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -1410,7 +1410,7 @@ pg_init_libpagestore(void)
 							"sharding stripe size",
 							NULL,
 							&stripe_size,
-							32768, 1, INT_MAX,
+							2048, 1, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_BLOCKS,
 							NULL, NULL, NULL);
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 9e0ca16fed..7b749f1080 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -561,6 +561,8 @@ _PG_init(void)
 PG_FUNCTION_INFO_V1(pg_cluster_size);
 PG_FUNCTION_INFO_V1(backpressure_lsns);
 PG_FUNCTION_INFO_V1(backpressure_throttling_time);
+PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
+PG_FUNCTION_INFO_V1(approximate_working_set_size);
 
 Datum
 pg_cluster_size(PG_FUNCTION_ARGS)
@@ -607,6 +609,34 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }
 
+Datum
+approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+{
+	time_t		duration;
+	int32		dc;
+
+	duration = PG_ARGISNULL(0) ? (time_t) -1 : PG_GETARG_INT32(0);
+
+	dc = lfc_approximate_working_set_size_seconds(duration, false);
+	if (dc < 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_INT32(dc);
+}
+
+Datum
+approximate_working_set_size(PG_FUNCTION_ARGS)
+{
+	bool		reset = PG_GETARG_BOOL(0);
+	int32		dc;
+
+	dc = lfc_approximate_working_set_size_seconds(-1, reset);
+	if (dc < 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_INT32(dc);
+}
+
 #if PG_MAJORVERSION_NUM >= 16
 static void
 neon_shmem_startup_hook(void)
diff --git a/pgxn/neon/neon_ddl_handler.c b/pgxn/neon/neon_ddl_handler.c
index 2ce7b0086b..1f03e52c67 100644
--- a/pgxn/neon/neon_ddl_handler.c
+++ b/pgxn/neon/neon_ddl_handler.c
@@ -953,7 +953,9 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 
 	/*
 	 * Fire Event Trigger if both function owner and current user are
-	 * superuser, or none of them are.
+	 * superuser. Allow executing Event Trigger function that belongs to a
+	 * superuser when connected as a non-superuser, even when the function is
+	 * SECURITY DEFINER.
 	 */
     else if (event == FHET_START
 		/* still enable it to pass pg_regress tests */
@@ -976,32 +978,7 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 		function_is_owned_by_super = superuser_arg(function_owner);
 
 		/*
-		 * 1. Refuse to run SECURITY DEFINER function that belongs to a
-		 * superuser when the current user is not a superuser itself.
-		 */
-		if (!role_is_super
-			&& function_is_owned_by_super
-			&& function_is_secdef)
-		{
-			char *func_name = get_func_name(flinfo->fn_oid);
-
-			ereport(WARNING,
-					(errmsg("Skipping Event Trigger"),
-					 errdetail("Event Trigger function \"%s\" is owned by \"%s\" "
-							   "and is SECURITY DEFINER",
-							   func_name,
-							   GetUserNameFromId(function_owner, false))));
-
-			/*
-			 * we can't skip execution directly inside the fmgr_hook so
-			 * instead we change the event trigger function to a noop
-			 * function.
-			 */
-			force_noop(flinfo);
-		}
-
-		/*
-		 * 2. Refuse to run functions that belongs to a non-superuser when the
+		 * Refuse to run functions that belongs to a non-superuser when the
 		 * current user is a superuser.
 		 *
 		 * We could run a SECURITY DEFINER user-function here and be safe with
@@ -1009,7 +986,7 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 		 * infrastructure maintenance operations, where we prefer to skip
 		 * running user-defined code.
 		 */
-		else if (role_is_super && !function_is_owned_by_super)
+		if (role_is_super && !function_is_owned_by_super)
 		{
 			char *func_name = get_func_name(flinfo->fn_oid);
 
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index 787bd552f8..c7574ef0f9 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -165,4 +165,8 @@ extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
 extern TimeLineID GetWALInsertionTimeLine(void);
 #endif
 
+/* format codes not present in PG17-; but available in PG18+ */
+#define INT64_HEX_FORMAT "%" INT64_MODIFIER "x"
+#define UINT64_HEX_FORMAT "%" INT64_MODIFIER "x"
+
 #endif							/* NEON_PGVERSIONCOMPAT_H */
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 4b223b6b18..e3a4022664 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -376,6 +376,18 @@ typedef struct PageserverFeedback
 	uint32		shard_number;
 } PageserverFeedback;
 
+/* BEGIN_HADRON */
+typedef struct WalRateLimiter
+{
+	/* If the value is 1, PG backends will hit backpressure. */
+	pg_atomic_uint32 should_limit;
+	/* The number of bytes sent in the current second. */
+	uint64		sent_bytes;
+	/* The last recorded time in microsecond. */
+	TimestampTz last_recorded_time_us;
+} WalRateLimiter;
+/* END_HADRON */
+
 typedef struct WalproposerShmemState
 {
 	pg_atomic_uint64 propEpochStartLsn;
@@ -395,6 +407,11 @@ typedef struct WalproposerShmemState
 
 	/* aggregated feedback with min LSNs across shards */
 	PageserverFeedback min_ps_feedback;
+
+	/* BEGIN_HADRON */
+	/* The WAL rate limiter */
+	WalRateLimiter wal_rate_limiter;
+	/* END_HADRON */
 } WalproposerShmemState;
 
 /*
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 185fc83ace..aaf8f43eeb 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -66,6 +66,9 @@ int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
 int			safekeeper_proto_version = 3;
 char	   *safekeeper_conninfo_options = "";
+/* BEGIN_HADRON */
+int         databricks_max_wal_mb_per_second = -1;
+/* END_HADRON */
 
 /* Set to true in the walproposer bgw. */
 static bool am_walproposer;
@@ -252,6 +255,18 @@ nwp_register_gucs(void)
 							PGC_POSTMASTER,
 							0,
 							NULL, NULL, NULL);
+
+    /* BEGIN_HADRON */
+    DefineCustomIntVariable(
+                            "databricks.max_wal_mb_per_second",
+                            "The maximum WAL MB per second allowed. If breached, sending WAL hit the backpressure. Setting to -1 disables the limit.",
+                            NULL,
+                            &databricks_max_wal_mb_per_second,
+                            -1, -1, INT_MAX,
+                            PGC_SUSET,
+                            GUC_UNIT_MB,
+                            NULL, NULL, NULL);
+    /* END_HADRON */
 }
 
 
@@ -393,6 +408,7 @@ assign_neon_safekeepers(const char *newval, void *extra)
 static uint64
 backpressure_lag_impl(void)
 {
+	struct WalproposerShmemState* state = NULL;
 	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0)
 	{
 		XLogRecPtr	writePtr;
@@ -426,6 +442,18 @@ backpressure_lag_impl(void)
 			return (myFlushLsn - applyPtr - max_replication_apply_lag * MB);
 		}
 	}
+
+	/* BEGIN_HADRON */
+	if (databricks_max_wal_mb_per_second == -1) {
+		return 0;
+	}
+
+	state = GetWalpropShmemState();
+	if (state != NULL && pg_atomic_read_u32(&state->wal_rate_limiter.should_limit) == 1)
+	{
+		return 1;
+	}
+	/* END_HADRON */
 	return 0;
 }
 
@@ -472,6 +500,9 @@ WalproposerShmemInit(void)
 		pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
 		pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
 		pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
+		/* BEGIN_HADRON */
+		pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
+		/* END_HADRON */
 	}
 	LWLockRelease(AddinShmemInitLock);
 
@@ -487,6 +518,9 @@ WalproposerShmemInit_SyncSafekeeper(void)
 	pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0);
 	pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
 	pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
+	/* BEGIN_HADRON */
+	pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
+	/* END_HADRON */
 }
 
 #define BACK_PRESSURE_DELAY 10000L // 0.01 sec
@@ -521,7 +555,6 @@ backpressure_throttling_impl(void)
 	if (lag == 0)
 		return retry;
 
-
 	old_status = get_ps_display(&len);
 	new_status = (char *) palloc(len + 64 + 1);
 	memcpy(new_status, old_status, len);
@@ -1458,6 +1491,8 @@ XLogBroadcastWalProposer(WalProposer *wp)
 {
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
+	struct WalproposerShmemState *state = NULL;
+	TimestampTz now = 0;
 
 	/* Start from the last sent position */
 	startptr = sentPtr;
@@ -1502,13 +1537,36 @@ XLogBroadcastWalProposer(WalProposer *wp)
 	 * that arbitrary LSN is eventually reported as written, flushed and
 	 * applied, so that it can measure the elapsed time.
 	 */
-	LagTrackerWrite(endptr, GetCurrentTimestamp());
+	now = GetCurrentTimestamp();
+	LagTrackerWrite(endptr, now);
 
 	/* Do we have any work to do? */
 	Assert(startptr <= endptr);
 	if (endptr <= startptr)
 		return;
 
+	/* BEGIN_HADRON */
+	state = GetWalpropShmemState();
+	if (databricks_max_wal_mb_per_second != -1 && state != NULL)
+	{
+		uint64 max_wal_bytes = (uint64) databricks_max_wal_mb_per_second * 1024 * 1024;
+		struct WalRateLimiter *limiter = &state->wal_rate_limiter;
+
+		if (now - limiter->last_recorded_time_us > USECS_PER_SEC)
+		{
+			/* Reset the rate limiter */
+			limiter->last_recorded_time_us = now;
+			limiter->sent_bytes = 0;
+			pg_atomic_exchange_u32(&limiter->should_limit, 0);
+		}
+		limiter->sent_bytes += (endptr - startptr);
+		if (limiter->sent_bytes > max_wal_bytes)
+		{
+			pg_atomic_exchange_u32(&limiter->should_limit, 1);
+		}
+	}
+	/* END_HADRON */
+
 	WalProposerBroadcast(wp, startptr, endptr);
 	sentPtr = endptr;
 
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index d37412f674..5f880dfd23 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -236,13 +236,13 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 	bool		save_neon_test_evict;
 
 	/*
-	 * Temporarily set the zenith_test_evict GUC, so that when we pin and
+	 * Temporarily set the neon_test_evict GUC, so that when we pin and
 	 * unpin a buffer, the buffer is evicted. We use that hack to evict all
 	 * buffers, as there is no explicit "evict this buffer" function in the
 	 * buffer manager.
 	 */
-	save_neon_test_evict = zenith_test_evict;
-	zenith_test_evict = true;
+	save_neon_test_evict = neon_test_evict;
+	neon_test_evict = true;
 	PG_TRY();
 	{
 		/* Scan through all the buffers */
@@ -273,7 +273,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 
 			/*
 			 * Pin the buffer, and release it again. Because we have
-			 * zenith_test_evict==true, this will evict the page from the
+			 * neon_test_evict==true, this will evict the page from the
 			 * buffer cache if no one else is holding a pin on it.
 			 */
 			if (isvalid)
@@ -286,7 +286,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 	PG_FINALLY();
 	{
 		/* restore the GUC */
-		zenith_test_evict = save_neon_test_evict;
+		neon_test_evict = save_neon_test_evict;
 	}
 	PG_END_TRY();
 
diff --git a/pgxn/typedefs.list b/pgxn/typedefs.list
index 760f384212..3ea8b3b091 100644
--- a/pgxn/typedefs.list
+++ b/pgxn/typedefs.list
@@ -2953,17 +2953,17 @@ XmlTableBuilderData
 YYLTYPE
 YYSTYPE
 YY_BUFFER_STATE
-ZenithErrorResponse
-ZenithExistsRequest
-ZenithExistsResponse
-ZenithGetPageRequest
-ZenithGetPageResponse
-ZenithMessage
-ZenithMessageTag
-ZenithNblocksRequest
-ZenithNblocksResponse
-ZenithRequest
-ZenithResponse
+NeonErrorResponse
+NeonExistsRequest
+NeonExistsResponse
+NeonGetPageRequest
+NeonGetPageResponse
+NeonMessage
+NeonMessageTag
+NeonNblocksRequest
+NeonNblocksResponse
+NeonRequest
+NeonResponse
 _SPI_connection
 _SPI_plan
 __AssignProcessToJobObject
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index ce8610be24..82fe6818e3 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -16,6 +16,7 @@ async-compression.workspace = true
 async-trait.workspace = true
 atomic-take.workspace = true
 aws-config.workspace = true
+aws-credential-types.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
 base64.workspace = true
@@ -48,6 +49,7 @@ indexmap = { workspace = true, features = ["serde"] }
 ipnet.workspace = true
 itertools.workspace = true
 itoa.workspace = true
+json = { path = "../libs/proxy/json" }
 lasso = { workspace = true, features = ["multi-threaded"] }
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
@@ -127,4 +129,4 @@ rstest.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
 tokio-postgres.workspace = true
-tracing-test = "0.2"
\ No newline at end of file
+tracing-test = "0.2"
diff --git a/proxy/README.md b/proxy/README.md
index e10ff3d710..ff48f9f323 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -123,6 +123,11 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_pl
 docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';"
 ```
 
+If you want to test query cancellation, redis is also required:
+```sh
+docker run --detach --name proxy-redis --publish 6379:6379 redis:7.0
+```
+
 Let's create self-signed certificate by running:
 ```sh
 openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build"
@@ -130,7 +135,10 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key
 
 Then we need to build proxy with 'testing' feature and run, e.g.:
 ```sh
-RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
+RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- \
+  --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' \
+  --redis-auth-type="plain" --redis-plain="redis://127.0.0.1:6379" \
+  -c server.crt -k server.key
 ```
 
 Now from client you can start a new session:
diff --git a/proxy/src/batch.rs b/proxy/src/batch.rs
index 33e08797f2..cf866ab9a3 100644
--- a/proxy/src/batch.rs
+++ b/proxy/src/batch.rs
@@ -7,13 +7,17 @@ use std::pin::pin;
 use std::sync::Mutex;
 
 use scopeguard::ScopeGuard;
+use tokio::sync::oneshot;
 use tokio::sync::oneshot::error::TryRecvError;
 
 use crate::ext::LockExt;
 
+type ProcResult<P> = Result<<P as QueueProcessing>::Res, <P as QueueProcessing>::Err>;
+
 pub trait QueueProcessing: Send + 'static {
     type Req: Send + 'static;
     type Res: Send;
+    type Err: Send + Clone;
 
     /// Get the desired batch size.
     fn batch_size(&self, queue_size: usize) -> usize;
@@ -24,7 +28,18 @@ pub trait QueueProcessing: Send + 'static {
     /// If this apply can error, it's expected that errors be forwarded to each Self::Res.
     ///
     /// Batching does not need to happen atomically.
-    fn apply(&mut self, req: Vec<Self::Req>) -> impl Future<Output = Vec<Self::Res>> + Send;
+    fn apply(
+        &mut self,
+        req: Vec<Self::Req>,
+    ) -> impl Future<Output = Result<Vec<Self::Res>, Self::Err>> + Send;
+}
+
+#[derive(thiserror::Error)]
+pub enum BatchQueueError<E: Clone, C> {
+    #[error(transparent)]
+    Result(E),
+    #[error(transparent)]
+    Cancelled(C),
 }
 
 pub struct BatchQueue<P: QueueProcessing> {
@@ -34,7 +49,7 @@ pub struct BatchQueue<P: QueueProcessing> {
 
 struct BatchJob<P: QueueProcessing> {
     req: P::Req,
-    res: tokio::sync::oneshot::Sender<P::Res>,
+    res: tokio::sync::oneshot::Sender<Result<P::Res, P::Err>>,
 }
 
 impl<P: QueueProcessing> BatchQueue<P> {
@@ -55,11 +70,11 @@ impl<P: QueueProcessing> BatchQueue<P> {
         &self,
         req: P::Req,
         cancelled: impl Future<Output = R>,
-    ) -> Result<P::Res, R> {
+    ) -> Result<P::Res, BatchQueueError<P::Err, R>> {
         let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req);
 
         let mut cancelled = pin!(cancelled);
-        let resp = loop {
+        let resp: Option<Result<P::Res, P::Err>> = loop {
             // try become the leader, or try wait for success.
             let mut processor = tokio::select! {
                 // try become leader.
@@ -72,7 +87,7 @@ impl<P: QueueProcessing> BatchQueue<P> {
                     if inner.queue.remove(&id).is_some() {
                         tracing::warn!("batched task cancelled before completion");
                     }
-                    return Err(cancel);
+                    return Err(BatchQueueError::Cancelled(cancel));
                 },
             };
 
@@ -96,18 +111,30 @@ impl<P: QueueProcessing> BatchQueue<P> {
             // good: we didn't get cancelled.
             ScopeGuard::into_inner(cancel_safety);
 
-            if values.len() != resps.len() {
-                tracing::error!(
-                    "batch: invalid response size, expected={}, got={}",
-                    resps.len(),
-                    values.len()
-                );
-            }
+            match values {
+                Ok(values) => {
+                    if values.len() != resps.len() {
+                        tracing::error!(
+                            "batch: invalid response size, expected={}, got={}",
+                            resps.len(),
+                            values.len()
+                        );
+                    }
 
-            // send response values.
-            for (tx, value) in std::iter::zip(resps, values) {
-                if tx.send(value).is_err() {
-                    // receiver hung up but that's fine.
+                    // send response values.
+                    for (tx, value) in std::iter::zip(resps, values) {
+                        if tx.send(Ok(value)).is_err() {
+                            // receiver hung up but that's fine.
+                        }
+                    }
+                }
+
+                Err(err) => {
+                    for tx in resps {
+                        if tx.send(Err(err.clone())).is_err() {
+                            // receiver hung up but that's fine.
+                        }
+                    }
                 }
             }
 
@@ -129,7 +156,8 @@ impl<P: QueueProcessing> BatchQueue<P> {
 
         tracing::debug!(id, "batch: job completed");
 
-        Ok(resp.expect("no response found. batch processer should not panic"))
+        resp.expect("no response found. batch processer should not panic")
+            .map_err(BatchQueueError::Result)
     }
 }
 
@@ -139,8 +167,8 @@ struct BatchQueueInner<P: QueueProcessing> {
 }
 
 impl<P: QueueProcessing> BatchQueueInner<P> {
-    fn register_job(&mut self, req: P::Req) -> (u64, tokio::sync::oneshot::Receiver<P::Res>) {
-        let (tx, rx) = tokio::sync::oneshot::channel();
+    fn register_job(&mut self, req: P::Req) -> (u64, oneshot::Receiver<ProcResult<P>>) {
+        let (tx, rx) = oneshot::channel();
 
         let id = self.version;
 
@@ -158,7 +186,7 @@ impl<P: QueueProcessing> BatchQueueInner<P> {
         (id, rx)
     }
 
-    fn get_batch(&mut self, p: &P) -> (Vec<P::Req>, Vec<tokio::sync::oneshot::Sender<P::Res>>) {
+    fn get_batch(&mut self, p: &P) -> (Vec<P::Req>, Vec<oneshot::Sender<ProcResult<P>>>) {
         let batch_size = p.batch_size(self.queue.len());
         let mut reqs = Vec::with_capacity(batch_size);
         let mut resps = Vec::with_capacity(batch_size);
diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index 691709ce2a..16a7dc7b67 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -522,15 +522,7 @@ pub async fn run() -> anyhow::Result<()> {
         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
     }
 
-    if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend
-        && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api
-        && let Some(client) = redis_client
-    {
-        // project info cache and invalidation of that cache.
-        let cache = api.caches.project_info.clone();
-        maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone()));
-        maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-
+    if let Some(client) = redis_client {
         // Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
         // This prevents immediate exit and pod restart,
         // which can cause hammering of the redis in case of connection issues.
@@ -560,6 +552,16 @@ pub async fn run() -> anyhow::Result<()> {
                 }
             }
         }
+
+        #[allow(irrefutable_let_patterns)]
+        if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend
+            && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api
+        {
+            // project info cache and invalidation of that cache.
+            let cache = api.caches.project_info.clone();
+            maintenance_tasks.spawn(notifications::task_main(client, cache.clone()));
+            maintenance_tasks.spawn(async move { cache.gc_worker().await });
+        }
     }
 
     let maintenance = loop {
diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 183e1ea449..e87cf53ab9 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -14,8 +14,8 @@ use std::time::{Duration, Instant};
 use hashlink::{LruCache, linked_hash_map::RawEntryMut};
 use tracing::debug;
 
+use super::Cache;
 use super::common::Cached;
-use super::{Cache, timed_lru};
 
 /// An implementation of timed LRU cache with fixed capacity.
 /// Key properties:
@@ -30,7 +30,7 @@ use super::{Cache, timed_lru};
 ///
 /// * There's an API for immediate invalidation (removal) of a cache entry;
 ///   It's useful in case we know for sure that the entry is no longer correct.
-///   See [`timed_lru::Cached`] for more information.
+///   See [`Cached`] for more information.
 ///
 /// * Expired entries are kept in the cache, until they are evicted by the LRU policy,
 ///   or by a successful lookup (i.e. the entry hasn't expired yet).
@@ -217,15 +217,18 @@ impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
 }
 
 impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
-    /// Retrieve a cached entry in convenient wrapper.
-    pub(crate) fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
+    /// Retrieve a cached entry in convenient wrapper, alongside timing information.
+    pub(crate) fn get_with_created_at<Q>(
+        &self,
+        key: &Q,
+    ) -> Option<Cached<&Self, (<Self as Cache>::Value, Instant)>>
     where
         K: Borrow<Q> + Clone,
         Q: Hash + Eq + ?Sized,
     {
         self.get_raw(key, |key, entry| Cached {
             token: Some((self, key.clone())),
-            value: entry.value.clone(),
+            value: (entry.value.clone(), entry.created_at),
         })
     }
 }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 74413f1a7d..77062d3bb4 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -4,12 +4,11 @@ use std::pin::pin;
 use std::sync::{Arc, OnceLock};
 use std::time::Duration;
 
-use anyhow::anyhow;
 use futures::FutureExt;
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use postgres_client::RawCancelToken;
 use postgres_client::tls::MakeTlsConnect;
-use redis::{Cmd, FromRedisValue, Value};
+use redis::{Cmd, FromRedisValue, SetExpiry, SetOptions, Value};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
@@ -18,7 +17,7 @@ use tracing::{debug, error, info};
 
 use crate::auth::AuthError;
 use crate::auth::backend::ComputeUserInfo;
-use crate::batch::{BatchQueue, QueueProcessing};
+use crate::batch::{BatchQueue, BatchQueueError, QueueProcessing};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::ControlPlaneApi;
@@ -28,23 +27,39 @@ use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, Redis
 use crate::pqproto::CancelKeyData;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
-use crate::redis::kv_ops::RedisKVClient;
+use crate::redis::kv_ops::{RedisKVClient, RedisKVClientError};
+use crate::util::run_until;
 
 type IpSubnetKey = IpNet;
 
-const CANCEL_KEY_TTL: std::time::Duration = std::time::Duration::from_secs(600);
-const CANCEL_KEY_REFRESH: std::time::Duration = std::time::Duration::from_secs(570);
+const CANCEL_KEY_TTL: Duration = Duration::from_secs(600);
+const CANCEL_KEY_REFRESH: Duration = Duration::from_secs(570);
 
 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
-    StoreCancelKey {
+    Store {
         key: CancelKeyData,
         value: Box<str>,
-        expire: std::time::Duration,
+        expire: Duration,
     },
-    GetCancelData {
+    Refresh {
+        key: CancelKeyData,
+        expire: Duration,
+    },
+    Get {
         key: CancelKeyData,
     },
+    GetOld {
+        key: CancelKeyData,
+    },
+}
+
+#[derive(thiserror::Error, Debug, Clone)]
+pub enum PipelineError {
+    #[error("could not send cmd to redis: {0}")]
+    RedisKVClient(Arc<RedisKVClientError>),
+    #[error("incorrect number of responses from redis")]
+    IncorrectNumberOfResponses,
 }
 
 pub struct Pipeline {
@@ -60,7 +75,7 @@ impl Pipeline {
         }
     }
 
-    async fn execute(self, client: &mut RedisKVClient) -> Vec<anyhow::Result<Value>> {
+    async fn execute(self, client: &mut RedisKVClient) -> Result<Vec<Value>, PipelineError> {
         let responses = self.replies;
         let batch_size = self.inner.len();
 
@@ -78,43 +93,44 @@ impl Pipeline {
                     batch_size,
                     responses, "successfully completed cancellation jobs",
                 );
-                values.into_iter().map(Ok).collect()
+                Ok(values.into_iter().collect())
             }
             Ok(value) => {
                 error!(batch_size, ?value, "unexpected redis return value");
-                std::iter::repeat_with(|| Err(anyhow!("incorrect response type from redis")))
-                    .take(responses)
-                    .collect()
-            }
-            Err(err) => {
-                std::iter::repeat_with(|| Err(anyhow!("could not send cmd to redis: {err}")))
-                    .take(responses)
-                    .collect()
+                Err(PipelineError::IncorrectNumberOfResponses)
             }
+            Err(err) => Err(PipelineError::RedisKVClient(Arc::new(err))),
         }
     }
 
-    fn add_command_with_reply(&mut self, cmd: Cmd) {
+    fn add_command(&mut self, cmd: Cmd) {
         self.inner.add_command(cmd);
         self.replies += 1;
     }
-
-    fn add_command_no_reply(&mut self, cmd: Cmd) {
-        self.inner.add_command(cmd).ignore();
-    }
 }
 
 impl CancelKeyOp {
     fn register(&self, pipe: &mut Pipeline) {
         match self {
-            CancelKeyOp::StoreCancelKey { key, value, expire } => {
+            CancelKeyOp::Store { key, value, expire } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
-                pipe.add_command_with_reply(Cmd::hset(&key, "data", &**value));
-                pipe.add_command_no_reply(Cmd::expire(&key, expire.as_secs() as i64));
+                pipe.add_command(Cmd::set_options(
+                    &key,
+                    &**value,
+                    SetOptions::default().with_expiration(SetExpiry::EX(expire.as_secs())),
+                ));
             }
-            CancelKeyOp::GetCancelData { key } => {
+            CancelKeyOp::Refresh { key, expire } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
-                pipe.add_command_with_reply(Cmd::hget(key, "data"));
+                pipe.add_command(Cmd::expire(&key, expire.as_secs() as i64));
+            }
+            CancelKeyOp::GetOld { key } => {
+                let key = KeyPrefix::Cancel(*key).build_redis_key();
+                pipe.add_command(Cmd::hget(key, "data"));
+            }
+            CancelKeyOp::Get { key } => {
+                let key = KeyPrefix::Cancel(*key).build_redis_key();
+                pipe.add_command(Cmd::get(key));
             }
         }
     }
@@ -127,13 +143,14 @@ pub struct CancellationProcessor {
 
 impl QueueProcessing for CancellationProcessor {
     type Req = (CancelChannelSizeGuard<'static>, CancelKeyOp);
-    type Res = anyhow::Result<redis::Value>;
+    type Res = redis::Value;
+    type Err = PipelineError;
 
     fn batch_size(&self, _queue_size: usize) -> usize {
         self.batch_size
     }
 
-    async fn apply(&mut self, batch: Vec<Self::Req>) -> Vec<Self::Res> {
+    async fn apply(&mut self, batch: Vec<Self::Req>) -> Result<Vec<Self::Res>, Self::Err> {
         if !self.client.credentials_refreshed() {
             // this will cause a timeout for cancellation operations
             tracing::debug!(
@@ -244,18 +261,18 @@ impl CancellationHandler {
         &self,
         key: CancelKeyData,
     ) -> Result<Option<CancelClosure>, CancelError> {
-        let guard = Metrics::get()
-            .proxy
-            .cancel_channel_size
-            .guard(RedisMsgKind::HGet);
-        let op = CancelKeyOp::GetCancelData { key };
+        const TIMEOUT: Duration = Duration::from_secs(5);
 
         let Some(tx) = self.tx.get() else {
             tracing::warn!("cancellation handler is not available");
             return Err(CancelError::InternalError);
         };
 
-        const TIMEOUT: Duration = Duration::from_secs(5);
+        let guard = Metrics::get()
+            .proxy
+            .cancel_channel_size
+            .guard(RedisMsgKind::Get);
+        let op = CancelKeyOp::Get { key };
         let result = timeout(
             TIMEOUT,
             tx.call((guard, op), std::future::pending::<Infallible>()),
@@ -264,10 +281,37 @@ impl CancellationHandler {
         .map_err(|_| {
             tracing::warn!("timed out waiting to receive GetCancelData response");
             CancelError::RateLimit
-        })?
-        // cannot be cancelled
-        .unwrap_or_else(|x| match x {})
-        .map_err(|e| {
+        })?;
+
+        // We may still have cancel keys set with HSET <key> "data".
+        // Check error type and retry with HGET.
+        // TODO: remove code after HSET is not used anymore.
+        let result = if let Err(err) = result.as_ref()
+            && let BatchQueueError::Result(err) = err
+            && let PipelineError::RedisKVClient(err) = err
+            && let RedisKVClientError::Redis(err) = &**err
+            && let Some(errcode) = err.code()
+            && errcode == "WRONGTYPE"
+        {
+            let guard = Metrics::get()
+                .proxy
+                .cancel_channel_size
+                .guard(RedisMsgKind::HGet);
+            let op = CancelKeyOp::GetOld { key };
+            timeout(
+                TIMEOUT,
+                tx.call((guard, op), std::future::pending::<Infallible>()),
+            )
+            .await
+            .map_err(|_| {
+                tracing::warn!("timed out waiting to receive GetCancelData response");
+                CancelError::RateLimit
+            })?
+        } else {
+            result
+        };
+
+        let result = result.map_err(|e| {
             tracing::warn!("failed to receive GetCancelData response: {e}");
             CancelError::InternalError
         })?;
@@ -438,39 +482,94 @@ impl Session {
 
         let mut cancel = pin!(cancel);
 
+        enum State {
+            Set,
+            Refresh,
+        }
+        let mut state = State::Set;
+
         loop {
-            let guard = Metrics::get()
-                .proxy
-                .cancel_channel_size
-                .guard(RedisMsgKind::HSet);
-            let op = CancelKeyOp::StoreCancelKey {
-                key: self.key,
-                value: closure_json.clone(),
-                expire: CANCEL_KEY_TTL,
+            let guard_op = match state {
+                State::Set => {
+                    let guard = Metrics::get()
+                        .proxy
+                        .cancel_channel_size
+                        .guard(RedisMsgKind::Set);
+                    let op = CancelKeyOp::Store {
+                        key: self.key,
+                        value: closure_json.clone(),
+                        expire: CANCEL_KEY_TTL,
+                    };
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "registering cancellation key"
+                    );
+                    (guard, op)
+                }
+
+                State::Refresh => {
+                    let guard = Metrics::get()
+                        .proxy
+                        .cancel_channel_size
+                        .guard(RedisMsgKind::Expire);
+                    let op = CancelKeyOp::Refresh {
+                        key: self.key,
+                        expire: CANCEL_KEY_TTL,
+                    };
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "refreshing cancellation key"
+                    );
+                    (guard, op)
+                }
             };
 
-            tracing::debug!(
-                src=%self.key,
-                dest=?cancel_closure.cancel_token,
-                "registering cancellation key"
-            );
-
-            match tx.call((guard, op), cancel.as_mut()).await {
-                Ok(Ok(_)) => {
+            match tx.call(guard_op, cancel.as_mut()).await {
+                // SET returns OK
+                Ok(Value::Okay) => {
                     tracing::debug!(
                         src=%self.key,
                         dest=?cancel_closure.cancel_token,
                         "registered cancellation key"
                     );
+                    state = State::Refresh;
+                }
 
-                    // wait before continuing.
-                    tokio::time::sleep(CANCEL_KEY_REFRESH).await;
+                // EXPIRE returns 1
+                Ok(Value::Int(1)) => {
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "refreshed cancellation key"
+                    );
                 }
+
+                Ok(_) => {
+                    // Any other response likely means the key expired.
+                    tracing::warn!(src=%self.key, "refreshing cancellation key failed");
+                    // Re-enter the SET loop to repush full data.
+                    state = State::Set;
+                }
+
                 // retry immediately.
-                Ok(Err(error)) => {
-                    tracing::warn!(?error, "error registering cancellation key");
+                Err(BatchQueueError::Result(error)) => {
+                    tracing::warn!(?error, "error refreshing cancellation key");
+                    // Small delay to prevent busy loop with high cpu and logging.
+                    tokio::time::sleep(Duration::from_millis(10)).await;
+                    continue;
                 }
-                Err(Err(_cancelled)) => break,
+
+                Err(BatchQueueError::Cancelled(Err(_cancelled))) => break,
+            }
+
+            // wait before continuing. break immediately if cancelled.
+            if run_until(tokio::time::sleep(CANCEL_KEY_REFRESH), cancel.as_mut())
+                .await
+                .is_err()
+            {
+                break;
             }
         }
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index b55cc14532..4d8df19476 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -267,7 +267,7 @@ async fn worker_inner(
 ) -> anyhow::Result<()> {
     #[cfg(any(test, feature = "testing"))]
     let storage = if config.test_remote_failures > 0 {
-        GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
+        GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures, 100)
     } else {
         storage
     };
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index fc263b73b1..bb785b8b0c 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -23,12 +23,13 @@ use crate::control_plane::errors::{
     ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
 };
 use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse};
 use crate::control_plane::{
     AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, EndpointAccessControl, NodeInfo,
     RoleAccessControl,
 };
 use crate::metrics::Metrics;
+use crate::proxy::retry::CouldRetry;
 use crate::rate_limiter::WakeComputeRateLimiter;
 use crate::types::{EndpointCacheKey, EndpointId, RoleName};
 use crate::{compute, http, scram};
@@ -382,16 +383,31 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
 
         macro_rules! check_cache {
             () => {
-                if let Some(cached) = self.caches.node_info.get(&key) {
-                    let (cached, info) = cached.take_value();
-                    let info = info.map_err(|c| {
-                        info!(key = &*key, "found cached wake_compute error");
-                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
-                    })?;
+                if let Some(cached) = self.caches.node_info.get_with_created_at(&key) {
+                    let (cached, (info, created_at)) = cached.take_value();
+                    return match info {
+                        Err(mut msg) => {
+                            info!(key = &*key, "found cached wake_compute error");
 
-                    debug!(key = &*key, "found cached compute node info");
-                    ctx.set_project(info.aux.clone());
-                    return Ok(cached.map(|()| info));
+                            // if retry_delay_ms is set, reduce it by the amount of time it spent in cache
+                            if let Some(status) = &mut msg.status {
+                                if let Some(retry_info) = &mut status.details.retry_info {
+                                    retry_info.retry_delay_ms = retry_info
+                                        .retry_delay_ms
+                                        .saturating_sub(created_at.elapsed().as_millis() as u64)
+                                }
+                            }
+
+                            Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                                msg,
+                            )))
+                        }
+                        Ok(info) => {
+                            debug!(key = &*key, "found cached compute node info");
+                            ctx.set_project(info.aux.clone());
+                            Ok(cached.map(|()| info))
+                        }
+                    };
                 }
             };
         }
@@ -434,42 +450,29 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                 Ok(cached.map(|()| node))
             }
             Err(err) => match err {
-                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
-                    let Some(status) = &err.status else {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
-                    };
+                WakeComputeError::ControlPlane(ControlPlaneError::Message(ref msg)) => {
+                    let retry_info = msg.status.as_ref().and_then(|s| s.details.retry_info);
 
-                    let reason = status
-                        .details
-                        .error_info
-                        .map_or(Reason::Unknown, |x| x.reason);
-
-                    // if we can retry this error, do not cache it.
-                    if reason.can_retry() {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
+                    // If we can retry this error, do not cache it,
+                    // unless we were given a retry delay.
+                    if msg.could_retry() && retry_info.is_none() {
+                        return Err(err);
                     }
 
-                    // at this point, we should only have quota errors.
                     debug!(
                         key = &*key,
                         "created a cache entry for the wake compute error"
                     );
 
-                    self.caches.node_info.insert_ttl(
-                        key,
-                        Err(err.clone()),
-                        Duration::from_secs(30),
-                    );
+                    let ttl = retry_info.map_or(Duration::from_secs(30), |r| {
+                        Duration::from_millis(r.retry_delay_ms)
+                    });
 
-                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                        err,
-                    )))
+                    self.caches.node_info.insert_ttl(key, Err(msg.clone()), ttl);
+
+                    Err(err)
                 }
-                err => return Err(err),
+                err => Err(err),
             },
         }
     }
diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs
index f640657d90..12843e48c7 100644
--- a/proxy/src/control_plane/errors.rs
+++ b/proxy/src/control_plane/errors.rs
@@ -43,28 +43,35 @@ impl UserFacingError for ControlPlaneError {
 }
 
 impl ReportableError for ControlPlaneError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
+    fn get_error_kind(&self) -> ErrorKind {
         match self {
             ControlPlaneError::Message(e) => match e.get_reason() {
-                Reason::RoleProtected => ErrorKind::User,
-                Reason::ResourceNotFound => ErrorKind::User,
-                Reason::ProjectNotFound => ErrorKind::User,
-                Reason::EndpointNotFound => ErrorKind::User,
-                Reason::BranchNotFound => ErrorKind::User,
+                Reason::RoleProtected
+                | Reason::ResourceNotFound
+                | Reason::ProjectNotFound
+                | Reason::EndpointNotFound
+                | Reason::EndpointDisabled
+                | Reason::BranchNotFound
+                | Reason::InvalidEphemeralEndpointOptions => ErrorKind::User,
+
                 Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
-                Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota,
-                Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota,
-                Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota,
-                Reason::WrittenDataQuotaExceeded => ErrorKind::Quota,
-                Reason::DataTransferQuotaExceeded => ErrorKind::Quota,
-                Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota,
-                Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
-                Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
-                Reason::RunningOperations => ErrorKind::ControlPlane,
-                Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
-                Reason::Unknown => ErrorKind::ControlPlane,
+
+                Reason::NonDefaultBranchComputeTimeExceeded
+                | Reason::ActiveTimeQuotaExceeded
+                | Reason::ComputeTimeQuotaExceeded
+                | Reason::WrittenDataQuotaExceeded
+                | Reason::DataTransferQuotaExceeded
+                | Reason::LogicalSizeQuotaExceeded
+                | Reason::ActiveEndpointsLimitExceeded => ErrorKind::Quota,
+
+                Reason::ConcurrencyLimitReached
+                | Reason::LockAlreadyTaken
+                | Reason::RunningOperations
+                | Reason::EndpointIdle
+                | Reason::ProjectUnderMaintenance
+                | Reason::Unknown => ErrorKind::ControlPlane,
             },
-            ControlPlaneError::Transport(_) => crate::error::ErrorKind::ControlPlane,
+            ControlPlaneError::Transport(_) => ErrorKind::ControlPlane,
         }
     }
 }
@@ -120,10 +127,10 @@ impl UserFacingError for GetAuthInfoError {
 }
 
 impl ReportableError for GetAuthInfoError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
+    fn get_error_kind(&self) -> ErrorKind {
         match self {
-            Self::BadSecret => crate::error::ErrorKind::ControlPlane,
-            Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
+            Self::BadSecret => ErrorKind::ControlPlane,
+            Self::ApiError(_) => ErrorKind::ControlPlane,
         }
     }
 }
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index f0314f91f0..cf193ed268 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -126,10 +126,16 @@ pub(crate) enum Reason {
     /// or that the subject doesn't have enough permissions to access the requested endpoint.
     #[serde(rename = "ENDPOINT_NOT_FOUND")]
     EndpointNotFound,
+    /// EndpointDisabled indicates that the endpoint has been disabled and does not accept connections.
+    #[serde(rename = "ENDPOINT_DISABLED")]
+    EndpointDisabled,
     /// BranchNotFound indicates that the branch wasn't found, usually due to the provided ID not being correct,
     /// or that the subject doesn't have enough permissions to access the requested branch.
     #[serde(rename = "BRANCH_NOT_FOUND")]
     BranchNotFound,
+    /// InvalidEphemeralEndpointOptions indicates that the specified LSN or timestamp are wrong.
+    #[serde(rename = "INVALID_EPHEMERAL_OPTIONS")]
+    InvalidEphemeralEndpointOptions,
     /// RateLimitExceeded indicates that the rate limit for the operation has been exceeded.
     #[serde(rename = "RATE_LIMIT_EXCEEDED")]
     RateLimitExceeded,
@@ -152,6 +158,9 @@ pub(crate) enum Reason {
     /// LogicalSizeQuotaExceeded indicates that the logical size quota was exceeded.
     #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")]
     LogicalSizeQuotaExceeded,
+    /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded.
+    #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")]
+    ActiveEndpointsLimitExceeded,
     /// RunningOperations indicates that the project already has some running operations
     /// and scheduling of new ones is prohibited.
     #[serde(rename = "RUNNING_OPERATIONS")]
@@ -162,9 +171,13 @@ pub(crate) enum Reason {
     /// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken.
     #[serde(rename = "LOCK_ALREADY_TAKEN")]
     LockAlreadyTaken,
-    /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded.
-    #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")]
-    ActiveEndpointsLimitExceeded,
+    /// EndpointIdle indicates that the endpoint cannot become active, because it's idle.
+    #[serde(rename = "ENDPOINT_IDLE")]
+    EndpointIdle,
+    /// ProjectUnderMaintenance indicates that the project is currently ongoing maintenance,
+    /// and thus cannot accept connections.
+    #[serde(rename = "PROJECT_UNDER_MAINTENANCE")]
+    ProjectUnderMaintenance,
     #[default]
     #[serde(other)]
     Unknown,
@@ -184,13 +197,15 @@ impl Reason {
     pub(crate) fn can_retry(self) -> bool {
         match self {
             // do not retry role protected errors
-            // not a transitive error
+            // not a transient error
             Reason::RoleProtected => false,
-            // on retry, it will still not be found
+            // on retry, it will still not be found or valid
             Reason::ResourceNotFound
             | Reason::ProjectNotFound
             | Reason::EndpointNotFound
-            | Reason::BranchNotFound => false,
+            | Reason::EndpointDisabled
+            | Reason::BranchNotFound
+            | Reason::InvalidEphemeralEndpointOptions => false,
             // we were asked to go away
             Reason::RateLimitExceeded
             | Reason::NonDefaultBranchComputeTimeExceeded
@@ -200,11 +215,13 @@ impl Reason {
             | Reason::DataTransferQuotaExceeded
             | Reason::LogicalSizeQuotaExceeded
             | Reason::ActiveEndpointsLimitExceeded => false,
-            // transitive error. control plane is currently busy
+            // transient error. control plane is currently busy
             // but might be ready soon
             Reason::RunningOperations
             | Reason::ConcurrencyLimitReached
-            | Reason::LockAlreadyTaken => true,
+            | Reason::LockAlreadyTaken
+            | Reason::EndpointIdle
+            | Reason::ProjectUnderMaintenance => true,
             // unknown error. better not retry it.
             Reason::Unknown => false,
         }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 9d1a3d4358..bf4d5a11eb 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -374,11 +374,10 @@ pub enum Waiting {
 #[label(singleton = "kind")]
 #[allow(clippy::enum_variant_names)]
 pub enum RedisMsgKind {
-    HSet,
-    HSetMultiple,
+    Set,
+    Get,
+    Expire,
     HGet,
-    HGetAll,
-    HDel,
 }
 
 #[derive(Default, Clone)]
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 08c81afa04..02651109e0 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -195,15 +195,18 @@ impl NeonOptions {
     // proxy options:
 
     /// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute.
-    pub const PARAMS_COMPAT: &str = "proxy_params_compat";
+    pub const PARAMS_COMPAT: &'static str = "proxy_params_compat";
 
     // cplane options:
 
     /// `LSN` allows provisioning an ephemeral compute with time-travel to the provided LSN.
-    const LSN: &str = "lsn";
+    const LSN: &'static str = "lsn";
+
+    /// `TIMESTAMP` allows provisioning an ephemeral compute with time-travel to the provided timestamp.
+    const TIMESTAMP: &'static str = "timestamp";
 
     /// `ENDPOINT_TYPE` allows configuring an ephemeral compute to be read_only or read_write.
-    const ENDPOINT_TYPE: &str = "endpoint_type";
+    const ENDPOINT_TYPE: &'static str = "endpoint_type";
 
     pub(crate) fn parse_params(params: &StartupMessageParams) -> Self {
         params
@@ -228,6 +231,7 @@ impl NeonOptions {
             // This is not a cplane option, we know it does not create ephemeral computes.
             Self::PARAMS_COMPAT => false,
             Self::LSN => true,
+            Self::TIMESTAMP => true,
             Self::ENDPOINT_TYPE => true,
             // err on the side of caution. any cplane options we don't know about
             // might lead to ephemeral computes.
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index 35a3fe4334..b0bf332e44 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -4,11 +4,12 @@ use std::time::Duration;
 
 use futures::FutureExt;
 use redis::aio::{ConnectionLike, MultiplexedConnection};
-use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult};
+use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisError, RedisResult};
 use tokio::task::AbortHandle;
 use tracing::{error, info, warn};
 
 use super::elasticache::CredentialsProvider;
+use crate::redis::elasticache::CredentialsProviderError;
 
 enum Credentials {
     Static(ConnectionInfo),
@@ -26,6 +27,14 @@ impl Clone for Credentials {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub enum ConnectionProviderError {
+    #[error(transparent)]
+    Redis(#[from] RedisError),
+    #[error(transparent)]
+    CredentialsProvider(#[from] CredentialsProviderError),
+}
+
 /// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token.
 /// Provides PubSub connection without credentials refresh.
 pub struct ConnectionWithCredentialsProvider {
@@ -86,15 +95,18 @@ impl ConnectionWithCredentialsProvider {
         }
     }
 
-    async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> {
-        redis::cmd("PING").query_async(con).await
+    async fn ping(con: &mut MultiplexedConnection) -> Result<(), ConnectionProviderError> {
+        redis::cmd("PING")
+            .query_async(con)
+            .await
+            .map_err(Into::into)
     }
 
     pub(crate) fn credentials_refreshed(&self) -> bool {
         self.credentials_refreshed.load(Ordering::Relaxed)
     }
 
-    pub(crate) async fn connect(&mut self) -> anyhow::Result<()> {
+    pub(crate) async fn connect(&mut self) -> Result<(), ConnectionProviderError> {
         let _guard = self.mutex.lock().await;
         if let Some(con) = self.con.as_mut() {
             match Self::ping(con).await {
@@ -141,7 +153,7 @@ impl ConnectionWithCredentialsProvider {
         Ok(())
     }
 
-    async fn get_connection_info(&self) -> anyhow::Result<ConnectionInfo> {
+    async fn get_connection_info(&self) -> Result<ConnectionInfo, ConnectionProviderError> {
         match &self.credentials {
             Credentials::Static(info) => Ok(info.clone()),
             Credentials::Dynamic(provider, addr) => {
@@ -160,7 +172,7 @@ impl ConnectionWithCredentialsProvider {
         }
     }
 
-    async fn get_client(&self) -> anyhow::Result<redis::Client> {
+    async fn get_client(&self) -> Result<redis::Client, ConnectionProviderError> {
         let client = redis::Client::open(self.get_connection_info().await?)?;
         self.credentials_refreshed.store(true, Ordering::Relaxed);
         Ok(client)
diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs
index 58e3c889a7..6f3b34d381 100644
--- a/proxy/src/redis/elasticache.rs
+++ b/proxy/src/redis/elasticache.rs
@@ -9,10 +9,12 @@ use aws_config::meta::region::RegionProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
+use aws_credential_types::provider::error::CredentialsError;
 use aws_sdk_iam::config::ProvideCredentials;
 use aws_sigv4::http_request::{
-    self, SignableBody, SignableRequest, SignatureLocation, SigningSettings,
+    self, SignableBody, SignableRequest, SignatureLocation, SigningError, SigningSettings,
 };
+use aws_sigv4::sign::v4::signing_params::BuildError;
 use tracing::info;
 
 #[derive(Debug)]
@@ -40,6 +42,18 @@ impl AWSIRSAConfig {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub enum CredentialsProviderError {
+    #[error(transparent)]
+    AwsCredentials(#[from] CredentialsError),
+    #[error(transparent)]
+    AwsSigv4Build(#[from] BuildError),
+    #[error(transparent)]
+    AwsSigv4Singing(#[from] SigningError),
+    #[error(transparent)]
+    Http(#[from] http::Error),
+}
+
 /// Credentials provider for AWS elasticache authentication.
 ///
 /// Official documentation:
@@ -92,7 +106,9 @@ impl CredentialsProvider {
         })
     }
 
-    pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
+    pub(crate) async fn provide_credentials(
+        &self,
+    ) -> Result<(String, String), CredentialsProviderError> {
         let aws_credentials = self
             .credentials_provider
             .provide_credentials()
diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs
index cfdbc21839..d1e97b6b09 100644
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -2,9 +2,18 @@ use std::time::Duration;
 
 use futures::FutureExt;
 use redis::aio::ConnectionLike;
-use redis::{Cmd, FromRedisValue, Pipeline, RedisResult};
+use redis::{Cmd, FromRedisValue, Pipeline, RedisError, RedisResult};
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use crate::redis::connection_with_credentials_provider::ConnectionProviderError;
+
+#[derive(thiserror::Error, Debug)]
+pub enum RedisKVClientError {
+    #[error(transparent)]
+    Redis(#[from] RedisError),
+    #[error(transparent)]
+    ConnectionProvider(#[from] ConnectionProviderError),
+}
 
 pub struct RedisKVClient {
     client: ConnectionWithCredentialsProvider,
@@ -32,12 +41,13 @@ impl RedisKVClient {
         Self { client }
     }
 
-    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+    pub async fn try_connect(&mut self) -> Result<(), RedisKVClientError> {
         self.client
             .connect()
             .boxed()
             .await
             .inspect_err(|e| tracing::error!("failed to connect to redis: {e}"))
+            .map_err(Into::into)
     }
 
     pub(crate) fn credentials_refreshed(&self) -> bool {
@@ -47,7 +57,7 @@ impl RedisKVClient {
     pub(crate) async fn query<T: FromRedisValue>(
         &mut self,
         q: &impl Queryable,
-    ) -> anyhow::Result<T> {
+    ) -> Result<T, RedisKVClientError> {
         let e = match q.query(&mut self.client).await {
             Ok(t) => return Ok(t),
             Err(e) => e,
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 2e67d07079..ef7c8a4d82 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -1,6 +1,7 @@
+use json::{ListSer, ObjectSer, ValueSer};
 use postgres_client::Row;
 use postgres_client::types::{Kind, Type};
-use serde_json::{Map, Value};
+use serde_json::Value;
 
 //
 // Convert json non-string types to strings, so that they can be passed to Postgres
@@ -74,44 +75,40 @@ pub(crate) enum JsonConversionError {
     UnbalancedString,
 }
 
-enum OutputMode {
-    Array(Vec<Value>),
-    Object(Map<String, Value>),
+enum OutputMode<'a> {
+    Array(ListSer<'a>),
+    Object(ObjectSer<'a>),
 }
 
-impl OutputMode {
-    fn key(&mut self, key: &str) -> &mut Value {
+impl OutputMode<'_> {
+    fn key(&mut self, key: &str) -> ValueSer<'_> {
         match self {
-            OutputMode::Array(values) => push_entry(values, Value::Null),
-            OutputMode::Object(map) => map.entry(key.to_string()).or_insert(Value::Null),
+            OutputMode::Array(values) => values.entry(),
+            OutputMode::Object(map) => map.key(key),
         }
     }
 
-    fn finish(self) -> Value {
+    fn finish(self) {
         match self {
-            OutputMode::Array(values) => Value::Array(values),
-            OutputMode::Object(map) => Value::Object(map),
+            OutputMode::Array(values) => values.finish(),
+            OutputMode::Object(map) => map.finish(),
         }
     }
 }
 
-fn push_entry<T>(arr: &mut Vec<T>, t: T) -> &mut T {
-    arr.push(t);
-    arr.last_mut().expect("a value was just inserted")
-}
-
 //
 // Convert postgres row with text-encoded values to JSON object
 //
 pub(crate) fn pg_text_row_to_json(
+    output: ValueSer,
     row: &Row,
     raw_output: bool,
     array_mode: bool,
-) -> Result<Value, JsonConversionError> {
+) -> Result<(), JsonConversionError> {
     let mut entries = if array_mode {
-        OutputMode::Array(Vec::with_capacity(row.columns().len()))
+        OutputMode::Array(output.list())
     } else {
-        OutputMode::Object(Map::with_capacity(row.columns().len()))
+        OutputMode::Object(output.object())
     };
 
     for (i, column) in row.columns().iter().enumerate() {
@@ -120,53 +117,48 @@ pub(crate) fn pg_text_row_to_json(
         let value = entries.key(column.name());
 
         match pg_value {
-            Some(v) if raw_output => *value = Value::String(v.to_string()),
+            Some(v) if raw_output => value.value(v),
             Some(v) => pg_text_to_json(value, v, column.type_())?,
-            None => *value = Value::Null,
+            None => value.value(json::Null),
         }
     }
 
-    Ok(entries.finish())
+    entries.finish();
+    Ok(())
 }
 
 //
 // Convert postgres text-encoded value to JSON value
 //
-fn pg_text_to_json(
-    output: &mut Value,
-    val: &str,
-    pg_type: &Type,
-) -> Result<(), JsonConversionError> {
+fn pg_text_to_json(output: ValueSer, val: &str, pg_type: &Type) -> Result<(), JsonConversionError> {
     if let Kind::Array(elem_type) = pg_type.kind() {
         // todo: we should fetch this from postgres.
         let delimiter = ',';
 
-        let mut array = vec![];
-        pg_array_parse(&mut array, val, elem_type, delimiter)?;
-        *output = Value::Array(array);
+        json::value_as_list!(|output| pg_array_parse(output, val, elem_type, delimiter)?);
         return Ok(());
     }
 
     match *pg_type {
-        Type::BOOL => *output = Value::Bool(val == "t"),
+        Type::BOOL => output.value(val == "t"),
         Type::INT2 | Type::INT4 => {
             let val = val.parse::<i32>()?;
-            *output = Value::Number(serde_json::Number::from(val));
+            output.value(val);
         }
         Type::FLOAT4 | Type::FLOAT8 => {
             let fval = val.parse::<f64>()?;
-            let num = serde_json::Number::from_f64(fval);
-            if let Some(num) = num {
-                *output = Value::Number(num);
+            if fval.is_finite() {
+                output.value(fval);
             } else {
                 // Pass Nan, Inf, -Inf as strings
                 // JS JSON.stringify() does converts them to null, but we
                 // want to preserve them, so we pass them as strings
-                *output = Value::String(val.to_string());
+                output.value(val);
             }
         }
-        Type::JSON | Type::JSONB => *output = serde_json::from_str(val)?,
-        _ => *output = Value::String(val.to_string()),
+        // we assume that the string value is valid json.
+        Type::JSON | Type::JSONB => output.write_raw_json(val.as_bytes()),
+        _ => output.value(val),
     }
 
     Ok(())
@@ -192,7 +184,7 @@ fn pg_text_to_json(
 /// gets its own level of curly braces, and delimiters must be written between adjacent
 /// curly-braced entities of the same level.
 fn pg_array_parse(
-    elements: &mut Vec<Value>,
+    elements: &mut ListSer,
     mut pg_array: &str,
     elem: &Type,
     delim: char,
@@ -221,7 +213,7 @@ fn pg_array_parse(
 /// reads a single array from the `pg_array` string and pushes each values to `elements`.
 /// returns the rest of the `pg_array` string that was not read.
 fn pg_array_parse_inner<'a>(
-    elements: &mut Vec<Value>,
+    elements: &mut ListSer,
     mut pg_array: &'a str,
     elem: &Type,
     delim: char,
@@ -234,7 +226,7 @@ fn pg_array_parse_inner<'a>(
     let mut q = String::new();
 
     loop {
-        let value = push_entry(elements, Value::Null);
+        let value = elements.entry();
         pg_array = pg_array_parse_item(value, &mut q, pg_array, elem, delim)?;
 
         // check for separator.
@@ -260,7 +252,7 @@ fn pg_array_parse_inner<'a>(
 ///
 /// `quoted` is a scratch allocation that has no defined output.
 fn pg_array_parse_item<'a>(
-    output: &mut Value,
+    output: ValueSer,
     quoted: &mut String,
     mut pg_array: &'a str,
     elem: &Type,
@@ -276,9 +268,8 @@ fn pg_array_parse_item<'a>(
 
     if pg_array.starts_with('{') {
         // nested array.
-        let mut nested = vec![];
-        pg_array = pg_array_parse_inner(&mut nested, pg_array, elem, delim)?;
-        *output = Value::Array(nested);
+        pg_array =
+            json::value_as_list!(|output| pg_array_parse_inner(output, pg_array, elem, delim))?;
         return Ok(pg_array);
     }
 
@@ -306,7 +297,7 @@ fn pg_array_parse_item<'a>(
     // we might have an item string:
     // check for null
     if item == "NULL" {
-        *output = Value::Null;
+        output.value(json::Null);
     } else {
         pg_text_to_json(output, item, elem)?;
     }
@@ -440,15 +431,15 @@ mod tests {
     }
 
     fn pg_text_to_json(val: &str, pg_type: &Type) -> Value {
-        let mut v = Value::Null;
-        super::pg_text_to_json(&mut v, val, pg_type).unwrap();
-        v
+        let output = json::value_to_string!(|v| super::pg_text_to_json(v, val, pg_type).unwrap());
+        serde_json::from_str(&output).unwrap()
     }
 
     fn pg_array_parse(pg_array: &str, pg_type: &Type) -> Value {
-        let mut array = vec![];
-        super::pg_array_parse(&mut array, pg_array, pg_type, ',').unwrap();
-        Value::Array(array)
+        let output = json::value_to_string!(|v| json::value_as_list!(|v| {
+            super::pg_array_parse(v, pg_array, pg_type, ',').unwrap();
+        }));
+        serde_json::from_str(&output).unwrap()
     }
 
     #[test]
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7a718d0280..8a14f804b6 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -14,10 +14,7 @@ use hyper::http::{HeaderName, HeaderValue};
 use hyper::{Request, Response, StatusCode, header};
 use indexmap::IndexMap;
 use postgres_client::error::{DbError, ErrorPosition, SqlState};
-use postgres_client::{
-    GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, RowStream, Transaction,
-};
-use serde::Serialize;
+use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use serde_json::Value;
 use serde_json::value::RawValue;
 use tokio::time::{self, Instant};
@@ -687,32 +684,21 @@ impl QueryData {
         let (inner, mut discard) = client.inner();
         let cancel_token = inner.cancel_token();
 
-        match select(
+        let mut json_buf = vec![];
+
+        let batch_result = match select(
             pin!(query_to_json(
                 config,
                 &mut *inner,
                 self,
-                &mut 0,
+                json::ValueSer::new(&mut json_buf),
                 parsed_headers
             )),
             pin!(cancel.cancelled()),
         )
         .await
         {
-            // The query successfully completed.
-            Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
-                discard.check_idle(status);
-
-                let json_output =
-                    serde_json::to_string(&results).expect("json serialization should not fail");
-                Ok(json_output)
-            }
-            // The query failed with an error
-            Either::Left((Err(e), __not_yet_cancelled)) => {
-                discard.discard();
-                Err(e)
-            }
-            // The query was cancelled.
+            Either::Left((res, __not_yet_cancelled)) => res,
             Either::Right((_cancelled, query)) => {
                 tracing::info!("cancelling query");
                 if let Err(err) = cancel_token.cancel_query(NoTls).await {
@@ -721,13 +707,7 @@ impl QueryData {
                 // wait for the query cancellation
                 match time::timeout(time::Duration::from_millis(100), query).await {
                     // query successed before it was cancelled.
-                    Ok(Ok((status, results))) => {
-                        discard.check_idle(status);
-
-                        let json_output = serde_json::to_string(&results)
-                            .expect("json serialization should not fail");
-                        Ok(json_output)
-                    }
+                    Ok(Ok(status)) => Ok(status),
                     // query failed or was cancelled.
                     Ok(Err(error)) => {
                         let db_error = match &error {
@@ -743,14 +723,29 @@ impl QueryData {
                             discard.discard();
                         }
 
-                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                        return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                     }
                     Err(_timeout) => {
                         discard.discard();
-                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                        return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                     }
                 }
             }
+        };
+
+        match batch_result {
+            // The query successfully completed.
+            Ok(status) => {
+                discard.check_idle(status);
+
+                let json_output = String::from_utf8(json_buf).expect("json should be valid utf8");
+                Ok(json_output)
+            }
+            // The query failed with an error
+            Err(e) => {
+                discard.discard();
+                Err(e)
+            }
         }
     }
 }
@@ -787,7 +782,7 @@ impl BatchQueryData {
             })
             .map_err(SqlOverHttpError::Postgres)?;
 
-        let json_output = match query_batch(
+        let json_output = match query_batch_to_json(
             config,
             cancel.child_token(),
             &mut transaction,
@@ -845,24 +840,21 @@ async fn query_batch(
     transaction: &mut Transaction<'_>,
     queries: BatchQueryData,
     parsed_headers: HttpHeaders,
-) -> Result<String, SqlOverHttpError> {
-    let mut results = Vec::with_capacity(queries.queries.len());
-    let mut current_size = 0;
+    results: &mut json::ListSer<'_>,
+) -> Result<(), SqlOverHttpError> {
     for stmt in queries.queries {
         let query = pin!(query_to_json(
             config,
             transaction,
             stmt,
-            &mut current_size,
+            results.entry(),
             parsed_headers,
         ));
         let cancelled = pin!(cancel.cancelled());
         let res = select(query, cancelled).await;
         match res {
             // TODO: maybe we should check that the transaction bit is set here
-            Either::Left((Ok((_, values)), _cancelled)) => {
-                results.push(values);
-            }
+            Either::Left((Ok(_), _cancelled)) => {}
             Either::Left((Err(e), _cancelled)) => {
                 return Err(e);
             }
@@ -872,8 +864,22 @@ async fn query_batch(
         }
     }
 
-    let results = json!({ "results": results });
-    let json_output = serde_json::to_string(&results).expect("json serialization should not fail");
+    Ok(())
+}
+
+async fn query_batch_to_json(
+    config: &'static HttpConfig,
+    cancel: CancellationToken,
+    tx: &mut Transaction<'_>,
+    queries: BatchQueryData,
+    headers: HttpHeaders,
+) -> Result<String, SqlOverHttpError> {
+    let json_output = json::value_to_string!(|obj| json::value_as_object!(|obj| {
+        let results = obj.key("results");
+        json::value_as_list!(|results| {
+            query_batch(config, cancel, tx, queries, headers, results).await?;
+        });
+    }));
 
     Ok(json_output)
 }
@@ -882,54 +888,54 @@ async fn query_to_json<T: GenericClient>(
     config: &'static HttpConfig,
     client: &mut T,
     data: QueryData,
-    current_size: &mut usize,
+    output: json::ValueSer<'_>,
     parsed_headers: HttpHeaders,
-) -> Result<(ReadyForQueryStatus, impl Serialize + use<T>), SqlOverHttpError> {
+) -> Result<ReadyForQueryStatus, SqlOverHttpError> {
     let query_start = Instant::now();
 
-    let query_params = data.params;
+    let mut output = json::ObjectSer::new(output);
     let mut row_stream = client
-        .query_raw_txt(&data.query, query_params)
+        .query_raw_txt(&data.query, data.params)
         .await
         .map_err(SqlOverHttpError::Postgres)?;
     let query_acknowledged = Instant::now();
 
-    let columns_len = row_stream.statement.columns().len();
-    let mut fields = Vec::with_capacity(columns_len);
-
+    let mut json_fields = output.key("fields").list();
     for c in row_stream.statement.columns() {
-        fields.push(json!({
-            "name": c.name().to_owned(),
-            "dataTypeID": c.type_().oid(),
-            "tableID": c.table_oid(),
-            "columnID": c.column_id(),
-            "dataTypeSize": c.type_size(),
-            "dataTypeModifier": c.type_modifier(),
-            "format": "text",
-        }));
+        let json_field = json_fields.entry();
+        json::value_as_object!(|json_field| {
+            json_field.entry("name", c.name());
+            json_field.entry("dataTypeID", c.type_().oid());
+            json_field.entry("tableID", c.table_oid());
+            json_field.entry("columnID", c.column_id());
+            json_field.entry("dataTypeSize", c.type_size());
+            json_field.entry("dataTypeModifier", c.type_modifier());
+            json_field.entry("format", "text");
+        });
     }
+    json_fields.finish();
 
-    let raw_output = parsed_headers.raw_output;
     let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
+    let raw_output = parsed_headers.raw_output;
 
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
     // big.
-    let mut rows = Vec::new();
+    let mut rows = 0;
+    let mut json_rows = output.key("rows").list();
     while let Some(row) = row_stream.next().await {
         let row = row.map_err(SqlOverHttpError::Postgres)?;
-        *current_size += row.body_len();
 
         // we don't have a streaming response support yet so this is to prevent OOM
         // from a malicious query (eg a cross join)
-        if *current_size > config.max_response_size_bytes {
+        if json_rows.as_buffer().len() > config.max_response_size_bytes {
             return Err(SqlOverHttpError::ResponseTooLarge(
                 config.max_response_size_bytes,
             ));
         }
 
-        let row = pg_text_row_to_json(&row, raw_output, array_mode)?;
-        rows.push(row);
+        pg_text_row_to_json(json_rows.entry(), &row, raw_output, array_mode)?;
+        rows += 1;
 
         // assumption: parsing pg text and converting to json takes CPU time.
         // let's assume it is slightly expensive, so we should consume some cooperative budget.
@@ -937,16 +943,14 @@ async fn query_to_json<T: GenericClient>(
         // of rows and never hit the tokio mpsc for a long time (although unlikely).
         tokio::task::consume_budget().await;
     }
+    json_rows.finish();
 
     let query_resp_end = Instant::now();
-    let RowStream {
-        command_tag,
-        status: ready,
-        ..
-    } = row_stream;
+
+    let ready = row_stream.status;
 
     // grab the command tag and number of rows affected
-    let command_tag = command_tag.unwrap_or_default();
+    let command_tag = row_stream.command_tag.unwrap_or_default();
     let mut command_tag_split = command_tag.split(' ');
     let command_tag_name = command_tag_split.next().unwrap_or_default();
     let command_tag_count = if command_tag_name == "INSERT" {
@@ -959,7 +963,7 @@ async fn query_to_json<T: GenericClient>(
     .and_then(|s| s.parse::<i64>().ok());
 
     info!(
-        rows = rows.len(),
+        rows,
         ?ready,
         command_tag,
         acknowledgement = ?(query_acknowledged - query_start),
@@ -967,16 +971,12 @@ async fn query_to_json<T: GenericClient>(
         "finished executing query"
     );
 
-    // Resulting JSON format is based on the format of node-postgres result.
-    let results = json!({
-        "command": command_tag_name.to_string(),
-        "rowCount": command_tag_count,
-        "rows": rows,
-        "fields": fields,
-        "rowAsArray": array_mode,
-    });
+    output.entry("command", command_tag_name);
+    output.entry("rowCount", command_tag_count);
+    output.entry("rowAsArray", array_mode);
 
-    Ok((ready, results))
+    output.finish();
+    Ok(ready)
 }
 
 enum Client {
diff --git a/proxy/src/util.rs b/proxy/src/util.rs
index 7fc2d9fbdb..0291216d94 100644
--- a/proxy/src/util.rs
+++ b/proxy/src/util.rs
@@ -7,8 +7,16 @@ pub async fn run_until_cancelled<F: Future>(
     f: F,
     cancellation_token: &CancellationToken,
 ) -> Option<F::Output> {
-    match select(pin!(f), pin!(cancellation_token.cancelled())).await {
-        Either::Left((f, _)) => Some(f),
-        Either::Right(((), _)) => None,
+    run_until(f, cancellation_token.cancelled()).await.ok()
+}
+
+/// Runs the future `f` unless interrupted by future `condition`.
+pub async fn run_until<F1: Future, F2: Future>(
+    f: F1,
+    condition: F2,
+) -> Result<F1::Output, F2::Output> {
+    match select(pin!(f), pin!(condition)).await {
+        Either::Left((f1, _)) => Ok(f1),
+        Either::Right((f2, _)) => Err(f2),
     }
 }
diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index b4bb193a4b..3c8db3029e 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -6,10 +6,10 @@
 use std::error::Error as _;
 
 use http_utils::error::HttpErrorBody;
-use reqwest::{IntoUrl, Method, StatusCode};
+use reqwest::{IntoUrl, Method, Response, StatusCode};
 use safekeeper_api::models::{
     self, PullTimelineRequest, PullTimelineResponse, SafekeeperStatus, SafekeeperUtilization,
-    TimelineCreateRequest, TimelineStatus,
+    TimelineCreateRequest,
 };
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;
@@ -161,13 +161,12 @@ impl Client {
         &self,
         tenant_id: TenantId,
         timeline_id: TimelineId,
-    ) -> Result<TimelineStatus> {
+    ) -> Result<Response> {
         let uri = format!(
             "{}/v1/tenant/{}/timeline/{}",
             self.mgmt_api_endpoint, tenant_id, timeline_id
         );
-        let resp = self.get(&uri).await?;
-        resp.json().await.map_err(Error::ReceiveBody)
+        self.get(&uri).await
     }
 
     pub async fn snapshot(
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 8fda625817..79cf2f9149 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -23,6 +23,7 @@ use safekeeper::defaults::{
     DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
     DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
 };
+use safekeeper::hadron;
 use safekeeper::wal_backup::WalBackup;
 use safekeeper::{
     BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf,
@@ -37,6 +38,7 @@ use tracing::*;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
 use utils::id::NodeId;
 use utils::logging::{self, LogFormat, SecretString};
+use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
 use utils::sentry_init::init_sentry;
 use utils::{pid_file, project_build_tag, project_git_version, tcp_listener};
 
@@ -243,9 +245,18 @@ struct Args {
     #[arg(long)]
     enable_tls_wal_service_api: bool,
 
+    /// Controls whether to collect all metrics on each scrape or to return potentially stale
+    /// results.
+    #[arg(long, default_value_t = true)]
+    force_metric_collection_on_scrape: bool,
+
     /// Run in development mode (disables security checks)
     #[arg(long, help = "Run in development mode (disables security checks)")]
     dev: bool,
+    /* BEGIN_HADRON */
+    #[arg(long)]
+    enable_pull_timeline_on_startup: bool,
+    /* END_HADRON */
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -428,6 +439,12 @@ async fn main() -> anyhow::Result<()> {
         ssl_ca_certs,
         use_https_safekeeper_api: args.use_https_safekeeper_api,
         enable_tls_wal_service_api: args.enable_tls_wal_service_api,
+        force_metric_collection_on_scrape: args.force_metric_collection_on_scrape,
+        /* BEGIN_HADRON */
+        advertise_pg_addr_tenant_only: None,
+        enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup,
+        hcc_base_url: None,
+        /* END_HADRON */
     });
 
     // initialize sentry if SENTRY_DSN is provided
@@ -522,6 +539,20 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
     // Load all timelines from disk to memory.
     global_timelines.init().await?;
 
+    /* BEGIN_HADRON */
+    if conf.enable_pull_timeline_on_startup && global_timelines.timelines_count() == 0 {
+        match hadron::hcc_pull_timelines(&conf, global_timelines.clone()).await {
+            Ok(_) => {
+                info!("Successfully pulled all timelines from peer safekeepers");
+            }
+            Err(e) => {
+                error!("Failed to pull timelines from peer safekeepers: {:?}", e);
+                return Err(e);
+            }
+        }
+    }
+    /* END_HADRON */
+
     // Run everything in current thread rt, if asked.
     if conf.current_thread_runtime {
         info!("running in current thread runtime");
@@ -640,6 +671,26 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
         .map(|res| ("broker main".to_owned(), res));
     tasks_handles.push(Box::pin(broker_task_handle));
 
+    /* BEGIN_HADRON */
+    if conf.force_metric_collection_on_scrape {
+        let metrics_handle = current_thread_rt
+            .as_ref()
+            .unwrap_or_else(|| BACKGROUND_RUNTIME.handle())
+            .spawn(async move {
+                let mut interval: tokio::time::Interval =
+                    tokio::time::interval(METRICS_COLLECTION_INTERVAL);
+                loop {
+                    interval.tick().await;
+                    tokio::task::spawn_blocking(|| {
+                        METRICS_COLLECTOR.run_once(true);
+                    });
+                }
+            })
+            .map(|res| ("broker main".to_owned(), res));
+        tasks_handles.push(Box::pin(metrics_handle));
+    }
+    /* END_HADRON */
+
     set_build_info_metric(GIT_VERSION, BUILD_TAG);
 
     // TODO: update tokio-stream, convert to real async Stream with
diff --git a/safekeeper/src/hadron.rs b/safekeeper/src/hadron.rs
new file mode 100644
index 0000000000..b41bf2c3da
--- /dev/null
+++ b/safekeeper/src/hadron.rs
@@ -0,0 +1,388 @@
+use pem::Pem;
+use safekeeper_api::models::PullTimelineRequest;
+use std::{collections::HashMap, env::VarError, net::IpAddr, sync::Arc, time::Duration};
+use tokio::time::sleep;
+use tokio_util::sync::CancellationToken;
+use url::Url;
+use utils::{backoff, id::TenantTimelineId, ip_address};
+
+use anyhow::Result;
+use pageserver_api::controller_api::{
+    AvailabilityZone, NodeRegisterRequest, SafekeeperTimeline, SafekeeperTimelinesResponse,
+};
+
+use crate::{
+    GlobalTimelines, SafeKeeperConf,
+    metrics::{
+        SK_RECOVERY_PULL_TIMELINE_ERRORS, SK_RECOVERY_PULL_TIMELINE_OKS,
+        SK_RECOVERY_PULL_TIMELINE_SECONDS, SK_RECOVERY_PULL_TIMELINES_SECONDS,
+    },
+    pull_timeline,
+    timelines_global_map::DeleteOrExclude,
+};
+
+// Extract information in the SafeKeeperConf to build a NodeRegisterRequest used to register the safekeeper with the HCC.
+fn build_node_registeration_request(
+    conf: &SafeKeeperConf,
+    node_ip_addr: Option<IpAddr>,
+) -> Result<NodeRegisterRequest> {
+    let advertise_pg_addr_with_port = conf
+        .advertise_pg_addr_tenant_only
+        .as_deref()
+        .expect("advertise_pg_addr_tenant_only is required to register with HCC");
+
+    // Extract host/port from the string.
+    let (advertise_host_addr, pg_port_str) = advertise_pg_addr_with_port.split_at(
+        advertise_pg_addr_with_port
+            .rfind(':')
+            .ok_or(anyhow::anyhow!("Invalid advertise_pg_addr"))?,
+    );
+    // Need the `[1..]` to remove the leading ':'.
+    let pg_port = pg_port_str[1..]
+        .parse::<u16>()
+        .map_err(|e| anyhow::anyhow!("Cannot parse PG port: {}", e))?;
+
+    let (_, http_port_str) = conf.listen_http_addr.split_at(
+        conf.listen_http_addr
+            .rfind(':')
+            .ok_or(anyhow::anyhow!("Invalid listen_http_addr"))?,
+    );
+    let http_port = http_port_str[1..]
+        .parse::<u16>()
+        .map_err(|e| anyhow::anyhow!("Cannot parse HTTP port: {}", e))?;
+
+    Ok(NodeRegisterRequest {
+        node_id: conf.my_id,
+        listen_pg_addr: advertise_host_addr.to_string(),
+        listen_pg_port: pg_port,
+        listen_http_addr: advertise_host_addr.to_string(),
+        listen_http_port: http_port,
+        node_ip_addr,
+        availability_zone_id: AvailabilityZone("todo".to_string()),
+        listen_grpc_addr: None,
+        listen_grpc_port: None,
+        listen_https_port: None,
+    })
+}
+
+// Retrieve the JWT token used for authenticating with HCC from the environment variable.
+// Returns None if the token cannot be retrieved.
+fn get_hcc_auth_token() -> Option<String> {
+    match std::env::var("HCC_AUTH_TOKEN") {
+        Ok(v) => {
+            tracing::info!("Loaded JWT token for authentication with HCC");
+            Some(v)
+        }
+        Err(VarError::NotPresent) => {
+            tracing::info!("No JWT token for authentication with HCC detected");
+            None
+        }
+        Err(_) => {
+            tracing::info!(
+                "Failed to either load to detect non-present HCC_AUTH_TOKEN environment variable"
+            );
+            None
+        }
+    }
+}
+
+async fn send_safekeeper_register_request(
+    request_url: &Url,
+    auth_token: &Option<String>,
+    request: &NodeRegisterRequest,
+) -> Result<()> {
+    let client = reqwest::Client::new();
+    let mut req_builder = client
+        .post(request_url.clone())
+        .header("Content-Type", "application/json");
+    if let Some(token) = auth_token {
+        req_builder = req_builder.bearer_auth(token);
+    }
+    req_builder
+        .json(&request)
+        .send()
+        .await?
+        .error_for_status()?;
+    Ok(())
+}
+
+/// Registers this safe keeper with the HCC.
+pub async fn register(conf: &SafeKeeperConf) -> Result<()> {
+    match conf.hcc_base_url.as_ref() {
+        None => {
+            tracing::info!("HCC base URL is not set, skipping registration");
+            Ok(())
+        }
+        Some(hcc_base_url) => {
+            // The following operations acquiring the auth token and the node IP address both read environment
+            // variables. It's fine for now as this `register()` function is only called once during startup.
+            // If we start to talk to HCC more regularly in the safekeeper we should probably consider
+            // refactoring things into a "HadronClusterCoordinatorClient" struct.
+            let auth_token = get_hcc_auth_token();
+            let node_ip_addr =
+                ip_address::read_node_ip_addr_from_env().expect("Error reading node IP address.");
+
+            let request = build_node_registeration_request(conf, node_ip_addr)?;
+            let cancel = CancellationToken::new();
+            let request_url = hcc_base_url.clone().join("/hadron-internal/v1/sk")?;
+
+            backoff::retry(
+                || async {
+                    send_safekeeper_register_request(&request_url, &auth_token, &request).await
+                },
+                |_| false,
+                3,
+                u32::MAX,
+                "Calling the HCC safekeeper register API",
+                &cancel,
+            )
+            .await
+            .ok_or(anyhow::anyhow!(
+                "Error in forever retry loop. This error should never be surfaced."
+            ))?
+        }
+    }
+}
+
+async fn safekeeper_list_timelines_request(
+    conf: &SafeKeeperConf,
+) -> Result<pageserver_api::controller_api::SafekeeperTimelinesResponse> {
+    if conf.hcc_base_url.is_none() {
+        tracing::info!("HCC base URL is not set, skipping registration");
+        return Err(anyhow::anyhow!("HCC base URL is not set"));
+    }
+
+    // The following operations acquiring the auth token and the node IP address both read environment
+    // variables. It's fine for now as this `register()` function is only called once during startup.
+    // If we start to talk to HCC more regularly in the safekeeper we should probably consider
+    // refactoring things into a "HadronClusterCoordinatorClient" struct.
+    let auth_token = get_hcc_auth_token();
+    let method = format!("/control/v1/safekeeper/{}/timelines", conf.my_id.0);
+    let request_url = conf.hcc_base_url.as_ref().unwrap().clone().join(&method)?;
+
+    let client = reqwest::Client::new();
+    let mut req_builder = client
+        .get(request_url.clone())
+        .header("Content-Type", "application/json")
+        .query(&[("id", conf.my_id.0)]);
+    if let Some(token) = auth_token {
+        req_builder = req_builder.bearer_auth(token);
+    }
+    let response = req_builder
+        .send()
+        .await?
+        .error_for_status()?
+        .json::<pageserver_api::controller_api::SafekeeperTimelinesResponse>()
+        .await?;
+    Ok(response)
+}
+
+// Returns true on success, false otherwise.
+pub async fn hcc_pull_timeline(
+    timeline: SafekeeperTimeline,
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+    nodeid_http: &HashMap<u64, String>,
+) -> bool {
+    let mut request = PullTimelineRequest {
+        tenant_id: timeline.tenant_id,
+        timeline_id: timeline.timeline_id,
+        http_hosts: Vec::new(),
+        ignore_tombstone: None,
+    };
+    for host in timeline.peers {
+        if host.0 == conf.my_id.0 {
+            continue;
+        }
+        if let Some(http_host) = nodeid_http.get(&host.0) {
+            request.http_hosts.push(http_host.clone());
+        }
+    }
+
+    let ca_certs = match conf
+        .ssl_ca_certs
+        .iter()
+        .map(Pem::contents)
+        .map(reqwest::Certificate::from_der)
+        .collect::<Result<Vec<_>, _>>()
+    {
+        Ok(result) => result,
+        Err(_) => {
+            return false;
+        }
+    };
+    match pull_timeline::handle_request(
+        request,
+        conf.sk_auth_token.clone(),
+        ca_certs,
+        global_timelines.clone(),
+        true,
+    )
+    .await
+    {
+        Ok(resp) => {
+            tracing::info!(
+                "Completed pulling tenant {} timeline {} from SK {:?}",
+                timeline.tenant_id,
+                timeline.timeline_id,
+                resp.safekeeper_host
+            );
+            return true;
+        }
+        Err(e) => {
+            tracing::error!(
+                "Failed to pull tenant {} timeline {} from SK {}",
+                timeline.tenant_id,
+                timeline.timeline_id,
+                e
+            );
+
+            let ttid = TenantTimelineId {
+                tenant_id: timeline.tenant_id,
+                timeline_id: timeline.timeline_id,
+            };
+            // Revert the failed timeline pull.
+            // Notice that not found timeline returns OK also.
+            match global_timelines
+                .delete_or_exclude(&ttid, DeleteOrExclude::DeleteLocal)
+                .await
+            {
+                Ok(dr) => {
+                    tracing::info!(
+                        "Deleted tenant {} timeline {} DirExists: {}",
+                        timeline.tenant_id,
+                        timeline.timeline_id,
+                        dr.dir_existed,
+                    );
+                }
+                Err(e) => {
+                    tracing::error!(
+                        "Failed to delete tenant {} timeline {} from global_timelines: {}",
+                        timeline.tenant_id,
+                        timeline.timeline_id,
+                        e
+                    );
+                }
+            }
+        }
+    }
+    false
+}
+
+pub async fn hcc_pull_timeline_till_success(
+    timeline: SafekeeperTimeline,
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+    nodeid_http: &HashMap<u64, String>,
+) {
+    const MAX_PULL_TIMELINE_RETRIES: u64 = 100;
+    for i in 0..MAX_PULL_TIMELINE_RETRIES {
+        if hcc_pull_timeline(
+            timeline.clone(),
+            conf,
+            global_timelines.clone(),
+            nodeid_http,
+        )
+        .await
+        {
+            SK_RECOVERY_PULL_TIMELINE_OKS.inc();
+            return;
+        }
+        tracing::error!(
+            "Failed to pull timeline {} from SK peers, retrying {}/{}",
+            timeline.timeline_id,
+            i + 1,
+            MAX_PULL_TIMELINE_RETRIES
+        );
+        tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+    }
+    SK_RECOVERY_PULL_TIMELINE_ERRORS.inc();
+}
+
+pub async fn hcc_pull_timelines(
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+) -> Result<()> {
+    let _timer = SK_RECOVERY_PULL_TIMELINES_SECONDS.start_timer();
+    tracing::info!("Start pulling timelines from SK peers");
+
+    let mut response = SafekeeperTimelinesResponse {
+        timelines: Vec::new(),
+        safekeeper_peers: Vec::new(),
+    };
+    for i in 0..100 {
+        match safekeeper_list_timelines_request(conf).await {
+            Ok(timelines) => {
+                response = timelines;
+            }
+            Err(e) => {
+                tracing::error!("Failed to list timelines from HCC: {}", e);
+                if i == 99 {
+                    return Err(e);
+                }
+            }
+        }
+        sleep(Duration::from_millis(100)).await;
+    }
+
+    let mut nodeid_http = HashMap::new();
+    for sk in response.safekeeper_peers {
+        nodeid_http.insert(
+            sk.node_id.0,
+            format!("http://{}:{}", sk.listen_http_addr, sk.http_port),
+        );
+    }
+    tracing::info!("Received {} timelines from HCC", response.timelines.len());
+    for timeline in response.timelines {
+        let _timer = SK_RECOVERY_PULL_TIMELINE_SECONDS
+            .with_label_values(&[
+                &timeline.tenant_id.to_string(),
+                &timeline.timeline_id.to_string(),
+            ])
+            .start_timer();
+        hcc_pull_timeline_till_success(timeline, conf, global_timelines.clone(), &nodeid_http)
+            .await;
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use utils::id::NodeId;
+
+    #[test]
+    fn test_build_node_registeration_request() {
+        // Test that:
+        // 1. We always extract the host name and port used to register with the HCC from the
+        //    `advertise_pg_addr` if it is set.
+        // 2. The correct ports are extracted from `advertise_pg_addr` and `listen_http_addr`.
+        let mut conf = SafeKeeperConf::dummy();
+        conf.my_id = NodeId(1);
+        conf.advertise_pg_addr_tenant_only =
+            Some("safe-keeper-1.safe-keeper.hadron.svc.cluster.local:5454".to_string());
+        // `listen_pg_addr` and `listen_pg_addr_tenant_only` are not used for node registration. Set them to a different
+        // host and port values and make sure that they don't show up in the node registration request.
+        conf.listen_pg_addr = "0.0.0.0:5456".to_string();
+        conf.listen_pg_addr_tenant_only = Some("0.0.0.0:5456".to_string());
+        conf.listen_http_addr = "0.0.0.0:7676".to_string();
+        let node_ip_addr: Option<IpAddr> = Some("127.0.0.1".parse().unwrap());
+
+        let request = build_node_registeration_request(&conf, node_ip_addr).unwrap();
+        assert_eq!(request.node_id, NodeId(1));
+        assert_eq!(
+            request.listen_pg_addr,
+            "safe-keeper-1.safe-keeper.hadron.svc.cluster.local"
+        );
+        assert_eq!(request.listen_pg_port, 5454);
+        assert_eq!(
+            request.listen_http_addr,
+            "safe-keeper-1.safe-keeper.hadron.svc.cluster.local"
+        );
+        assert_eq!(request.listen_http_port, 7676);
+        assert_eq!(
+            request.node_ip_addr,
+            Some(IpAddr::V4("127.0.0.1".parse().unwrap()))
+        );
+    }
+}
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 384c582678..a0ee2facb5 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -241,9 +241,14 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
             ApiError::InternalServerError(anyhow::anyhow!("failed to parse CA certs: {e}"))
         })?;
 
-    let resp =
-        pull_timeline::handle_request(data, conf.sk_auth_token.clone(), ca_certs, global_timelines)
-            .await?;
+    let resp = pull_timeline::handle_request(
+        data,
+        conf.sk_auth_token.clone(),
+        ca_certs,
+        global_timelines,
+        false,
+    )
+    .await?;
     json_response(StatusCode::OK, resp)
 }
 
@@ -699,6 +704,11 @@ pub fn make_router(
         }))
     }
 
+    let force_metric_collection_on_scrape = conf.force_metric_collection_on_scrape;
+
+    let prometheus_metrics_handler_wrapper =
+        move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape);
+
     // NB: on any changes do not forget to update the OpenAPI spec
     // located nearby (/safekeeper/src/http/openapi_spec.yaml).
     let auth = conf.http_auth.clone();
@@ -706,7 +716,9 @@ pub fn make_router(
         .data(conf)
         .data(global_timelines)
         .data(auth)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| {
+            request_span(r, prometheus_metrics_handler_wrapper)
+        })
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
         .get("/profile/heap", |r| request_span(r, profile_heap_handler))
         .get("/v1/status", |r| request_span(r, status_handler))
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index c461c071da..02533b804d 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -10,6 +10,7 @@ use pem::Pem;
 use remote_storage::RemoteStorageConfig;
 use storage_broker::Uri;
 use tokio::runtime::Runtime;
+use url::Url;
 use utils::auth::SwappableJwtAuth;
 use utils::id::NodeId;
 use utils::logging::SecretString;
@@ -20,6 +21,7 @@ pub mod control_file;
 pub mod control_file_upgrade;
 pub mod copy_timeline;
 pub mod debug_dump;
+pub mod hadron;
 pub mod handler;
 pub mod http;
 pub mod metrics;
@@ -100,6 +102,11 @@ pub struct SafeKeeperConf {
     pub advertise_pg_addr: Option<String>,
     pub availability_zone: Option<String>,
     pub no_sync: bool,
+    /* BEGIN_HADRON */
+    pub advertise_pg_addr_tenant_only: Option<String>,
+    pub enable_pull_timeline_on_startup: bool,
+    pub hcc_base_url: Option<Url>,
+    /* END_HADRON */
     pub broker_endpoint: Uri,
     pub broker_keepalive_interval: Duration,
     pub heartbeat_timeout: Duration,
@@ -134,6 +141,7 @@ pub struct SafeKeeperConf {
     pub ssl_ca_certs: Vec<Pem>,
     pub use_https_safekeeper_api: bool,
     pub enable_tls_wal_service_api: bool,
+    pub force_metric_collection_on_scrape: bool,
 }
 
 impl SafeKeeperConf {
@@ -183,6 +191,12 @@ impl SafeKeeperConf {
             ssl_ca_certs: Vec::new(),
             use_https_safekeeper_api: false,
             enable_tls_wal_service_api: false,
+            force_metric_collection_on_scrape: true,
+            /* BEGIN_HADRON */
+            advertise_pg_addr_tenant_only: None,
+            enable_pull_timeline_on_startup: false,
+            hcc_base_url: None,
+            /* END_HADRON */
         }
     }
 }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 9baa80f73a..e1af51c115 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -59,6 +59,15 @@ pub static FLUSH_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     .expect("Failed to register safekeeper_flush_wal_seconds histogram")
 });
 /* BEGIN_HADRON */
+// Counter of all ProposerAcceptorMessage requests received
+pub static PROPOSER_ACCEPTOR_MESSAGES_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_proposer_acceptor_messages_total",
+        "Total number of ProposerAcceptorMessage requests received by the Safekeeper.",
+        &["outcome"]
+    )
+    .expect("Failed to register safekeeper_proposer_acceptor_messages_total counter")
+});
 pub static WAL_DISK_IO_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "safekeeper_wal_disk_io_errors",
@@ -76,6 +85,43 @@ pub static WAL_STORAGE_LIMIT_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_wal_storage_limit_errors counter")
 });
+pub static SK_RECOVERY_PULL_TIMELINE_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_recovery_pull_timeline_errors",
+        concat!(
+            "Number of errors due to pull_timeline errors during SK lost disk recovery.",
+            "An increase in this metric indicates pull timelines runs into error."
+        )
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_errors counter")
+});
+pub static SK_RECOVERY_PULL_TIMELINE_OKS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_recovery_pull_timeline_oks",
+        concat!(
+            "Number of successful pull_timeline during SK lost disk recovery.",
+            "An increase in this metric indicates pull timelines is successful."
+        )
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_oks counter")
+});
+pub static SK_RECOVERY_PULL_TIMELINES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_recovery_pull_timelines_seconds",
+        "Seconds to pull timelines",
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timelines_seconds histogram")
+});
+pub static SK_RECOVERY_PULL_TIMELINE_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "safekeeper_recovery_pull_timeline_seconds",
+        "Seconds to pull timeline",
+        &["tenant_id", "timeline_id"],
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_seconds histogram vec")
+});
 /* END_HADRON */
 pub static PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 1c9e5bade5..b4c4877b2c 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -8,6 +8,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
+use http::StatusCode;
 use http_utils::error::ApiError;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
 use remote_storage::GenericRemoteStorage;
@@ -21,10 +22,11 @@ use tokio::fs::OpenOptions;
 use tokio::io::AsyncWrite;
 use tokio::sync::mpsc;
 use tokio::task;
+use tokio::time::sleep;
 use tokio_tar::{Archive, Builder, Header};
 use tokio_util::io::{CopyToBytes, SinkWriter};
 use tokio_util::sync::PollSender;
-use tracing::{error, info, instrument};
+use tracing::{error, info, instrument, warn};
 use utils::crashsafe::fsync_async_opt;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::logging::SecretString;
@@ -449,6 +451,7 @@ pub async fn handle_request(
     sk_auth_token: Option<SecretString>,
     ssl_ca_certs: Vec<Certificate>,
     global_timelines: Arc<GlobalTimelines>,
+    wait_for_peer_timeline_status: bool,
 ) -> Result<PullTimelineResponse, ApiError> {
     let existing_tli = global_timelines.get(TenantTimelineId::new(
         request.tenant_id,
@@ -472,37 +475,100 @@ pub async fn handle_request(
     let http_hosts = request.http_hosts.clone();
 
     // Figure out statuses of potential donors.
-    let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
-        futures::future::join_all(http_hosts.iter().map(|url| async {
-            let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
-            let info = cclient
-                .timeline_status(request.tenant_id, request.timeline_id)
-                .await?;
-            Ok(info)
-        }))
-        .await;
-
     let mut statuses = Vec::new();
-    for (i, response) in responses.into_iter().enumerate() {
-        match response {
-            Ok(status) => {
-                statuses.push((status, i));
-            }
-            Err(e) => {
-                info!("error fetching status from {}: {e}", http_hosts[i]);
+    if !wait_for_peer_timeline_status {
+        let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
+            futures::future::join_all(http_hosts.iter().map(|url| async {
+                let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
+                let resp = cclient
+                    .timeline_status(request.tenant_id, request.timeline_id)
+                    .await?;
+                let info: TimelineStatus = resp
+                    .json()
+                    .await
+                    .context("Failed to deserialize timeline status")
+                    .map_err(|e| mgmt_api::Error::ReceiveErrorBody(e.to_string()))?;
+                Ok(info)
+            }))
+            .await;
+
+        for (i, response) in responses.into_iter().enumerate() {
+            match response {
+                Ok(status) => {
+                    statuses.push((status, i));
+                }
+                Err(e) => {
+                    info!("error fetching status from {}: {e}", http_hosts[i]);
+                }
             }
         }
-    }
 
-    // Allow missing responses from up to one safekeeper (say due to downtime)
-    // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
-    // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
-    let min_required_successful = (http_hosts.len() - 1).max(1);
-    if statuses.len() < min_required_successful {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "only got {} successful status responses. required: {min_required_successful}",
-            statuses.len()
-        )));
+        // Allow missing responses from up to one safekeeper (say due to downtime)
+        // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
+        // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
+        let min_required_successful = (http_hosts.len() - 1).max(1);
+        if statuses.len() < min_required_successful {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "only got {} successful status responses. required: {min_required_successful}",
+                statuses.len()
+            )));
+        }
+    } else {
+        let mut retry = true;
+        // We must get status from all other peers.
+        // Otherwise, we may run into split-brain scenario.
+        while retry {
+            statuses.clear();
+            retry = false;
+            for (i, url) in http_hosts.iter().enumerate() {
+                let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
+                match cclient
+                    .timeline_status(request.tenant_id, request.timeline_id)
+                    .await
+                {
+                    Ok(resp) => {
+                        if resp.status() == StatusCode::NOT_FOUND {
+                            warn!(
+                                "Timeline {} not found on peer SK {}, no need to pull it",
+                                TenantTimelineId::new(request.tenant_id, request.timeline_id),
+                                url
+                            );
+                            return Ok(PullTimelineResponse {
+                                safekeeper_host: None,
+                            });
+                        }
+                        let info: TimelineStatus = resp
+                            .json()
+                            .await
+                            .context("Failed to deserialize timeline status")
+                            .map_err(ApiError::InternalServerError)?;
+                        statuses.push((info, i));
+                    }
+                    Err(e) => {
+                        match e {
+                            // If we get a 404, it means the timeline doesn't exist on this safekeeper.
+                            // We can ignore this error.
+                            mgmt_api::Error::ApiError(status, _)
+                                if status == StatusCode::NOT_FOUND =>
+                            {
+                                warn!(
+                                    "Timeline {} not found on peer SK {}, no need to pull it",
+                                    TenantTimelineId::new(request.tenant_id, request.timeline_id),
+                                    url
+                                );
+                                return Ok(PullTimelineResponse {
+                                    safekeeper_host: None,
+                                });
+                            }
+                            _ => {}
+                        }
+                        retry = true;
+                        error!("Failed to get timeline status from {}: {:#}", url, e);
+                    }
+                }
+            }
+            sleep(std::time::Duration::from_millis(100)).await;
+        }
     }
 
     // Find the most advanced safekeeper
@@ -511,6 +577,12 @@ pub async fn handle_request(
         .max_by_key(|(status, _)| {
             (
                 status.acceptor_state.epoch,
+                /* BEGIN_HADRON */
+                // We need to pull from the SK with the highest term.
+                // This is because another compute may come online and vote the same highest term again on the other two SKs.
+                // Then, there will be 2 computes running on the same term.
+                status.acceptor_state.term,
+                /* END_HADRON */
                 status.flush_lsn,
                 status.commit_lsn,
             )
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 4d15fc9de3..09ca041e22 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -24,7 +24,7 @@ use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;
 
-use crate::metrics::MISC_OPERATION_SECONDS;
+use crate::metrics::{MISC_OPERATION_SECONDS, PROPOSER_ACCEPTOR_MESSAGES_TOTAL};
 use crate::state::TimelineState;
 use crate::{control_file, wal_storage};
 
@@ -938,7 +938,7 @@ where
         &mut self,
         msg: &ProposerAcceptorMessage,
     ) -> Result<Option<AcceptorProposerMessage>> {
-        match msg {
+        let res = match msg {
             ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg).await,
             ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg).await,
             ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg).await,
@@ -949,7 +949,20 @@ where
                 self.handle_append_request(msg, false).await
             }
             ProposerAcceptorMessage::FlushWAL => self.handle_flush().await,
-        }
+        };
+
+        // BEGIN HADRON
+        match &res {
+            Ok(_) => PROPOSER_ACCEPTOR_MESSAGES_TOTAL
+                .with_label_values(&["success"])
+                .inc(),
+            Err(_) => PROPOSER_ACCEPTOR_MESSAGES_TOTAL
+                .with_label_values(&["error"])
+                .inc(),
+        };
+
+        res
+        // END HADRON
     }
 
     /// Handle initial message from proposer: check its sanity and send my
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 7e10847a1b..0e8dfd64c3 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -166,7 +166,7 @@ fn hadron_determine_offloader(mgr: &Manager, state: &StateSnapshot) -> (Option<N
 
     let backup_lag = state.commit_lsn.checked_sub(state.backup_lsn);
     if backup_lag.is_none() {
-        info!("Backup lag is None. Skipping re-election.");
+        debug!("Backup lag is None. Skipping re-election.");
         return (offloader, election_dbg_str);
     }
 
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 1f6990c682..393df6228e 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -190,6 +190,12 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         ssl_ca_certs: Vec::new(),
         use_https_safekeeper_api: false,
         enable_tls_wal_service_api: false,
+        force_metric_collection_on_scrape: true,
+        /* BEGIN_HADRON */
+        enable_pull_timeline_on_startup: false,
+        advertise_pg_addr_tenant_only: None,
+        hcc_base_url: None,
+        /* END_HADRON */
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index e5a3a969d4..62fc212e12 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -850,6 +850,31 @@ async fn handle_tenant_describe(
     json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }
 
+/* BEGIN_HADRON */
+async fn handle_tenant_timeline_describe(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Scrubber)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_timeline_describe(tenant_id, timeline_id)
+            .await?,
+    )
+}
+/* END_HADRON */
+
 async fn handle_tenant_list(
     service: Arc<Service>,
     req: Request<Body>,
@@ -2480,6 +2505,13 @@ pub fn make_router(
             )
         })
         // Timeline operations
+        .get("/control/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_describe,
+                RequestName("v1_tenant_timeline_describe"),
+            )
+        })
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             tenant_service_handler(
                 r,
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 2a851dc25b..5d21feeb10 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -222,6 +222,9 @@ struct Cli {
     /// Primarily useful for testing to reduce test execution time.
     #[arg(long, default_value = "false", action=ArgAction::Set)]
     kick_secondary_downloads: bool,
+
+    #[arg(long)]
+    shard_split_request_timeout: Option<humantime::Duration>,
 }
 
 enum StrictMode {
@@ -470,6 +473,10 @@ async fn async_main() -> anyhow::Result<()> {
         timeline_safekeeper_count: args.timeline_safekeeper_count,
         posthog_config: posthog_config.clone(),
         kick_secondary_downloads: args.kick_secondary_downloads,
+        shard_split_request_timeout: args
+            .shard_split_request_timeout
+            .map(humantime::Duration::into)
+            .unwrap_or(Duration::MAX),
     };
 
     // Validate that we can connect to the database
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index d6fe173eb3..da0687895a 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -86,6 +86,23 @@ impl PageserverClient {
         )
     }
 
+    /* BEGIN_HADRON */
+    pub(crate) async fn tenant_timeline_describe(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Result<TimelineInfo> {
+        measured_request!(
+            "tenant_timeline_describe",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner
+                .tenant_timeline_describe(tenant_shard_id, timeline_id,)
+                .await
+        )
+    }
+    /* END_HADRON */
+
     pub(crate) async fn tenant_scan_remote_storage(
         &self,
         tenant_id: TenantId,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ed6643d641..638cb410fa 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -32,7 +32,7 @@ use pageserver_api::controller_api::{
     ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
     SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
     TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    TenantShardMigrateRequest, TenantShardMigrateResponse, TenantTimelineDescribeResponse,
 };
 use pageserver_api::models::{
     self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease,
@@ -60,6 +60,7 @@ use tokio::sync::mpsc::error::TrySendError;
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
 use utils::completion::Barrier;
+use utils::env;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -483,6 +484,9 @@ pub struct Config {
 
     /// When set, actively checks and initiates heatmap downloads/uploads.
     pub kick_secondary_downloads: bool,
+
+    /// Timeout used for HTTP client of split requests. [`Duration::MAX`] if None.
+    pub shard_split_request_timeout: Duration,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -1984,11 +1988,14 @@ impl Service {
         });
 
         // Check that there is enough safekeepers configured that we can create new timelines
-        let test_sk_res = this.safekeepers_for_new_timeline().await;
+        let test_sk_res_str = match this.safekeepers_for_new_timeline().await {
+            Ok(v) => format!("Ok({v:?})"),
+            Err(v) => format!("Err({v:})"),
+        };
         tracing::info!(
             timeline_safekeeper_count = config.timeline_safekeeper_count,
             timelines_onto_safekeepers = config.timelines_onto_safekeepers,
-            "viability test result (test timeline creation on safekeepers): {test_sk_res:?}",
+            "viability test result (test timeline creation on safekeepers): {test_sk_res_str}",
         );
 
         Ok(this)
@@ -4428,7 +4435,7 @@ impl Service {
                 .await;
 
             let mut failed = 0;
-            for (tid, result) in targeted_tenant_shards.iter().zip(results.into_iter()) {
+            for (tid, (_, result)) in targeted_tenant_shards.iter().zip(results.into_iter()) {
                 match result {
                     Ok(ok) => {
                         if tid.is_shard_zero() {
@@ -4758,6 +4765,7 @@ impl Service {
         )
         .await;
 
+        let mut retry_if_not_attached = false;
         let targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -4774,6 +4782,24 @@ impl Service {
                         .expect("Pageservers may not be deleted while referenced");
 
                     targets.push((*tenant_shard_id, node.clone()));
+
+                    if let Some(location) = shard.observed.locations.get(node_id) {
+                        if let Some(ref conf) = location.conf {
+                            if conf.mode != LocationConfigMode::AttachedSingle
+                                && conf.mode != LocationConfigMode::AttachedMulti
+                            {
+                                // If the shard is attached as secondary, we need to retry if 404.
+                                retry_if_not_attached = true;
+                            }
+                            // If the shard is attached as primary, we should succeed.
+                        } else {
+                            // Location conf is not available yet, retry if 404.
+                            retry_if_not_attached = true;
+                        }
+                    } else {
+                        // The shard is not attached to the intended pageserver yet, retry if 404.
+                        retry_if_not_attached = true;
+                    }
                 }
             }
             targets
@@ -4795,7 +4821,7 @@ impl Service {
             .await;
 
         let mut valid_until = None;
-        for r in res {
+        for (node, r) in res {
             match r {
                 Ok(lease) => {
                     if let Some(ref mut valid_until) = valid_until {
@@ -4804,8 +4830,20 @@ impl Service {
                         valid_until = Some(lease.valid_until);
                     }
                 }
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _))
+                    if retry_if_not_attached =>
+                {
+                    // This is expected if the attach is not finished yet. Return 503 so that the client can retry.
+                    return Err(ApiError::ResourceUnavailable(
+                        format!(
+                            "Timeline is not attached to the pageserver {} yet, please retry",
+                            node.get_id()
+                        )
+                        .into(),
+                    ));
+                }
                 Err(e) => {
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
+                    return Err(passthrough_api_error(&node, e));
                 }
             }
         }
@@ -4919,7 +4957,7 @@ impl Service {
         max_retries: u32,
         timeout: Duration,
         cancel: &CancellationToken,
-    ) -> Vec<mgmt_api::Result<T>>
+    ) -> Vec<(Node, mgmt_api::Result<T>)>
     where
         O: Fn(TenantShardId, PageserverClient) -> F + Copy,
         F: std::future::Future<Output = mgmt_api::Result<T>>,
@@ -4940,16 +4978,16 @@ impl Service {
                         cancel,
                     )
                     .await;
-                (idx, r)
+                (idx, node, r)
             });
         }
 
-        while let Some((idx, r)) = futs.next().await {
-            results.push((idx, r.unwrap_or(Err(mgmt_api::Error::Cancelled))));
+        while let Some((idx, node, r)) = futs.next().await {
+            results.push((idx, node, r.unwrap_or(Err(mgmt_api::Error::Cancelled))));
         }
 
-        results.sort_by_key(|(idx, _)| *idx);
-        results.into_iter().map(|(_, r)| r).collect()
+        results.sort_by_key(|(idx, _, _)| *idx);
+        results.into_iter().map(|(_, node, r)| (node, r)).collect()
     }
 
     /// Helper for safely working with the shards in a tenant remotely on pageservers, for example
@@ -5172,6 +5210,9 @@ impl Service {
                 match res {
                     Ok(ok) => Ok(ok),
                     Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT),
+                    Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) if msg.contains("Requested tenant is missing") => {
+                        Err(ApiError::ResourceUnavailable("Tenant migration in progress".into()))
+                    },
                     Err(mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg)) => Err(ApiError::ResourceUnavailable(msg.into())),
                     Err(e) => {
                         Err(
@@ -5452,6 +5493,92 @@ impl Service {
         .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
     }
 
+    /* BEGIN_HADRON */
+    pub(crate) async fn tenant_timeline_describe(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<TenantTimelineDescribeResponse, ApiError> {
+        self.tenant_remote_mutation(tenant_id, |locations| async move {
+            if locations.0.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            };
+
+            let locations: Vec<(TenantShardId, Node)> = locations
+                .0
+                .iter()
+                .map(|t| (*t.0, t.1.latest.node.clone()))
+                .collect();
+            let mut futs = FuturesUnordered::new();
+
+            for (shard_id, node) in locations {
+                futs.push({
+                    async move {
+                        let result = node
+                            .with_client_retries(
+                                |client| async move {
+                                    client
+                                        .tenant_timeline_describe(&shard_id, &timeline_id)
+                                        .await
+                                },
+                                &self.http_client,
+                                &self.config.pageserver_jwt_token,
+                                3,
+                                3,
+                                Duration::from_secs(30),
+                                &self.cancel,
+                            )
+                            .await;
+                        (result, shard_id, node.get_id())
+                    }
+                });
+            }
+
+            let mut results: Vec<TimelineInfo> = Vec::new();
+            while let Some((result, tenant_shard_id, node_id)) = futs.next().await {
+                match result {
+                    Some(Ok(timeline_info)) => results.push(timeline_info),
+                    Some(Err(e)) => {
+                        tracing::warn!(
+                            "Failed to describe tenant {} timeline {} for pageserver {}: {e}",
+                            tenant_shard_id,
+                            timeline_id,
+                            node_id,
+                        );
+                        return Err(ApiError::ResourceUnavailable(format!("{e}").into()));
+                    }
+                    None => return Err(ApiError::Cancelled),
+                }
+            }
+            let mut image_consistent_lsn: Option<Lsn> = Some(Lsn::MAX);
+            for timeline_info in &results {
+                if let Some(tline_image_consistent_lsn) = timeline_info.image_consistent_lsn {
+                    image_consistent_lsn = Some(std::cmp::min(
+                        image_consistent_lsn.unwrap(),
+                        tline_image_consistent_lsn,
+                    ));
+                } else {
+                    tracing::warn!(
+                        "Timeline {} on shard {} does not have image consistent lsn",
+                        timeline_info.timeline_id,
+                        timeline_info.tenant_id
+                    );
+                    image_consistent_lsn = None;
+                    break;
+                }
+            }
+
+            Ok(TenantTimelineDescribeResponse {
+                shards: results,
+                image_consistent_lsn,
+            })
+        })
+        .await?
+    }
+    /* END_HADRON */
+
     /// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not
     /// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory
     /// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses
@@ -5862,7 +5989,7 @@ impl Service {
             return;
         }
 
-        for result in self
+        for (_, result) in self
             .tenant_for_shards_api(
                 attached,
                 |tenant_shard_id, client| async move {
@@ -5881,7 +6008,7 @@ impl Service {
             }
         }
 
-        for result in self
+        for (_, result) in self
             .tenant_for_shards_api(
                 secondary,
                 |tenant_shard_id, client| async move {
@@ -6283,18 +6410,39 @@ impl Service {
         // TODO: issue split calls concurrently (this only matters once we're splitting
         // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
 
+        // HADRON: set a timeout for splitting individual shards on page servers.
+        // Currently we do not perform any retry because it's not clear if page server can handle
+        // partially split shards correctly.
+        let shard_split_timeout =
+            if let Some(env::DeploymentMode::Local) = env::get_deployment_mode() {
+                Duration::from_secs(30)
+            } else {
+                self.config.shard_split_request_timeout
+            };
+        let mut http_client_builder = reqwest::ClientBuilder::new()
+            .pool_max_idle_per_host(0)
+            .timeout(shard_split_timeout);
+
+        for ssl_ca_cert in &self.config.ssl_ca_certs {
+            http_client_builder = http_client_builder.add_root_certificate(ssl_ca_cert.clone());
+        }
+        let http_client = http_client_builder
+            .build()
+            .expect("Failed to construct HTTP client");
         for target in &targets {
             let ShardSplitTarget {
                 parent_id,
                 node,
                 child_ids,
             } = target;
+
             let client = PageserverClient::new(
                 node.get_id(),
-                self.http_client.clone(),
+                http_client.clone(),
                 node.base_url(),
                 self.config.pageserver_jwt_token.as_deref(),
             );
+
             let response = client
                 .tenant_shard_split(
                     *parent_id,
@@ -8768,7 +8916,7 @@ impl Service {
             )
             .await;
 
-        for ((tenant_shard_id, node, optimization), secondary_status) in
+        for ((tenant_shard_id, node, optimization), (_, secondary_status)) in
             want_secondary_status.into_iter().zip(results.into_iter())
         {
             match secondary_status {
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index d7179372b2..7521d7bd86 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -25,7 +25,8 @@ use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo};
 use safekeeper_api::PgVersionId;
 use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration};
 use safekeeper_api::models::{
-    PullTimelineRequest, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse,
+    PullTimelineRequest, TimelineLocateResponse, TimelineMembershipSwitchRequest,
+    TimelineMembershipSwitchResponse,
 };
 use safekeeper_api::{INITIAL_TERM, Term};
 use safekeeper_client::mgmt_api;
@@ -37,21 +38,14 @@ use utils::lsn::Lsn;
 
 use super::Service;
 
-#[derive(serde::Serialize, serde::Deserialize, Clone)]
-pub struct TimelineLocateResponse {
-    pub generation: SafekeeperGeneration,
-    pub sk_set: Vec<NodeId>,
-    pub new_sk_set: Option<Vec<NodeId>>,
-}
-
 impl Service {
-    fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, ApiError> {
+    fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, anyhow::Error> {
         let members = safekeepers
             .iter()
             .map(|sk| sk.get_safekeeper_id())
             .collect::<Vec<_>>();
 
-        MemberSet::new(members).map_err(ApiError::InternalServerError)
+        MemberSet::new(members)
     }
 
     fn get_safekeepers(&self, ids: &[i64]) -> Result<Vec<Safekeeper>, ApiError> {
@@ -86,7 +80,7 @@ impl Service {
     ) -> Result<Vec<NodeId>, ApiError> {
         let safekeepers = self.get_safekeepers(&timeline_persistence.sk_set)?;
 
-        let mset = Self::make_member_set(&safekeepers)?;
+        let mset = Self::make_member_set(&safekeepers).map_err(ApiError::InternalServerError)?;
         let mconf = safekeeper_api::membership::Configuration::new(mset);
 
         let req = safekeeper_api::models::TimelineCreateRequest {
@@ -1111,6 +1105,26 @@ impl Service {
             }
         }
 
+        if new_sk_set.is_empty() {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "new safekeeper set is empty"
+            )));
+        }
+
+        if new_sk_set.len() < self.config.timeline_safekeeper_count {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "new safekeeper set must have at least {} safekeepers",
+                self.config.timeline_safekeeper_count
+            )));
+        }
+
+        let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
+        let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?;
+        // Construct new member set in advance to validate it.
+        // E.g. validates that there is no duplicate safekeepers.
+        let new_sk_member_set =
+            Self::make_member_set(&new_safekeepers).map_err(ApiError::BadRequest)?;
+
         // TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks.
         let _tenant_lock = trace_shared_lock(
             &self.tenant_op_locks,
@@ -1141,6 +1155,18 @@ impl Service {
             .map(|&id| NodeId(id as u64))
             .collect::<Vec<_>>();
 
+        // Validate that we are not migrating to a decomissioned safekeeper.
+        for sk in new_safekeepers.iter() {
+            if !cur_sk_set.contains(&sk.get_id())
+                && sk.scheduling_policy() == SkSchedulingPolicy::Decomissioned
+            {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "safekeeper {} is decomissioned",
+                    sk.get_id()
+                )));
+            }
+        }
+
         tracing::info!(
             ?cur_sk_set,
             ?new_sk_set,
@@ -1183,11 +1209,8 @@ impl Service {
         }
 
         let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?;
-        let cur_sk_member_set = Self::make_member_set(&cur_safekeepers)?;
-
-        let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
-        let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?;
-        let new_sk_member_set = Self::make_member_set(&new_safekeepers)?;
+        let cur_sk_member_set =
+            Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?;
 
         let joint_config = membership::Configuration {
             generation,
diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index 703ee4b91e..a41906c956 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -34,7 +34,9 @@ class NeonAPI:
         self.retries524 = 0
         self.retries4xx = 0
 
-    def __request(self, method: str | bytes, endpoint: str, **kwargs: Any) -> requests.Response:
+    def __request(
+        self, method: str | bytes, endpoint: str, retry404: bool = False, **kwargs: Any
+    ) -> requests.Response:
         kwargs["headers"] = kwargs.get("headers", {})
         kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}"
 
@@ -55,10 +57,12 @@ class NeonAPI:
                 resp.raise_for_status()
                 break
             elif resp.status_code >= 400:
-                if resp.status_code == 422:
-                    if resp.json()["message"] == "branch not ready yet":
-                        retry = True
-                        self.retries4xx += 1
+                if resp.status_code == 404 and retry404:
+                    retry = True
+                    self.retries4xx += 1
+                elif resp.status_code == 422 and resp.json()["message"] == "branch not ready yet":
+                    retry = True
+                    self.retries4xx += 1
                 elif resp.status_code == 423 and resp.json()["message"] in {
                     "endpoint is in some transitive state, could not suspend",
                     "project already has running conflicting operations, scheduling of new ones is prohibited",
@@ -66,7 +70,7 @@ class NeonAPI:
                     retry = True
                     self.retries4xx += 1
                 elif resp.status_code == 524:
-                    log.info("The request was timed out, trying to get operations")
+                    log.info("The request was timed out")
                     retry = True
                     self.retries524 += 1
             if retry:
@@ -203,6 +207,9 @@ class NeonAPI:
         resp = self.__request(
             "GET",
             f"/projects/{project_id}/branches/{branch_id}",
+            # XXX Retry get parent details to work around the issue
+            # https://databricks.atlassian.net/browse/LKB-279
+            retry404=True,
             headers={
                 "Accept": "application/json",
             },
@@ -317,6 +324,10 @@ class NeonAPI:
         if endpoint_type:
             data["endpoint"]["type"] = endpoint_type
         if settings:
+            # otherwise we get 400 "settings must not be nil"
+            # TODO(myrrc): fix on cplane side
+            if "pg_settings" not in settings:
+                settings["pg_settings"] = {}
             data["endpoint"]["settings"] = settings
 
         resp = self.__request(
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f54d5be635..b9fff05c6c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1795,6 +1795,33 @@ def neon_env_builder(
         record_property("preserve_database_files", builder.preserve_database_files)
 
 
+@pytest.fixture(scope="function")
+def neon_env_builder_local(
+    neon_env_builder: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_distrib_dir: Path,
+) -> NeonEnvBuilder:
+    """
+    Fixture to create a Neon environment for test with its own pg_install copy.
+
+    This allows the test to edit the list of available extensions in the
+    local instance of Postgres used for the test, and install extensions via
+    downloading them when a remote extension is tested, for instance, or
+    copying files around for local extension testing.
+    """
+    test_local_pginstall = test_output_dir / "pg_install"
+    log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}")
+
+    # We can't copy only the version that we are currently testing because other
+    # binaries like the storage controller need specific Postgres versions.
+    shutil.copytree(pg_distrib_dir, test_local_pginstall)
+
+    neon_env_builder.pg_distrib_dir = test_local_pginstall
+    log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}")
+
+    return neon_env_builder
+
+
 @dataclass
 class PageserverPort:
     pg: int
@@ -2315,6 +2342,20 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response.raise_for_status()
         return response.json()
 
+    # HADRON
+    def tenant_timeline_describe(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ):
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+        return response.json()
+
     def nodes(self):
         """
         :return: list of {"id": ""}
@@ -5368,6 +5409,7 @@ SKIP_FILES = frozenset(
     (
         "pg_internal.init",
         "pg.log",
+        "neon.signal",
         "zenith.signal",
         "pg_hba.conf",
         "postgresql.conf",
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 0e4dd571c0..59249f31ad 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -115,8 +115,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*Local data loss suspected.*",
     # Too many frozen layers error is normal during intensive benchmarks
     ".*too many frozen layers.*",
-    # Transient errors when resolving tenant shards by page service
-    ".*Fail to resolve tenant shard in attempt.*",
+    ".*Failed to resolve tenant shard after.*",
     # Expected warnings when pageserver has not refreshed GC info yet
     ".*pitr LSN/interval not found, skipping force image creation LSN calculation.*",
     ".*No broker updates received for a while.*",
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 79cfba8da6..23b9d1c8c9 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -333,6 +333,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys")
         self.verbose_error(res)
 
+    def list_tenant_visible_size(self) -> dict[TenantShardId, int]:
+        res = self.get(f"http://localhost:{self.port}/v1/list_tenant_visible_size")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
     def tenant_list(self) -> list[dict[Any, Any]]:
         res = self.get(f"http://localhost:{self.port}/v1/tenant")
         self.verbose_error(res)
@@ -1002,7 +1009,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def get_metrics_str(self) -> str:
         """You probably want to use get_metrics() instead."""
-        res = self.get(f"http://localhost:{self.port}/metrics")
+        res = self.get(f"http://localhost:{self.port}/metrics?use_latest=true")
         self.verbose_error(res)
         return res.text
 
diff --git a/test_runner/fixtures/port_distributor.py b/test_runner/fixtures/port_distributor.py
index 6a829a9399..e51d08e16e 100644
--- a/test_runner/fixtures/port_distributor.py
+++ b/test_runner/fixtures/port_distributor.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import re
 import socket
 from contextlib import closing
+from itertools import cycle
 
 from fixtures.log_helper import log
 
@@ -34,15 +35,23 @@ def can_bind(host: str, port: int) -> bool:
 
 class PortDistributor:
     def __init__(self, base_port: int, port_number: int):
-        self.iterator = iter(range(base_port, base_port + port_number))
+        self.base_port = base_port
+        self.port_number = port_number
+        self.cycle = cycle(range(base_port, base_port + port_number))
         self.port_map: dict[int, int] = {}
 
     def get_port(self) -> int:
-        for port in self.iterator:
+        checked = 0
+        for port in self.cycle:
             if can_bind("localhost", port):
                 return port
+            elif checked < self.port_number:
+                checked += 1
+            else:
+                break
+
         raise RuntimeError(
-            "port range configured for test is exhausted, consider enlarging the range"
+            f"port range ({self.base_port}..{self.base_port + self.port_number}) configured for test is exhausted, consider enlarging the range"
         )
 
     def replace_with_new_port(self, value: int | str) -> int | str:
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 942b620be6..ceb00c0f90 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -143,7 +143,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
 
     def get_metrics_str(self) -> str:
         """You probably want to use get_metrics() instead."""
-        request_result = self.get(f"http://localhost:{self.port}/metrics")
+        request_result = self.get(f"http://localhost:{self.port}/metrics?use_latest=true")
         request_result.raise_for_status()
         return request_result.text
 
diff --git a/test_runner/performance/test_lfc_prewarm.py b/test_runner/performance/test_lfc_prewarm.py
new file mode 100644
index 0000000000..6c0083de95
--- /dev/null
+++ b/test_runner/performance/test_lfc_prewarm.py
@@ -0,0 +1,168 @@
+from __future__ import annotations
+
+import os
+import timeit
+import traceback
+from concurrent.futures import ThreadPoolExecutor as Exec
+from pathlib import Path
+from time import sleep
+from typing import TYPE_CHECKING, Any, cast
+
+import pytest
+from fixtures.benchmark_fixture import NeonBenchmarker, PgBenchRunResult
+from fixtures.log_helper import log
+from fixtures.neon_api import NeonAPI, connection_parameters_to_env
+
+if TYPE_CHECKING:
+    from fixtures.compare_fixtures import NeonCompare
+    from fixtures.neon_fixtures import Endpoint, PgBin
+    from fixtures.pg_version import PgVersion
+
+from performance.test_perf_pgbench import utc_now_timestamp
+
+# These tests compare performance for a write-heavy and read-heavy workloads of an ordinary endpoint
+# compared to the endpoint which saves its LFC and prewarms using it on startup.
+
+
+def test_compare_prewarmed_pgbench_perf(neon_compare: NeonCompare):
+    env = neon_compare.env
+    env.create_branch("normal")
+    env.create_branch("prewarmed")
+    pg_bin = neon_compare.pg_bin
+    ep_normal: Endpoint = env.endpoints.create_start("normal")
+    ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed", autoprewarm=True)
+
+    for ep in [ep_normal, ep_prewarmed]:
+        connstr: str = ep.connstr()
+        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", connstr, "-s100"])
+        ep.safe_psql("CREATE EXTENSION neon")
+        client = ep.http_client()
+        client.offload_lfc()
+        ep.stop()
+        ep.start()
+        client.prewarm_lfc_wait()
+
+        run_start_timestamp = utc_now_timestamp()
+        t0 = timeit.default_timer()
+        out = pg_bin.run_capture(["pgbench", "-c10", "-T10", connstr])
+        run_duration = timeit.default_timer() - t0
+        run_end_timestamp = utc_now_timestamp()
+
+        stdout = Path(f"{out}.stdout").read_text()
+        res = PgBenchRunResult.parse_from_stdout(
+            stdout=stdout,
+            run_duration=run_duration,
+            run_start_timestamp=run_start_timestamp,
+            run_end_timestamp=run_end_timestamp,
+        )
+        name: str = cast("str", ep.branch_name)
+        neon_compare.zenbenchmark.record_pg_bench_result(name, res)
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_compare_prewarmed_pgbench_perf_benchmark(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    name = f"Test prewarmed pgbench performance, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
+    project = neon_api.create_project(pg_version, name)
+    project_id = project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+    err = False
+    try:
+        benchmark_impl(pg_bin, neon_api, project, zenbenchmark)
+    except Exception as e:
+        err = True
+        log.error(f"Caught exception: {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not err
+        neon_api.delete_project(project_id)
+
+
+def benchmark_impl(
+    pg_bin: PgBin, neon_api: NeonAPI, project: dict[str, Any], zenbenchmark: NeonBenchmarker
+):
+    pgbench_size = int(os.getenv("PGBENCH_SIZE") or "3424")  # 50GB
+    offload_secs = 20
+    test_duration_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60}"
+    # prewarm API is not publicly exposed. In order to test performance of a
+    # fully prewarmed endpoint, wait after it restarts.
+    # The number here is empirical, based on manual runs on staging
+    prewarmed_sleep_secs = 180
+
+    branch_id = project["branch"]["id"]
+    project_id = project["project"]["id"]
+    normal_env = connection_parameters_to_env(
+        project["connection_uris"][0]["connection_parameters"]
+    )
+    normal_id = project["endpoints"][0]["id"]
+
+    prewarmed_branch_id = neon_api.create_branch(
+        project_id, "prewarmed", parent_id=branch_id, add_endpoint=False
+    )["branch"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+
+    ep_prewarmed = neon_api.create_endpoint(
+        project_id,
+        prewarmed_branch_id,
+        endpoint_type="read_write",
+        settings={"autoprewarm": True, "offload_lfc_interval_seconds": offload_secs},
+    )
+    neon_api.wait_for_operation_to_finish(project_id)
+
+    prewarmed_env = normal_env.copy()
+    prewarmed_env["PGHOST"] = ep_prewarmed["endpoint"]["host"]
+    prewarmed_id = ep_prewarmed["endpoint"]["id"]
+
+    def bench(endpoint_name, endpoint_id, env):
+        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", f"-s{pgbench_size}"], env)
+        sleep(offload_secs * 2)  # ensure LFC is offloaded after pgbench finishes
+        neon_api.restart_endpoint(project_id, endpoint_id)
+        sleep(prewarmed_sleep_secs)
+
+        run_start_timestamp = utc_now_timestamp()
+        t0 = timeit.default_timer()
+        out = pg_bin.run_capture(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env)
+        run_duration = timeit.default_timer() - t0
+        run_end_timestamp = utc_now_timestamp()
+
+        stdout = Path(f"{out}.stdout").read_text()
+        res = PgBenchRunResult.parse_from_stdout(
+            stdout=stdout,
+            run_duration=run_duration,
+            run_start_timestamp=run_start_timestamp,
+            run_end_timestamp=run_end_timestamp,
+        )
+        zenbenchmark.record_pg_bench_result(endpoint_name, res)
+
+    with Exec(max_workers=2) as exe:
+        exe.submit(bench, "normal", normal_id, normal_env)
+        exe.submit(bench, "prewarmed", prewarmed_id, prewarmed_env)
+
+
+def test_compare_prewarmed_read_perf(neon_compare: NeonCompare):
+    env = neon_compare.env
+    env.create_branch("normal")
+    env.create_branch("prewarmed")
+    ep_normal: Endpoint = env.endpoints.create_start("normal")
+    ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed", autoprewarm=True)
+
+    sql = [
+        "CREATE EXTENSION neon",
+        "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')",
+        "INSERT INTO foo SELECT FROM generate_series(1,1000000)",
+    ]
+    for ep in [ep_normal, ep_prewarmed]:
+        ep.safe_psql_many(sql)
+        client = ep.http_client()
+        client.offload_lfc()
+        ep.stop()
+        ep.start()
+        client.prewarm_lfc_wait()
+        with neon_compare.record_duration(f"{ep.branch_name}_run_duration"):
+            ep.safe_psql("SELECT count(*) from foo")
diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py
index 63da47b555..aa21637159 100644
--- a/test_runner/random_ops/test_random_ops.py
+++ b/test_runner/random_ops/test_random_ops.py
@@ -13,7 +13,6 @@ from typing import TYPE_CHECKING, Any
 
 import pytest
 from fixtures.log_helper import log
-from requests import HTTPError
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -204,26 +203,11 @@ class NeonBranch:
         self.updated_at = datetime.fromisoformat(res["branch"]["updated_at"])
         self.parent_timestamp = datetime.fromisoformat(res["branch"]["parent_timestamp"])
         parent_id: str = res["branch"]["parent_id"]
-        # XXX Retry get parent details to work around the issue
-        # https://databricks.atlassian.net/browse/LKB-279
-        target_time = datetime.now() + timedelta(seconds=30)
-        while datetime.now() < target_time:
-            try:
-                parent_def = self.neon_api.get_branch_details(self.project_id, parent_id)
-            except HTTPError as he:
-                if he.response.status_code == 404:
-                    log.info("Branch not found, waiting...")
-                    time.sleep(1)
-                else:
-                    raise HTTPError(he) from he
-            else:
-                break
-        else:
-            raise RuntimeError(f"Branch {parent_id} not found")
-
         # Creates an object for the parent branch
         # After the reset operation a new parent branch is created
-        parent = NeonBranch(self.project, parent_def, True)
+        parent = NeonBranch(
+            self.project, self.neon_api.get_branch_details(self.project_id, parent_id), True
+        )
         self.project.branches[parent_id] = parent
         self.parent = parent
         parent.children[self.id] = self
diff --git a/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql
new file mode 100644
index 0000000000..2b82102802
--- /dev/null
+++ b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql
@@ -0,0 +1,32 @@
+\echo Use "CREATE EXTENSION test_event_trigger_extension" to load this file. \quit
+
+CREATE SCHEMA event_trigger;
+
+create sequence if not exists event_trigger.seq_schema_version as int cycle;
+
+create or replace function event_trigger.increment_schema_version()
+    returns event_trigger
+    security definer
+    language plpgsql
+as $$
+begin
+    perform pg_catalog.nextval('event_trigger.seq_schema_version');
+end;
+$$;
+
+create or replace function event_trigger.get_schema_version()
+    returns int
+    security definer
+    language sql
+as $$
+    select last_value from event_trigger.seq_schema_version;
+$$;
+
+-- On DDL event, increment the schema version number
+create event trigger event_trigger_watch_ddl
+    on ddl_command_end
+    execute procedure event_trigger.increment_schema_version();
+
+create event trigger event_trigger_watch_drop
+    on sql_drop
+    execute procedure event_trigger.increment_schema_version();
diff --git a/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control
new file mode 100644
index 0000000000..4fe8c3341b
--- /dev/null
+++ b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control
@@ -0,0 +1,8 @@
+default_version = '1.0'
+comment = 'Test extension with Event Trigger'
+
+# make sure the extension objects are owned by the bootstrap user
+# to check that the SECURITY DEFINER event trigger function is still
+# called during non-superuser DDL events.
+superuser = true
+trusted = true
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 7788faceb4..eaaa3014a5 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -165,6 +165,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "gc_horizon": 23 * (1024 * 1024),
         "gc_period": "2h 13m",
         "image_creation_threshold": 7,
+        "image_layer_force_creation_period": "1m",
         "pitr_interval": "1m",
         "lagging_wal_timeout": "23m",
         "lazy_slru_download": True,
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index 8447c9bf2d..148f469a95 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING
 import pytest
 from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
+from fixtures.neon_fixtures import wait_for_last_flush_lsn
 from fixtures.pageserver.http import TimelineCreate406
 from fixtures.utils import query_scalar, skip_in_debug_build
 
@@ -162,6 +163,9 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
     )
     lsn = Lsn(res[2][0][0])
 
+    # Wait for all WAL to reach the pageserver, so GC cutoff LSN is greater than `lsn`.
+    wait_for_last_flush_lsn(env, endpoint0, tenant, b0)
+
     # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the
     # branch creation task but the individual timeline GC iteration happens *after*
     # the branch creation task.
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 1570d40ae9..963a19d640 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -7,6 +7,7 @@ import time
 from enum import StrEnum
 
 import pytest
+from fixtures.common_types import TenantShardId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -944,3 +945,204 @@ def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool
                 f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)"
             )
             assert res[0][0] == 1
+
+
+# BEGIN_HADRON
+def get_layer_map(env, tenant_shard_id, timeline_id, ps_id):
+    client = env.pageservers[ps_id].http_client()
+    layer_map = client.layer_map_info(tenant_shard_id, timeline_id)
+    image_layer_count = 0
+    delta_layer_count = 0
+    for layer in layer_map.historic_layers:
+        if layer.kind == "Image":
+            image_layer_count += 1
+        elif layer.kind == "Delta":
+            delta_layer_count += 1
+    return image_layer_count, delta_layer_count
+
+
+def test_image_layer_creation_time_threshold(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests that image layers can be created when the time threshold is reached on non-0 shards.
+    """
+    tenant_conf = {
+        "compaction_threshold": "100",
+        "image_creation_threshold": "100",
+        "image_layer_creation_check_threshold": "1",
+        # disable distance based image layer creation check
+        "checkpoint_distance": 10 * 1024 * 1024 * 1024,
+        "checkpoint_timeout": "100ms",
+        "image_layer_force_creation_period": "1s",
+        "pitr_interval": "10s",
+        "gc_period": "1s",
+        "compaction_period": "1s",
+        "lsn_lease_length": "1s",
+    }
+
+    # consider every tenant large to run the image layer generation check more eagerly
+    neon_env_builder.pageserver_config_override = (
+        "image_layer_generation_large_timeline_threshold=0"
+    )
+
+    neon_env_builder.num_pageservers = 1
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf,
+        initial_tenant_shard_count=2,
+        initial_tenant_shard_stripe_size=1,
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER, val text)")
+
+    for v in range(10):
+        endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))")
+
+    tenant_shard_id = TenantShardId(tenant_id, 1, 2)
+
+    # Generate some rows.
+    for v in range(20):
+        endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))")
+
+    # restart page server so that logical size on non-0 shards is missing
+    env.pageserver.restart()
+
+    (old_images, old_deltas) = get_layer_map(env, tenant_shard_id, timeline_id, 0)
+    log.info(f"old images: {old_images}, old deltas: {old_deltas}")
+
+    def check_image_creation():
+        (new_images, old_deltas) = get_layer_map(env, tenant_shard_id, timeline_id, 0)
+        log.info(f"images: {new_images}, deltas: {old_deltas}")
+        assert new_images > old_images
+
+    wait_until(check_image_creation)
+
+    endpoint.stop_and_destroy()
+
+
+def test_image_layer_force_creation_period(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests that page server can force creating new images if image_layer_force_creation_period is enabled
+    """
+    # use large knobs to disable L0 compaction/image creation except for the force image creation
+    tenant_conf = {
+        "compaction_threshold": "100",
+        "image_creation_threshold": "100",
+        "image_layer_creation_check_threshold": "1",
+        "checkpoint_distance": 10 * 1024,
+        "checkpoint_timeout": "1s",
+        "image_layer_force_creation_period": "1s",
+        "pitr_interval": "10s",
+        "gc_period": "1s",
+        "compaction_period": "1s",
+        "lsn_lease_length": "1s",
+    }
+
+    # consider every tenant large to run the image layer generation check more eagerly
+    neon_env_builder.pageserver_config_override = (
+        "image_layer_generation_large_timeline_threshold=0"
+    )
+
+    neon_env_builder.num_pageservers = 1
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+    # Generate some rows.
+    for v in range(10):
+        endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))")
+
+    # Sleep a bit such that the inserts are considered when calculating the forced image layer creation LSN.
+    time.sleep(2)
+
+    def check_force_image_creation():
+        ps_http = env.pageserver.http_client()
+        ps_http.timeline_compact(tenant_id, timeline_id)
+        image, delta = get_layer_map(env, tenant_id, timeline_id, 0)
+        log.info(f"images: {image}, deltas: {delta}")
+        assert image > 0
+
+        env.pageserver.assert_log_contains("forcing L0 compaction of")
+        env.pageserver.assert_log_contains("forcing image creation for partitioned range")
+
+    wait_until(check_force_image_creation)
+
+    endpoint.stop_and_destroy()
+
+    env.pageserver.allowed_errors.append(
+        ".*created delta file of size.*larger than double of target.*"
+    )
+
+
+def test_image_consistent_lsn(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the /v1/tenant/<tenant_id>/timeline/<timeline_id> endpoint and the computation of image_consistent_lsn
+    """
+    # use large knobs to disable L0 compaction/image creation except for the force image creation
+    tenant_conf = {
+        "compaction_threshold": "100",
+        "image_creation_threshold": "100",
+        "image_layer_creation_check_threshold": "1",
+        "checkpoint_distance": 10 * 1024,
+        "checkpoint_timeout": "1s",
+        "image_layer_force_creation_period": "1s",
+        "pitr_interval": "10s",
+        "gc_period": "1s",
+        "compaction_period": "1s",
+        "lsn_lease_length": "1s",
+    }
+
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf,
+        initial_tenant_shard_count=4,
+        initial_tenant_shard_stripe_size=1,
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER, val text)")
+    for v in range(10):
+        endpoint.safe_psql(
+            f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))", log_query=False
+        )
+
+    response = env.storage_controller.tenant_timeline_describe(tenant_id, timeline_id)
+    shards = response["shards"]
+    for shard in shards:
+        assert shard["image_consistent_lsn"] is not None
+    image_consistent_lsn = response["image_consistent_lsn"]
+    assert image_consistent_lsn is not None
+
+    # do more writes and wait for image_consistent_lsn to advance
+    for v in range(100):
+        endpoint.safe_psql(
+            f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))", log_query=False
+        )
+
+    def check_image_consistent_lsn_advanced():
+        response = env.storage_controller.tenant_timeline_describe(tenant_id, timeline_id)
+        new_image_consistent_lsn = response["image_consistent_lsn"]
+        shards = response["shards"]
+        for shard in shards:
+            print(f"shard {shard['tenant_id']} image_consistent_lsn{shard['image_consistent_lsn']}")
+        assert new_image_consistent_lsn != image_consistent_lsn
+
+    wait_until(check_image_consistent_lsn_advanced)
+
+    endpoint.stop_and_destroy()
+
+    for ps in env.pageservers:
+        ps.allowed_errors.append(".*created delta file of size.*larger than double of target.*")
+
+
+# END_HADRON
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index fe3b220c67..d7f78afac8 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import os
 import platform
-import shutil
 import tarfile
 from enum import StrEnum
 from pathlib import Path
@@ -31,27 +30,6 @@ if TYPE_CHECKING:
     from werkzeug.wrappers.request import Request
 
 
-# use neon_env_builder_local fixture to override the default neon_env_builder fixture
-# and use a test-specific pg_install instead of shared one
-@pytest.fixture(scope="function")
-def neon_env_builder_local(
-    neon_env_builder: NeonEnvBuilder,
-    test_output_dir: Path,
-    pg_distrib_dir: Path,
-) -> NeonEnvBuilder:
-    test_local_pginstall = test_output_dir / "pg_install"
-    log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}")
-
-    # We can't copy only the version that we are currently testing because other
-    # binaries like the storage controller need specific Postgres versions.
-    shutil.copytree(pg_distrib_dir, test_local_pginstall)
-
-    neon_env_builder.pg_distrib_dir = test_local_pginstall
-    log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}")
-
-    return neon_env_builder
-
-
 @final
 class RemoteExtension(StrEnum):
     SQL_ONLY = "test_extension_sql_only"
diff --git a/test_runner/regress/test_event_trigger_extension.py b/test_runner/regress/test_event_trigger_extension.py
new file mode 100644
index 0000000000..ac4351dcd5
--- /dev/null
+++ b/test_runner/regress/test_event_trigger_extension.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+import shutil
+from pathlib import Path
+from typing import TYPE_CHECKING, cast
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.paths import BASE_DIR
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from fixtures.neon_fixtures import (
+        NeonEnvBuilder,
+    )
+    from fixtures.pg_version import PgVersion
+
+
+# use neon_env_builder_local fixture to override the default neon_env_builder fixture
+# and use a test-specific pg_install instead of shared one
+@pytest.fixture(scope="function")
+def neon_env_builder_event_trigger_extension(
+    neon_env_builder_local: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_version: PgVersion,
+) -> NeonEnvBuilder:
+    test_local_pginstall = test_output_dir / "pg_install"
+
+    # Now copy the SQL only extension test_event_trigger_extension in the local
+    # pginstall extension directory on-disk
+    test_event_trigger_extension_dir = (
+        BASE_DIR / "test_runner" / "regress" / "data" / "test_event_trigger_extension"
+    )
+
+    test_local_extension_dir = (
+        test_local_pginstall / f"v{pg_version}" / "share" / "postgresql" / "extension"
+    )
+
+    log.info(f"copy {test_event_trigger_extension_dir} to {test_local_extension_dir}")
+
+    for f in [
+        test_event_trigger_extension_dir / "test_event_trigger_extension.control",
+        test_event_trigger_extension_dir / "test_event_trigger_extension--1.0.sql",
+    ]:
+        shutil.copy(f, test_local_extension_dir)
+
+    return neon_env_builder_local
+
+
+def test_event_trigger_extension(neon_env_builder_event_trigger_extension: NeonEnvBuilder):
+    """
+    Test installing an extension that contains an Event Trigger.
+
+    The Event Trigger function is owned by the extension owner, which at
+    CREATE EXTENSION is going to be the Postgres bootstrap user, per the
+    extension control file where both superuser = true and trusted = true.
+
+    Also this function is SECURTY DEFINER, to allow for making changes to
+    the extension SQL objects, in our case a sequence.
+
+    This test makes sure that the event trigger function is fired correctly
+    by non-privileged user DDL actions such as CREATE TABLE.
+    """
+    env = neon_env_builder_event_trigger_extension.init_start()
+    env.create_branch("test_event_trigger_extension")
+
+    endpoint = env.endpoints.create_start("test_event_trigger_extension")
+    extension = "test_event_trigger_extension"
+    database = "test_event_trigger_extension"
+
+    endpoint.safe_psql(f"CREATE DATABASE {database}")
+    endpoint.safe_psql(f"CREATE EXTENSION {extension}", dbname=database)
+
+    # check that the extension is owned by the bootstrap superuser (cloud_admin)
+    pg_bootstrap_superuser_name = "cloud_admin"
+    with endpoint.connect(dbname=database) as pg_conn:
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                f"select rolname from pg_roles r join pg_extension e on r.oid = e.extowner where extname = '{extension}'"
+            )
+            owner = cast("tuple[str]", cur.fetchone())[0]
+            assert owner == pg_bootstrap_superuser_name, (
+                f"extension {extension} is not owned by bootstrap user '{pg_bootstrap_superuser_name}'"
+            )
+
+    # test that the SQL-only Event Trigger (SECURITY DEFINER function) runs
+    # correctly now that the extension has been installed
+    #
+    # create table to trigger the event trigger, twice, check sequence count
+    with endpoint.connect(dbname=database) as pg_conn:
+        log.info("creating SQL objects (tables)")
+        with pg_conn.cursor() as cur:
+            cur.execute("CREATE TABLE foo1(id int primary key)")
+            cur.execute("CREATE TABLE foo2(id int)")
+
+            cur.execute("SELECT event_trigger.get_schema_version()")
+            res = cast("tuple[int]", cur.fetchone())
+            ver = res[0]
+
+            log.info(f"schema version is now {ver}")
+            assert ver == 2, "schema version is not 2"
diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py
index ae36bbda79..0f0cf4cc6d 100644
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -1,6 +1,7 @@
 import random
 import threading
 from enum import StrEnum
+from time import sleep
 from typing import Any
 
 import pytest
@@ -24,18 +25,7 @@ OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
 OFFLOAD_ERR_LABEL = "compute_ctl_lfc_offload_errors_total"
 METHOD_VALUES = [e for e in PrewarmMethod]
 METHOD_IDS = [e.value for e in PrewarmMethod]
-
-
-def check_pinned_entries(cur: Cursor):
-    """
-    Wait till none of LFC buffers are pinned
-    """
-
-    def none_pinned():
-        cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
-        assert cur.fetchall()[0][0] == 0
-
-    wait_until(none_pinned)
+AUTOOFFLOAD_INTERVAL_SECS = 2
 
 
 def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
@@ -49,9 +39,18 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
 
 
 def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
+    if method == PrewarmMethod.POSTGRES:
+        cur.execute("select neon.get_local_cache_state()")
+        return cur.fetchall()[0][0]
+
     if method == PrewarmMethod.AUTOPREWARM:
+        # With autoprewarm, we need to be sure LFC was offloaded after all writes
+        # finish, so we sleep. Otherwise we'll have less prewarmed pages than we want
+        sleep(AUTOOFFLOAD_INTERVAL_SECS)
         client.offload_lfc_wait()
-    elif method == PrewarmMethod.COMPUTE_CTL:
+        return
+
+    if method == PrewarmMethod.COMPUTE_CTL:
         status = client.prewarm_lfc_status()
         assert status["status"] == "not_prewarmed"
         assert "error" not in status
@@ -60,11 +59,9 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor)
         parsed = prom_parse(client)
         desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0}
         assert parsed == desired, f"{parsed=} != {desired=}"
-    elif method == PrewarmMethod.POSTGRES:
-        cur.execute("select get_local_cache_state()")
-        return cur.fetchall()[0][0]
-    else:
-        raise AssertionError(f"{method} not in PrewarmMethod")
+        return
+
+    raise AssertionError(f"{method} not in PrewarmMethod")
 
 
 def prewarm_endpoint(
@@ -75,7 +72,7 @@ def prewarm_endpoint(
     elif method == PrewarmMethod.COMPUTE_CTL:
         client.prewarm_lfc()
     elif method == PrewarmMethod.POSTGRES:
-        cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+        cur.execute("select neon.prewarm_local_cache(%s)", (lfc_state,))
 
 
 def check_prewarmed(
@@ -106,21 +103,20 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
         "neon.file_cache_size_limit=1GB",
         "neon.file_cache_prewarm_limit=1000",
     ]
-    offload_secs = 2
 
     if method == PrewarmMethod.AUTOPREWARM:
         endpoint = env.endpoints.create_start(
             branch_name="main",
             config_lines=cfg,
             autoprewarm=True,
-            offload_lfc_interval_seconds=offload_secs,
+            offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS,
         )
     else:
         endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
 
     pg_conn = endpoint.connect()
     pg_cur = pg_conn.cursor()
-    pg_cur.execute("create extension neon")
+    pg_cur.execute("create schema neon; create extension neon with schema neon")
     pg_cur.execute("create database lfc")
 
     lfc_conn = endpoint.connect(dbname="lfc")
@@ -135,7 +131,7 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
 
     endpoint.stop()
     if method == PrewarmMethod.AUTOPREWARM:
-        endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs)
+        endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS)
     else:
         endpoint.start()
 
@@ -146,10 +142,12 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
     lfc_cur = lfc_conn.cursor()
     prewarm_endpoint(method, client, pg_cur, lfc_state)
 
-    pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
+    pg_cur.execute(
+        "select lfc_value from neon.neon_lfc_stats where lfc_key='file_cache_used_pages'"
+    )
     lfc_used_pages = pg_cur.fetchall()[0][0]
     log.info(f"Used LFC size: {lfc_used_pages}")
-    pg_cur.execute("select * from get_prewarm_info()")
+    pg_cur.execute("select * from neon.get_prewarm_info()")
     total, prewarmed, skipped, _ = pg_cur.fetchall()[0]
     log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}")
     progress = (prewarmed + skipped) * 100 // total
@@ -162,7 +160,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
     lfc_cur.execute("select sum(pk) from t")
     assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
 
-    check_pinned_entries(pg_cur)
     desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
     check_prewarmed(method, client, desired)
 
@@ -191,7 +188,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
 
     pg_conn = endpoint.connect()
     pg_cur = pg_conn.cursor()
-    pg_cur.execute("create extension neon")
+    pg_cur.execute("create schema neon; create extension neon with schema neon")
     pg_cur.execute("CREATE DATABASE lfc")
 
     lfc_conn = endpoint.connect(dbname="lfc")
@@ -243,9 +240,9 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
     prewarm_thread.start()
 
     def prewarmed():
-        assert n_prewarms > 5
+        assert n_prewarms > 3
 
-    wait_until(prewarmed)
+    wait_until(prewarmed, timeout=40)  # debug builds don't finish in 20s
 
     running = False
     for t in workload_threads:
@@ -256,7 +253,6 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
     total_balance = lfc_cur.fetchall()[0][0]
     assert total_balance == 0
 
-    check_pinned_entries(pg_cur)
     if method == PrewarmMethod.POSTGRES:
         return
     desired = {
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 7f9207047e..92889e5de3 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 
 from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
@@ -164,3 +165,15 @@ def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder
             {"rel_size_migration": "legacy"},
         )
         assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy"
+
+
+def test_pageserver_get_tenant_visible_size(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_pageservers = 1
+    env = neon_env_builder.init_start()
+    env.create_tenant(shard_count=4)
+    env.create_tenant(shard_count=2)
+
+    json = env.pageserver.http_client().list_tenant_visible_size()
+    log.info(f"{json}")
+    # initial tennat + 2 newly created tenants
+    assert len(json) == 7
diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py
index 1f26269f40..8d39ac123a 100644
--- a/test_runner/regress/test_replica_promotes.py
+++ b/test_runner/regress/test_replica_promotes.py
@@ -60,7 +60,7 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
 
     with primary.connect() as primary_conn:
         primary_cur = primary_conn.cursor()
-        primary_cur.execute("create extension neon")
+        primary_cur.execute("create schema neon;create extension neon with schema neon")
         primary_cur.execute(
             "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
         )
@@ -172,7 +172,7 @@ def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv):
     secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
 
     with primary.connect() as conn, conn.cursor() as cur:
-        cur.execute("create extension neon")
+        cur.execute("create schema neon;create extension neon with schema neon")
         cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
         cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
         cur.execute("show neon.safekeepers")
diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py
index b82d7b9bb0..170c1a3650 100644
--- a/test_runner/regress/test_safekeeper_migration.py
+++ b/test_runner/regress/test_safekeeper_migration.py
@@ -2,6 +2,9 @@ from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
+import pytest
+from fixtures.neon_fixtures import StorageControllerApiException
+
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnvBuilder
 
@@ -75,3 +78,38 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
     ep.start(safekeeper_generation=1, safekeepers=[3])
 
     assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
+
+
+def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that safekeeper_migrate validates the new_sk_set before starting the migration.
+    """
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.storage_controller_config = {
+        "timelines_onto_safekeepers": True,
+        "timeline_safekeeper_count": 2,
+    }
+    env = neon_env_builder.init_start()
+
+    def expect_fail(sk_set: list[int], match: str):
+        with pytest.raises(StorageControllerApiException, match=match):
+            env.storage_controller.migrate_safekeepers(
+                env.initial_tenant, env.initial_timeline, sk_set
+            )
+        # Check that we failed before commiting to the database.
+        mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+        assert mconf["generation"] == 1
+
+    expect_fail([], "safekeeper set is empty")
+    expect_fail([1], "must have at least 2 safekeepers")
+    expect_fail([1, 1], "duplicate safekeeper")
+    expect_fail([1, 100500], "does not exist")
+
+    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+    sk_set = mconf["sk_set"]
+    assert len(sk_set) == 2
+
+    decom_sk = [sk.id for sk in env.safekeepers if sk.id not in sk_set][0]
+    env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned")
+
+    expect_fail([sk_set[0], decom_sk], "decomissioned")
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 8ff767eca4..5549105188 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1673,6 +1673,91 @@ def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder):
 # END_HADRON
 
 
+# HADRON
+@pytest.mark.skip(reason="The backpressure change has not been merged yet.")
+def test_back_pressure_per_shard(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests back pressure knobs are enforced on the per shard basis instead of at the tenant level.
+    """
+    init_shard_count = 4
+    neon_env_builder.num_pageservers = init_shard_count
+    stripe_size = 1
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=init_shard_count,
+        initial_tenant_shard_stripe_size=stripe_size,
+        initial_tenant_conf={
+            # disable auto-flush of shards and set max_replication_flush_lag as 15MB.
+            # The backpressure parameters must be enforced at the shard level to avoid stalling PG.
+            "checkpoint_distance": 1 * 1024 * 1024 * 1024,
+            "checkpoint_timeout": "1h",
+        },
+    )
+
+    endpoint = env.endpoints.create(
+        "main",
+        config_lines=[
+            "max_replication_write_lag = 0",
+            "max_replication_apply_lag = 0",
+            "max_replication_flush_lag = 15MB",
+            "neon.max_cluster_size = 10GB",
+        ],
+    )
+    endpoint.respec(skip_pg_catalog_updates=False)  # Needed for databricks_system to get created.
+    endpoint.start()
+
+    # generate 20MB of data
+    endpoint.safe_psql(
+        "CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, 20000) s;"
+    )
+    res = endpoint.safe_psql(
+        "SELECT neon.backpressure_throttling_time() as throttling_time", dbname="databricks_system"
+    )[0]
+    assert res[0] == 0, f"throttling_time should be 0, but got {res[0]}"
+
+    endpoint.stop()
+
+
+# HADRON
+def test_shard_split_page_server_timeout(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests that shard split can correctly handle page server timeouts and abort the split
+    """
+    init_shard_count = 2
+    neon_env_builder.num_pageservers = 1
+    stripe_size = 1
+
+    if neon_env_builder.storage_controller_config is None:
+        neon_env_builder.storage_controller_config = {"shard_split_request_timeout": "5s"}
+    else:
+        neon_env_builder.storage_controller_config["shard_split_request_timeout"] = "5s"
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=init_shard_count,
+        initial_tenant_shard_stripe_size=stripe_size,
+    )
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Enqueuing background abort.*",
+            ".*failpoint.*",
+            ".*Failed to abort.*",
+            ".*Exclusive lock by ShardSplit was held.*",
+        ]
+    )
+    env.pageserver.allowed_errors.extend([".*request was dropped before completing.*"])
+
+    endpoint1 = env.endpoints.create_start(branch_name="main")
+
+    env.pageserver.http_client().configure_failpoints(("shard-split-post-finish-pause", "pause"))
+
+    with pytest.raises(StorageControllerApiException):
+        env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=4)
+
+    env.pageserver.http_client().configure_failpoints(("shard-split-post-finish-pause", "off"))
+    endpoint1.stop_and_destroy()
+
+
 def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
     """
     Check a scenario when one of the shards is much slower than others.
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index c0f163db32..45b7af719e 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -209,9 +209,9 @@ def test_ancestor_detach_branched_from(
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
 
-    # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different
-    # as there is always "PREV_LSN: invalid" for "before"
-    skip_files = {"zenith.signal"}
+    # because we do the fullbackup from ancestor at the branch_lsn, the neon.signal and/or zenith.signal is always
+    # different as there is always "PREV_LSN: invalid" for "before"
+    skip_files = {"zenith.signal", "neon.signal"}
 
     assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files)
 
@@ -767,7 +767,7 @@ def test_compaction_induced_by_detaches_in_history(
         env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after
     )
 
-    # we don't need to skip any files, because zenith.signal will be identical
+    # we don't need to skip any files, because neon.signal will be identical
     assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
 
 
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 0bb63308bb..573016f772 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import sys
 import tarfile
 import tempfile
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 import pytest
@@ -198,3 +199,115 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool)
     # the table is back now!
     restored = env.endpoints.create_start("main")
     assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
+
+
+# BEGIN_HADRON
+# TODO: re-enable once CM python is integreated.
+# def clear_directory(directory):
+#     for item in os.listdir(directory):
+#         item_path = os.path.join(directory, item)
+#         if os.path.isdir(item_path):
+#             log.info(f"removing SK directory: {item_path}")
+#             shutil.rmtree(item_path)
+#         else:
+#             log.info(f"removing SK file: {item_path}")
+#             os.remove(item_path)
+
+
+# def test_sk_pull_timelines(
+#     neon_env_builder: NeonEnvBuilder,
+# ):
+#     DBNAME = "regression"
+#     superuser_name = "databricks_superuser"
+#     neon_env_builder.num_safekeepers = 3
+#     neon_env_builder.num_pageservers = 4
+#     neon_env_builder.safekeeper_extra_opts = ["--enable-pull-timeline-on-startup"]
+#     neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
+
+#     env = neon_env_builder.init_start(initial_tenant_shard_count=4)
+
+#     env.compute_manager.start(base_port=env.compute_manager_port)
+
+#     test_creator = "test_creator"
+#     test_metastore_id = uuid4()
+#     test_account_id = uuid4()
+#     test_workspace_id = 1
+#     test_workspace_url = "http://test_workspace_url"
+#     test_metadata_version = 1
+#     test_metadata = {
+#         "state": "INSTANCE_PROVISIONING",
+#         "admin_rolename": "admin",
+#         "admin_password_scram": "abc123456",
+#     }
+
+#     test_instance_name_1 = "test_instance_1"
+#     test_instance_read_write_compute_pool_1 = {
+#         "instance_name": test_instance_name_1,
+#         "compute_pool_name": "compute_pool_1",
+#         "creator": test_creator,
+#         "capacity": 2.0,
+#         "node_count": 1,
+#         "metadata_version": 0,
+#         "metadata": {
+#             "state": "INSTANCE_PROVISIONING",
+#         },
+#     }
+
+#     test_instance_1_readable_secondaries_enabled = False
+
+#     # Test creation
+#     create_instance_with_retries(
+#         env,
+#         test_instance_name_1,
+#         test_creator,
+#         test_metastore_id,
+#         test_account_id,
+#         test_workspace_id,
+#         test_workspace_url,
+#         test_instance_read_write_compute_pool_1,
+#         test_metadata_version,
+#         test_metadata,
+#         test_instance_1_readable_secondaries_enabled,
+#     )
+#     instance = env.compute_manager.get_instance_by_name(test_instance_name_1, test_workspace_id)
+#     log.info(f"haoyu Instance created: {instance}")
+#     assert instance["instance_name"] == test_instance_name_1
+#     test_instance_id = instance["instance_id"]
+#     instance_detail = env.compute_manager.describe_instance(test_instance_id)
+#     log.info(f"haoyu Instance detail: {instance_detail}")
+
+#     env.initial_tenant = instance_detail[0]["tenant_id"]
+#     env.initial_timeline = instance_detail[0]["timeline_id"]
+
+#     # Connect to postgres and create a database called "regression".
+#     endpoint = env.endpoints.create_start("main")
+#     endpoint.safe_psql(f"CREATE ROLE {superuser_name}")
+#     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
+
+#     endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);")
+#     # Write some data. ~20 MB.
+#     num_rows = 0
+#     for _i in range(0, 20000):
+#         endpoint.safe_psql(
+#             "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
+#         )
+#         num_rows += 1
+
+#     log.info(f"SKs {env.storage_controller.hcc_sk_node_list()}")
+
+#     env.safekeepers[0].stop(immediate=True)
+#     clear_directory(env.safekeepers[0].data_dir)
+#     env.safekeepers[0].start()
+
+#     # PG can still write data. ~20 MB.
+#     for _i in range(0, 20000):
+#         endpoint.safe_psql(
+#             "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
+#         )
+#         num_rows += 1
+
+#     tuples = endpoint.safe_psql("SELECT COUNT(*) FROM usertable;")
+#     assert tuples[0][0] == num_rows
+#     endpoint.stop_and_destroy()
+
+# END_HADRON
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 9085654ee8..8ce1f52303 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 9085654ee8022d5cc4ca719380a1dc53e5e3246f
+Subproject commit 8ce1f52303aec29e098309347b57c01a1962e221
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 8c3249f36c..afd46987f3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9
+Subproject commit afd46987f3da50c9146a8aa59380052df0862c06
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 7a4c0eacae..e08c8d5f15 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 7a4c0eacaeb9b97416542fa19103061c166460b1
+Subproject commit e08c8d5f1576ca0487d14d154510499c5f12adfb
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index db424d42d7..353c725b0c 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit db424d42d748f8ad91ac00e28db2c7f2efa42f7f
+Subproject commit 353c725b0c76cc82b15af21d8360d03391dc6814
diff --git a/vendor/revisions.json b/vendor/revisions.json
index b260698c86..992aa405b1 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.5",
-    "db424d42d748f8ad91ac00e28db2c7f2efa42f7f"
+    "353c725b0c76cc82b15af21d8360d03391dc6814"
   ],
   "v16": [
     "16.9",
-    "7a4c0eacaeb9b97416542fa19103061c166460b1"
+    "e08c8d5f1576ca0487d14d154510499c5f12adfb"
   ],
   "v15": [
     "15.13",
-    "8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9"
+    "afd46987f3da50c9146a8aa59380052df0862c06"
   ],
   "v14": [
     "14.18",
-    "9085654ee8022d5cc4ca719380a1dc53e5e3246f"
+    "8ce1f52303aec29e098309347b57c01a1962e221"
   ]
 }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index fc01deb92d..c61598cdf6 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -98,7 +98,7 @@ tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unpref
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["full", "test-util"] }
 tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
-tokio-stream = { version = "0.1", features = ["net"] }
+tokio-stream = { version = "0.1", features = ["net", "sync"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io-util", "rt"] }
 toml_edit = { version = "0.22", features = ["serde"] }
 tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "log"] }