proxy: add benchmark for custom json logging vs official fmt logger

Don't depend on the postgres_ffi just for one type (#12610 )
We don't want to depend on postgres_ffi in an API crate. If there is no such dependency, we can compile stuff like `storcon_cli` without needing a full working postgres build. Fixes regression of #12548 (before we could compile it).
2026-05-16 20:50:37 +00:00 · 2025-07-15 19:44:41 +02:00 · 2025-07-15 17:28:08 +00:00 · 2025-07-15 14:36:28 +00:00 · 2025-07-15 11:16:29 +00:00 · 2025-07-15 11:06:58 +00:00
198 changed files with 7335 additions and 1600 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -27,4 +27,4 @@
 !storage_controller/
 !vendor/postgres-*/
 !workspace_hack/
-!build_tools/patches
+!build-tools/patches
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -31,6 +31,7 @@ config-variables:
  - NEON_PROD_AWS_ACCOUNT_ID
  - PGREGRESS_PG16_PROJECT_ID
  - PGREGRESS_PG17_PROJECT_ID
+  - PREWARM_PGBENCH_SIZE
  - REMOTE_STORAGE_AZURE_CONTAINER
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_CICD_CHANNEL_ID
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -176,7 +176,11 @@ runs:
        fi

        if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
-          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+          # We don't use code coverage for regression tests (the step is disabled),
+          # so there's no need to collect it.
+          # Ref https://github.com/neondatabase/neon/issues/4540
+          # cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+          cov_prefix=()
        else
          cov_prefix=()
        fi
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -150,7 +150,7 @@ jobs:
          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
          use-fallback: false
          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
@@ -162,7 +162,7 @@ jobs:
          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
          use-fallback: false
          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
@@ -174,7 +174,7 @@ jobs:
          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
          use-fallback: false
          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}

      - name: Cache postgres v17 build
        id: cache_pg_17
@@ -186,7 +186,7 @@ jobs:
          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
          use-fallback: false
          path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}

      - name: Build all
        # Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -219,6 +219,7 @@ jobs:
          --ignore test_runner/performance/test_cumulative_statistics_persistence.py
          --ignore test_runner/performance/test_perf_many_relations.py
          --ignore test_runner/performance/test_perf_oltp_large_tenant.py
+          --ignore test_runner/performance/test_lfc_prewarm.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -410,6 +411,77 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

+  prewarm-test:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }}
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 17
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+
+    steps:
+    - name: Harden the runner (Audit all outbound calls)
+      uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+      with:
+        egress-policy: audit
+
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Run prewarm benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_lfc_prewarm.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      id: create-allure-report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+      with:
+        store-test-results-into-db: true
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
  generate-matrices:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -72,7 +72,7 @@ jobs:
          ARCHS: ${{ inputs.archs || '["x64","arm64"]' }}
          DEBIANS: ${{ inputs.debians || '["bullseye","bookworm"]' }}
          IMAGE_TAG: |
-            ${{ hashFiles('build-tools.Dockerfile',
+            ${{ hashFiles('build-tools/Dockerfile',
                          '.github/workflows/build-build-tools-image.yml') }}
        run: |
          echo "archs=${ARCHS}"           | tee -a ${GITHUB_OUTPUT}
@@ -144,7 +144,7 @@ jobs:

      - uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
        with:
-          file: build-tools.Dockerfile
+          file: build-tools/Dockerfile
          context: .
          provenance: false
          push: true
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -87,6 +87,24 @@ jobs:
    uses: ./.github/workflows/build-build-tools-image.yml
    secrets: inherit

+  lint-openapi-spec:
+    runs-on: ubuntu-22.04
+    needs: [ meta, check-permissions ]
+    # We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - run: make lint-openapi-spec
+
  check-codestyle-python:
    needs: [ meta, check-permissions, build-build-tools-image ]
    # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,16 +1,16 @@
 [submodule "vendor/postgres-v14"]
 	path = vendor/postgres-v14
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_14_STABLE_neon
 [submodule "vendor/postgres-v15"]
 	path = vendor/postgres-v15
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_15_STABLE_neon
 [submodule "vendor/postgres-v16"]
 	path = vendor/postgres-v16
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_16_STABLE_neon
 [submodule "vendor/postgres-v17"]
 	path = vendor/postgres-v17
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_17_STABLE_neon
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1348,6 +1348,7 @@ dependencies = [
 "p256 0.13.2",
 "pageserver_page_api",
 "postgres",
+ "postgres-types",
 "postgres_initdb",
 "postgres_versioninfo",
 "regex",
@@ -4293,7 +4294,9 @@ dependencies = [
 "humantime-serde",
 "pageserver_api",
 "pageserver_client",
+ "pageserver_client_grpc",
 "pageserver_page_api",
+ "pprof",
 "rand 0.8.5",
 "reqwest",
 "serde",
@@ -4322,6 +4325,7 @@ dependencies = [
 "pageserver_api",
 "postgres_ffi",
 "remote_storage",
+ "serde",
 "serde_json",
 "svg_fmt",
 "thiserror 1.0.69",
@@ -4498,10 +4502,15 @@ name = "pageserver_client_grpc"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "arc-swap",
+ "bytes",
+ "compute_api",
 "futures",
+ "pageserver_api",
 "pageserver_page_api",
 "tokio",
 "tokio-stream",
+ "tokio-util",
 "tonic 0.13.1",
 "tracing",
 "utils",
@@ -5281,6 +5290,7 @@ dependencies = [
 "async-trait",
 "atomic-take",
 "aws-config",
+ "aws-credential-types",
 "aws-sdk-iam",
 "aws-sigv4",
 "base64 0.22.1",
@@ -5293,6 +5303,7 @@ dependencies = [
 "clashmap",
 "compute_api",
 "consumption_metrics",
+ "criterion",
 "ecdsa 0.16.9",
 "ed25519-dalek",
 "env_logger",
@@ -5320,6 +5331,7 @@ dependencies = [
 "itoa",
 "jose-jwa",
 "jose-jwk",
+ "json",
 "lasso",
 "measured",
 "metrics",
@@ -6200,6 +6212,7 @@ dependencies = [
 "postgres-protocol",
 "postgres_backend",
 "postgres_ffi",
+ "postgres_ffi_types",
 "postgres_versioninfo",
 "pprof",
 "pq_proto",
@@ -6244,7 +6257,7 @@ dependencies = [
 "anyhow",
 "const_format",
 "pageserver_api",
- "postgres_ffi",
+ "postgres_ffi_types",
 "postgres_versioninfo",
 "pq_proto",
 "serde",
@@ -6983,6 +6996,7 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "reqwest",
+ "safekeeper_api",
 "serde_json",
 "storage_controller_client",
 "tokio",
@@ -7552,6 +7566,7 @@ dependencies = [
 "futures-core",
 "pin-project-lite",
 "tokio",
+ "tokio-util",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -201,7 +201,7 @@ tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.g
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
-tokio-stream = "0.1"
+tokio-stream = { version = "0.1", features = ["sync"] }
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
 toml = "0.8"
@@ -262,6 +262,7 @@ neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_client_grpc = { path = "./pageserver/client_grpc" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
--- a/9
+++ b/9
@@ -220,6 +220,15 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit

+.PHONY: lint-openapi-spec
+lint-openapi-spec:
+	# operation-2xx-response: pageserver timeline delete returns 404 on success
+	find . -iname "openapi_spec.y*ml" -exec\
+		docker run --rm -v ${PWD}:/spec ghcr.io/redocly/cli:1.34.4\
+			--skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\
+			--skip-rule=no-server-example.com --skip-rule=operation-2xx-response\
+			lint {} \+
+
 # Targets for building PostgreSQL are defined in postgres.mk.
 #
 # But if the caller has indicated that PostgreSQL is already
--- a/build-tools/Dockerfile
+++ b/build-tools/Dockerfile
@@ -35,7 +35,7 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
    echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc

-COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
+COPY build-tools/patches/pgcopydbv017.patch /pgcopydbv017.patch

 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
        set -e && \
--- a/build-tools/patches/pgcopydbv017.patch
+++ b/build-tools/patches/pgcopydbv017.patch
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -9,7 +9,7 @@
 #
 # build-tools:   This contains Rust compiler toolchain and other tools needed at compile
 #                time. This is also used for the storage builds. This image is defined in
-#                build-tools.Dockerfile.
+#                build-tools/Dockerfile.
 #
 # build-deps:    Contains C compiler, other build tools, and compile-time dependencies
 #                needed to compile PostgreSQL and most extensions. (Some extensions need
@@ -115,7 +115,7 @@ ARG EXTENSIONS=all
 FROM $BASE_IMAGE_SHA AS build-deps
 ARG DEBIAN_VERSION

-# Keep in sync with build-tools.Dockerfile
+# Keep in sync with build-tools/Dockerfile
 ENV PROTOC_VERSION=25.1

 # Use strict mode for bash to catch errors early
@@ -1790,7 +1790,7 @@ RUN set -e \
 #########################################################################################
 FROM build-deps AS exporters
 ARG TARGETARCH
-# Keep sql_exporter version same as in build-tools.Dockerfile and
+# Keep sql_exporter version same as in build-tools/Dockerfile and
 # test_runner/regress/test_compute_metrics.py
 # See comment on the top of the file regading `echo`, `-e` and `\n`
 RUN if [ "$TARGETARCH" = "amd64" ]; then\
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -66,7 +66,7 @@ url.workspace = true
 uuid.workspace = true
 walkdir.workspace = true
 x509-cert.workspace = true
-
+postgres-types.workspace = true
 postgres_versioninfo.workspace = true
 postgres_initdb.workspace = true
 compute_api.workspace = true
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -46,11 +46,14 @@ stateDiagram-v2
  Configuration --> Failed : Failed to configure the compute
  Configuration --> Running : Compute has been configured
  Empty --> Init : Compute spec is immediately available
-  Empty --> TerminationPending : Requested termination
+  Empty --> TerminationPendingFast : Requested termination
+  Empty --> TerminationPendingImmediate : Requested termination
  Init --> Failed : Failed to start Postgres
  Init --> Running : Started Postgres
-  Running --> TerminationPending : Requested termination
-  TerminationPending --> Terminated : Terminated compute
+  Running --> TerminationPendingFast : Requested termination
+  Running --> TerminationPendingImmediate : Requested termination
+  TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status
+  TerminationPendingImmediate --> Terminated : Terminated compute immediately
  Failed --> [*] : Compute exited
  Terminated --> [*] : Compute exited
 ```
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{
    ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
-    LfcPrewarmState, TlsConfig,
+    LfcPrewarmState, PromoteState, TlsConfig,
 };
 use compute_api::spec::{
    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
@@ -29,8 +29,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
 use std::sync::{Arc, Condvar, Mutex, RwLock};
 use std::time::{Duration, Instant};
 use std::{env, fs};
-use tokio::task::JoinHandle;
-use tokio::{spawn, time};
+use tokio::{spawn, sync::watch, task::JoinHandle, time};
 use tracing::{Instrument, debug, error, info, instrument, warn};
 use url::Url;
 use utils::id::{TenantId, TimelineId};
@@ -175,6 +174,7 @@ pub struct ComputeState {
    /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
    /// mode == ComputeMode::Primary. None otherwise
    pub terminate_flush_lsn: Option<Lsn>,
+    pub promote_state: Option<watch::Receiver<PromoteState>>,

    pub metrics: ComputeMetrics,
 }
@@ -192,6 +192,7 @@ impl ComputeState {
            lfc_prewarm_state: LfcPrewarmState::default(),
            lfc_offload_state: LfcOffloadState::default(),
            terminate_flush_lsn: None,
+            promote_state: None,
        }
    }

@@ -955,14 +956,20 @@ impl ComputeNode {
            None
        };

-        let mut delay_exit = false;
        let mut state = self.state.lock().unwrap();
        state.terminate_flush_lsn = lsn;
-        if let ComputeStatus::TerminationPending { mode } = state.status {
+
+        let delay_exit = state.status == ComputeStatus::TerminationPendingFast;
+        if state.status == ComputeStatus::TerminationPendingFast
+            || state.status == ComputeStatus::TerminationPendingImmediate
+        {
+            info!(
+                "Changing compute status from {} to {}",
+                state.status,
+                ComputeStatus::Terminated
+            );
            state.status = ComputeStatus::Terminated;
            self.state_changed.notify_all();
-            // we were asked to terminate gracefully, don't exit to avoid restart
-            delay_exit = mode == compute_api::responses::TerminateMode::Fast
        }
        drop(state);

@@ -1033,6 +1040,8 @@ impl ComputeNode {
            PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
        };

+        self.fix_zenith_signal_neon_signal()?;
+
        let mut state = self.state.lock().unwrap();
        state.metrics.pageserver_connect_micros =
            connected.duration_since(started).as_micros() as u64;
@@ -1042,6 +1051,27 @@ impl ComputeNode {
        Ok(())
    }

+    /// Move the Zenith signal file to Neon signal file location.
+    /// This makes Compute compatible with older PageServers that don't yet
+    /// know about the Zenith->Neon rename.
+    fn fix_zenith_signal_neon_signal(&self) -> Result<()> {
+        let datadir = Path::new(&self.params.pgdata);
+
+        let neonsig = datadir.join("neon.signal");
+
+        if neonsig.is_file() {
+            return Ok(());
+        }
+
+        let zenithsig = datadir.join("zenith.signal");
+
+        if zenithsig.is_file() {
+            fs::copy(zenithsig, neonsig)?;
+        }
+
+        Ok(())
+    }
+
    /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
    /// the connection was established, and the (compressed) size of the basebackup.
    fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
@@ -1256,9 +1286,7 @@ impl ComputeNode {

        // In case of error, log and fail the check, but don't crash.
        // We're playing it safe because these errors could be transient
-        // and we don't yet retry. Also being careful here allows us to
-        // be backwards compatible with safekeepers that don't have the
-        // TIMELINE_STATUS API yet.
+        // and we don't yet retry.
        if responses.len() < quorum {
            error!(
                "failed sync safekeepers check {:?} {:?} {:?}",
@@ -1804,6 +1832,8 @@ impl ComputeNode {
            tls_config,
        )?;

+        self.pg_reload_conf()?;
+
        if !spec.skip_pg_catalog_updates {
            let max_concurrent_connections = spec.reconfigure_concurrency;
            // Temporarily reset max_cluster_size in config
@@ -1823,10 +1853,9 @@ impl ComputeNode {

                Ok(())
            })?;
+            self.pg_reload_conf()?;
        }

-        self.pg_reload_conf()?;
-
        let unknown_op = "unknown".to_string();
        let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
        info!(
@@ -1899,7 +1928,8 @@ impl ComputeNode {

                            // exit loop
                            ComputeStatus::Failed
-                            | ComputeStatus::TerminationPending { .. }
+                            | ComputeStatus::TerminationPendingFast
+                            | ComputeStatus::TerminationPendingImmediate
                            | ComputeStatus::Terminated => break 'cert_update,

                            // wait
@@ -2455,7 +2485,7 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
                serde_json::to_string(&extensions).expect("failed to serialize extensions list")
            );
        }
-        Err(err) => error!("could not get installed extensions: {err:?}"),
+        Err(err) => error!("could not get installed extensions: {err}"),
    }
    Ok(())
 }
--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -70,7 +70,7 @@ impl ComputeNode {
            }
        };
        let row = match client
-            .query_one("select * from get_prewarm_info()", &[])
+            .query_one("select * from neon.get_prewarm_info()", &[])
            .await
        {
            Ok(row) => row,
@@ -146,7 +146,7 @@ impl ComputeNode {
        ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
            .await
            .context("connecting to postgres")?
-            .query_one("select prewarm_local_cache($1)", &[&uncompressed])
+            .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
            .await
            .context("loading LFC state into postgres")
            .map(|_| ())
@@ -196,7 +196,7 @@ impl ComputeNode {
        ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
            .await
            .context("connecting to postgres")?
-            .query_one("select get_local_cache_state()", &[])
+            .query_one("select neon.get_local_cache_state()", &[])
            .await
            .context("querying LFC state")?
            .try_get::<usize, &[u8]>(0)
--- a/compute_tools/src/compute_promote.rs
+++ b/compute_tools/src/compute_promote.rs
@@ -0,0 +1,132 @@
+use crate::compute::ComputeNode;
+use anyhow::{Context, Result, bail};
+use compute_api::{
+    responses::{LfcPrewarmState, PromoteState, SafekeepersLsn},
+    spec::ComputeMode,
+};
+use std::{sync::Arc, time::Duration};
+use tokio::time::sleep;
+use utils::lsn::Lsn;
+
+impl ComputeNode {
+    /// Returns only when promote fails or succeeds. If a network error occurs
+    /// and http client disconnects, this does not stop promotion, and subsequent
+    /// calls block until promote finishes.
+    /// Called by control plane on secondary after primary endpoint is terminated
+    pub async fn promote(self: &Arc<Self>, safekeepers_lsn: SafekeepersLsn) -> PromoteState {
+        let cloned = self.clone();
+        let start_promotion = || {
+            let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
+            tokio::spawn(async move {
+                tx.send(match cloned.promote_impl(safekeepers_lsn).await {
+                    Ok(_) => PromoteState::Completed,
+                    Err(err) => {
+                        tracing::error!(%err, "promoting");
+                        PromoteState::Failed {
+                            error: err.to_string(),
+                        }
+                    }
+                })
+            });
+            rx
+        };
+
+        let mut task;
+        // self.state is unlocked after block ends so we lock it in promote_impl
+        // and task.changed() is reached
+        {
+            task = self
+                .state
+                .lock()
+                .unwrap()
+                .promote_state
+                .get_or_insert_with(start_promotion)
+                .clone()
+        }
+        task.changed().await.expect("promote sender dropped");
+        task.borrow().clone()
+    }
+
+    // Why do we have to supply safekeepers?
+    // For secondary we use primary_connection_conninfo so safekeepers field is empty
+    async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> {
+        {
+            let state = self.state.lock().unwrap();
+            let mode = &state.pspec.as_ref().unwrap().spec.mode;
+            if *mode != ComputeMode::Replica {
+                bail!("{} is not replica", mode.to_type_str());
+            }
+
+            // we don't need to query Postgres so not self.lfc_prewarm_state()
+            match &state.lfc_prewarm_state {
+                LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
+                    bail!("prewarm not requested or pending")
+                }
+                LfcPrewarmState::Failed { error } => {
+                    tracing::warn!(%error, "replica prewarm failed")
+                }
+                _ => {}
+            }
+        }
+
+        let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
+            .await
+            .context("connecting to postgres")?;
+
+        let primary_lsn = safekeepers_lsn.wal_flush_lsn;
+        let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
+        const RETRIES: i32 = 20;
+        for i in 0..=RETRIES {
+            let row = client
+                .query_one("SELECT pg_last_wal_replay_lsn()", &[])
+                .await
+                .context("getting last replay lsn")?;
+            let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
+            last_wal_replay_lsn = lsn.into();
+            if last_wal_replay_lsn >= primary_lsn {
+                break;
+            }
+            tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
+            sleep(Duration::from_secs(1)).await;
+        }
+        if last_wal_replay_lsn < primary_lsn {
+            bail!("didn't catch up with primary in {RETRIES} retries");
+        }
+
+        // using $1 doesn't work with ALTER SYSTEM SET
+        let safekeepers_sql = format!(
+            "ALTER SYSTEM SET neon.safekeepers='{}'",
+            safekeepers_lsn.safekeepers
+        );
+        client
+            .query(&safekeepers_sql, &[])
+            .await
+            .context("setting safekeepers")?;
+        client
+            .query("SELECT pg_reload_conf()", &[])
+            .await
+            .context("reloading postgres config")?;
+        let row = client
+            .query_one("SELECT * FROM pg_promote()", &[])
+            .await
+            .context("pg_promote")?;
+        if !row.get::<usize, bool>(0) {
+            bail!("pg_promote() returned false");
+        }
+
+        let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
+            .await
+            .context("connecting to postgres")?;
+        let row = client
+            .query_one("SHOW transaction_read_only", &[])
+            .await
+            .context("getting transaction_read_only")?;
+        if row.get::<usize, &str>(0) == "on" {
+            bail!("replica in read only mode after promotion");
+        }
+
+        let mut state = self.state.lock().unwrap();
+        state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary;
+        Ok(())
+    }
+}
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -83,6 +83,87 @@ paths:
              schema:
                $ref: "#/components/schemas/DbsAndRoles"

+  /promote:
+    post:
+      tags:
+        - Promotion
+      summary: Promote secondary replica to primary
+      description: ""
+      operationId: promoteReplica
+      requestBody:
+        description: Promote requests data
+        required: true
+        content:
+          application/json:
+            schema:
+                $ref: "#/components/schemas/SafekeepersLsn"
+      responses:
+        200:
+          description: Promote succeeded or wasn't started
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PromoteState"
+        500:
+          description: Promote failed
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PromoteState"
+
+  /lfc/prewarm:
+    post:
+      summary: Request LFC Prewarm
+      parameters:
+        - name: from_endpoint
+          in: query
+          schema:
+            type: string
+      description: ""
+      operationId: lfcPrewarm
+      responses:
+        202:
+          description: LFC prewarm started
+        429:
+          description: LFC prewarm ongoing
+    get:
+      tags:
+        - Prewarm
+      summary: Get LFC prewarm state
+      description: ""
+      operationId: getLfcPrewarmState
+      responses:
+        200:
+          description: Prewarm state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/LfcPrewarmState"
+
+  /lfc/offload:
+    post:
+      summary: Request LFC offload
+      description: ""
+      operationId: lfcOffload
+      responses:
+        202:
+          description: LFC offload started
+        429:
+          description: LFC offload ongoing
+    get:
+      tags:
+        - Prewarm
+      summary: Get LFC offloading state
+      description: ""
+      operationId: getLfcOffloadState
+      responses:
+        200:
+          description: Offload state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/LfcOffloadState"
+
  /database_schema:
    get:
      tags:
@@ -290,9 +371,28 @@ paths:
      summary: Terminate Postgres and wait for it to exit
      description: ""
      operationId: terminate
+      parameters:
+        - name: mode
+          in: query
+          description: "Terminate mode: fast (wait 30s before returning) and immediate"
+          required: false
+          schema:
+            type: string
+            enum: ["fast", "immediate"]
+            default: fast
      responses:
        200:
          description: Result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TerminateResponse"
+        201:
+          description: Result if compute is already terminated
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TerminateResponse"
        412:
          description: "wrong state"
          content:
@@ -335,15 +435,6 @@ components:
        total_startup_ms:
          type: integer

-    Info:
-      type: object
-      description: Information about VM/Pod.
-      required:
-        - num_cpus
-      properties:
-        num_cpus:
-          type: integer
-
    DbsAndRoles:
      type: object
      description: Databases and Roles
@@ -458,11 +549,14 @@ components:
      type: string
      enum:
        - empty
-        - init
-        - failed
-        - running
        - configuration_pending
+        - init
+        - running
        - configuration
+        - failed
+        - termination_pending_fast
+        - termination_pending_immediate
+        - terminated
      example: running

    ExtensionInstallRequest:
@@ -497,25 +591,69 @@ components:
          type: string
          example: "1.0.0"

-    InstalledExtensions:
+    SafekeepersLsn:
      type: object
+      required:
+        - safekeepers
+        - wal_flush_lsn
      properties:
-        extensions:
-          description: Contains list of installed extensions.
-          type: array
-          items:
-            type: object
-            properties:
-              extname:
-                type: string
-              version:
-                type: string
-                items:
-                  type: string
-              n_databases:
-                type: integer
-              owned_by_superuser:
-                type: integer
+        safekeepers:
+          description: Primary replica safekeepers
+          type: string
+        wal_flush_lsn:
+          description: Primary last WAL flush LSN
+          type: string
+
+    LfcPrewarmState:
+      type: object
+      required:
+        - status
+        - total
+        - prewarmed
+        - skipped
+      properties:
+        status:
+          description: Lfc prewarm status
+          enum: [not_prewarmed, prewarming, completed, failed]
+          type: string
+        error:
+          description: Lfc prewarm error, if any
+          type: string
+        total:
+          description: Total pages processed
+          type: integer
+        prewarmed:
+          description: Total pages prewarmed
+          type: integer
+        skipped:
+          description: Pages processed but not prewarmed
+          type: integer
+
+    LfcOffloadState:
+      type: object
+      required:
+        - status
+      properties:
+        status:
+          description: Lfc offload status
+          enum: [not_offloaded, offloading, completed, failed]
+          type: string
+        error:
+          description: Lfc offload error, if any
+          type: string
+
+    PromoteState:
+      type: object
+      required:
+        - status
+      properties:
+        status:
+          description: Promote result
+          enum: [not_promoted, completed, failed]
+          type: string
+        error:
+          description: Promote error, if any
+          type: string

    SetRoleGrantsRequest:
      type: object
@@ -544,6 +682,17 @@ components:
          description: Role name.
          example: "neon"

+    TerminateResponse:
+      type: object
+      required:
+        - lsn
+      properties:
+        lsn:
+          type: string
+          nullable: true
+          description: "last WAL flush LSN"
+          example: "0/028F10D8"
+
    SetRoleGrantsResponse:
      type: object
      required:
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -14,6 +14,7 @@ pub(in crate::http) mod insights;
 pub(in crate::http) mod lfc;
 pub(in crate::http) mod metrics;
 pub(in crate::http) mod metrics_json;
+pub(in crate::http) mod promote;
 pub(in crate::http) mod status;
 pub(in crate::http) mod terminate;

--- a/compute_tools/src/http/routes/promote.rs
+++ b/compute_tools/src/http/routes/promote.rs
@@ -0,0 +1,14 @@
+use crate::http::JsonResponse;
+use axum::Form;
+use http::StatusCode;
+
+pub(in crate::http) async fn promote(
+    compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
+    Form(safekeepers_lsn): Form<compute_api::responses::SafekeepersLsn>,
+) -> axum::response::Response {
+    let state = compute.promote(safekeepers_lsn).await;
+    if let compute_api::responses::PromoteState::Failed { error } = state {
+        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error);
+    }
+    JsonResponse::success(StatusCode::OK, state)
+}
--- a/compute_tools/src/http/routes/terminate.rs
+++ b/compute_tools/src/http/routes/terminate.rs
@@ -3,7 +3,7 @@ use crate::http::JsonResponse;
 use axum::extract::State;
 use axum::response::Response;
 use axum_extra::extract::OptionalQuery;
-use compute_api::responses::{ComputeStatus, TerminateResponse};
+use compute_api::responses::{ComputeStatus, TerminateMode, TerminateResponse};
 use http::StatusCode;
 use serde::Deserialize;
 use std::sync::Arc;
@@ -12,7 +12,7 @@ use tracing::info;

 #[derive(Deserialize, Default)]
 pub struct TerminateQuery {
-    mode: compute_api::responses::TerminateMode,
+    mode: TerminateMode,
 }

 /// Terminate the compute.
@@ -24,16 +24,16 @@ pub(in crate::http) async fn terminate(
    {
        let mut state = compute.state.lock().unwrap();
        if state.status == ComputeStatus::Terminated {
-            return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn);
+            let response = TerminateResponse {
+                lsn: state.terminate_flush_lsn,
+            };
+            return JsonResponse::success(StatusCode::CREATED, response);
        }

        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
            return JsonResponse::invalid_status(state.status);
        }
-        state.set_status(
-            ComputeStatus::TerminationPending { mode },
-            &compute.state_changed,
-        );
+        state.set_status(mode.into(), &compute.state_changed);
    }

    forward_termination_signal(false);
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -23,7 +23,7 @@ use super::{
    middleware::authorize::Authorize,
    routes::{
        check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
-        grants, insights, lfc, metrics, metrics_json, status, terminate,
+        grants, insights, lfc, metrics, metrics_json, promote, status, terminate,
    },
 };
 use crate::compute::ComputeNode;
@@ -87,6 +87,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
                let authenticated_router = Router::<Arc<ComputeNode>>::new()
                    .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
                    .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
+                    .route("/promote", post(promote::promote))
                    .route("/check_writability", post(check_writability::is_writable))
                    .route("/configure", post(configure::configure))
                    .route("/database_schema", get(database_schema::get_schema_dump))
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -2,6 +2,7 @@ use std::collections::HashMap;

 use anyhow::Result;
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
+use tokio_postgres::error::Error as PostgresError;
 use tokio_postgres::{Client, Config, NoTls};

 use crate::metrics::INSTALLED_EXTENSIONS;
@@ -10,7 +11,7 @@ use crate::metrics::INSTALLED_EXTENSIONS;
 /// and to make database listing query here more explicit.
 ///
 /// Limit the number of databases to 500 to avoid excessive load.
-async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
+async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
    // `pg_database.datconnlimit = -2` means that the database is in the
    // invalid state
    let databases = client
@@ -37,7 +38,9 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 /// Same extension can be installed in multiple databases with different versions,
 /// so we report a separate metric (number of databases where it is installed)
 /// for each extension version.
-pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExtensions> {
+pub async fn get_installed_extensions(
+    mut conf: Config,
+) -> Result<InstalledExtensions, PostgresError> {
    conf.application_name("compute_ctl:get_installed_extensions");
    let databases: Vec<String> = {
        let (mut client, connection) = conf.connect(NoTls).await?;
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -12,6 +12,7 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod compute_prewarm;
+pub mod compute_promote;
 pub mod disk_quota;
 pub mod extension_server;
 pub mod installed_extensions;
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -108,7 +108,7 @@ pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "compute_ctl_lfc_prewarm_errors_total",
-        "Total number of LFC prewarms errors requested by compute_ctl or autoprewarm option",
+        "Total number of LFC prewarm errors",
    )
    .expect("failed to define a metric")
 });
@@ -124,7 +124,7 @@ pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "compute_ctl_lfc_offload_errors_total",
-        "Total number of LFC offload errors requested by compute_ctl or lfc_offload_period_seconds option",
+        "Total number of LFC offload errors",
    )
    .expect("failed to define a metric")
 });
--- a/compute_tools/src/migrations/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/0002-alter_roles.sql
@@ -1,3 +1,16 @@
+-- On December 8th, 2023, an engineering escalation (INC-110) was opened after
+-- it was found that BYPASSRLS was being applied to all roles.
+--
+-- PR that introduced the issue: https://github.com/neondatabase/neon/pull/5657
+-- Subsequent commit on main: https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072
+--
+-- NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but because it
+-- isn't easy to know if a Postgres cluster is affected by the issue, we need to
+-- keep the migration around for a long time, if not indefinitely, so any
+-- cluster can be fixed.
+--
+-- Branching is the gift that keeps on giving...
+
 DO $$
 DECLARE
    role_name text;
--- a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
@@ -0,0 +1 @@
+GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;
--- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
@@ -7,13 +7,17 @@ BEGIN
        INTO monitor
        FROM pg_auth_members
        WHERE roleid = 'pg_monitor'::regrole
-            AND member = 'pg_monitor'::regrole;
+            AND member = 'neon_superuser'::regrole;

-    IF NOT monitor.member THEN
+    IF monitor IS NULL THEN
+        RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';
+    END IF;
+
+    IF monitor.admin IS NULL OR NOT monitor.member THEN
        RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor';
    END IF;

-    IF NOT monitor.admin THEN
+    IF monitor.admin IS NULL OR NOT monitor.admin THEN
        RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor';
    END IF;
 END $$;
--- a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
@@ -0,0 +1,23 @@
+DO $$
+DECLARE
+    signal_backend record;
+BEGIN
+    SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
+            admin_option AS admin
+        INTO signal_backend
+        FROM pg_auth_members
+        WHERE roleid = 'pg_signal_backend'::regrole
+            AND member = 'neon_superuser'::regrole;
+
+    IF signal_backend IS NULL THEN
+        RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_signal_backend';
+    END IF;
+
+    IF signal_backend.member IS NULL OR NOT signal_backend.member THEN
+        RAISE EXCEPTION 'neon_superuser is not a member of pg_signal_backend';
+    END IF;
+
+    IF signal_backend.admin IS NULL OR NOT signal_backend.admin THEN
+        RAISE EXCEPTION 'neon_superuser cannot grant pg_signal_backend';
+    END IF;
+END $$;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -84,7 +84,8 @@ impl ComputeMonitor {
        if matches!(
            compute_status,
            ComputeStatus::Terminated
-                | ComputeStatus::TerminationPending { .. }
+                | ComputeStatus::TerminationPendingFast
+                | ComputeStatus::TerminationPendingImmediate
                | ComputeStatus::Failed
        ) {
            info!(
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -197,6 +197,7 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
        include_str!(
            "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
        ),
+        include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"),
    ];

    MigrationRunner::new(client, &migrations)
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -36,7 +36,7 @@ impl StorageBroker {
    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
        let broker = &self.env.broker;

-        print!("Starting neon broker at {}", broker.client_url());
+        println!("Starting neon broker at {}", broker.client_url());

        let mut args = Vec::new();

--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -32,7 +32,8 @@
 //!     config.json                 - passed to `compute_ctl`
 //!     pgdata/
 //!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
-//!         zenith.signal
+//!         neon.signal
+//!         zenith.signal         - copy of neon.signal, for backward compatibility
 //!         <other PostgreSQL files>
 //! ```
 //!
@@ -463,7 +464,7 @@ impl Endpoint {
        conf.append("max_connections", "100");
        conf.append("wal_level", "logical");
        // wal_sender_timeout is the maximum time to wait for WAL replication.
-        // It also defines how often the walreciever will send a feedback message to the wal sender.
+        // It also defines how often the walreceiver will send a feedback message to the wal sender.
        conf.append("wal_sender_timeout", "5s");
        conf.append("listen_addresses", &self.pg_address.ip().to_string());
        conf.append("port", &self.pg_address.port().to_string());
@@ -922,7 +923,8 @@ impl Endpoint {
                        ComputeStatus::Empty
                        | ComputeStatus::ConfigurationPending
                        | ComputeStatus::Configuration
-                        | ComputeStatus::TerminationPending { .. }
+                        | ComputeStatus::TerminationPendingFast
+                        | ComputeStatus::TerminationPendingImmediate
                        | ComputeStatus::Terminated => {
                            bail!("unexpected compute status: {:?}", state.status)
                        }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -217,6 +217,9 @@ pub struct NeonStorageControllerConf {
    pub posthog_config: Option<PostHogConfig>,

    pub kick_secondary_downloads: Option<bool>,
+
+    #[serde(with = "humantime_serde")]
+    pub shard_split_request_timeout: Option<Duration>,
 }

 impl NeonStorageControllerConf {
@@ -250,6 +253,7 @@ impl Default for NeonStorageControllerConf {
            timeline_safekeeper_count: None,
            posthog_config: None,
            kick_secondary_downloads: None,
+            shard_split_request_timeout: None,
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -303,7 +303,7 @@ impl PageServerNode {
    async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
-        print!(
+        println!(
            "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
            self.conf.id,
            self.pg_connection_config.raw_address(),
@@ -452,6 +452,12 @@ impl PageServerNode {
                .map(|x| x.parse::<usize>())
                .transpose()
                .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
+            // HADRON
+            image_layer_force_creation_period: settings
+                .remove("image_layer_force_creation_period")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'image_layer_force_creation_period' as duration")?,
            image_layer_creation_check_threshold: settings
                .remove("image_layer_creation_check_threshold")
                .map(|x| x.parse::<u8>())
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -127,7 +127,7 @@ impl SafekeeperNode {
        extra_opts: &[String],
        retry_timeout: &Duration,
    ) -> anyhow::Result<()> {
-        print!(
+        println!(
            "Starting safekeeper at '{}' in '{}', retrying for {:?}",
            self.pg_connection_config.raw_address(),
            self.datadir_path().display(),
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -648,6 +648,13 @@ impl StorageController {
            args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
        }

+        if let Some(duration) = self.config.shard_split_request_timeout {
+            args.push(format!(
+                "--shard-split-request-timeout={}",
+                humantime::Duration::from(duration)
+            ));
+        }
+
        let mut envs = vec![
            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
@@ -660,7 +667,7 @@ impl StorageController {
            ));
        }

-        println!("Starting storage controller");
+        println!("Starting storage controller at {scheme}://{host}:{listen_port}");

        background_process::start_process(
            COMMAND,
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -14,6 +14,7 @@ humantime.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
+safekeeper_api.workspace=true
 serde_json = { workspace = true, features = ["raw_value"] }
 storage_controller_client.workspace = true
 tokio.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -11,7 +11,7 @@ use pageserver_api::controller_api::{
    PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
    ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
    SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    TenantShardMigrateRequest, TenantShardMigrateResponse, TimelineSafekeeperMigrateRequest,
 };
 use pageserver_api::models::{
    EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
@@ -21,6 +21,7 @@ use pageserver_api::models::{
 use pageserver_api::shard::{ShardStripeSize, TenantShardId};
 use pageserver_client::mgmt_api::{self};
 use reqwest::{Certificate, Method, StatusCode, Url};
+use safekeeper_api::models::TimelineLocateResponse;
 use storage_controller_client::control_api::Client;
 use utils::id::{NodeId, TenantId, TimelineId};

@@ -279,6 +280,23 @@ enum Command {
        #[arg(long)]
        concurrency: Option<usize>,
    },
+    /// Locate safekeepers for a timeline from the storcon DB.
+    TimelineLocate {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+    },
+    /// Migrate a timeline to a new set of safekeepers
+    TimelineSafekeeperMigrate {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+        /// Example: --new-sk-set 1,2,3
+        #[arg(long, required = true, value_delimiter = ',')]
+        new_sk_set: Vec<NodeId>,
+    },
 }

 #[derive(Parser)]
@@ -458,6 +476,7 @@ async fn main() -> anyhow::Result<()> {
                        listen_http_port,
                        listen_https_port,
                        availability_zone_id: AvailabilityZone(availability_zone_id),
+                        node_ip_addr: None,
                    }),
                )
                .await?;
@@ -1324,7 +1343,7 @@ async fn main() -> anyhow::Result<()> {
            concurrency,
        } => {
            let mut path = format!(
-                "/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
+                "v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
            );

            if let Some(c) = concurrency {
@@ -1335,6 +1354,41 @@ async fn main() -> anyhow::Result<()> {
                .dispatch::<(), ()>(Method::POST, path, None)
                .await?;
        }
+        Command::TimelineLocate {
+            tenant_id,
+            timeline_id,
+        } => {
+            let path = format!("debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate");
+
+            let resp = storcon_client
+                .dispatch::<(), TimelineLocateResponse>(Method::GET, path, None)
+                .await?;
+
+            let sk_set = resp.sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
+            let new_sk_set = resp
+                .new_sk_set
+                .as_ref()
+                .map(|ids| ids.iter().map(|id| id.0 as i64).collect::<Vec<_>>());
+
+            println!("generation = {}", resp.generation);
+            println!("sk_set = {sk_set:?}");
+            println!("new_sk_set = {new_sk_set:?}");
+        }
+        Command::TimelineSafekeeperMigrate {
+            tenant_id,
+            timeline_id,
+            new_sk_set,
+        } => {
+            let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate");
+
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::POST,
+                    path,
+                    Some(TimelineSafekeeperMigrateRequest { new_sk_set }),
+                )
+                .await?;
+        }
    }

    Ok(())
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -129,9 +129,10 @@ segment to bootstrap the WAL writing, but it doesn't contain the checkpoint reco
 changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
 from WAL.

-This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start
-at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last
-checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo.
+This includes code to read the `neon.signal` (also `zenith.signal`) file, which tells the startup 
+code the LSN to start at. When the `neon.signal` file is present, the startup uses that LSN
+instead of the last checkpoint's LSN. The system is known to be consistent at that LSN, without 
+any WAL redo.


 ### How to get rid of the patch
--- a/docs/pageserver-services.md
+++ b/docs/pageserver-services.md
@@ -75,7 +75,7 @@ CLI examples:
 * AWS S3  : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`

 For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
-For local S3 installations, refer to the their documentation for name format and credentials.
+For local S3 installations, refer to their documentation for name format and credentials.

 Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
 Required sections are:
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -46,7 +46,7 @@ pub struct ExtensionInstallResponse {
    pub version: ExtVersion,
 }

-#[derive(Serialize, Default, Debug, Clone)]
+#[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcPrewarmState {
    #[default]
@@ -58,6 +58,17 @@ pub enum LfcPrewarmState {
    },
 }

+impl Display for LfcPrewarmState {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
+            LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
+            LfcPrewarmState::Completed => f.write_str("Completed"),
+            LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
+        }
+    }
+}
+
 #[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcOffloadState {
@@ -70,6 +81,23 @@ pub enum LfcOffloadState {
    },
 }

+#[derive(Serialize, Debug, Clone, PartialEq)]
+#[serde(tag = "status", rename_all = "snake_case")]
+/// Response of /promote
+pub enum PromoteState {
+    NotPromoted,
+    Completed,
+    Failed { error: String },
+}
+
+#[derive(Deserialize, Serialize, Default, Debug, Clone)]
+#[serde(rename_all = "snake_case")]
+/// Result of /safekeepers_lsn
+pub struct SafekeepersLsn {
+    pub safekeepers: String,
+    pub wal_flush_lsn: utils::lsn::Lsn,
+}
+
 /// Response of the /status API
 #[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
@@ -93,6 +121,15 @@ pub enum TerminateMode {
    Immediate,
 }

+impl From<TerminateMode> for ComputeStatus {
+    fn from(mode: TerminateMode) -> Self {
+        match mode {
+            TerminateMode::Fast => ComputeStatus::TerminationPendingFast,
+            TerminateMode::Immediate => ComputeStatus::TerminationPendingImmediate,
+        }
+    }
+}
+
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
@@ -113,7 +150,9 @@ pub enum ComputeStatus {
    // control-plane to terminate it.
    Failed,
    // Termination requested
-    TerminationPending { mode: TerminateMode },
+    TerminationPendingFast,
+    // Termination requested, without waiting 30s before returning from /terminate
+    TerminationPendingImmediate,
    // Terminated Postgres
    Terminated,
 }
@@ -132,7 +171,10 @@ impl Display for ComputeStatus {
            ComputeStatus::Running => f.write_str("running"),
            ComputeStatus::Configuration => f.write_str("configuration"),
            ComputeStatus::Failed => f.write_str("failed"),
-            ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"),
+            ComputeStatus::TerminationPendingFast => f.write_str("termination-pending-fast"),
+            ComputeStatus::TerminationPendingImmediate => {
+                f.write_str("termination-pending-immediate")
+            }
            ComputeStatus::Terminated => f.write_str("terminated"),
        }
    }
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -442,7 +442,7 @@ pub struct JwksSettings {
 }

 /// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
-#[derive(Clone, Copy, Debug, Default)]
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
 pub enum PageserverProtocol {
    /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
    #[default]
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -20,6 +20,7 @@ use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{Instrument, debug, info, info_span, warn};
 use utils::auth::{AuthError, Claims, SwappableJwtAuth};
+use utils::metrics_collector::{METRICS_COLLECTOR, METRICS_STALE_MILLIS};

 use crate::error::{ApiError, api_error_handler, route_error_handler};
 use crate::request::{get_query_param, parse_query_param};
@@ -250,9 +251,28 @@ impl std::io::Write for ChannelWriter {
    }
 }

-pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn prometheus_metrics_handler(
+    req: Request<Body>,
+    force_metric_collection_on_scrape: bool,
+) -> Result<Response<Body>, ApiError> {
    SERVE_METRICS_COUNT.inc();

+    // HADRON
+    let requested_use_latest = parse_query_param(&req, "use_latest")?;
+
+    let use_latest = match requested_use_latest {
+        None => force_metric_collection_on_scrape,
+        Some(true) => true,
+        Some(false) => {
+            if force_metric_collection_on_scrape {
+                // We don't cache in this case
+                true
+            } else {
+                false
+            }
+        }
+    };
+
    let started_at = std::time::Instant::now();

    let (tx, rx) = mpsc::channel(1);
@@ -277,12 +297,18 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<

        let _span = span.entered();

-        let metrics = metrics::gather();
+        // HADRON
+        let collected = if use_latest {
+            // Skip caching the results if we always force metric collection on scrape.
+            METRICS_COLLECTOR.run_once(!force_metric_collection_on_scrape)
+        } else {
+            METRICS_COLLECTOR.last_collected()
+        };

        let gathered_at = std::time::Instant::now();

        let res = encoder
-            .encode(&metrics, &mut writer)
+            .encode(&collected.metrics, &mut writer)
            .and_then(|_| writer.flush().map_err(|e| e.into()));

        // this instant is not when we finally got the full response sent, sending is done by hyper
@@ -295,6 +321,10 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
        let encoded_in = encoded_at - gathered_at - writer.wait_time();
        let total = encoded_at - started_at;

+        // HADRON
+        let staleness_ms = (encoded_at - collected.collected_at).as_millis();
+        METRICS_STALE_MILLIS.set(staleness_ms as i64);
+
        match res {
            Ok(()) => {
                tracing::info!(
@@ -303,6 +333,7 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
                    spawning_ms = spawned_in.as_millis(),
                    collection_ms = collected_in.as_millis(),
                    encoding_ms = encoded_in.as_millis(),
+                    stalenss_ms = staleness_ms,
                    "responded /metrics"
                );
            }
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -226,6 +226,7 @@ pub struct ConfigToml {
    pub synthetic_size_calculation_interval: Duration,
    pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
    pub test_remote_failures: u64,
+    pub test_remote_failures_probability: u64,
    pub ondemand_download_behavior_treat_error_as_warn: bool,
    #[serde(with = "humantime_serde")]
    pub background_task_maximum_delay: Duration,
@@ -271,6 +272,9 @@ pub struct ConfigToml {
    pub timeline_import_config: TimelineImportConfig,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub basebackup_cache_config: Option<BasebackupCacheConfig>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_layer_generation_large_timeline_threshold: Option<u64>,
+    pub force_metric_collection_on_scrape: bool,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -560,6 +564,11 @@ pub struct TenantConfigToml {
    pub gc_period: Duration,
    // Delta layer churn threshold to create L1 image layers.
    pub image_creation_threshold: usize,
+    // HADRON
+    // When the timeout is reached, PageServer will (1) force compact any remaining L0 deltas and
+    // (2) create image layers if there are any L1 deltas.
+    #[serde(with = "humantime_serde")]
+    pub image_layer_force_creation_period: Option<Duration>,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is time.
@@ -758,6 +767,7 @@ impl Default for ConfigToml {
            disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),

            test_remote_failures: (0),
+            test_remote_failures_probability: (100),

            ondemand_download_behavior_treat_error_as_warn: (false),

@@ -821,6 +831,8 @@ impl Default for ConfigToml {
            },
            basebackup_cache_config: None,
            posthog_config: None,
+            image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024),
+            force_metric_collection_on_scrape: true,
        }
    }
 }
@@ -914,6 +926,7 @@ impl Default for TenantConfigToml {
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
+            image_layer_force_creation_period: None,
            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
                .expect("cannot parse default PITR interval"),
            walreceiver_connect_timeout: humantime::parse_duration(
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,5 +1,6 @@
 use std::collections::{HashMap, HashSet};
 use std::fmt::Display;
+use std::net::IpAddr;
 use std::str::FromStr;
 use std::time::{Duration, Instant};

@@ -10,7 +11,7 @@ use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;

-use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
+use crate::models::{PageserverUtilization, ShardParameters, TenantConfig, TimelineInfo};
 use crate::shard::{ShardStripeSize, TenantShardId};

 #[derive(Serialize, Deserialize, Debug)]
@@ -60,6 +61,11 @@ pub struct NodeRegisterRequest {
    pub listen_https_port: Option<u16>,

    pub availability_zone_id: AvailabilityZone,
+
+    // Reachable IP address of the PS/SK registering, if known.
+    // Hadron Cluster Coordiantor will update the DNS record of the registering node
+    // with this IP address.
+    pub node_ip_addr: Option<IpAddr>,
 }

 #[derive(Serialize, Deserialize)]
@@ -126,6 +132,13 @@ pub struct TenantDescribeResponse {
    pub config: TenantConfig,
 }

+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantTimelineDescribeResponse {
+    pub shards: Vec<TimelineInfo>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_consistent_lsn: Option<Lsn>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct NodeShardResponse {
    pub node_id: NodeId,
@@ -538,6 +551,39 @@ pub struct SafekeeperDescribeResponse {
    pub scheduling_policy: SkSchedulingPolicy,
 }

+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct TimelineSafekeeperPeer {
+    pub node_id: NodeId,
+    pub listen_http_addr: String,
+    pub http_port: i32,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SCSafekeeperTimeline {
+    // SC does not know the tenant id.
+    pub timeline_id: TimelineId,
+    pub peers: Vec<NodeId>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SCSafekeeperTimelinesResponse {
+    pub timelines: Vec<SCSafekeeperTimeline>,
+    pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SafekeeperTimeline {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub peers: Vec<NodeId>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SafekeeperTimelinesResponse {
+    pub timelines: Vec<SafekeeperTimeline>,
+    pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
+}
+
 #[derive(Serialize, Deserialize, Clone)]
 pub struct SafekeeperSchedulingPolicyRequest {
    pub scheduling_policy: SkSchedulingPolicy,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -384,7 +384,7 @@ pub struct SafekeepersInfo {
    pub safekeepers: Vec<SafekeeperInfo>,
 }

-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Serialize, Deserialize, Clone, Debug)]
 pub struct SafekeeperInfo {
    pub id: NodeId,
    pub hostname: String,
@@ -597,6 +597,9 @@ pub struct TenantConfigPatch {
    pub gc_period: FieldPatch<String>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub image_creation_threshold: FieldPatch<usize>,
+    // HADRON
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_layer_force_creation_period: FieldPatch<String>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub pitr_interval: FieldPatch<String>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
@@ -700,6 +703,11 @@ pub struct TenantConfig {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub image_creation_threshold: Option<usize>,

+    // HADRON
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    pub image_layer_force_creation_period: Option<Duration>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(with = "humantime_serde")]
    pub pitr_interval: Option<Duration>,
@@ -798,6 +806,7 @@ impl TenantConfig {
            mut gc_horizon,
            mut gc_period,
            mut image_creation_threshold,
+            mut image_layer_force_creation_period,
            mut pitr_interval,
            mut walreceiver_connect_timeout,
            mut lagging_wal_timeout,
@@ -861,6 +870,11 @@ impl TenantConfig {
        patch
            .image_creation_threshold
            .apply(&mut image_creation_threshold);
+        // HADRON
+        patch
+            .image_layer_force_creation_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut image_layer_force_creation_period);
        patch
            .pitr_interval
            .map(|v| humantime::parse_duration(&v))?
@@ -942,6 +956,7 @@ impl TenantConfig {
            gc_horizon,
            gc_period,
            image_creation_threshold,
+            image_layer_force_creation_period,
            pitr_interval,
            walreceiver_connect_timeout,
            lagging_wal_timeout,
@@ -1016,6 +1031,9 @@ impl TenantConfig {
            image_creation_threshold: self
                .image_creation_threshold
                .unwrap_or(global_conf.image_creation_threshold),
+            image_layer_force_creation_period: self
+                .image_layer_force_creation_period
+                .or(global_conf.image_layer_force_creation_period),
            pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval),
            walreceiver_connect_timeout: self
                .walreceiver_connect_timeout
@@ -1604,6 +1622,9 @@ pub struct TimelineInfo {

    /// Whether the timeline is invisible in synthetic size calculations.
    pub is_invisible: Option<bool>,
+    // HADRON: the largest LSN below which all page updates have been included in the image layers.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_consistent_lsn: Option<Lsn>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -332,7 +332,11 @@ fn hash_combine(mut a: u32, mut b: u32) -> u32 {
 ///
 /// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
 /// and will be handled at higher levels when shards are split.
-fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
+pub fn key_to_shard_number(
+    count: ShardCount,
+    stripe_size: ShardStripeSize,
+    key: &Key,
+) -> ShardNumber {
    // Fast path for un-sharded tenants or broadcast keys
    if count < ShardCount(2) || key_is_shard0(key) {
        return ShardNumber(0);
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -110,7 +110,6 @@ fn main() -> anyhow::Result<()> {
            .allowlist_type("XLogRecPtr")
            .allowlist_type("XLogSegNo")
            .allowlist_type("TimeLineID")
-            .allowlist_type("TimestampTz")
            .allowlist_type("MultiXactId")
            .allowlist_type("MultiXactOffset")
            .allowlist_type("MultiXactStatus")
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -227,8 +227,7 @@ pub mod walrecord;
 // Export some widely used datatypes that are unlikely to change across Postgres versions
 pub use v14::bindings::{
    BlockNumber, CheckPoint, ControlFileData, MultiXactId, OffsetNumber, Oid, PageHeaderData,
-    RepOriginId, TimeLineID, TimestampTz, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32,
-    uint64,
+    RepOriginId, TimeLineID, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32, uint64,
 };
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
--- a/libs/postgres_ffi/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
@@ -4,13 +4,14 @@
 //! TODO: Generate separate types for each supported PG version

 use bytes::{Buf, Bytes};
+use postgres_ffi_types::TimestampTz;
 use serde::{Deserialize, Serialize};
 use utils::bin_ser::DeserializeError;
 use utils::lsn::Lsn;

 use crate::{
    BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, PgMajorVersion,
-    RepOriginId, TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants,
+    RepOriginId, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants,
 };

 #[repr(C)]
@@ -863,7 +864,8 @@ pub mod v17 {
        XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange,
        rm_neon,
    };
-    pub use crate::{TimeLineID, TimestampTz};
+    pub use crate::TimeLineID;
+    pub use postgres_ffi_types::TimestampTz;

    #[repr(C)]
    #[derive(Debug)]
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -9,10 +9,11 @@

 use super::super::waldecoder::WalStreamDecoder;
 use super::bindings::{
-    CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
+    CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID,
    XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
    MY_PGVERSION
 };
+use postgres_ffi_types::TimestampTz;
 use super::wal_generator::LogicalMessageGenerator;
 use crate::pg_constants;
 use crate::PG_TLI;
--- a/libs/postgres_ffi_types/src/lib.rs
+++ b/libs/postgres_ffi_types/src/lib.rs
@@ -11,3 +11,4 @@ pub mod forknum;

 pub type Oid = u32;
 pub type RepOriginId = u16;
+pub type TimestampTz = i64;
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -43,6 +43,7 @@ itertools.workspace = true
 sync_wrapper = { workspace = true, features = ["futures"] }

 byteorder = "1.4"
+rand = "0.8.5"

 [dev-dependencies]
 camino-tempfile.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -732,9 +732,15 @@ impl GenericRemoteStorage {
        })
    }

-    pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
-        Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
+    /* BEGIN_HADRON */
+    pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self {
+        Self::Unreliable(Arc::new(UnreliableWrapper::new(
+            s,
+            fail_first,
+            fail_probability,
+        )))
    }
+    /* END_HADRON */

    /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
    pub async fn upload_storage_object(
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -1,6 +1,8 @@
 //! This module provides a wrapper around a real RemoteStorage implementation that
 //! causes the first N attempts at each upload or download operatio to fail. For
 //! testing purposes.
+use rand::Rng;
+use std::cmp;
 use std::collections::HashMap;
 use std::collections::hash_map::Entry;
 use std::num::NonZeroU32;
@@ -25,6 +27,13 @@ pub struct UnreliableWrapper {

    // Tracks how many failed attempts of each operation has been made.
    attempts: Mutex<HashMap<RemoteOp, u64>>,
+
+    /* BEGIN_HADRON */
+    // This the probability of failure for each operation, ranged from [0, 100].
+    // The probability is default to 100, which means that all operations will fail.
+    // Storage will fail by probability up to attempts_to_fail times.
+    attempt_failure_probability: u64,
+    /* END_HADRON */
 }

 /// Used to identify retries of different unique operation.
@@ -40,7 +49,11 @@ enum RemoteOp {
 }

 impl UnreliableWrapper {
-    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
+    pub fn new(
+        inner: crate::GenericRemoteStorage,
+        attempts_to_fail: u64,
+        attempt_failure_probability: u64,
+    ) -> Self {
        assert!(attempts_to_fail > 0);
        let inner = match inner {
            GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
@@ -51,9 +64,11 @@ impl UnreliableWrapper {
                panic!("Can't wrap unreliable wrapper unreliably")
            }
        };
+        let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100);
        UnreliableWrapper {
            inner,
            attempts_to_fail,
+            attempt_failure_probability: actual_attempt_failure_probability,
            attempts: Mutex::new(HashMap::new()),
        }
    }
@@ -66,6 +81,7 @@ impl UnreliableWrapper {
    ///
    fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
        let mut attempts = self.attempts.lock().unwrap();
+        let mut rng = rand::thread_rng();

        match attempts.entry(op) {
            Entry::Occupied(mut e) => {
@@ -75,15 +91,19 @@ impl UnreliableWrapper {
                    *p
                };

-                if attempts_before_this >= self.attempts_to_fail {
-                    // let it succeed
-                    e.remove();
-                    Ok(attempts_before_this)
-                } else {
+                /* BEGIN_HADRON */
+                // If there are more attempts to fail, fail the request by probability.
+                if (attempts_before_this < self.attempts_to_fail)
+                    && (rng.gen_range(0..=100) < self.attempt_failure_probability)
+                {
                    let error =
                        anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
                    Err(error)
+                } else {
+                    e.remove();
+                    Ok(attempts_before_this)
                }
+                /* END_HADRON */
            }
            Entry::Vacant(e) => {
                let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -9,7 +9,7 @@ anyhow.workspace = true
 const_format.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-postgres_ffi.workspace = true
+postgres_ffi_types.workspace = true
 postgres_versioninfo.workspace = true
 pq_proto.workspace = true
 tokio.workspace = true
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -3,7 +3,7 @@
 use std::net::SocketAddr;

 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::TimestampTz;
+use postgres_ffi_types::TimestampTz;
 use postgres_versioninfo::PgVersionId;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
@@ -11,7 +11,7 @@ use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;

-use crate::membership::Configuration;
+use crate::membership::{Configuration, SafekeeperGeneration};
 use crate::{ServerInfo, Term};

 #[derive(Debug, Serialize, Deserialize)]
@@ -311,3 +311,12 @@ pub struct PullTimelineResponse {
    pub safekeeper_host: Option<String>,
    // TODO: add more fields?
 }
+
+/// Response to a timeline locate request.
+/// Storcon-only API.
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct TimelineLocateResponse {
+    pub generation: SafekeeperGeneration,
+    pub sk_set: Vec<NodeId>,
+    pub new_sk_set: Option<Vec<NodeId>>,
+}
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -44,3 +44,63 @@ where
        }
    }
 }
+
+/* BEGIN_HADRON */
+pub enum DeploymentMode {
+    Local,
+    Dev,
+    Staging,
+    Prod,
+}
+
+pub fn get_deployment_mode() -> Option<DeploymentMode> {
+    match std::env::var("DEPLOYMENT_MODE") {
+        Ok(env) => match env.as_str() {
+            "development" => Some(DeploymentMode::Dev),
+            "staging" => Some(DeploymentMode::Staging),
+            "production" => Some(DeploymentMode::Prod),
+            _ => {
+                tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env);
+                None
+            }
+        },
+        Err(_) => {
+            // tracing::error!("DEPLOYMENT_MODE not set");
+            None
+        }
+    }
+}
+
+pub fn is_dev_or_staging() -> bool {
+    matches!(
+        get_deployment_mode(),
+        Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging)
+    )
+}
+
+pub enum TestingMode {
+    Chaos,
+    Stress,
+}
+
+pub fn get_test_mode() -> Option<TestingMode> {
+    match std::env::var("HADRON_TEST_MODE") {
+        Ok(env) => match env.as_str() {
+            "chaos" => Some(TestingMode::Chaos),
+            "stress" => Some(TestingMode::Stress),
+            _ => {
+                tracing::error!("Unexpected HADRON_TEST_MODE: {}", env);
+                None
+            }
+        },
+        Err(_) => {
+            tracing::error!("HADRON_TEST_MODE not set");
+            None
+        }
+    }
+}
+
+pub fn is_chaos_testing() -> bool {
+    matches!(get_test_mode(), Some(TestingMode::Chaos))
+}
+/* END_HADRON */
--- a/libs/utils/src/ip_address.rs
+++ b/libs/utils/src/ip_address.rs
@@ -0,0 +1,73 @@
+use std::env::{VarError, var};
+use std::error::Error;
+use std::net::IpAddr;
+use std::str::FromStr;
+
+/// Name of the environment variable containing the reachable IP address of the node. If set, the IP address contained in this
+/// environment variable is used as the reachable IP address of the pageserver or safekeeper node during node registration.
+/// In a Kubernetes environment, this environment variable should be set by Kubernetes to the Pod IP (specified in the Pod
+/// template).
+pub const HADRON_NODE_IP_ADDRESS: &str = "HADRON_NODE_IP_ADDRESS";
+
+/// Read the reachable IP address of this page server from env var HADRON_NODE_IP_ADDRESS.
+/// In Kubernetes this environment variable is set to the Pod IP (specified in the Pod template).
+pub fn read_node_ip_addr_from_env() -> Result<Option<IpAddr>, Box<dyn Error>> {
+    match var(HADRON_NODE_IP_ADDRESS) {
+        Ok(v) => {
+            if let Ok(addr) = IpAddr::from_str(&v) {
+                Ok(Some(addr))
+            } else {
+                Err(format!("Invalid IP address string: {v}. Cannot be parsed as either an IPv4 or an IPv6 address.").into())
+            }
+        }
+        Err(VarError::NotPresent) => Ok(None),
+        Err(e) => Err(e.into()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::env;
+    use std::net::{Ipv4Addr, Ipv6Addr};
+
+    #[test]
+    fn test_read_node_ip_addr_from_env() {
+        // SAFETY: test code
+        unsafe {
+            // Test with a valid IPv4 address
+            env::set_var(HADRON_NODE_IP_ADDRESS, "192.168.1.1");
+            let result = read_node_ip_addr_from_env().unwrap();
+            assert_eq!(result, Some(IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1))));
+
+            // Test with a valid IPv6 address
+            env::set_var(
+                HADRON_NODE_IP_ADDRESS,
+                "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
+            );
+        }
+        let result = read_node_ip_addr_from_env().unwrap();
+        assert_eq!(
+            result,
+            Some(IpAddr::V6(
+                Ipv6Addr::from_str("2001:0db8:85a3:0000:0000:8a2e:0370:7334").unwrap()
+            ))
+        );
+
+        // Test with an invalid IP address
+        // SAFETY: test code
+        unsafe {
+            env::set_var(HADRON_NODE_IP_ADDRESS, "invalid_ip");
+        }
+        let result = read_node_ip_addr_from_env();
+        assert!(result.is_err());
+
+        // Test with no environment variable set
+        // SAFETY: test code
+        unsafe {
+            env::remove_var(HADRON_NODE_IP_ADDRESS);
+        }
+        let result = read_node_ip_addr_from_env().unwrap();
+        assert_eq!(result, None);
+    }
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,6 +26,9 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;

+// utility functions to obtain reachable IP addresses in PS/SK nodes.
+pub mod ip_address;
+
 pub mod shard;

 mod hex;
@@ -99,6 +102,8 @@ pub mod elapsed_accum;
 #[cfg(target_os = "linux")]
 pub mod linux_socket_ioctl;

+pub mod metrics_collector;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,4 +1,5 @@
 use std::future::Future;
+use std::pin::Pin;
 use std::str::FromStr;
 use std::time::Duration;

@@ -7,7 +8,7 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
 use tokio::time::Instant;
-use tracing::info;
+use tracing::{info, warn};

 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
@@ -377,10 +378,11 @@ impl std::fmt::Debug for SecretString {
 ///
 /// TODO: consider upgrading this to a warning, but currently it fires too often.
 #[inline]
-pub async fn log_slow<F, O>(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O
-where
-    F: Future<Output = O>,
-{
+pub async fn log_slow<O>(
+    name: &str,
+    threshold: Duration,
+    f: Pin<&mut impl Future<Output = O>>,
+) -> O {
    monitor_slow_future(
        threshold,
        threshold, // period = threshold
@@ -394,16 +396,42 @@ where
            if !is_slow {
                return;
            }
+            let elapsed = elapsed_total.as_secs_f64();
            if ready {
-                info!(
-                    "slow {name} completed after {:.3}s",
-                    elapsed_total.as_secs_f64()
-                );
+                info!("slow {name} completed after {elapsed:.3}s");
            } else {
-                info!(
-                    "slow {name} still running after {:.3}s",
-                    elapsed_total.as_secs_f64()
-                );
+                info!("slow {name} still running after {elapsed:.3}s");
+            }
+        },
+    )
+    .await
+}
+
+/// Logs a periodic warning if a future is slow to complete.
+#[inline]
+pub async fn warn_slow<O>(
+    name: &str,
+    threshold: Duration,
+    f: Pin<&mut impl Future<Output = O>>,
+) -> O {
+    monitor_slow_future(
+        threshold,
+        threshold, // period = threshold
+        f,
+        |MonitorSlowFutureCallback {
+             ready,
+             is_slow,
+             elapsed_total,
+             elapsed_since_last_callback: _,
+         }| {
+            if !is_slow {
+                return;
+            }
+            let elapsed = elapsed_total.as_secs_f64();
+            if ready {
+                warn!("slow {name} completed after {elapsed:.3}s");
+            } else {
+                warn!("slow {name} still running after {elapsed:.3}s");
            }
        },
    )
@@ -416,7 +444,7 @@ where
 pub async fn monitor_slow_future<F, O>(
    threshold: Duration,
    period: Duration,
-    mut fut: std::pin::Pin<&mut F>,
+    mut fut: Pin<&mut F>,
    mut cb: impl FnMut(MonitorSlowFutureCallback),
 ) -> O
 where
--- a/libs/utils/src/metrics_collector.rs
+++ b/libs/utils/src/metrics_collector.rs
@@ -0,0 +1,75 @@
+use std::{
+    sync::{Arc, RwLock},
+    time::{Duration, Instant},
+};
+
+use metrics::{IntGauge, proto::MetricFamily, register_int_gauge};
+use once_cell::sync::Lazy;
+
+pub static METRICS_STALE_MILLIS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "metrics_metrics_stale_milliseconds",
+        "The current metrics stale time in milliseconds"
+    )
+    .expect("failed to define a metric")
+});
+
+#[derive(Debug)]
+pub struct CollectedMetrics {
+    pub metrics: Vec<MetricFamily>,
+    pub collected_at: Instant,
+}
+
+impl CollectedMetrics {
+    fn new(metrics: Vec<MetricFamily>) -> Self {
+        Self {
+            metrics,
+            collected_at: Instant::now(),
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct MetricsCollector {
+    last_collected: RwLock<Arc<CollectedMetrics>>,
+}
+
+impl MetricsCollector {
+    pub fn new() -> Self {
+        Self {
+            last_collected: RwLock::new(Arc::new(CollectedMetrics::new(vec![]))),
+        }
+    }
+
+    #[tracing::instrument(name = "metrics_collector", skip_all)]
+    pub fn run_once(&self, cache_metrics: bool) -> Arc<CollectedMetrics> {
+        let started = Instant::now();
+        let metrics = metrics::gather();
+        let collected = Arc::new(CollectedMetrics::new(metrics));
+        if cache_metrics {
+            let mut guard = self.last_collected.write().unwrap();
+            *guard = collected.clone();
+        }
+        tracing::info!(
+            "Collected {} metric families in {} ms",
+            collected.metrics.len(),
+            started.elapsed().as_millis()
+        );
+        collected
+    }
+
+    pub fn last_collected(&self) -> Arc<CollectedMetrics> {
+        self.last_collected.read().unwrap().clone()
+    }
+}
+
+impl Default for MetricsCollector {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+// Interval for metrics collection. Currently hard-coded to be the same as the metrics scape interval from the obs agent
+pub static METRICS_COLLECTION_INTERVAL: Duration = Duration::from_secs(30);
+
+pub static METRICS_COLLECTOR: Lazy<MetricsCollector> = Lazy::new(MetricsCollector::default);
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -171,6 +171,12 @@ impl std::fmt::Display for ShardNumber {
    }
 }

+impl std::fmt::Display for ShardCount {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 impl std::fmt::Display for ShardSlug<'_> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
--- a/libs/wal_decoder/src/models/record.rs
+++ b/libs/wal_decoder/src/models/record.rs
@@ -2,7 +2,8 @@

 use bytes::Bytes;
 use postgres_ffi::walrecord::{MultiXactMember, describe_postgres_wal_record};
-use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
+use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId};
+use postgres_ffi_types::TimestampTz;
 use serde::{Deserialize, Serialize};
 use utils::bin_ser::DeserializeError;

--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -428,6 +428,12 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
        shard_number: 0,
    };

+    let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
+        should_limit: crate::bindings::pg_atomic_uint32 { value: 0 },
+        sent_bytes: 0,
+        last_recorded_time_us: 0,
+    };
+
    crate::bindings::WalproposerShmemState {
        propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
        donor_name: [0; 64],
@@ -441,6 +447,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
        num_shards: 0,
        replica_promote: false,
        min_ps_feedback: empty_feedback,
+        wal_rate_limiter: empty_wal_rate_limiter,
    }
 }

--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
 use std::error::Error as _;
 use std::time::Duration;

@@ -251,6 +251,70 @@ impl Client {
        Ok(())
    }

+    pub async fn tenant_timeline_compact(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        force_image_layer_creation: bool,
+        must_force_image_layer_creation: bool,
+        scheduled: bool,
+        wait_until_done: bool,
+    ) -> Result<()> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/compact",
+            self.mgmt_api_endpoint
+        ))
+        .expect("Cannot build URL");
+
+        if force_image_layer_creation {
+            path.query_pairs_mut()
+                .append_pair("force_image_layer_creation", "true");
+        }
+
+        if must_force_image_layer_creation {
+            path.query_pairs_mut()
+                .append_pair("must_force_image_layer_creation", "true");
+        }
+
+        if scheduled {
+            path.query_pairs_mut().append_pair("scheduled", "true");
+        }
+        if wait_until_done {
+            path.query_pairs_mut()
+                .append_pair("wait_until_scheduled_compaction_done", "true");
+            path.query_pairs_mut()
+                .append_pair("wait_until_uploaded", "true");
+        }
+        self.request(Method::PUT, path, ()).await?;
+        Ok(())
+    }
+
+    /* BEGIN_HADRON */
+    pub async fn tenant_timeline_describe(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Result<TimelineInfo> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        ))
+        .expect("Cannot build URL");
+        path.query_pairs_mut()
+            .append_pair("include-image-consistent-lsn", "true");
+
+        let response: reqwest::Response = self.request(Method::GET, path, ()).await?;
+        let body = response.json().await.map_err(Error::ReceiveBody)?;
+        Ok(body)
+    }
+
+    pub async fn list_tenant_visible_size(&self) -> Result<BTreeMap<TenantShardId, u64>> {
+        let uri = format!("{}/v1/list_tenant_visible_size", self.mgmt_api_endpoint);
+        let resp = self.get(&uri).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+    /* END_HADRON */
+
    pub async fn tenant_scan_remote_storage(
        &self,
        tenant_id: TenantId,
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -4,12 +4,20 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

+[features]
+testing = ["pageserver_api/testing"]
+
 [dependencies]
 anyhow.workspace = true
+arc-swap.workspace = true
+bytes.workspace = true
+compute_api.workspace = true
 futures.workspace = true
+pageserver_api.workspace = true
 pageserver_page_api.workspace = true
 tokio.workspace = true
 tokio-stream.workspace = true
+tokio-util.workspace = true
 tonic.workspace = true
 tracing.workspace = true
 utils.workspace = true
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -0,0 +1,604 @@
+use std::collections::HashMap;
+use std::num::NonZero;
+use std::pin::pin;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use anyhow::anyhow;
+use arc_swap::ArcSwap;
+use futures::stream::FuturesUnordered;
+use futures::{FutureExt as _, StreamExt as _};
+use tonic::codec::CompressionEncoding;
+use tracing::{debug, instrument};
+use utils::logging::warn_slow;
+
+use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
+use crate::retry::Retry;
+use crate::split::GetPageSplitter;
+use compute_api::spec::PageserverProtocol;
+use pageserver_api::shard::ShardStripeSize;
+use pageserver_page_api as page_api;
+use utils::id::{TenantId, TimelineId};
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};
+
+/// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
+/// when full.
+///
+/// Normal requests are small, and we don't pipeline them, so we can afford a large number of
+/// streams per connection.
+///
+/// TODO: tune all of these constants, and consider making them configurable.
+const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(64).unwrap();
+
+/// Max number of concurrent bulk GetPage streams per channel (i.e. TCP connection). These use a
+/// dedicated channel pool with a lower client limit, to avoid TCP-level head-of-line blocking and
+/// transmission delays. This also concentrates large window sizes on a smaller set of
+/// streams/connections, presumably reducing memory use.
+const MAX_BULK_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
+
+/// The batch size threshold at which a GetPage request will use the bulk stream pool.
+///
+/// The gRPC initial window size is 64 KB. Each page is 8 KB, so let's avoid increasing the window
+/// size for the normal stream pool, and route requests for >= 5 pages (>32 KB) to the bulk pool.
+const BULK_THRESHOLD_BATCH_SIZE: usize = 5;
+
+/// The overall request call timeout, including retries and pool acquisition.
+/// TODO: should we retry forever? Should the caller decide?
+const CALL_TIMEOUT: Duration = Duration::from_secs(60);
+
+/// The per-request (retry attempt) timeout, including any lazy connection establishment.
+const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// The initial request retry backoff duration. The first retry does not back off.
+/// TODO: use a different backoff for ResourceExhausted (rate limiting)? Needs server support.
+const BASE_BACKOFF: Duration = Duration::from_millis(5);
+
+/// The maximum request retry backoff duration.
+const MAX_BACKOFF: Duration = Duration::from_secs(5);
+
+/// Threshold and interval for warning about slow operation.
+const SLOW_THRESHOLD: Duration = Duration::from_secs(3);
+
+/// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
+/// basic `page_api::Client` gRPC client, and supports:
+///
+/// * Sharded tenants across multiple Pageservers.
+/// * Pooling of connections, clients, and streams for efficient resource use.
+/// * Concurrent use by many callers.
+/// * Internal handling of GetPage bidirectional streams.
+/// * Automatic retries.
+/// * Observability.
+///
+/// The client has dedicated connection/client/stream pools per shard, for resource reuse. These
+/// pools are unbounded: we allow scaling out as many concurrent streams as needed to serve all
+/// concurrent callers, which mostly eliminates head-of-line blocking. Idle streams are fairly
+/// cheap: the server task currently uses 26 KB of memory, so we can comfortably fit 100,000
+/// concurrent idle streams (2.5 GB memory). The worst case degenerates to the old libpq case with
+/// one stream per backend, but without the TCP connection overhead. In the common case we expect
+/// significantly lower stream counts due to stream sharing, driven e.g. by idle backends, LFC hits,
+/// read coalescing, sharding (backends typically only talk to one shard at a time), etc.
+///
+/// TODO: this client does not support base backups or LSN leases, as these are only used by
+/// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
+pub struct PageserverClient {
+    /// The tenant ID.
+    tenant_id: TenantId,
+    /// The timeline ID.
+    timeline_id: TimelineId,
+    /// The JWT auth token for this tenant, if any.
+    auth_token: Option<String>,
+    /// The compression to use, if any.
+    compression: Option<CompressionEncoding>,
+    /// The shards for this tenant.
+    shards: ArcSwap<Shards>,
+}
+
+impl PageserverClient {
+    /// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given
+    /// in the shard spec, which must be complete and must use gRPC URLs.
+    pub fn new(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_spec: ShardSpec,
+        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
+    ) -> anyhow::Result<Self> {
+        let shards = Shards::new(
+            tenant_id,
+            timeline_id,
+            shard_spec,
+            auth_token.clone(),
+            compression,
+        )?;
+        Ok(Self {
+            tenant_id,
+            timeline_id,
+            auth_token,
+            compression,
+            shards: ArcSwap::new(Arc::new(shards)),
+        })
+    }
+
+    /// Updates the shards from the given shard spec. In-flight requests will complete using the
+    /// existing shards, but may retry with the new shards if they fail.
+    ///
+    /// TODO: verify that in-flight requests are allowed to complete, and that the old pools are
+    /// properly spun down and dropped afterwards.
+    pub fn update_shards(&self, shard_spec: ShardSpec) -> anyhow::Result<()> {
+        // Validate the shard spec. We should really use `ArcSwap::rcu` for this, to avoid races
+        // with concurrent updates, but that involves creating a new `Shards` on every attempt,
+        // which spins up a bunch of Tokio tasks and such. These should already be checked elsewhere
+        // in the stack, and if they're violated then we already have problems elsewhere, so a
+        // best-effort but possibly-racy check is okay here.
+        let old = self.shards.load_full();
+        if shard_spec.count < old.count {
+            return Err(anyhow!(
+                "can't reduce shard count from {} to {}",
+                old.count,
+                shard_spec.count
+            ));
+        }
+        if !old.count.is_unsharded() && shard_spec.stripe_size != old.stripe_size {
+            return Err(anyhow!(
+                "can't change stripe size from {} to {}",
+                old.stripe_size,
+                shard_spec.stripe_size
+            ));
+        }
+
+        let shards = Shards::new(
+            self.tenant_id,
+            self.timeline_id,
+            shard_spec,
+            self.auth_token.clone(),
+            self.compression,
+        )?;
+        self.shards.store(Arc::new(shards));
+        Ok(())
+    }
+
+    /// Returns whether a relation exists.
+    #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
+    pub async fn check_rel_exists(
+        &self,
+        req: page_api::CheckRelExistsRequest,
+    ) -> tonic::Result<page_api::CheckRelExistsResponse> {
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.check_rel_exists(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
+    }
+
+    /// Returns the total size of a database, as # of bytes.
+    #[instrument(skip_all, fields(db_oid=%req.db_oid, lsn=%req.read_lsn))]
+    pub async fn get_db_size(
+        &self,
+        req: page_api::GetDbSizeRequest,
+    ) -> tonic::Result<page_api::GetDbSizeResponse> {
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_db_size(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
+    }
+
+    /// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
+    /// `attempt` must be 0 (incremented on retry). Automatically splits requests that straddle
+    /// shard boundaries, and assembles the responses.
+    ///
+    /// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status`
+    /// errors. All responses will have `GetPageStatusCode::Ok`.
+    #[instrument(skip_all, fields(
+        req_id = %req.request_id,
+        class = %req.request_class,
+        rel = %req.rel,
+        blkno = %req.block_numbers[0],
+        blks = %req.block_numbers.len(),
+        lsn = %req.read_lsn,
+    ))]
+    pub async fn get_page(
+        &self,
+        req: page_api::GetPageRequest,
+    ) -> tonic::Result<page_api::GetPageResponse> {
+        // Make sure we have at least one page.
+        if req.block_numbers.is_empty() {
+            return Err(tonic::Status::invalid_argument("no block number"));
+        }
+        // The request attempt must be 0. The client will increment it internally.
+        if req.request_id.attempt != 0 {
+            return Err(tonic::Status::invalid_argument("request attempt must be 0"));
+        }
+
+        debug!("sending request: {req:?}");
+
+        // The shards may change while we're fetching pages. We execute the request using a stable
+        // view of the shards (especially important for requests that span shards), but retry the
+        // top-level (pre-split) request to pick up shard changes. This can lead to unnecessary
+        // retries and re-splits in some cases where requests span shards, but these are expected to
+        // be rare.
+        //
+        // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
+        // once we figure out how to handle these.
+        let resp = Self::with_retries(CALL_TIMEOUT, async |attempt| {
+            let mut req = req.clone();
+            req.request_id.attempt = attempt as u32;
+            let shards = self.shards.load_full();
+            Self::with_timeout(REQUEST_TIMEOUT, Self::get_page_with_shards(req, &shards)).await
+        })
+        .await?;
+
+        debug!("received response: {resp:?}");
+        Ok(resp)
+    }
+
+    /// Fetches pages using the given shards. This uses a stable view of the shards, regardless of
+    /// concurrent shard updates. Does not retry internally, but is retried by `get_page()`.
+    async fn get_page_with_shards(
+        req: page_api::GetPageRequest,
+        shards: &Shards,
+    ) -> tonic::Result<page_api::GetPageResponse> {
+        // Fast path: request is for a single shard.
+        if let Some(shard_id) =
+            GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)
+        {
+            return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
+        }
+
+        // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
+        // reassemble the responses.
+        let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size);
+
+        let mut shard_requests = FuturesUnordered::new();
+        for (shard_id, shard_req) in splitter.drain_requests() {
+            let future = Self::get_page_with_shard(shard_req, shards.get(shard_id)?)
+                .map(move |result| result.map(|resp| (shard_id, resp)));
+            shard_requests.push(future);
+        }
+
+        while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
+            splitter.add_response(shard_id, shard_response)?;
+        }
+
+        splitter.get_response()
+    }
+
+    /// Fetches pages on the given shard. Does not retry internally.
+    async fn get_page_with_shard(
+        req: page_api::GetPageRequest,
+        shard: &Shard,
+    ) -> tonic::Result<page_api::GetPageResponse> {
+        let mut stream = shard.stream(Self::is_bulk(&req)).await?;
+        let resp = stream.send(req.clone()).await?;
+
+        // Convert per-request errors into a tonic::Status.
+        if resp.status_code != page_api::GetPageStatusCode::Ok {
+            return Err(tonic::Status::new(
+                resp.status_code.into(),
+                resp.reason.unwrap_or_else(|| String::from("unknown error")),
+            ));
+        }
+
+        // Check that we received the expected pages.
+        if req.rel != resp.rel {
+            return Err(tonic::Status::internal(format!(
+                "shard {} returned wrong relation, expected {} got {}",
+                shard.id, req.rel, resp.rel
+            )));
+        }
+        if !req
+            .block_numbers
+            .iter()
+            .copied()
+            .eq(resp.pages.iter().map(|p| p.block_number))
+        {
+            return Err(tonic::Status::internal(format!(
+                "shard {} returned wrong pages, expected {:?} got {:?}",
+                shard.id,
+                req.block_numbers,
+                resp.pages
+                    .iter()
+                    .map(|page| page.block_number)
+                    .collect::<Vec<_>>()
+            )));
+        }
+
+        Ok(resp)
+    }
+
+    /// Returns the size of a relation, as # of blocks.
+    #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
+    pub async fn get_rel_size(
+        &self,
+        req: page_api::GetRelSizeRequest,
+    ) -> tonic::Result<page_api::GetRelSizeResponse> {
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_rel_size(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
+    }
+
+    /// Fetches an SLRU segment.
+    #[instrument(skip_all, fields(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn))]
+    pub async fn get_slru_segment(
+        &self,
+        req: page_api::GetSlruSegmentRequest,
+    ) -> tonic::Result<page_api::GetSlruSegmentResponse> {
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // SLRU segments are only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_slru_segment(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
+    }
+
+    /// Runs the given async closure with retries up to the given timeout. Only certain gRPC status
+    /// codes are retried, see [`Retry::should_retry`]. Returns `DeadlineExceeded` on timeout.
+    async fn with_retries<T, F, O>(timeout: Duration, f: F) -> tonic::Result<T>
+    where
+        F: FnMut(usize) -> O, // pass attempt number, starting at 0
+        O: Future<Output = tonic::Result<T>>,
+    {
+        Retry {
+            timeout: Some(timeout),
+            base_backoff: BASE_BACKOFF,
+            max_backoff: MAX_BACKOFF,
+        }
+        .with(f)
+        .await
+    }
+
+    /// Runs the given future with a timeout. Returns `DeadlineExceeded` on timeout.
+    async fn with_timeout<T>(
+        timeout: Duration,
+        f: impl Future<Output = tonic::Result<T>>,
+    ) -> tonic::Result<T> {
+        let started = Instant::now();
+        tokio::time::timeout(timeout, f).await.map_err(|_| {
+            tonic::Status::deadline_exceeded(format!(
+                "request timed out after {:.3}s",
+                started.elapsed().as_secs_f64()
+            ))
+        })?
+    }
+
+    /// Returns true if the request is considered a bulk request and should use the bulk pool.
+    fn is_bulk(req: &page_api::GetPageRequest) -> bool {
+        req.block_numbers.len() >= BULK_THRESHOLD_BATCH_SIZE
+    }
+}
+
+/// Shard specification for a PageserverClient.
+pub struct ShardSpec {
+    /// Maps shard indices to gRPC URLs.
+    ///
+    /// INVARIANT: every shard 0..count is present, and shard 0 is always present.
+    /// INVARIANT: every URL is valid and uses grpc:// scheme.
+    urls: HashMap<ShardIndex, String>,
+    /// The shard count.
+    ///
+    /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
+    count: ShardCount,
+    /// The stripe size for these shards.
+    stripe_size: ShardStripeSize,
+}
+
+impl ShardSpec {
+    /// Creates a new shard spec with the given URLs and stripe size. All shards must be given.
+    /// The stripe size may be omitted for unsharded tenants.
+    pub fn new(
+        urls: HashMap<ShardIndex, String>,
+        stripe_size: Option<ShardStripeSize>,
+    ) -> anyhow::Result<Self> {
+        // Compute the shard count.
+        let count = match urls.len() {
+            0 => return Err(anyhow!("no shards provided")),
+            1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()`
+            n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")),
+            n => ShardCount::new(n as u8),
+        };
+
+        // Determine the stripe size. It doesn't matter for unsharded tenants.
+        if stripe_size.is_none() && !count.is_unsharded() {
+            return Err(anyhow!("stripe size must be given for sharded tenants"));
+        }
+        let stripe_size = stripe_size.unwrap_or_default();
+
+        // Validate the shard spec.
+        for (shard_id, url) in &urls {
+            // The shard index must match the computed shard count, even for unsharded tenants.
+            if shard_id.shard_count != count {
+                return Err(anyhow!("invalid shard index {shard_id}, expected {count}"));
+            }
+            // The shard index' number and count must be consistent.
+            if !shard_id.is_unsharded() && shard_id.shard_number.0 >= shard_id.shard_count.0 {
+                return Err(anyhow!("invalid shard index {shard_id}"));
+            }
+            // The above conditions guarantee that we have all shards 0..count: len() matches count,
+            // shard number < count, and numbers are unique (via hashmap).
+
+            // Validate the URL.
+            if PageserverProtocol::from_connstring(url)? != PageserverProtocol::Grpc {
+                return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
+            }
+        }
+
+        Ok(Self {
+            urls,
+            count,
+            stripe_size,
+        })
+    }
+}
+
+/// Tracks the tenant's shards.
+struct Shards {
+    /// Shards by shard index.
+    ///
+    /// INVARIANT: every shard 0..count is present.
+    /// INVARIANT: shard 0 is always present.
+    by_index: HashMap<ShardIndex, Shard>,
+    /// The shard count.
+    ///
+    /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
+    count: ShardCount,
+    /// The stripe size. Only used for sharded tenants.
+    stripe_size: ShardStripeSize,
+}
+
+impl Shards {
+    /// Creates a new set of shards based on a shard spec.
+    fn new(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_spec: ShardSpec,
+        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
+    ) -> anyhow::Result<Self> {
+        // NB: the shard spec has already been validated when constructed.
+        let mut shards = HashMap::with_capacity(shard_spec.urls.len());
+        for (shard_id, url) in shard_spec.urls {
+            shards.insert(
+                shard_id,
+                Shard::new(
+                    url,
+                    tenant_id,
+                    timeline_id,
+                    shard_id,
+                    auth_token.clone(),
+                    compression,
+                )?,
+            );
+        }
+
+        Ok(Self {
+            by_index: shards,
+            count: shard_spec.count,
+            stripe_size: shard_spec.stripe_size,
+        })
+    }
+
+    /// Looks up the given shard.
+    #[allow(clippy::result_large_err)] // TODO: check perf impact
+    fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> {
+        self.by_index
+            .get(&shard_id)
+            .ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}")))
+    }
+
+    /// Returns shard 0.
+    fn get_zero(&self) -> &Shard {
+        self.get(ShardIndex::new(ShardNumber(0), self.count))
+            .expect("always present")
+    }
+}
+
+/// A single shard. Has dedicated resource pools with the following structure:
+///
+/// * Channel pool: MAX_CLIENTS_PER_CHANNEL.
+///   * Client pool: unbounded.
+///     * Stream pool: unbounded.
+/// * Bulk channel pool: MAX_BULK_CLIENTS_PER_CHANNEL.
+///   * Bulk client pool: unbounded.
+///     * Bulk stream pool: unbounded.
+///
+/// We use a separate bulk channel pool with a lower concurrency limit for large batch requests.
+/// This avoids TCP-level head-of-line blocking, and also concentrates large window sizes on a
+/// smaller set of streams/connections, which presumably reduces memory use. Neither of these pools
+/// are bounded, nor do they pipeline requests, so the latency characteristics should be mostly
+/// similar (except for TCP transmission time).
+///
+/// TODO: since we never use bounded pools, we could consider removing the pool limiters. However,
+/// the code is fairly trivial, so we may as well keep them around for now in case we need them.
+struct Shard {
+    /// The shard ID.
+    id: ShardIndex,
+    /// Unary gRPC client pool.
+    client_pool: Arc<ClientPool>,
+    /// GetPage stream pool.
+    stream_pool: Arc<StreamPool>,
+    /// GetPage stream pool for bulk requests.
+    bulk_stream_pool: Arc<StreamPool>,
+}
+
+impl Shard {
+    /// Creates a new shard. It has its own dedicated resource pools.
+    fn new(
+        url: String,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_id: ShardIndex,
+        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
+    ) -> anyhow::Result<Self> {
+        // Shard pools for unary requests and non-bulk GetPage requests.
+        let client_pool = ClientPool::new(
+            ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?,
+            tenant_id,
+            timeline_id,
+            shard_id,
+            auth_token.clone(),
+            compression,
+            None, // unbounded
+        );
+        let stream_pool = StreamPool::new(client_pool.clone(), None); // unbounded
+
+        // Bulk GetPage stream pool for large batches (prefetches, sequential scans, vacuum, etc.).
+        let bulk_stream_pool = StreamPool::new(
+            ClientPool::new(
+                ChannelPool::new(url, MAX_BULK_CLIENTS_PER_CHANNEL)?,
+                tenant_id,
+                timeline_id,
+                shard_id,
+                auth_token,
+                compression,
+                None, // unbounded,
+            ),
+            None, // unbounded
+        );
+
+        Ok(Self {
+            id: shard_id,
+            client_pool,
+            stream_pool,
+            bulk_stream_pool,
+        })
+    }
+
+    /// Returns a pooled client for this shard.
+    #[instrument(skip_all)]
+    async fn client(&self) -> tonic::Result<ClientGuard> {
+        warn_slow(
+            "client pool acquisition",
+            SLOW_THRESHOLD,
+            pin!(self.client_pool.get()),
+        )
+        .await
+    }
+
+    /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk pool.
+    #[instrument(skip_all, fields(bulk))]
+    async fn stream(&self, bulk: bool) -> tonic::Result<StreamGuard> {
+        let pool = match bulk {
+            false => &self.stream_pool,
+            true => &self.bulk_stream_pool,
+        };
+        warn_slow("stream pool acquisition", SLOW_THRESHOLD, pin!(pool.get())).await
+    }
+}
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -1,14 +1,6 @@
-//! A rich Pageserver gRPC client. This client is more capable than the basic `page_api::Client`
-//! gRPC client, and supports:
-//!
-//! * Sharded tenants across multiple Pageservers.
-//! * Pooling of connections, clients, and streams for efficient resource use.
-//! * Concurrent use by many callers.
-//! * Internal handling of GetPage bidirectional streams.
-//! * Automatic retries.
-//! * Observability.
-//!
-//! The client is under development, this package is just a shell.
-
-#[allow(unused)]
+mod client;
 mod pool;
+mod retry;
+mod split;
+
+pub use client::{PageserverClient, ShardSpec};
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -9,19 +9,36 @@
 //!
 //! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
 //!   can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
-//!   per-channel client limit. Channels may be closed when they are no longer used by any clients.
+//!   per-channel client limit. Channels are closed immediately when empty, and indirectly rely on
+//!   client/stream idle timeouts.
 //!
 //! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
 //!   channel from the ChannelPool for the client's lifetime. A client can only be acquired by a
-//!   single caller at a time, and is returned to the pool when dropped. Idle clients may be removed
-//!   from the pool after some time, to free up the channel.
+//!   single caller at a time, and is returned to the pool when dropped. Idle clients are removed
+//!   from the pool after a while to free up resources.
 //!
 //! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
-//!   ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
-//!   returns a guard that can be used to send a single request, to properly enforce queue depth and
-//!   route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
-//!   possibly pipelining multiple requests from multiple callers on the same stream (up to some
-//!   queue depth). Idle streams may be removed from the pool after a while to free up the client.
+//!   ClientPool for the stream's lifetime. A stream can only be acquired by a single caller at a
+//!   time, and is returned to the pool when dropped. Idle streams are removed from the pool after
+//!   a while to free up resources.
+//!
+//!   The stream only supports sending a single, synchronous request at a time, and does not support
+//!   pipelining multiple requests from different callers onto the same stream -- instead, we scale
+//!   out concurrent streams to improve throughput. There are many reasons for this design choice:
+//!
+//!     * It (mostly) eliminates head-of-line blocking. A single stream is processed sequentially by
+//!       a single server task, which may block e.g. on layer downloads, LSN waits, etc.
+//!
+//!     * Cancellation becomes trivial, by closing the stream. Otherwise, if a caller goes away
+//!       (e.g. because of a timeout), the request would still be processed by the server and block
+//!       requests behind it in the stream. It might even block its own timeout retry.
+//!
+//!     * Stream scheduling becomes significantly simpler and cheaper.
+//!
+//!     * Individual callers can still use client-side batching for pipelining.
+//!
+//!     * Idle streams are cheap. Benchmarks show that an idle GetPage stream takes up about 26 KB
+//!       per stream (2.5 GB for 100,000 streams), so we can afford to scale out.
 //!
 //! Each channel corresponds to one TCP connection. Each client unary request and each stream
 //! corresponds to one HTTP/2 stream and server task.
@@ -29,46 +46,56 @@
 //! TODO: error handling (including custom error types).
 //! TODO: observability.

-use std::collections::{BTreeMap, HashMap};
+use std::collections::BTreeMap;
+use std::num::NonZero;
 use std::ops::{Deref, DerefMut};
+use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, Weak};
+use std::time::{Duration, Instant};

-use futures::StreamExt as _;
-use tokio::sync::mpsc::{Receiver, Sender};
-use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use futures::{Stream, StreamExt as _};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, watch};
+use tokio_stream::wrappers::WatchStream;
+use tokio_util::sync::CancellationToken;
+use tonic::codec::CompressionEncoding;
 use tonic::transport::{Channel, Endpoint};
-use tracing::{error, warn};

 use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;

-/// Max number of concurrent clients per channel.
+/// Reap clients/streams that have been idle for this long. Channels are reaped immediately when
+/// empty, and indirectly rely on the client/stream idle timeouts.
 ///
-/// TODO: tune these constants, and make them configurable.
-/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
-/// with only streams.
-const CLIENTS_PER_CHANNEL: usize = 16;
+/// A stream's client will be reaped after 2x the idle threshold (first stream the client), but
+/// that's okay -- if the stream closes abruptly (e.g. due to timeout or cancellation), we want to
+/// keep its client around in the pool for a while.
+const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
+    false => Duration::from_secs(180),
+    true => Duration::from_secs(1), // exercise reaping in tests
+};

-/// Maximum number of concurrent clients per `ClientPool`.
-const CLIENT_LIMIT: usize = 64;
-
-/// Max number of pipelined requests per gRPC GetPage stream.
-const STREAM_QUEUE_DEPTH: usize = 2;
+/// Reap idle resources with this interval.
+const REAP_IDLE_INTERVAL: Duration = match cfg!(any(test, feature = "testing")) {
+    false => Duration::from_secs(10),
+    true => Duration::from_secs(1), // exercise reaping in tests
+};

 /// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
-/// stream multiplexing), up to `CLIENTS_PER_CHANNEL`. The pool does not limit the number of
-/// channels, and instead relies on `ClientPool` to limit the number of concurrent clients.
+/// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this.
+/// The pool does not limit the number of channels, and instead relies on `ClientPool` or
+/// `StreamPool` to limit the number of concurrent clients.
 ///
 /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
 ///
-/// TODO: reap idle channels.
 /// TODO: consider prewarming a set of channels, to avoid initial connection latency.
 /// TODO: consider adding a circuit breaker for errors and fail fast.
 pub struct ChannelPool {
    /// Pageserver endpoint to connect to.
    endpoint: Endpoint,
+    /// Max number of clients per channel. Beyond this, a new channel will be created.
+    max_clients_per_channel: NonZero<usize>,
    /// Open channels.
    channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
    /// Channel ID generator.
@@ -86,13 +113,14 @@ struct ChannelEntry {

 impl ChannelPool {
    /// Creates a new channel pool for the given Pageserver endpoint.
-    pub fn new<E>(endpoint: E) -> anyhow::Result<Arc<Self>>
+    pub fn new<E>(endpoint: E, max_clients_per_channel: NonZero<usize>) -> anyhow::Result<Arc<Self>>
    where
        E: TryInto<Endpoint> + Send + Sync + 'static,
        <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
    {
        Ok(Arc::new(Self {
            endpoint: endpoint.try_into()?,
+            max_clients_per_channel,
            channels: Mutex::default(),
            next_channel_id: AtomicUsize::default(),
        }))
@@ -116,12 +144,16 @@ impl ChannelPool {
        let mut channels = self.channels.lock().unwrap();

        // Try to find an existing channel with available capacity. We check entries in BTreeMap
-        // order, to fill up the lower-ordered channels first. The ClientPool also prefers clients
-        // with lower-ordered channel IDs first. This will cluster clients in lower-ordered
+        // order, to fill up the lower-ordered channels first. The client/stream pools also prefer
+        // clients with lower-ordered channel IDs first. This will cluster clients in lower-ordered
        // channels, and free up higher-ordered channels such that they can be reaped.
        for (&id, entry) in channels.iter_mut() {
-            assert!(entry.clients <= CLIENTS_PER_CHANNEL, "channel overflow");
-            if entry.clients < CLIENTS_PER_CHANNEL {
+            assert!(
+                entry.clients <= self.max_clients_per_channel.get(),
+                "channel overflow"
+            );
+            assert_ne!(entry.clients, 0, "empty channel not reaped");
+            if entry.clients < self.max_clients_per_channel.get() {
                entry.clients += 1;
                return ChannelGuard {
                    pool: Arc::downgrade(self),
@@ -166,26 +198,30 @@ impl ChannelGuard {
    }
 }

-/// Returns the channel to the pool.
+/// Returns the channel to the pool. The channel is closed when empty.
 impl Drop for ChannelGuard {
    fn drop(&mut self) {
        let Some(pool) = self.pool.upgrade() else {
            return; // pool was dropped
        };
+
        let mut channels = pool.channels.lock().unwrap();
        let entry = channels.get_mut(&self.id).expect("unknown channel");
        assert!(entry.clients > 0, "channel underflow");
        entry.clients -= 1;
+
+        // Reap empty channels immediately.
+        if entry.clients == 0 {
+            channels.remove(&self.id);
+        }
    }
 }

 /// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner
 /// `ChannelPool`. A client is only given out to single caller at a time. The pool limits the total
-/// number of concurrent clients to `CLIENT_LIMIT` via semaphore.
+/// number of concurrent clients to `max_clients` via semaphore.
 ///
 /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
-///
-/// TODO: reap idle clients.
 pub struct ClientPool {
    /// Tenant ID.
    tenant_id: TenantId,
@@ -195,17 +231,20 @@ pub struct ClientPool {
    shard_id: ShardIndex,
    /// Authentication token, if any.
    auth_token: Option<String>,
+    /// Compression to use.
+    compression: Option<CompressionEncoding>,
    /// Channel pool to acquire channels from.
    channel_pool: Arc<ChannelPool>,
-    /// Limits the max number of concurrent clients for this pool.
-    limiter: Arc<Semaphore>,
+    /// Limits the max number of concurrent clients for this pool. None if the pool is unbounded.
+    limiter: Option<Arc<Semaphore>>,
    /// Idle pooled clients. Acquired clients are removed from here and returned on drop.
    ///
    /// The first client in the map will be acquired next. The map is sorted by client ID, which in
    /// turn is sorted by its channel ID, such that we prefer acquiring idle clients from
-    /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
-    /// clients are reaped.
+    /// lower-ordered channels. This allows us to free up and reap higher-ordered channels.
    idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
+    /// Reaps idle clients.
+    idle_reaper: Reaper,
    /// Unique client ID generator.
    next_client_id: AtomicUsize,
 }
@@ -217,44 +256,53 @@ struct ClientEntry {
    client: page_api::Client,
    /// The channel guard for the channel used by the client.
    channel_guard: ChannelGuard,
+    /// The client has been idle since this time. All clients in `ClientPool::idle` are idle by
+    /// definition, so this is the time when it was added back to the pool.
+    idle_since: Instant,
 }

 impl ClientPool {
    /// Creates a new client pool for the given tenant shard. Channels are acquired from the given
-    /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard.
+    /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard. Allows up to
+    /// `max_clients` concurrent clients, or unbounded if None.
    pub fn new(
        channel_pool: Arc<ChannelPool>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        shard_id: ShardIndex,
        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
+        max_clients: Option<NonZero<usize>>,
    ) -> Arc<Self> {
-        Arc::new(Self {
+        let pool = Arc::new(Self {
            tenant_id,
            timeline_id,
            shard_id,
            auth_token,
+            compression,
            channel_pool,
            idle: Mutex::default(),
-            limiter: Arc::new(Semaphore::new(CLIENT_LIMIT)),
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
+            limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))),
            next_client_id: AtomicUsize::default(),
-        })
+        });
+        pool.idle_reaper.spawn(&pool);
+        pool
    }

    /// Gets a client from the pool, or creates a new one if necessary. Connections are established
-    /// lazily and do not block, but this call can block if the pool is at `CLIENT_LIMIT`. The
-    /// client is returned to the pool when the guard is dropped.
+    /// lazily and do not block, but this call can block if the pool is at `max_clients`. The client
+    /// is returned to the pool when the guard is dropped.
    ///
    /// This is moderately performance-sensitive. It is called for every unary request, but these
    /// establish a new gRPC stream per request so they're already expensive. GetPage requests use
    /// the `StreamPool` instead.
-    pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
-        let permit = self
-            .limiter
-            .clone()
-            .acquire_owned()
-            .await
-            .expect("never closed");
+    pub async fn get(self: &Arc<Self>) -> tonic::Result<ClientGuard> {
+        // Acquire a permit if the pool is bounded.
+        let mut permit = None;
+        if let Some(limiter) = self.limiter.clone() {
+            permit = Some(limiter.acquire_owned().await.expect("never closed"));
+        }

        // Fast path: acquire an idle client from the pool.
        if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() {
@@ -267,7 +315,7 @@ impl ClientPool {
            });
        }

-        // Slow path: construct a new client.
+        // Construct a new client.
        let mut channel_guard = self.channel_pool.get();
        let client = page_api::Client::new(
            channel_guard.take(),
@@ -275,8 +323,9 @@ impl ClientPool {
            self.timeline_id,
            self.shard_id,
            self.auth_token.clone(),
-            None,
-        )?;
+            self.compression,
+        )
+        .map_err(|err| tonic::Status::internal(format!("failed to create client: {err}")))?;

        Ok(ClientGuard {
            pool: Arc::downgrade(self),
@@ -291,14 +340,24 @@ impl ClientPool {
    }
 }

+impl Reapable for ClientPool {
+    /// Reaps clients that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.idle
+            .lock()
+            .unwrap()
+            .retain(|_, entry| entry.idle_since >= cutoff)
+    }
+}
+
 /// A client acquired from the pool. The inner client can be accessed via Deref. The client is
 /// returned to the pool when dropped.
 pub struct ClientGuard {
    pool: Weak<ClientPool>,
    id: ClientID,
-    client: Option<page_api::Client>,    // Some until dropped
-    channel_guard: Option<ChannelGuard>, // Some until dropped
-    permit: OwnedSemaphorePermit,
+    client: Option<page_api::Client>,     // Some until dropped
+    channel_guard: Option<ChannelGuard>,  // Some until dropped
+    permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
 }

 impl Deref for ClientGuard {
@@ -321,9 +380,11 @@ impl Drop for ClientGuard {
        let Some(pool) = self.pool.upgrade() else {
            return; // pool was dropped
        };
+
        let entry = ClientEntry {
            client: self.client.take().expect("dropped once"),
            channel_guard: self.channel_guard.take().expect("dropped once"),
+            idle_since: Instant::now(),
        };
        pool.idle.lock().unwrap().insert(self.id, entry);

@@ -334,253 +395,268 @@ impl Drop for ClientGuard {
 /// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
 /// acquires a client from the inner `ClientPool` for the stream's lifetime.
 ///
-/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send
-/// a single request and await the response. Internally, requests are multiplexed across streams and
-/// channels. This allows proper queue depth enforcement and response routing.
+/// Individual streams only send a single request at a time, and do not pipeline multiple callers
+/// onto the same stream. Instead, we scale out the number of concurrent streams. This is primarily
+/// to eliminate head-of-line blocking. See the module documentation for more details.
 ///
-/// TODO: reap idle streams.
 /// TODO: consider making this generic over request and response types; not currently needed.
 pub struct StreamPool {
-    /// The client pool to acquire clients from.
+    /// The client pool to acquire clients from. Must be unbounded.
    client_pool: Arc<ClientPool>,
-    /// All pooled streams.
+    /// Idle pooled streams. Acquired streams are removed from here and returned on drop.
    ///
-    /// Incoming requests will be sent over an existing stream with available capacity. If all
-    /// streams are full, a new one is spun up and added to the pool (up to the `ClientPool` limit).
-    /// Each stream has an associated Tokio task that processes requests and responses.
-    streams: Arc<Mutex<HashMap<StreamID, StreamEntry>>>,
-    /// Limits the max number of concurrent requests (not streams).
-    limiter: Arc<Semaphore>,
-    /// Stream ID generator.
-    next_stream_id: AtomicUsize,
+    /// The first stream in the map will be acquired next. The map is sorted by stream ID, which is
+    /// equivalent to the client ID and in turn sorted by its channel ID. This way we prefer
+    /// acquiring idle streams from lower-ordered channels, which allows us to free up and reap
+    /// higher-ordered channels.
+    idle: Mutex<BTreeMap<StreamID, StreamEntry>>,
+    /// Limits the max number of concurrent streams. None if the pool is unbounded.
+    limiter: Option<Arc<Semaphore>>,
+    /// Reaps idle streams.
+    idle_reaper: Reaper,
 }

-type StreamID = usize;
-type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>;
-type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>;
-type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
+/// The stream ID. Reuses the inner client ID.
+type StreamID = ClientID;

+/// A pooled stream.
 struct StreamEntry {
-    /// Sends caller requests to the stream task. The stream task exits when this is dropped.
-    sender: RequestSender,
-    /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on
-    /// completion without acquiring the `StreamPool::streams` lock.
-    queue_depth: Arc<AtomicUsize>,
+    /// The bidirectional stream.
+    stream: BiStream,
+    /// The time when this stream was last used, i.e. when it was put back into `StreamPool::idle`.
+    idle_since: Instant,
+}
+
+/// A bidirectional GetPage stream and its client. Can send requests and receive responses.
+struct BiStream {
+    /// The owning client. Holds onto the channel slot while the stream is alive.
+    client: ClientGuard,
+    /// Stream for sending requests. Uses a watch channel, so it can only send a single request at a
+    /// time, and the caller must await the response before sending another request. This is
+    /// enforced by `StreamGuard::send`.
+    sender: watch::Sender<page_api::GetPageRequest>,
+    /// Stream for receiving responses.
+    receiver: Pin<Box<dyn Stream<Item = tonic::Result<page_api::GetPageResponse>> + Send>>,
 }

 impl StreamPool {
-    /// Creates a new stream pool, using the given client pool.
+    /// Creates a new stream pool, using the given client pool. It will use up to `max_streams`
+    /// concurrent streams.
    ///
-    /// NB: the stream pool should use a dedicated client pool. Otherwise, long-lived streams may
-    /// fill up the client pool and starve out unary requests. Client pools can share the same
-    /// `ChannelPool` though, since the channel pool is unbounded.
-    pub fn new(client_pool: Arc<ClientPool>) -> Arc<Self> {
-        Arc::new(Self {
+    /// The client pool must be unbounded. The stream pool will enforce its own limits, and because
+    /// streams are long-lived they can cause persistent starvation if they exhaust the client pool.
+    /// The stream pool should generally have its own dedicated client pool (but it can share a
+    /// channel pool with others since these are always unbounded).
+    pub fn new(client_pool: Arc<ClientPool>, max_streams: Option<NonZero<usize>>) -> Arc<Self> {
+        assert!(client_pool.limiter.is_none(), "bounded client pool");
+        let pool = Arc::new(Self {
            client_pool,
-            streams: Arc::default(),
-            limiter: Arc::new(Semaphore::new(CLIENT_LIMIT * STREAM_QUEUE_DEPTH)),
-            next_stream_id: AtomicUsize::default(),
-        })
+            idle: Mutex::default(),
+            limiter: max_streams.map(|max_streams| Arc::new(Semaphore::new(max_streams.get()))),
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
+        });
+        pool.idle_reaper.spawn(&pool);
+        pool
    }

-    /// Acquires an available stream from the pool, or spins up a new stream async if all streams
-    /// are full. Returns a guard that can be used to send a single request on the stream and await
-    /// the response, with queue depth quota already acquired. Blocks if the pool is at capacity
-    /// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight).
+    /// Acquires an available stream from the pool, or spins up a new stream if all streams are
+    /// full. Returns a guard that can be used to send requests and await the responses. Blocks if
+    /// the pool is full.
    ///
    /// This is very performance-sensitive, as it is on the GetPage hot path.
    ///
-    /// TODO: this must do something more sophisticated for performance. We want:
-    ///
-    /// * Cheap, concurrent access in the common case where we can use a pooled stream.
-    /// * Quick acquisition of pooled streams with available capacity.
-    /// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
-    /// * Prefer filling up existing streams' queue depth before spinning up new streams.
-    /// * Don't hold a lock while spinning up new streams.
-    /// * Allow concurrent clients to join onto streams while they're spun up.
-    /// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
-    ///
-    /// For now, we just do something simple and functional, but very inefficient (linear scan).
-    pub async fn get(&self) -> StreamGuard {
-        let permit = self
-            .limiter
-            .clone()
-            .acquire_owned()
-            .await
-            .expect("never closed");
-        let mut streams = self.streams.lock().unwrap();
-
-        // Look for a pooled stream with available capacity.
-        for entry in streams.values() {
-            assert!(
-                entry.queue_depth.load(Ordering::Relaxed) <= STREAM_QUEUE_DEPTH,
-                "stream queue overflow"
-            );
-            if entry
-                .queue_depth
-                .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| {
-                    // Increment the queue depth via compare-and-swap.
-                    // TODO: review ordering.
-                    (queue_depth < STREAM_QUEUE_DEPTH).then_some(queue_depth + 1)
-                })
-                .is_ok()
-            {
-                return StreamGuard {
-                    sender: entry.sender.clone(),
-                    queue_depth: entry.queue_depth.clone(),
-                    permit,
-                };
-            }
+    /// TODO: is a `Mutex<BTreeMap>` performant enough? Will it become too contended? We can't
+    /// trivially use e.g. DashMap or sharding, because we want to pop lower-ordered streams first
+    /// to free up higher-ordered channels.
+    pub async fn get(self: &Arc<Self>) -> tonic::Result<StreamGuard> {
+        // Acquire a permit if the pool is bounded.
+        let mut permit = None;
+        if let Some(limiter) = self.limiter.clone() {
+            permit = Some(limiter.acquire_owned().await.expect("never closed"));
        }

-        // No available stream, spin up a new one. We install the stream entry in the pool first and
-        // return the guard, while spinning up the stream task async. This allows other callers to
-        // join onto this stream and also create additional streams concurrently if this fills up.
-        let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
-        let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller
-        let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH);
-        let entry = StreamEntry {
-            sender: req_tx.clone(),
-            queue_depth: queue_depth.clone(),
-        };
-        streams.insert(id, entry);
+        // Fast path: acquire an idle stream from the pool.
+        if let Some((_, entry)) = self.idle.lock().unwrap().pop_first() {
+            return Ok(StreamGuard {
+                pool: Arc::downgrade(self),
+                stream: Some(entry.stream),
+                can_reuse: true,
+                permit,
+            });
+        }

-        // NB: make sure we don't overshoot the client limit. The semaphore limit is CLIENT_LIMIT *
-        // STREAM_QUEUE_DEPTH, but if we were to misaccount queue depth we'd try to spin up more
-        // streams than CLIENT_LIMIT and block on the client pool ~forever. This should not happen
-        // because we only acquire queue depth under lock and after acquiring a semaphore permit.
-        assert!(streams.len() <= CLIENT_LIMIT, "stream overflow");
+        // Spin up a new stream. Uses a watch channel to send a single request at a time, since
+        // `StreamGuard::send` enforces this anyway and it avoids unnecessary channel overhead.
+        let mut client = self.client_pool.get().await?;

-        let client_pool = self.client_pool.clone();
-        let streams = self.streams.clone();
+        let (req_tx, req_rx) = watch::channel(page_api::GetPageRequest::default());
+        let req_stream = WatchStream::from_changes(req_rx);
+        let resp_stream = client.get_pages(req_stream).await?;

-        tokio::spawn(async move {
-            if let Err(err) = Self::run_stream(client_pool, req_rx).await {
-                error!("stream failed: {err}");
-            }
-            // Remove stream from pool on exit.
-            let entry = streams.lock().unwrap().remove(&id);
-            assert!(entry.is_some(), "unknown stream ID: {id}");
-        });
-
-        StreamGuard {
-            sender: req_tx,
-            queue_depth,
+        Ok(StreamGuard {
+            pool: Arc::downgrade(self),
+            stream: Some(BiStream {
+                client,
+                sender: req_tx,
+                receiver: Box::pin(resp_stream),
+            }),
+            can_reuse: true,
            permit,
-        }
-    }
-
-    /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
-    /// bidirectional GetPage stream, then forwards requests and responses between callers and the
-    /// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be
-    /// atomic with pool stream acquisition.
-    ///
-    /// The task exits when the request channel is closed, or on a stream error. The caller is
-    /// responsible for removing the stream from the pool on exit.
-    async fn run_stream(
-        client_pool: Arc<ClientPool>,
-        mut caller_rx: RequestReceiver,
-    ) -> anyhow::Result<()> {
-        // Acquire a client from the pool and create a stream.
-        let mut client = client_pool.get().await?;
-
-        let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH);
-        let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
-        let mut resp_stream = client.get_pages(req_stream).await?;
-
-        // Track caller response channels by request ID. If the task returns early, these response
-        // channels will be dropped and the waiting callers will receive an error.
-        let mut callers = HashMap::with_capacity(STREAM_QUEUE_DEPTH);
-
-        // Process requests and responses.
-        loop {
-            // NB: this can trip if the server doesn't respond to a request, so only debug_assert.
-            debug_assert!(callers.len() <= STREAM_QUEUE_DEPTH, "stream queue overflow");
-
-            tokio::select! {
-                // Receive requests from callers and send them to the stream.
-                req = caller_rx.recv() => {
-                    // Shut down if request channel is closed.
-                    let Some((req, resp_tx)) = req else {
-                        return Ok(());
-                    };
-
-                    // Store the response channel by request ID.
-                    if callers.contains_key(&req.request_id) {
-                        // Error on request ID duplicates. Ignore callers that went away.
-                        _ = resp_tx.send(Err(tonic::Status::invalid_argument(
-                            format!("duplicate request ID: {}", req.request_id),
-                        )));
-                        continue;
-                    }
-                    callers.insert(req.request_id, resp_tx);
-
-                    // Send the request on the stream. Bail out if the send fails.
-                    req_tx.send(req).await.map_err(|_| {
-                        tonic::Status::unavailable("stream closed")
-                    })?;
-                }
-
-                // Receive responses from the stream and send them to callers.
-                resp = resp_stream.next() => {
-                    // Shut down if the stream is closed, and bail out on stream errors.
-                    let Some(resp) = resp.transpose()? else {
-                        return Ok(())
-                    };
-
-                    // Send the response to the caller. Ignore errors if the caller went away.
-                    let Some(resp_tx) = callers.remove(&resp.request_id) else {
-                        warn!("received response for unknown request ID: {}", resp.request_id);
-                        continue;
-                    };
-                    _ = resp_tx.send(Ok(resp));
-                }
-            }
-        }
+        })
    }
 }

-/// A pooled stream reference. Can be used to send a single request, to properly enforce queue
-/// depth. Queue depth is already reserved and will be returned on drop.
+impl Reapable for StreamPool {
+    /// Reaps streams that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.idle
+            .lock()
+            .unwrap()
+            .retain(|_, entry| entry.idle_since >= cutoff);
+    }
+}
+
+/// A stream acquired from the pool. Returned to the pool when dropped, unless there are still
+/// in-flight requests on the stream, or the stream failed.
 pub struct StreamGuard {
-    sender: RequestSender,
-    queue_depth: Arc<AtomicUsize>,
-    permit: OwnedSemaphorePermit,
+    pool: Weak<StreamPool>,
+    stream: Option<BiStream>,             // Some until dropped
+    can_reuse: bool,                      // returned to pool if true
+    permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
 }

 impl StreamGuard {
-    /// Sends a request on the stream and awaits the response. Consumes the guard, since it's only
-    /// valid for a single request (to enforce queue depth). This also drops the guard on return and
-    /// returns the queue depth quota to the pool.
+    /// Sends a request on the stream and awaits the response. If the future is dropped before it
+    /// resolves (e.g. due to a timeout or cancellation), the stream will be closed to cancel the
+    /// request and is not returned to the pool. The same is true if the stream errors, in which
+    /// case the caller can't send further requests on the stream.
    ///
-    /// The `GetPageRequest::request_id` must be unique across in-flight requests.
+    /// We only support sending a single request at a time, to eliminate head-of-line blocking. See
+    /// module documentation for details.
    ///
    /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
    /// to avoid tearing down the stream for per-request errors. Callers must check this.
    pub async fn send(
-        self,
+        &mut self,
        req: page_api::GetPageRequest,
    ) -> tonic::Result<page_api::GetPageResponse> {
-        let (resp_tx, resp_rx) = oneshot::channel();
+        let req_id = req.request_id;
+        let stream = self.stream.as_mut().expect("not dropped");

-        self.sender
-            .send((req, resp_tx))
-            .await
+        // Mark the stream as not reusable while the request is in flight. We can't return the
+        // stream to the pool until we receive the response, to avoid head-of-line blocking and
+        // stale responses. Failed streams can't be reused either.
+        if !self.can_reuse {
+            return Err(tonic::Status::internal("stream can't be reused"));
+        }
+        self.can_reuse = false;
+
+        // Send the request and receive the response.
+        //
+        // NB: this uses a watch channel, so it's unsafe to change this code to pipeline requests.
+        stream
+            .sender
+            .send(req)
            .map_err(|_| tonic::Status::unavailable("stream closed"))?;

-        resp_rx
+        let resp = stream
+            .receiver
+            .next()
            .await
-            .map_err(|_| tonic::Status::unavailable("stream closed"))?
+            .ok_or_else(|| tonic::Status::unavailable("stream closed"))??;
+
+        if resp.request_id != req_id {
+            return Err(tonic::Status::internal(format!(
+                "response ID {} does not match request ID {}",
+                resp.request_id, req_id
+            )));
+        }
+
+        // Success, mark the stream as reusable.
+        self.can_reuse = true;
+
+        Ok(resp)
    }
 }

 impl Drop for StreamGuard {
    fn drop(&mut self) {
-        // Release the queue depth reservation on drop. This can prematurely decrement it if dropped
-        // before the response is received, but that's okay.
-        let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst);
-        assert!(prev_queue_depth > 0, "stream queue underflow");
+        let Some(pool) = self.pool.upgrade() else {
+            return; // pool was dropped
+        };
+
+        // If the stream isn't reusable, it can't be returned to the pool.
+        if !self.can_reuse {
+            return;
+        }
+
+        // Place the idle stream back into the pool.
+        let entry = StreamEntry {
+            stream: self.stream.take().expect("dropped once"),
+            idle_since: Instant::now(),
+        };
+        pool.idle
+            .lock()
+            .unwrap()
+            .insert(entry.stream.client.id, entry);

        _ = self.permit; // returned on drop, referenced for visibility
    }
 }
+
+/// Periodically reaps idle resources from a pool.
+struct Reaper {
+    /// The task check interval.
+    interval: Duration,
+    /// The threshold for reaping idle resources.
+    threshold: Duration,
+    /// Cancels the reaper task. Cancelled when the reaper is dropped.
+    cancel: CancellationToken,
+}
+
+impl Reaper {
+    /// Creates a new reaper.
+    pub fn new(threshold: Duration, interval: Duration) -> Self {
+        Self {
+            cancel: CancellationToken::new(),
+            threshold,
+            interval,
+        }
+    }
+
+    /// Spawns a task to periodically reap idle resources from the given task pool. The task is
+    /// cancelled when the reaper is dropped.
+    pub fn spawn(&self, pool: &Arc<impl Reapable>) {
+        // NB: hold a weak pool reference, otherwise the task will prevent dropping the pool.
+        let pool = Arc::downgrade(pool);
+        let cancel = self.cancel.clone();
+        let (interval, threshold) = (self.interval, self.threshold);
+
+        tokio::spawn(async move {
+            loop {
+                tokio::select! {
+                    _ = tokio::time::sleep(interval) => {
+                        let Some(pool) = pool.upgrade() else {
+                            return; // pool was dropped
+                        };
+                        pool.reap_idle(Instant::now() - threshold);
+                    }
+
+                    _ = cancel.cancelled() => return,
+                }
+            }
+        });
+    }
+}
+
+impl Drop for Reaper {
+    fn drop(&mut self) {
+        self.cancel.cancel(); // cancel reaper task
+    }
+}
+
+/// A reapable resource pool.
+trait Reapable: Send + Sync + 'static {
+    /// Reaps resources that have been idle since before the given cutoff.
+    fn reap_idle(&self, cutoff: Instant);
+}
--- a/pageserver/client_grpc/src/retry.rs
+++ b/pageserver/client_grpc/src/retry.rs
@@ -0,0 +1,150 @@
+use std::time::Duration;
+
+use futures::future::pending;
+use tokio::time::Instant;
+use tracing::{error, info, warn};
+
+use utils::backoff::exponential_backoff_duration;
+
+/// A retry handler for Pageserver gRPC requests.
+///
+/// This is used instead of backoff::retry for better control and observability.
+pub struct Retry {
+    /// Timeout across all retry attempts. If None, retries forever.
+    pub timeout: Option<Duration>,
+    /// The initial backoff duration. The first retry does not use a backoff.
+    pub base_backoff: Duration,
+    /// The maximum backoff duration.
+    pub max_backoff: Duration,
+}
+
+impl Retry {
+    /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
+    /// using the current tracing span for context.
+    ///
+    /// Only certain gRPC status codes are retried, see [`Self::should_retry`].
+    pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
+    where
+        F: FnMut(usize) -> O, // pass attempt number, starting at 0
+        O: Future<Output = tonic::Result<T>>,
+    {
+        let started = Instant::now();
+        let deadline = self.timeout.map(|timeout| started + timeout);
+        let mut last_error = None;
+        let mut retries = 0;
+        loop {
+            // Set up a future to wait for the backoff, if any, and run the closure.
+            let backoff_and_try = async {
+                // NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
+                // https://github.com/tokio-rs/tokio/issues/6866
+                if let Some(backoff) = self.backoff_duration(retries) {
+                    tokio::time::sleep(backoff).await;
+                }
+
+                f(retries).await
+            };
+
+            // Set up a future for the timeout, if any.
+            let timeout = async {
+                match deadline {
+                    Some(deadline) => tokio::time::sleep_until(deadline).await,
+                    None => pending().await,
+                }
+            };
+
+            // Wait for the backoff and request, or bail out if the timeout is exceeded.
+            let result = tokio::select! {
+                result = backoff_and_try => result,
+
+                _ = timeout => {
+                    let last_error = last_error.unwrap_or_else(|| {
+                        tonic::Status::deadline_exceeded(format!(
+                            "request timed out after {:.3}s",
+                            started.elapsed().as_secs_f64()
+                        ))
+                    });
+                    error!(
+                        "giving up after {:.3}s and {retries} retries, last error {:?}: {}",
+                        started.elapsed().as_secs_f64(), last_error.code(), last_error.message(),
+                    );
+                    return Err(last_error);
+                }
+            };
+
+            match result {
+                // Success, return the result.
+                Ok(result) => {
+                    if retries > 0 {
+                        info!(
+                            "request succeeded after {retries} retries in {:.3}s",
+                            started.elapsed().as_secs_f64(),
+                        );
+                    }
+
+                    return Ok(result);
+                }
+
+                // Error, retry or bail out.
+                Err(status) => {
+                    let (code, message) = (status.code(), status.message());
+                    let attempt = retries + 1;
+
+                    if !Self::should_retry(code) {
+                        // NB: include the attempt here too. This isn't necessarily the first
+                        // attempt, because the error may change between attempts.
+                        error!(
+                            "request failed with {code:?}: {message}, not retrying (attempt {attempt})"
+                        );
+                        return Err(status);
+                    }
+
+                    warn!("request failed with {code:?}: {message}, retrying (attempt {attempt})");
+
+                    retries += 1;
+                    last_error = Some(status);
+                }
+            }
+        }
+    }
+
+    /// Returns the backoff duration for the given retry attempt, or None for no backoff. The first
+    /// attempt and first retry never backs off, so this returns None for 0 and 1 retries.
+    fn backoff_duration(&self, retries: usize) -> Option<Duration> {
+        let backoff = exponential_backoff_duration(
+            (retries as u32).saturating_sub(1), // first retry does not back off
+            self.base_backoff.as_secs_f64(),
+            self.max_backoff.as_secs_f64(),
+        );
+        (!backoff.is_zero()).then_some(backoff)
+    }
+
+    /// Returns true if the given status code should be retries.
+    fn should_retry(code: tonic::Code) -> bool {
+        match code {
+            tonic::Code::Ok => panic!("unexpected Ok status code"),
+
+            // These codes are transient, so retry them.
+            tonic::Code::Aborted => true,
+            tonic::Code::Cancelled => true,
+            tonic::Code::DeadlineExceeded => true, // maybe transient slowness
+            tonic::Code::ResourceExhausted => true,
+            tonic::Code::Unavailable => true,
+
+            // The following codes will like continue to fail, so don't retry.
+            tonic::Code::AlreadyExists => false,
+            tonic::Code::DataLoss => false,
+            tonic::Code::FailedPrecondition => false,
+            // NB: don't retry Internal. It is intended for serious errors such as invariant
+            // violations, and is also used for client-side invariant checks that would otherwise
+            // result in retry loops.
+            tonic::Code::Internal => false,
+            tonic::Code::InvalidArgument => false,
+            tonic::Code::NotFound => false,
+            tonic::Code::OutOfRange => false,
+            tonic::Code::PermissionDenied => false,
+            tonic::Code::Unauthenticated => false,
+            tonic::Code::Unimplemented => false,
+            tonic::Code::Unknown => false,
+        }
+    }
+}
--- a/pageserver/client_grpc/src/split.rs
+++ b/pageserver/client_grpc/src/split.rs
@@ -0,0 +1,209 @@
+use std::collections::HashMap;
+
+use bytes::Bytes;
+
+use pageserver_api::key::rel_block_to_key;
+use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
+use pageserver_page_api as page_api;
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};
+
+/// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
+/// TODO: add tests for this.
+pub struct GetPageSplitter {
+    /// Split requests by shard index.
+    requests: HashMap<ShardIndex, page_api::GetPageRequest>,
+    /// The response being assembled. Preallocated with empty pages, to be filled in.
+    response: page_api::GetPageResponse,
+    /// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used
+    /// to assemble the response pages in the same order as the original request.
+    block_shards: Vec<ShardIndex>,
+}
+
+impl GetPageSplitter {
+    /// Checks if the given request only touches a single shard, and returns the shard ID. This is
+    /// the common case, so we check first in order to avoid unnecessary allocations and overhead.
+    pub fn for_single_shard(
+        req: &page_api::GetPageRequest,
+        count: ShardCount,
+        stripe_size: ShardStripeSize,
+    ) -> Option<ShardIndex> {
+        // Fast path: unsharded tenant.
+        if count.is_unsharded() {
+            return Some(ShardIndex::unsharded());
+        }
+
+        // Find the first page's shard, for comparison. If there are no pages, just return the first
+        // shard (caller likely checked already, otherwise the server will reject it).
+        let Some(&first_page) = req.block_numbers.first() else {
+            return Some(ShardIndex::new(ShardNumber(0), count));
+        };
+        let key = rel_block_to_key(req.rel, first_page);
+        let shard_number = key_to_shard_number(count, stripe_size, &key);
+
+        req.block_numbers
+            .iter()
+            .skip(1) // computed above
+            .all(|&blkno| {
+                let key = rel_block_to_key(req.rel, blkno);
+                key_to_shard_number(count, stripe_size, &key) == shard_number
+            })
+            .then_some(ShardIndex::new(shard_number, count))
+    }
+
+    /// Splits the given request.
+    pub fn split(
+        req: page_api::GetPageRequest,
+        count: ShardCount,
+        stripe_size: ShardStripeSize,
+    ) -> Self {
+        // The caller should make sure we don't split requests unnecessarily.
+        debug_assert!(
+            Self::for_single_shard(&req, count, stripe_size).is_none(),
+            "unnecessary request split"
+        );
+
+        // Split the requests by shard index.
+        let mut requests = HashMap::with_capacity(2); // common case
+        let mut block_shards = Vec::with_capacity(req.block_numbers.len());
+        for &blkno in &req.block_numbers {
+            let key = rel_block_to_key(req.rel, blkno);
+            let shard_number = key_to_shard_number(count, stripe_size, &key);
+            let shard_id = ShardIndex::new(shard_number, count);
+
+            requests
+                .entry(shard_id)
+                .or_insert_with(|| page_api::GetPageRequest {
+                    request_id: req.request_id,
+                    request_class: req.request_class,
+                    rel: req.rel,
+                    read_lsn: req.read_lsn,
+                    block_numbers: Vec::new(),
+                })
+                .block_numbers
+                .push(blkno);
+            block_shards.push(shard_id);
+        }
+
+        // Construct a response to be populated by shard responses. Preallocate empty page slots
+        // with the expected block numbers.
+        let response = page_api::GetPageResponse {
+            request_id: req.request_id,
+            status_code: page_api::GetPageStatusCode::Ok,
+            reason: None,
+            rel: req.rel,
+            pages: req
+                .block_numbers
+                .into_iter()
+                .map(|block_number| {
+                    page_api::Page {
+                        block_number,
+                        image: Bytes::new(), // empty page slot to be filled in
+                    }
+                })
+                .collect(),
+        };
+
+        Self {
+            requests,
+            response,
+            block_shards,
+        }
+    }
+
+    /// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations.
+    pub fn drain_requests(
+        &mut self,
+    ) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
+        self.requests.drain()
+    }
+
+    /// Adds a response from the given shard. The response must match the request ID and have an OK
+    /// status code. A response must not already exist for the given shard ID.
+    #[allow(clippy::result_large_err)]
+    pub fn add_response(
+        &mut self,
+        shard_id: ShardIndex,
+        response: page_api::GetPageResponse,
+    ) -> tonic::Result<()> {
+        // The caller should already have converted status codes into tonic::Status.
+        if response.status_code != page_api::GetPageStatusCode::Ok {
+            return Err(tonic::Status::internal(format!(
+                "unexpected non-OK response for shard {shard_id}: {} {}",
+                response.status_code,
+                response.reason.unwrap_or_default()
+            )));
+        }
+
+        if response.request_id != self.response.request_id {
+            return Err(tonic::Status::internal(format!(
+                "response ID mismatch for shard {shard_id}: expected {}, got {}",
+                self.response.request_id, response.request_id
+            )));
+        }
+
+        // Place the shard response pages into the assembled response, in request order.
+        let mut pages = response.pages.into_iter();
+
+        for (i, &s) in self.block_shards.iter().enumerate() {
+            if shard_id != s {
+                continue;
+            }
+
+            let Some(slot) = self.response.pages.get_mut(i) else {
+                return Err(tonic::Status::internal(format!(
+                    "no block_shards slot {i} for shard {shard_id}"
+                )));
+            };
+            let Some(page) = pages.next() else {
+                return Err(tonic::Status::internal(format!(
+                    "missing page {} in shard {shard_id} response",
+                    slot.block_number
+                )));
+            };
+            if page.block_number != slot.block_number {
+                return Err(tonic::Status::internal(format!(
+                    "shard {shard_id} returned wrong page at index {i}, expected {} got {}",
+                    slot.block_number, page.block_number
+                )));
+            }
+            if !slot.image.is_empty() {
+                return Err(tonic::Status::internal(format!(
+                    "shard {shard_id} returned duplicate page {} at index {i}",
+                    slot.block_number
+                )));
+            }
+
+            *slot = page;
+        }
+
+        // Make sure we've consumed all pages from the shard response.
+        if let Some(extra_page) = pages.next() {
+            return Err(tonic::Status::internal(format!(
+                "shard {shard_id} returned extra page: {}",
+                extra_page.block_number
+            )));
+        }
+
+        Ok(())
+    }
+
+    /// Fetches the final, assembled response.
+    #[allow(clippy::result_large_err)]
+    pub fn get_response(self) -> tonic::Result<page_api::GetPageResponse> {
+        // Check that the response is complete.
+        for (i, page) in self.response.pages.iter().enumerate() {
+            if page.image.is_empty() {
+                return Err(tonic::Status::internal(format!(
+                    "missing page {} for shard {}",
+                    page.block_number,
+                    self.block_shards
+                        .get(i)
+                        .map(|s| s.to_string())
+                        .unwrap_or_else(|| "?".to_string())
+                )));
+            }
+        }
+
+        Ok(self.response)
+    }
+}
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -17,6 +17,7 @@ pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
+serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
--- a/pageserver/ctl/src/download_remote_object.rs
+++ b/pageserver/ctl/src/download_remote_object.rs
@@ -0,0 +1,85 @@
+use camino::Utf8PathBuf;
+use clap::Parser;
+use tokio_util::sync::CancellationToken;
+
+/// Download a specific object from remote storage to a local file.
+///
+/// The remote storage configuration is supplied via the `REMOTE_STORAGE_CONFIG` environment
+/// variable, in the same TOML format that the pageserver itself understands. This allows the
+/// command to work with any cloud supported by the `remote_storage` crate (currently AWS S3,
+/// Azure Blob Storage and local files), as long as the credentials are available via the
+/// standard environment variables expected by the underlying SDKs.
+///
+/// Examples for setting the environment variable:
+///
+/// ```bash
+/// # AWS S3 (region can also be provided via AWS_REGION)
+/// export REMOTE_STORAGE_CONFIG='remote_storage = { bucket_name = "my-bucket", bucket_region = "us-east-2" }'
+///
+/// # Azure Blob Storage (account key picked up from AZURE_STORAGE_ACCOUNT_KEY)
+/// export REMOTE_STORAGE_CONFIG='remote_storage = { container = "my-container", account = "my-account" }'
+/// ```
+#[derive(Parser)]
+pub(crate) struct DownloadRemoteObjectCmd {
+    /// Key / path of the object to download (relative to the remote storage prefix).
+    ///
+    /// Examples:
+    ///   "wal/3aa8f.../00000001000000000000000A"
+    ///   "pageserver/v1/tenants/<tenant_id>/timelines/<timeline_id>/layer_12345"
+    pub remote_path: String,
+
+    /// Path of the local file to create. Existing file will be overwritten.
+    ///
+    /// Examples:
+    ///   "./segment"
+    ///   "/tmp/layer_12345.parquet"
+    pub output_file: Utf8PathBuf,
+}
+
+pub(crate) async fn main(cmd: &DownloadRemoteObjectCmd) -> anyhow::Result<()> {
+    use remote_storage::{DownloadOpts, GenericRemoteStorage, RemotePath, RemoteStorageConfig};
+
+    // Fetch remote storage configuration from the environment
+    let config_str = std::env::var("REMOTE_STORAGE_CONFIG").map_err(|_| {
+        anyhow::anyhow!(
+            "'REMOTE_STORAGE_CONFIG' environment variable must be set to a valid remote storage TOML config"
+        )
+    })?;
+
+    let config = RemoteStorageConfig::from_toml_str(&config_str)?;
+
+    // Initialise remote storage client
+    let storage = GenericRemoteStorage::from_config(&config).await?;
+
+    // RemotePath must be relative – leading slashes confuse the parser.
+    let remote_path_str = cmd.remote_path.trim_start_matches('/');
+    let remote_path = RemotePath::from_string(remote_path_str)?;
+
+    let cancel = CancellationToken::new();
+
+    println!(
+        "Downloading '{remote_path}' from remote storage bucket {:?} ...",
+        config.storage.bucket_name()
+    );
+
+    // Start the actual download
+    let download = storage
+        .download(&remote_path, &DownloadOpts::default(), &cancel)
+        .await?;
+
+    // Stream to file
+    let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
+    let tmp_path = cmd.output_file.with_extension("tmp");
+    let mut file = tokio::fs::File::create(&tmp_path).await?;
+    tokio::io::copy(&mut reader, &mut file).await?;
+    file.sync_all().await?;
+    // Atomically move into place
+    tokio::fs::rename(&tmp_path, &cmd.output_file).await?;
+
+    println!(
+        "Downloaded to '{}'. Last modified: {:?}, etag: {}",
+        cmd.output_file, download.last_modified, download.etag
+    );
+
+    Ok(())
+}
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -1,14 +1,16 @@
 use std::str::FromStr;

-use anyhow::Context;
+use anyhow::{Context, Ok};
 use camino::Utf8PathBuf;
 use pageserver::tenant::{
    IndexPart,
    layer_map::{LayerMap, SearchResult},
-    remote_timeline_client::remote_layer_path,
-    storage_layer::{PersistentLayerDesc, ReadableLayerWeak},
+    remote_timeline_client::{index::LayerFileMetadata, remote_layer_path},
+    storage_layer::{LayerName, LayerVisibilityHint, PersistentLayerDesc, ReadableLayerWeak},
 };
 use pageserver_api::key::Key;
+use serde::Serialize;
+use std::collections::BTreeMap;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
@@ -33,6 +35,31 @@ pub(crate) enum IndexPartCmd {
        #[arg(long)]
        lsn: String,
    },
+    /// List all visible delta and image layers at the latest LSN.
+    ListVisibleLayers {
+        #[arg(long)]
+        path: Utf8PathBuf,
+    },
+}
+
+fn create_layer_map_from_index_part(
+    index_part: &IndexPart,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+) -> LayerMap {
+    let mut layer_map = LayerMap::default();
+    {
+        let mut updates = layer_map.batch_update();
+        for (key, value) in index_part.layer_metadata.iter() {
+            updates.insert_historic(PersistentLayerDesc::from_filename(
+                tenant_shard_id,
+                timeline_id,
+                key.clone(),
+                value.file_size,
+            ));
+        }
+    }
+    layer_map
 }

 async fn search_layers(
@@ -49,18 +76,7 @@ async fn search_layers(
        let bytes = tokio::fs::read(path).await?;
        IndexPart::from_json_bytes(&bytes).unwrap()
    };
-    let mut layer_map = LayerMap::default();
-    {
-        let mut updates = layer_map.batch_update();
-        for (key, value) in index_json.layer_metadata.iter() {
-            updates.insert_historic(PersistentLayerDesc::from_filename(
-                tenant_shard_id,
-                timeline_id,
-                key.clone(),
-                value.file_size,
-            ));
-        }
-    }
+    let layer_map = create_layer_map_from_index_part(&index_json, tenant_shard_id, timeline_id);
    let key = Key::from_hex(key)?;

    let lsn = Lsn::from_str(lsn).unwrap();
@@ -98,6 +114,69 @@ async fn search_layers(
    Ok(())
 }

+#[derive(Debug, Clone, Serialize)]
+struct VisibleLayers {
+    pub total_images: u64,
+    pub total_image_bytes: u64,
+    pub total_deltas: u64,
+    pub total_delta_bytes: u64,
+    pub layer_metadata: BTreeMap<LayerName, LayerFileMetadata>,
+}
+
+impl VisibleLayers {
+    pub fn new() -> Self {
+        Self {
+            layer_metadata: BTreeMap::new(),
+            total_images: 0,
+            total_image_bytes: 0,
+            total_deltas: 0,
+            total_delta_bytes: 0,
+        }
+    }
+
+    pub fn add_layer(&mut self, name: LayerName, layer: LayerFileMetadata) {
+        match name {
+            LayerName::Image(_) => {
+                self.total_images += 1;
+                self.total_image_bytes += layer.file_size;
+            }
+            LayerName::Delta(_) => {
+                self.total_deltas += 1;
+                self.total_delta_bytes += layer.file_size;
+            }
+        }
+        self.layer_metadata.insert(name, layer);
+    }
+}
+
+async fn list_visible_layers(path: &Utf8PathBuf) -> anyhow::Result<()> {
+    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    let timeline_id = TimelineId::generate();
+
+    let bytes = tokio::fs::read(path).await.context("read file")?;
+    let index_part = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
+    let layer_map = create_layer_map_from_index_part(&index_part, tenant_shard_id, timeline_id);
+    let mut visible_layers = VisibleLayers::new();
+    let (layers, _key_space) = layer_map.get_visibility(Vec::new());
+    for (layer, visibility) in layers {
+        if visibility == LayerVisibilityHint::Visible {
+            visible_layers.add_layer(
+                layer.layer_name(),
+                index_part
+                    .layer_metadata
+                    .get(&layer.layer_name())
+                    .unwrap()
+                    .clone(),
+            );
+        }
+    }
+    let output = serde_json::to_string_pretty(&visible_layers).context("serialize output")?;
+    println!("{output}");
+
+    Ok(())
+}
+
 pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
    match cmd {
        IndexPartCmd::Dump { path } => {
@@ -114,5 +193,6 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
            key,
            lsn,
        } => search_layers(tenant_id, timeline_id, path, key, lsn).await,
+        IndexPartCmd::ListVisibleLayers { path } => list_visible_layers(path).await,
    }
 }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -4,6 +4,7 @@
 //!
 //! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.

+mod download_remote_object;
 mod draw_timeline_dir;
 mod index_part;
 mod key;
@@ -16,6 +17,7 @@ use std::time::{Duration, SystemTime};

 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
+use download_remote_object::DownloadRemoteObjectCmd;
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use page_trace::PageTraceCmd;
@@ -63,6 +65,7 @@ enum Commands {
    /// Debug print a hex key found from logs
    Key(key::DescribeKeyCommand),
    PageTrace(PageTraceCmd),
+    DownloadRemoteObject(DownloadRemoteObjectCmd),
 }

 /// Read and update pageserver metadata file
@@ -185,6 +188,9 @@ async fn main() -> anyhow::Result<()> {
        }
        Commands::Key(dkc) => dkc.execute(),
        Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
+        Commands::DownloadRemoteObject(cmd) => {
+            download_remote_object::main(&cmd).await?;
+        }
    };
    Ok(())
 }
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -153,7 +153,7 @@ message GetDbSizeResponse {
 message GetPageRequest {
  // A request ID. Will be included in the response. Should be unique for
  // in-flight requests on the stream.
-  uint64 request_id = 1;
+  RequestID request_id = 1;
  // The request class.
  GetPageClass request_class = 2;
  // The LSN to read at.
@@ -177,6 +177,14 @@ message GetPageRequest {
  repeated uint32 block_number = 5;
 }

+// A Request ID. Should be unique for in-flight requests on a stream. Included in the response.
+message RequestID {
+  // The base request ID.
+  uint64 id = 1;
+  // The request attempt. Starts at 0, incremented on each retry.
+  uint32 attempt = 2;
+}
+
 // A GetPageRequest class. Primarily intended for observability, but may also be
 // used for prioritization in the future.
 enum GetPageClass {
@@ -199,13 +207,26 @@ enum GetPageClass {
 // the entire batch is ready, so no one can make use of the individual pages.
 message GetPageResponse {
  // The original request's ID.
-  uint64 request_id = 1;
-  // The response status code.
+  RequestID request_id = 1;
+  // The response status code. If not OK, the rel and page fields will be empty.
  GetPageStatusCode status_code = 2;
  // A string describing the status, if any.
  string reason = 3;
-  // The 8KB page images, in the same order as the request. Empty if status_code != OK.
-  repeated bytes page_image = 4;
+  // The relation that the pages belong to.
+  RelTag rel = 4;
+  // The page(s), in the same order as the request.
+  repeated Page page = 5;
+}
+
+// A page.
+//
+// TODO: it would be slightly more efficient (but less convenient) to have separate arrays of block
+// numbers and images, but given the 8KB page size it's probably negligible. Benchmark it anyway.
+message Page {
+  // The page number.
+  uint32 block_number = 1;
+  // The materialized page image, as an 8KB byte vector.
+  bytes image = 2;
 }

 // A GetPageResponse status code.
--- a/pageserver/page_api/src/client.rs
+++ b/pageserver/page_api/src/client.rs
@@ -1,4 +1,5 @@
 use anyhow::Context as _;
+use futures::future::ready;
 use futures::{Stream, StreamExt as _, TryStreamExt as _};
 use tokio::io::AsyncRead;
 use tokio_util::io::StreamReader;
@@ -110,7 +111,7 @@ impl Client {
    ) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
        let reqs = reqs.map(proto::GetPageRequest::from);
        let resps = self.inner.get_pages(reqs).await?.into_inner();
-        Ok(resps.map_ok(GetPageResponse::from))
+        Ok(resps.and_then(|resp| ready(GetPageResponse::try_from(resp).map_err(|err| err.into()))))
    }

    /// Returns the size of a relation, as # of blocks.
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -49,7 +49,7 @@ impl From<ProtocolError> for tonic::Status {
 }

 /// The LSN a request should read at.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, Default)]
 pub struct ReadLsn {
    /// The request's read LSN.
    pub request_lsn: Lsn,
@@ -329,7 +329,7 @@ impl From<GetDbSizeResponse> for proto::GetDbSizeResponse {
 }

 /// Requests one or more pages.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Default)]
 pub struct GetPageRequest {
    /// A request ID. Will be included in the response. Should be unique for in-flight requests on
    /// the stream.
@@ -356,7 +356,10 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
            return Err(ProtocolError::Missing("block_number"));
        }
        Ok(Self {
-            request_id: pb.request_id,
+            request_id: pb
+                .request_id
+                .ok_or(ProtocolError::Missing("request_id"))?
+                .into(),
            request_class: pb.request_class.into(),
            read_lsn: pb
                .read_lsn
@@ -371,7 +374,7 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
 impl From<GetPageRequest> for proto::GetPageRequest {
    fn from(request: GetPageRequest) -> Self {
        Self {
-            request_id: request.request_id,
+            request_id: Some(request.request_id.into()),
            request_class: request.request_class.into(),
            read_lsn: Some(request.read_lsn.into()),
            rel: Some(request.rel.into()),
@@ -380,16 +383,60 @@ impl From<GetPageRequest> for proto::GetPageRequest {
    }
 }

-/// A GetPage request ID.
-pub type RequestID = u64;
+/// A GetPage request ID and retry attempt. Should be unique for in-flight requests on a stream.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct RequestID {
+    /// The base request ID.
+    pub id: u64,
+    // The request attempt. Starts at 0, incremented on each retry.
+    pub attempt: u32,
+}
+
+impl RequestID {
+    /// Creates a new RequestID with the given ID and an initial attempt of 0.
+    pub fn new(id: u64) -> Self {
+        Self { id, attempt: 0 }
+    }
+}
+
+impl Display for RequestID {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}.{}", self.id, self.attempt)
+    }
+}
+
+impl From<proto::RequestId> for RequestID {
+    fn from(pb: proto::RequestId) -> Self {
+        Self {
+            id: pb.id,
+            attempt: pb.attempt,
+        }
+    }
+}
+
+impl From<u64> for RequestID {
+    fn from(id: u64) -> Self {
+        Self::new(id)
+    }
+}
+
+impl From<RequestID> for proto::RequestId {
+    fn from(request_id: RequestID) -> Self {
+        Self {
+            id: request_id.id,
+            attempt: request_id.attempt,
+        }
+    }
+}

 /// A GetPage request class.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, Default, strum_macros::Display)]
 pub enum GetPageClass {
    /// Unknown class. For backwards compatibility: used when an older client version sends a class
    /// that a newer server version has removed.
    Unknown,
    /// A normal request. This is the default.
+    #[default]
    Normal,
    /// A prefetch request. NB: can only be classified on pg < 18.
    Prefetch,
@@ -443,32 +490,41 @@ impl From<GetPageClass> for i32 {
 pub struct GetPageResponse {
    /// The original request's ID.
    pub request_id: RequestID,
-    /// The response status code.
+    /// The response status code. If not OK, the `rel` and `pages` fields will be empty.
    pub status_code: GetPageStatusCode,
    /// A string describing the status, if any.
    pub reason: Option<String>,
-    /// The 8KB page images, in the same order as the request. Empty if status != OK.
-    pub page_images: Vec<Bytes>,
+    /// The relation that the pages belong to.
+    pub rel: RelTag,
+    // The page(s), in the same order as the request.
+    pub pages: Vec<Page>,
 }

-impl From<proto::GetPageResponse> for GetPageResponse {
-    fn from(pb: proto::GetPageResponse) -> Self {
-        Self {
-            request_id: pb.request_id,
+impl TryFrom<proto::GetPageResponse> for GetPageResponse {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetPageResponse) -> Result<Self, ProtocolError> {
+        Ok(Self {
+            request_id: pb
+                .request_id
+                .ok_or(ProtocolError::Missing("request_id"))?
+                .into(),
            status_code: pb.status_code.into(),
            reason: Some(pb.reason).filter(|r| !r.is_empty()),
-            page_images: pb.page_image,
-        }
+            rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
+            pages: pb.page.into_iter().map(Page::from).collect(),
+        })
    }
 }

 impl From<GetPageResponse> for proto::GetPageResponse {
    fn from(response: GetPageResponse) -> Self {
        Self {
-            request_id: response.request_id,
+            request_id: Some(response.request_id.into()),
            status_code: response.status_code.into(),
            reason: response.reason.unwrap_or_default(),
-            page_image: response.page_images,
+            rel: Some(response.rel.into()),
+            page: response.pages.into_iter().map(proto::Page::from).collect(),
        }
    }
 }
@@ -501,11 +557,39 @@ impl GetPageResponse {
            request_id,
            status_code,
            reason: Some(status.message().to_string()),
-            page_images: Vec::new(),
+            rel: RelTag::default(),
+            pages: Vec::new(),
        })
    }
 }

+// A page.
+#[derive(Clone, Debug)]
+pub struct Page {
+    /// The page number.
+    pub block_number: u32,
+    /// The materialized page image, as an 8KB byte vector.
+    pub image: Bytes,
+}
+
+impl From<proto::Page> for Page {
+    fn from(pb: proto::Page) -> Self {
+        Self {
+            block_number: pb.block_number,
+            image: pb.image,
+        }
+    }
+}
+
+impl From<Page> for proto::Page {
+    fn from(page: Page) -> Self {
+        Self {
+            block_number: page.block_number,
+            image: page.image,
+        }
+    }
+}
+
 /// A GetPage response status code.
 ///
 /// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
@@ -602,6 +686,21 @@ impl TryFrom<tonic::Code> for GetPageStatusCode {
    }
 }

+impl From<GetPageStatusCode> for tonic::Code {
+    fn from(status_code: GetPageStatusCode) -> Self {
+        use tonic::Code;
+
+        match status_code {
+            GetPageStatusCode::Unknown => Code::Unknown,
+            GetPageStatusCode::Ok => Code::Ok,
+            GetPageStatusCode::NotFound => Code::NotFound,
+            GetPageStatusCode::InvalidRequest => Code::InvalidArgument,
+            GetPageStatusCode::InternalError => Code::Internal,
+            GetPageStatusCode::SlowDown => Code::ResourceExhausted,
+        }
+    }
+}
+
 // Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other
 // shards will error.
 #[derive(Clone, Copy, Debug)]
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -16,6 +16,7 @@ futures.workspace = true
 hdrhistogram.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
+pprof.workspace = true
 rand.workspace = true
 reqwest.workspace = true
 serde.workspace = true
@@ -27,8 +28,9 @@ tokio-util.workspace = true
 tonic.workspace = true
 url.workspace = true

-pageserver_client.workspace = true
 pageserver_api.workspace = true
+pageserver_client.workspace = true
+pageserver_client_grpc.workspace = true
 pageserver_page_api.workspace = true
 utils = { path = "../../libs/utils/" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -10,12 +10,14 @@ use anyhow::Context;
 use async_trait::async_trait;
 use bytes::Bytes;
 use camino::Utf8PathBuf;
+use futures::stream::FuturesUnordered;
 use futures::{Stream, StreamExt as _};
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::TenantShardId;
+use pageserver_client_grpc::{self as client_grpc, ShardSpec};
 use pageserver_page_api as page_api;
 use rand::prelude::*;
 use tokio::task::JoinSet;
@@ -37,6 +39,10 @@ pub(crate) struct Args {
    /// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
    page_service_connstring: String,
+    /// Use the rich gRPC Pageserver client `client_grpc::PageserverClient`, rather than the basic
+    /// no-frills `page_api::Client`. Only valid with grpc:// connstrings.
+    #[clap(long)]
+    rich_client: bool,
    #[clap(long)]
    pageserver_jwt: Option<String>,
    #[clap(long, default_value = "1")]
@@ -332,6 +338,7 @@ async fn main_impl(
            let client: Box<dyn Client> = match scheme.as_str() {
                "postgresql" | "postgres" => {
                    assert!(!args.compression, "libpq does not support compression");
+                    assert!(!args.rich_client, "rich client requires grpc://");
                    Box::new(
                        LibpqClient::new(&args.page_service_connstring, worker_id.timeline)
                            .await
@@ -339,6 +346,16 @@ async fn main_impl(
                    )
                }

+                "grpc" if args.rich_client => Box::new(
+                    RichGrpcClient::new(
+                        &args.page_service_connstring,
+                        worker_id.timeline,
+                        args.compression,
+                    )
+                    .await
+                    .unwrap(),
+                ),
+
                "grpc" => Box::new(
                    GrpcClient::new(
                        &args.page_service_connstring,
@@ -657,7 +674,7 @@ impl Client for GrpcClient {
        blks: Vec<u32>,
    ) -> anyhow::Result<()> {
        let req = page_api::GetPageRequest {
-            request_id: req_id,
+            request_id: req_id.into(),
            request_class: page_api::GetPageClass::Normal,
            read_lsn: page_api::ReadLsn {
                request_lsn: req_lsn,
@@ -677,6 +694,79 @@ impl Client for GrpcClient {
            "unexpected status code: {}",
            resp.status_code,
        );
-        Ok((resp.request_id, resp.page_images))
+        Ok((
+            resp.request_id.id,
+            resp.pages.into_iter().map(|p| p.image).collect(),
+        ))
+    }
+}
+
+/// A rich gRPC Pageserver client.
+struct RichGrpcClient {
+    inner: Arc<client_grpc::PageserverClient>,
+    requests: FuturesUnordered<
+        Pin<Box<dyn Future<Output = anyhow::Result<page_api::GetPageResponse>> + Send>>,
+    >,
+}
+
+impl RichGrpcClient {
+    async fn new(
+        connstring: &str,
+        ttid: TenantTimelineId,
+        compression: bool,
+    ) -> anyhow::Result<Self> {
+        let inner = Arc::new(client_grpc::PageserverClient::new(
+            ttid.tenant_id,
+            ttid.timeline_id,
+            ShardSpec::new(
+                [(ShardIndex::unsharded(), connstring.to_string())].into(),
+                None,
+            )?,
+            None,
+            compression.then_some(tonic::codec::CompressionEncoding::Zstd),
+        )?);
+        Ok(Self {
+            inner,
+            requests: FuturesUnordered::new(),
+        })
+    }
+}
+
+#[async_trait]
+impl Client for RichGrpcClient {
+    async fn send_get_page(
+        &mut self,
+        req_id: u64,
+        req_lsn: Lsn,
+        mod_lsn: Lsn,
+        rel: RelTag,
+        blks: Vec<u32>,
+    ) -> anyhow::Result<()> {
+        let req = page_api::GetPageRequest {
+            request_id: req_id.into(),
+            request_class: page_api::GetPageClass::Normal,
+            read_lsn: page_api::ReadLsn {
+                request_lsn: req_lsn,
+                not_modified_since_lsn: Some(mod_lsn),
+            },
+            rel,
+            block_numbers: blks,
+        };
+        let inner = self.inner.clone();
+        self.requests.push(Box::pin(async move {
+            inner
+                .get_page(req)
+                .await
+                .map_err(|err| anyhow::anyhow!("{err}"))
+        }));
+        Ok(())
+    }
+
+    async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
+        let resp = self.requests.next().await.unwrap()?;
+        Ok((
+            resp.request_id.id,
+            resp.pages.into_iter().map(|p| p.image).collect(),
+        ))
    }
 }
--- a/pageserver/pagebench/src/cmd/idle_streams.rs
+++ b/pageserver/pagebench/src/cmd/idle_streams.rs
@@ -0,0 +1,127 @@
+use std::sync::Arc;
+
+use anyhow::anyhow;
+use futures::StreamExt;
+use tonic::transport::Endpoint;
+use tracing::info;
+
+use pageserver_page_api::{GetPageClass, GetPageRequest, GetPageStatusCode, ReadLsn, RelTag};
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+use utils::shard::ShardIndex;
+
+/// Starts a large number of idle gRPC GetPage streams.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    /// The Pageserver to connect to. Must use grpc://.
+    #[clap(long, default_value = "grpc://localhost:51051")]
+    server: String,
+    /// The Pageserver HTTP API.
+    #[clap(long, default_value = "http://localhost:9898")]
+    http_server: String,
+    /// The number of streams to open.
+    #[clap(long, default_value = "100000")]
+    count: usize,
+    /// Number of streams per connection.
+    #[clap(long, default_value = "100")]
+    per_connection: usize,
+    /// Send a single GetPage request on each stream.
+    #[clap(long, default_value_t = false)]
+    send_request: bool,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()?;
+
+    rt.block_on(main_impl(args))
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    // Discover a tenant and timeline to use.
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        reqwest::Client::new(),
+        args.http_server.clone(),
+        None,
+    ));
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: Some(1),
+            targets: None,
+        },
+    )
+    .await?;
+    let ttid = timelines
+        .first()
+        .ok_or_else(|| anyhow!("no timelines found"))?;
+
+    // Set up the initial client.
+    let endpoint = Endpoint::from_shared(args.server.clone())?;
+
+    let connect = async || {
+        pageserver_page_api::Client::new(
+            endpoint.connect().await?,
+            ttid.tenant_id,
+            ttid.timeline_id,
+            ShardIndex::unsharded(),
+            None,
+            None,
+        )
+    };
+
+    let mut client = connect().await?;
+    let mut streams = Vec::with_capacity(args.count);
+
+    // Create streams.
+    for i in 0..args.count {
+        if i % 100 == 0 {
+            info!("opened {}/{} streams", i, args.count);
+        }
+        if i % args.per_connection == 0 && i > 0 {
+            client = connect().await?;
+        }
+
+        let (req_tx, req_rx) = tokio::sync::mpsc::unbounded_channel();
+        let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
+        let mut resp_stream = client.get_pages(req_stream).await?;
+
+        // Send request if specified.
+        if args.send_request {
+            req_tx.send(GetPageRequest {
+                request_id: 1.into(),
+                request_class: GetPageClass::Normal,
+                read_lsn: ReadLsn {
+                    request_lsn: Lsn::MAX,
+                    not_modified_since_lsn: Some(Lsn(1)),
+                },
+                rel: RelTag {
+                    spcnode: 1664, // pg_global
+                    dbnode: 0,     // shared database
+                    relnode: 1262, // pg_authid
+                    forknum: 0,    // init
+                },
+                block_numbers: vec![0],
+            })?;
+            let resp = resp_stream
+                .next()
+                .await
+                .transpose()?
+                .ok_or_else(|| anyhow!("no response"))?;
+            if resp.status_code != GetPageStatusCode::Ok {
+                return Err(anyhow!("{} response", resp.status_code));
+            }
+        }
+
+        // Hold onto streams to avoid closing them.
+        streams.push((req_tx, resp_stream));
+    }
+
+    info!("opened {} streams, sleeping", args.count);
+
+    // Block forever, to hold the idle streams open for inspection.
+    futures::future::pending::<()>().await;
+
+    Ok(())
+}
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -1,4 +1,7 @@
+use std::fs::File;
+
 use clap::Parser;
+use tracing::info;
 use utils::logging;

 /// Re-usable pieces of code that aren't CLI-specific.
@@ -17,38 +20,73 @@ mod cmd {
    pub(super) mod aux_files;
    pub(super) mod basebackup;
    pub(super) mod getpage_latest_lsn;
+    pub(super) mod idle_streams;
    pub(super) mod ondemand_download_churn;
    pub(super) mod trigger_initial_size_calculation;
 }

 /// Component-level performance test for pageserver.
 #[derive(clap::Parser)]
-enum Args {
+struct Args {
+    /// Takes a client CPU profile into profile.svg. The benchmark must exit cleanly before it's
+    /// written, e.g. via --runtime.
+    #[arg(long)]
+    profile: bool,
+
+    #[command(subcommand)]
+    subcommand: Subcommand,
+}
+
+#[derive(clap::Subcommand)]
+enum Subcommand {
    Basebackup(cmd::basebackup::Args),
    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
    OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
    AuxFiles(cmd::aux_files::Args),
+    IdleStreams(cmd::idle_streams::Args),
 }

-fn main() {
+fn main() -> anyhow::Result<()> {
    logging::init(
        logging::LogFormat::Plain,
        logging::TracingErrorLayerEnablement::Disabled,
        logging::Output::Stderr,
-    )
-    .unwrap();
+    )?;
    logging::replace_panic_hook_with_tracing_panic_hook().forget();

    let args = Args::parse();
-    match args {
-        Args::Basebackup(args) => cmd::basebackup::main(args),
-        Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
-        Args::TriggerInitialSizeCalculation(args) => {
+
+    // Start a CPU profile if requested.
+    let mut profiler = None;
+    if args.profile {
+        profiler = Some(
+            pprof::ProfilerGuardBuilder::default()
+                .frequency(1000)
+                .blocklist(&["libc", "libgcc", "pthread", "vdso"])
+                .build()?,
+        );
+    }
+
+    match args.subcommand {
+        Subcommand::Basebackup(args) => cmd::basebackup::main(args),
+        Subcommand::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
+        Subcommand::TriggerInitialSizeCalculation(args) => {
            cmd::trigger_initial_size_calculation::main(args)
        }
-        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
-        Args::AuxFiles(args) => cmd::aux_files::main(args),
+        Subcommand::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
+        Subcommand::AuxFiles(args) => cmd::aux_files::main(args),
+        Subcommand::IdleStreams(args) => cmd::idle_streams::main(args),
+    }?;
+
+    // Generate a CPU flamegraph if requested.
+    if let Some(profiler) = profiler {
+        let report = profiler.report().build()?;
+        drop(profiler); // stop profiling
+        let file = File::create("profile.svg")?;
+        report.flamegraph(file)?;
+        info!("wrote CPU profile flamegraph to profile.svg")
    }
-    .unwrap()
+
+    Ok(())
 }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -114,7 +114,7 @@ where
    // Compute postgres doesn't have any previous WAL files, but the first
    // record that it's going to write needs to include the LSN of the
    // previous record (xl_prev). We include prev_record_lsn in the
-    // "zenith.signal" file, so that postgres can read it during startup.
+    // "neon.signal" file, so that postgres can read it during startup.
    //
    // We don't keep full history of record boundaries in the page server,
    // however, only the predecessor of the latest record on each
@@ -751,34 +751,39 @@ where

    //
    // Add generated pg_control file and bootstrap WAL segment.
-    // Also send zenith.signal file with extra bootstrap data.
+    // Also send neon.signal and zenith.signal file with extra bootstrap data.
    //
    async fn add_pgcontrol_file(
        &mut self,
        pg_control_bytes: Bytes,
        system_identifier: u64,
    ) -> Result<(), BasebackupError> {
-        // add zenith.signal file
-        let mut zenith_signal = String::new();
+        // add neon.signal file
+        let mut neon_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
            if self.timeline.is_ancestor_lsn(self.lsn) {
-                write!(zenith_signal, "PREV LSN: none")
+                write!(neon_signal, "PREV LSN: none")
                    .map_err(|e| BasebackupError::Server(e.into()))?;
            } else {
-                write!(zenith_signal, "PREV LSN: invalid")
+                write!(neon_signal, "PREV LSN: invalid")
                    .map_err(|e| BasebackupError::Server(e.into()))?;
            }
        } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
+            write!(neon_signal, "PREV LSN: {}", self.prev_record_lsn)
                .map_err(|e| BasebackupError::Server(e.into()))?;
        }
-        self.ar
-            .append(
-                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
-                zenith_signal.as_bytes(),
-            )
-            .await
-            .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
+
+        // TODO: Remove zenith.signal once all historical computes have been replaced
+        // ... and thus support the neon.signal file.
+        for signalfilename in ["neon.signal", "zenith.signal"] {
+            self.ar
+                .append(
+                    &new_tar_header(signalfilename, neon_signal.len() as u64)?,
+                    neon_signal.as_bytes(),
+                )
+                .await
+                .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,neon.signal"))?;
+        }

        //send pg_control
        let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -29,8 +29,8 @@ use pageserver::task_mgr::{
 };
 use pageserver::tenant::{TenantSharedResources, mgr, secondary};
 use pageserver::{
-    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
-    page_cache, page_service, task_mgr, virtual_file,
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener,
+    MetricsCollectionTask, http, page_cache, page_service, task_mgr, virtual_file,
 };
 use postgres_backend::AuthType;
 use remote_storage::GenericRemoteStorage;
@@ -41,6 +41,7 @@ use tracing_utils::OtelGuard;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::crashsafe::syncfs;
 use utils::logging::TracingErrorLayerEnablement;
+use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
 use utils::sentry_init::init_sentry;
 use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener};

@@ -763,6 +764,41 @@ fn start_pageserver(
        (http_task, https_task)
    };

+    /* BEGIN_HADRON */
+    let metrics_collection_task = {
+        let cancel = shutdown_pageserver.child_token();
+        let task = crate::BACKGROUND_RUNTIME.spawn({
+            let cancel = cancel.clone();
+            let background_jobs_barrier = background_jobs_barrier.clone();
+            async move {
+                if conf.force_metric_collection_on_scrape {
+                    return;
+                }
+
+                // first wait until background jobs are cleared to launch.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return; },
+                    _ = background_jobs_barrier.wait() => {}
+                };
+                let mut interval = tokio::time::interval(METRICS_COLLECTION_INTERVAL);
+                loop {
+                    tokio::select! {
+                        _ = cancel.cancelled() => {
+                            tracing::info!("cancelled metrics collection task, exiting...");
+                             break;
+                        },
+                        _ = interval.tick() => {}
+                    }
+                    tokio::task::spawn_blocking(|| {
+                        METRICS_COLLECTOR.run_once(true);
+                    });
+                }
+            }
+        });
+        MetricsCollectionTask(CancellableTask { task, cancel })
+    };
+    /* END_HADRON */
+
    let consumption_metrics_tasks = {
        let cancel = shutdown_pageserver.child_token();
        let task = crate::BACKGROUND_RUNTIME.spawn({
@@ -844,6 +880,7 @@ fn start_pageserver(
            https_endpoint_listener,
            page_service,
            page_service_grpc,
+            metrics_collection_task,
            consumption_metrics_tasks,
            disk_usage_eviction_task,
            &tenant_manager,
@@ -880,17 +917,15 @@ async fn create_remote_storage_client(
    // If `test_remote_failures` is non-zero, wrap the client with a
    // wrapper that simulates failures.
    if conf.test_remote_failures > 0 {
-        if !cfg!(feature = "testing") {
-            anyhow::bail!(
-                "test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"
-            );
-        }
        info!(
            "Simulating remote failures for first {} attempts of each op",
            conf.test_remote_failures
        );
-        remote_storage =
-            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
+        remote_storage = GenericRemoteStorage::unreliable_wrapper(
+            remote_storage,
+            conf.test_remote_failures,
+            conf.test_remote_failures_probability,
+        );
    }

    Ok(remote_storage)
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -147,7 +147,11 @@ pub struct PageServerConf {

    pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,

+    // The number of allowed failures in remote storage operations.
    pub test_remote_failures: u64,
+    // The probability of failure in remote storage operations. Only works when test_remote_failures > 1.
+    // Use 100 for 100% failure, 0 for no failure.
+    pub test_remote_failures_probability: u64,

    pub ondemand_download_behavior_treat_error_as_warn: bool,

@@ -248,6 +252,14 @@ pub struct PageServerConf {
    pub timeline_import_config: pageserver_api::config::TimelineImportConfig,

    pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
+
+    /// Defines what is a big tenant for the purpose of image layer generation.
+    /// See Timeline::should_check_if_image_layers_required
+    pub image_layer_generation_large_timeline_threshold: Option<u64>,
+
+    /// Controls whether to collect all metrics on each scrape or to return potentially stale
+    /// results.
+    pub force_metric_collection_on_scrape: bool,
 }

 /// Token for authentication to safekeepers
@@ -392,6 +404,7 @@ impl PageServerConf {
            synthetic_size_calculation_interval,
            disk_usage_based_eviction,
            test_remote_failures,
+            test_remote_failures_probability,
            ondemand_download_behavior_treat_error_as_warn,
            background_task_maximum_delay,
            control_plane_api,
@@ -427,6 +440,8 @@ impl PageServerConf {
            posthog_config,
            timeline_import_config,
            basebackup_cache_config,
+            image_layer_generation_large_timeline_threshold,
+            force_metric_collection_on_scrape,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -461,6 +476,7 @@ impl PageServerConf {
            synthetic_size_calculation_interval,
            disk_usage_based_eviction,
            test_remote_failures,
+            test_remote_failures_probability,
            ondemand_download_behavior_treat_error_as_warn,
            background_task_maximum_delay,
            control_plane_api: control_plane_api
@@ -484,6 +500,8 @@ impl PageServerConf {
            dev_mode,
            timeline_import_config,
            basebackup_cache_config,
+            image_layer_generation_large_timeline_threshold,
+            force_metric_collection_on_scrape,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -194,6 +194,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
                        listen_http_port: m.http_port,
                        listen_https_port: m.https_port,
                        availability_zone_id: az_id.expect("Checked above"),
+                        node_ip_addr: None,
                    })
                }
                Err(e) => {
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -1,5 +1,5 @@
 //! The validator is responsible for validating DeletionLists for execution,
-//! based on whethe the generation in the DeletionList is still the latest
+//! based on whether the generation in the DeletionList is still the latest
 //! generation for a tenant.
 //!
 //! The purpose of validation is to ensure split-brain safety in the cluster
--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -409,11 +409,12 @@ impl TenantFeatureResolver {

    /// Refresh the cached properties and flags on the critical path.
    pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) {
-        let mut remote_size_mb = None;
+        let mut remote_size_mb = Some(0.0);
        for timeline in tenant_shard.list_timelines() {
            let size = timeline.metrics.resident_physical_size_get();
            if size == 0 {
                remote_size_mb = None;
+                break;
            }
            if let Some(ref mut remote_size_mb) = remote_size_mb {
                *remote_size_mb += size as f64 / 1024.0 / 1024.0;
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -116,26 +116,6 @@ paths:
              schema:
                type: string

-  /v1/tenant/{tenant_id}/timeline:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    get:
-      description: Get timelines for tenant
-      responses:
-        "200":
-          description: TimelineInfo
-          content:
-            application/json:
-              schema:
-                type: array
-                items:
-                  $ref: "#/components/schemas/TimelineInfo"
-
-
  /v1/tenant/{tenant_id}/timeline/{timeline_id}:
    parameters:
      - name: tenant_id
@@ -618,7 +598,7 @@ paths:
              schema:
                $ref: "#/components/schemas/SecondaryProgress"

-  /v1/tenant/{tenant_id}/timeline/:
+  /v1/tenant/{tenant_id}/timeline:
    parameters:
      - name: tenant_id
        in: path
@@ -685,6 +665,17 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+    get:
+      description: Get timelines for tenant
+      responses:
+        "200":
+          description: TimelineInfo
+          content:
+            application/json:
+              schema:
+                type: array
+                items:
+                  $ref: "#/components/schemas/TimelineInfo"

  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
    parameters:
@@ -767,7 +758,7 @@ paths:
                $ref: "#/components/schemas/ServiceUnavailableError"


-  /v1/tenant/:
+  /v1/tenant:
    get:
      description: Get tenants list
      responses:
@@ -847,7 +838,7 @@ paths:
                items:
                  $ref: "#/components/schemas/TenantInfo"

-  /v1/tenant/{tenant_id}/config/:
+  /v1/tenant/{tenant_id}/config:
    parameters:
      - name: tenant_id
        in: path
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2,12 +2,15 @@
 //! Management HTTP API
 //!
 use std::cmp::Reverse;
-use std::collections::{BinaryHeap, HashMap};
+use std::collections::BTreeMap;
+use std::collections::BinaryHeap;
+use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{Context, Result, anyhow};
+use bytes::Bytes;
 use enumset::EnumSet;
 use futures::future::join_all;
 use futures::{StreamExt, TryFutureExt};
@@ -44,6 +47,7 @@ use pageserver_api::shard::{ShardCount, TenantShardId};
 use postgres_ffi::PgMajorVersion;
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
 use scopeguard::defer;
+use serde::{Deserialize, Serialize};
 use serde_json::json;
 use tenant_size_model::svg::SvgBranchKind;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -55,6 +59,7 @@ use utils::auth::SwappableJwtAuth;
 use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
+use wal_decoder::models::record::NeonWalRecord;

 use crate::config::PageServerConf;
 use crate::context;
@@ -75,12 +80,13 @@ use crate::tenant::remote_timeline_client::{
 };
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
+use crate::tenant::storage_layer::ValuesReconstructState;
 use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
 use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
 use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
 use crate::tenant::timeline::{
-    CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
-    WaitLsnTimeout, WaitLsnWaiter, import_pgdata,
+    CompactFlags, CompactOptions, CompactRequest, MarkInvisibleRequest, Timeline, WaitLsnTimeout,
+    WaitLsnWaiter, import_pgdata,
 };
 use crate::tenant::{
    GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
@@ -395,6 +401,7 @@ async fn build_timeline_info(
    timeline: &Arc<Timeline>,
    include_non_incremental_logical_size: bool,
    force_await_initial_logical_size: bool,
+    include_image_consistent_lsn: bool,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -419,6 +426,10 @@ async fn build_timeline_info(
                .await?,
        );
    }
+    // HADRON
+    if include_image_consistent_lsn {
+        info.image_consistent_lsn = Some(timeline.compute_image_consistent_lsn().await?);
+    }
    Ok(info)
 }

@@ -508,6 +519,8 @@ async fn build_timeline_info_common(
        is_invisible: Some(is_invisible),

        walreceiver_status,
+        // HADRON
+        image_consistent_lsn: None,
    };
    Ok(info)
 }
@@ -710,6 +723,8 @@ async fn timeline_list_handler(
        parse_query_param(&request, "include-non-incremental-logical-size")?;
    let force_await_initial_logical_size: Option<bool> =
        parse_query_param(&request, "force-await-initial-logical-size")?;
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let state = get_state(&request);
@@ -730,6 +745,7 @@ async fn timeline_list_handler(
                &timeline,
                include_non_incremental_logical_size.unwrap_or(false),
                force_await_initial_logical_size.unwrap_or(false),
+                include_image_consistent_lsn.unwrap_or(false),
                &ctx,
            )
            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -758,6 +774,9 @@ async fn timeline_and_offloaded_list_handler(
        parse_query_param(&request, "include-non-incremental-logical-size")?;
    let force_await_initial_logical_size: Option<bool> =
        parse_query_param(&request, "force-await-initial-logical-size")?;
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
+
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let state = get_state(&request);
@@ -778,6 +797,7 @@ async fn timeline_and_offloaded_list_handler(
                &timeline,
                include_non_incremental_logical_size.unwrap_or(false),
                force_await_initial_logical_size.unwrap_or(false),
+                include_image_consistent_lsn.unwrap_or(false),
                &ctx,
            )
            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -962,6 +982,9 @@ async fn timeline_detail_handler(
        parse_query_param(&request, "include-non-incremental-logical-size")?;
    let force_await_initial_logical_size: Option<bool> =
        parse_query_param(&request, "force-await-initial-logical-size")?;
+    // HADRON
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    // Logical size calculation needs downloading.
@@ -982,6 +1005,7 @@ async fn timeline_detail_handler(
            &timeline,
            include_non_incremental_logical_size.unwrap_or(false),
            force_await_initial_logical_size.unwrap_or(false),
+            include_image_consistent_lsn.unwrap_or(false),
            ctx,
        )
        .await
@@ -2500,9 +2524,10 @@ async fn timeline_checkpoint_handler(
                .compact(&cancel, flags, &ctx)
                .await
                .map_err(|e|
-                    match e {
-                        CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                        CompactionError::Other(e) => ApiError::InternalServerError(e),
+                    if e.is_cancel() {
+                        ApiError::ShuttingDown
+                    } else {
+                        ApiError::InternalServerError(e.into_anyhow())
                    }
                )?;
        }
@@ -2687,6 +2712,16 @@ async fn deletion_queue_flush(
    }
 }

+/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+struct GetPageResponse {
+    pub page: Bytes,
+    pub layers_visited: u32,
+    pub delta_layers_visited: u32,
+    pub records: Vec<(Lsn, NeonWalRecord)>,
+    pub img: Option<(Lsn, Bytes)>,
+}
+
 async fn getpage_at_lsn_handler(
    request: Request<Body>,
    cancel: CancellationToken,
@@ -2737,21 +2772,24 @@ async fn getpage_at_lsn_handler_inner(

        // Use last_record_lsn if no lsn is provided
        let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let page = timeline.get(key.0, lsn, &ctx).await?;

        if touch {
            json_response(StatusCode::OK, ())
        } else {
-            Result::<_, ApiError>::Ok(
-                Response::builder()
-                    .status(StatusCode::OK)
-                    .header(header::CONTENT_TYPE, "application/octet-stream")
-                    .body(hyper::Body::from(page))
-                    .unwrap(),
-            )
+            let mut reconstruct_state = ValuesReconstructState::new_with_debug(IoConcurrency::sequential());
+            let page = timeline.debug_get(key.0, lsn, &ctx, &mut reconstruct_state).await?;
+            let response = GetPageResponse {
+                page,
+                layers_visited: reconstruct_state.get_layers_visited(),
+                delta_layers_visited: reconstruct_state.get_delta_layers_visited(),
+                records: reconstruct_state.debug_state.records.clone(),
+                img: reconstruct_state.debug_state.img.clone(),
+            };
+
+            json_response(StatusCode::OK, response)
        }
    }
-    .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .instrument(info_span!("timeline_debug_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
 }

@@ -3213,6 +3251,30 @@ async fn get_utilization(
        .map_err(ApiError::InternalServerError)
 }

+/// HADRON
+async fn list_tenant_visible_size_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let state = get_state(&request);
+
+    let mut map = BTreeMap::new();
+    for (tenant_shard_id, slot) in state.tenant_manager.list() {
+        match slot {
+            TenantSlot::Attached(tenant) => {
+                let visible_size = tenant.get_visible_size();
+                map.insert(tenant_shard_id, visible_size);
+            }
+            TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
+                continue;
+            }
+        }
+    }
+
+    json_response(StatusCode::OK, map)
+}
+
 async fn list_aux_files(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -3616,6 +3678,7 @@ async fn activate_post_import_handler(
        let timeline_info = build_timeline_info(
            &timeline, false, // include_non_incremental_logical_size,
            false, // force_await_initial_logical_size
+            false, // include_image_consistent_lsn
            &ctx,
        )
        .await
@@ -3691,6 +3754,23 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow
    Ok(())
 }

+async fn force_refresh_feature_flag(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant
+        .feature_resolver
+        .refresh_properties_and_flags(&tenant);
+    json_response(StatusCode::OK, ())
+}
+
 async fn tenant_evaluate_feature_flag(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -3920,9 +4000,14 @@ pub fn make_router(
        .expect("construct launch timestamp header middleware"),
    );

+    let force_metric_collection_on_scrape = state.conf.force_metric_collection_on_scrape;
+
+    let prometheus_metrics_handler_wrapper =
+        move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape);
+
    Ok(router
        .data(state)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| request_span(r, prometheus_metrics_handler_wrapper))
        .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
        .get("/profile/heap", |r| request_span(r, profile_heap_handler))
        .get("/v1/status", |r| api_handler(r, status_handler))
@@ -4115,7 +4200,7 @@ pub fn make_router(
        })
        .get(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
-            |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
+            |r|  testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
        )
        .get(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage",
@@ -4128,6 +4213,7 @@ pub fn make_router(
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
        .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
        .get("/v1/utilization", |r| api_handler(r, get_utilization))
+        .get("/v1/list_tenant_visible_size", |r| api_handler(r, list_tenant_visible_size_handler))
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
            |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
@@ -4156,6 +4242,9 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| {
            api_handler(r, tenant_evaluate_feature_flag)
        })
+        .post("/v1/tenant/:tenant_shard_id/force_refresh_feature_flag", |r| {
+            api_handler(r, force_refresh_feature_flag)
+        })
        .put("/v1/feature_flag/:flag_key", |r| {
            testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put)
        })
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -610,13 +610,13 @@ async fn import_file(
        debug!("imported twophase file");
    } else if file_path.starts_with("pg_wal") {
        debug!("found wal file in base section. ignore it");
-    } else if file_path.starts_with("zenith.signal") {
+    } else if file_path.starts_with("zenith.signal") || file_path.starts_with("neon.signal") {
        // Parse zenith signal file to set correct previous LSN
        let bytes = read_all_bytes(reader).await?;
-        // zenith.signal format is "PREV LSN: prev_lsn"
+        // neon.signal format is "PREV LSN: prev_lsn"
        // TODO write serialization and deserialization in the same place.
-        let zenith_signal = std::str::from_utf8(&bytes)?.trim();
-        let prev_lsn = match zenith_signal {
+        let neon_signal = std::str::from_utf8(&bytes)?.trim();
+        let prev_lsn = match neon_signal {
            "PREV LSN: none" => Lsn(0),
            "PREV LSN: invalid" => Lsn(0),
            other => {
@@ -624,17 +624,17 @@ async fn import_file(
                split[1]
                    .trim()
                    .parse::<Lsn>()
-                    .context("can't parse zenith.signal")?
+                    .context("can't parse neon.signal")?
            }
        };

-        // zenith.signal is not necessarily the last file, that we handle
+        // neon.signal is not necessarily the last file, that we handle
        // but it is ok to call `finish_write()`, because final `modification.commit()`
        // will update lsn once more to the final one.
        let writer = modification.tline.writer().await;
        writer.finish_write(prev_lsn);

-        debug!("imported zenith signal {}", prev_lsn);
+        debug!("imported neon signal {}", prev_lsn);
    } else if file_path.starts_with("pg_tblspc") {
        // TODO Backups exported from neon won't have pg_tblspc, but we will need
        // this to import arbitrary postgres databases.
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -73,6 +73,9 @@ pub struct HttpEndpointListener(pub CancellableTask);
 pub struct HttpsEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
+// HADRON
+pub struct MetricsCollectionTask(pub CancellableTask);
+
 impl CancellableTask {
    pub async fn shutdown(self) {
        self.cancel.cancel();
@@ -87,6 +90,7 @@ pub async fn shutdown_pageserver(
    https_listener: Option<HttpsEndpointListener>,
    page_service: page_service::Listener,
    grpc_task: Option<CancellableTask>,
+    metrics_collection_task: MetricsCollectionTask,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
@@ -211,6 +215,14 @@ pub async fn shutdown_pageserver(
    // Best effort to persist any outstanding deletions, to avoid leaking objects
    deletion_queue.shutdown(Duration::from_secs(5)).await;

+    // HADRON
+    timed(
+        metrics_collection_task.0.shutdown(),
+        "shutdown metrics collections metrics",
+        Duration::from_secs(1),
+    )
+    .await;
+
    timed(
        consumption_metrics_worker.0.shutdown(),
        "shutdown consumption metrics",
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2847,6 +2847,24 @@ pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy<IntCounter> = Lazy::new(||
    .expect("failed to define a metric")
 });

+// Global counter for PageStream request results by outcome. Outcomes are divided into 3 categories:
+// - success
+// - internal_error: errors that indicate bugs in the storage cluster (e.g. page reconstruction errors, misrouted requests, LSN timeout errors)
+// - other_error: transient error conditions that are expected in normal operation or indicate bugs with other parts of the system (e.g. error due to pageserver shutdown, malformed requests etc.)
+pub(crate) static PAGESTREAM_HANDLER_RESULTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_pagestream_handler_results_total",
+        "Number of pageserver pagestream handler results by outcome (success, internal_error, other_error)",
+        &["outcome"]
+    )
+    .expect("failed to define a metric")
+});
+
+// Constants for pageserver_pagestream_handler_results_total's outcome labels
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_SUCCESS: &str = "success";
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR: &str = "internal_error";
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR: &str = "other_error";
+
 // Metrics collected on WAL redo operations
 //
 // We collect the time spent in actual WAL redo ('redo'), and time waiting
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -70,7 +70,7 @@ use crate::context::{
 };
 use crate::metrics::{
    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
-    MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics,
+    MISROUTED_PAGESTREAM_REQUESTS, PAGESTREAM_HANDLER_RESULTS_TOTAL, SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::{LsnRange, Version};
 use crate::span::{
@@ -1441,20 +1441,57 @@ impl PageServerHandler {
            let (response_msg, ctx) = match handler_result {
                Err(e) => match &e.err {
                    PageStreamError::Shutdown => {
+                        // BEGIN HADRON
+                        PAGESTREAM_HANDLER_RESULTS_TOTAL
+                            .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR])
+                            .inc();
+                        // END HADRON
+
                        // If we fail to fulfil a request during shutdown, which may be _because_ of
                        // shutdown, then do not send the error to the client.  Instead just drop the
                        // connection.
                        span.in_scope(|| info!("dropping connection due to shutdown"));
                        return Err(QueryError::Shutdown);
                    }
-                    PageStreamError::Reconnect(reason) => {
-                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                    PageStreamError::Reconnect(_reason) => {
+                        span.in_scope(|| {
+                            // BEGIN HADRON
+                            // We can get here because the compute node is pointing at the wrong PS. We
+                            // already have a metric to keep track of this so suppressing this log to
+                            // reduce log spam. The information in this log message is not going to be that
+                            // helpful given the volume of logs that can be generated.
+                            // info!("handler requested reconnect: {reason}")
+                            // END HADRON
+                        });
+                        // BEGIN HADRON
+                        PAGESTREAM_HANDLER_RESULTS_TOTAL
+                            .with_label_values(&[
+                                metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
+                            ])
+                            .inc();
+                        // END HADRON
                        return Err(QueryError::Reconnect);
                    }
                    PageStreamError::Read(_)
                    | PageStreamError::LsnTimeout(_)
                    | PageStreamError::NotFound(_)
                    | PageStreamError::BadRequest(_) => {
+                        // BEGIN HADRON
+                        if let PageStreamError::Read(_) | PageStreamError::LsnTimeout(_) = &e.err {
+                            PAGESTREAM_HANDLER_RESULTS_TOTAL
+                                .with_label_values(&[
+                                    metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
+                                ])
+                                .inc();
+                        } else {
+                            PAGESTREAM_HANDLER_RESULTS_TOTAL
+                                .with_label_values(&[
+                                    metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR,
+                                ])
+                                .inc();
+                        }
+                        // END HADRON
+
                        // print the all details to the log with {:#}, but for the client the
                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
                        // here includes cancellation which is not an error.
@@ -1472,7 +1509,15 @@ impl PageServerHandler {
                        )
                    }
                },
-                Ok((response_msg, _op_timer_already_observed, ctx)) => (response_msg, Some(ctx)),
+                Ok((response_msg, _op_timer_already_observed, ctx)) => {
+                    // BEGIN HADRON
+                    PAGESTREAM_HANDLER_RESULTS_TOTAL
+                        .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_SUCCESS])
+                        .inc();
+                    // END HADRON
+
+                    (response_msg, Some(ctx))
+                }
            };

            let ctx = ctx.map(|req_ctx| {
@@ -3293,9 +3338,12 @@ impl GrpcPageServiceHandler {
    }

    /// Generates a PagestreamRequest header from a ReadLsn and request ID.
-    fn make_hdr(read_lsn: page_api::ReadLsn, req_id: u64) -> PagestreamRequest {
+    fn make_hdr(
+        read_lsn: page_api::ReadLsn,
+        req_id: Option<page_api::RequestID>,
+    ) -> PagestreamRequest {
        PagestreamRequest {
-            reqid: req_id,
+            reqid: req_id.map(|r| r.id).unwrap_or_default(),
            request_lsn: read_lsn.request_lsn,
            not_modified_since: read_lsn
                .not_modified_since_lsn
@@ -3353,6 +3401,8 @@ impl GrpcPageServiceHandler {
    /// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
    /// GetPageResponse with an appropriate status code to avoid terminating the stream.
    ///
+    /// TODO: verify that the requested pages belong to this shard.
+    ///
    /// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
    /// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
    /// split them up in the client or server.
@@ -3403,7 +3453,7 @@ impl GrpcPageServiceHandler {

            batch.push(BatchedGetPageRequest {
                req: PagestreamGetPageRequest {
-                    hdr: Self::make_hdr(req.read_lsn, req.request_id),
+                    hdr: Self::make_hdr(req.read_lsn, Some(req.request_id)),
                    rel: req.rel,
                    blkno,
                },
@@ -3433,12 +3483,16 @@ impl GrpcPageServiceHandler {
            request_id: req.request_id,
            status_code: page_api::GetPageStatusCode::Ok,
            reason: None,
-            page_images: Vec::with_capacity(results.len()),
+            rel: req.rel,
+            pages: Vec::with_capacity(results.len()),
        };

        for result in results {
            match result {
-                Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.page_images.push(r.page),
+                Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.pages.push(page_api::Page {
+                    block_number: r.req.blkno,
+                    image: r.page,
+                }),
                Ok((resp, _, _)) => {
                    return Err(tonic::Status::internal(format!(
                        "unexpected response: {resp:?}"
@@ -3481,7 +3535,7 @@ impl proto::PageService for GrpcPageServiceHandler {
        span_record!(rel=%req.rel, lsn=%req.read_lsn);

        let req = PagestreamExistsRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
            rel: req.rel,
        };

@@ -3631,7 +3685,7 @@ impl proto::PageService for GrpcPageServiceHandler {
        span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn);

        let req = PagestreamDbSizeRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
            dbnode: req.db_oid,
        };

@@ -3681,7 +3735,7 @@ impl proto::PageService for GrpcPageServiceHandler {
                .await?
                .downgrade();
            while let Some(req) = reqs.message().await? {
-                let req_id = req.request_id;
+                let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
                let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
                    .instrument(span.clone()) // propagate request span
                    .await;
@@ -3720,7 +3774,7 @@ impl proto::PageService for GrpcPageServiceHandler {
        span_record!(rel=%req.rel, lsn=%req.read_lsn);

        let req = PagestreamNblocksRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
            rel: req.rel,
        };

@@ -3753,7 +3807,7 @@ impl proto::PageService for GrpcPageServiceHandler {
        span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn);

        let req = PagestreamGetSlruSegmentRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
            kind: req.kind as u8,
            segno: req.segno,
        };
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -25,9 +25,9 @@ use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
 use pageserver_api::models::RelSizeMigration;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::{BLCKSZ, PgMajorVersion, TimestampTz, TransactionId};
+use postgres_ffi::{BLCKSZ, PgMajorVersion, TransactionId};
 use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi_types::{Oid, RepOriginId};
+use postgres_ffi_types::{Oid, RepOriginId, TimestampTz};
 use serde::{Deserialize, Serialize};
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3291,7 +3291,7 @@ impl TenantShard {
                        // Ignore this, we likely raced with unarchival.
                        OffloadError::NotArchived => Ok(()),
                        OffloadError::AlreadyInProgress => Ok(()),
-                        OffloadError::Cancelled => Err(CompactionError::ShuttingDown),
+                        OffloadError::Cancelled => Err(CompactionError::new_cancelled()),
                        // don't break the anyhow chain
                        OffloadError::Other(err) => Err(CompactionError::Other(err)),
                    })?;
@@ -3321,16 +3321,13 @@ impl TenantShard {

    /// Trips the compaction circuit breaker if appropriate.
    pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
-        match err {
-            err if err.is_cancel() => {}
-            CompactionError::ShuttingDown => (),
-            CompactionError::Other(err) => {
-                self.compaction_circuit_breaker
-                    .lock()
-                    .unwrap()
-                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
-            }
+        if err.is_cancel() {
+            return;
        }
+        self.compaction_circuit_breaker
+            .lock()
+            .unwrap()
+            .fail(&CIRCUIT_BREAKERS_BROKEN, err);
    }

    /// Cancel scheduled compaction tasks
@@ -3396,7 +3393,13 @@ impl TenantShard {
                .collect_vec();

            for timeline in timelines {
-                timeline.maybe_freeze_ephemeral_layer().await;
+                // Include a span with the timeline ID. The parent span already has the tenant ID.
+                let span =
+                    info_span!("maybe_freeze_ephemeral_layer", timeline_id = %timeline.timeline_id);
+                timeline
+                    .maybe_freeze_ephemeral_layer()
+                    .instrument(span)
+                    .await;
            }
        }

@@ -4174,6 +4177,15 @@ impl TenantShard {
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

+    // HADRON
+    pub fn get_image_creation_timeout(&self) -> Option<Duration> {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf.image_layer_force_creation_period.or(self
+            .conf
+            .default_tenant_conf
+            .image_layer_force_creation_period)
+    }
+
    pub fn get_pitr_interval(&self) -> Duration {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
@@ -5713,6 +5725,16 @@ impl TenantShard {
            .unwrap_or(0)
    }

+    /// HADRON
+    /// Return the visible size of all timelines in this tenant.
+    pub(crate) fn get_visible_size(&self) -> u64 {
+        let timelines = self.timelines.lock().unwrap();
+        timelines
+            .values()
+            .map(|t| t.metrics.visible_physical_size_gauge.get())
+            .sum()
+    }
+
    /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
    /// manifest in `Self::remote_tenant_manifest`.
    ///
@@ -12800,6 +12822,40 @@ mod tests {
                },
            ]
        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_get_force_image_creation_lsn() -> anyhow::Result<()> {
+        let tenant_conf = pageserver_api::models::TenantConfig {
+            pitr_interval: Some(Duration::from_secs(7 * 3600)),
+            image_layer_force_creation_period: Some(Duration::from_secs(3600)),
+            ..Default::default()
+        };
+
+        let tenant_id = TenantId::generate();
+
+        let harness = TenantHarness::create_custom(
+            "test_get_force_image_creation_lsn",
+            tenant_conf,
+            tenant_id,
+            ShardIdentity::unsharded(),
+            Generation::new(1),
+        )
+        .await?;
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        timeline.gc_info.write().unwrap().cutoffs.time = Some(Lsn(100));
+        {
+            let writer = timeline.writer().await;
+            writer.finish_write(Lsn(5000));
+        }
+
+        let image_creation_lsn = timeline.get_force_image_creation_lsn().unwrap();
+        assert_eq!(image_creation_lsn, Lsn(4300));
        Ok(())
    }
 }
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;`