Address review comments

Update pgxn/neon/neon--1.6--1.7.sql
Co-authored-by: Matthias van de Meent <matthias@neon.tech>
2026-01-26 23:00:37 +00:00 · 2025-07-16 18:39:31 +03:00 · 2025-07-16 18:25:59 +03:00 · 2025-07-16 18:18:47 +03:00 · 2025-07-16 17:38:28 +03:00 · 2025-07-16 14:53:26 +03:00
120 changed files with 3970 additions and 1493 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -27,4 +27,4 @@
 !storage_controller/
 !vendor/postgres-*/
 !workspace_hack/
-!build_tools/patches
+!build-tools/patches
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -31,6 +31,7 @@ config-variables:
  - NEON_PROD_AWS_ACCOUNT_ID
  - PGREGRESS_PG16_PROJECT_ID
  - PGREGRESS_PG17_PROJECT_ID
+  - PREWARM_PGBENCH_SIZE
  - REMOTE_STORAGE_AZURE_CONTAINER
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_CICD_CHANNEL_ID
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -176,7 +176,11 @@ runs:
        fi

        if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
-          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+          # We don't use code coverage for regression tests (the step is disabled),
+          # so there's no need to collect it.
+          # Ref https://github.com/neondatabase/neon/issues/4540
+          # cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+          cov_prefix=()
        else
          cov_prefix=()
        fi
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -150,7 +150,7 @@ jobs:
          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
          use-fallback: false
          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
@@ -162,7 +162,7 @@ jobs:
          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
          use-fallback: false
          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
@@ -174,7 +174,7 @@ jobs:
          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
          use-fallback: false
          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}

      - name: Cache postgres v17 build
        id: cache_pg_17
@@ -186,7 +186,7 @@ jobs:
          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
          use-fallback: false
          path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}

      - name: Build all
        # Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -219,6 +219,7 @@ jobs:
          --ignore test_runner/performance/test_cumulative_statistics_persistence.py
          --ignore test_runner/performance/test_perf_many_relations.py
          --ignore test_runner/performance/test_perf_oltp_large_tenant.py
+          --ignore test_runner/performance/test_lfc_prewarm.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -410,6 +411,77 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

+  prewarm-test:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }}
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 17
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+
+    steps:
+    - name: Harden the runner (Audit all outbound calls)
+      uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+      with:
+        egress-policy: audit
+
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Run prewarm benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_lfc_prewarm.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      id: create-allure-report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+      with:
+        store-test-results-into-db: true
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
  generate-matrices:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -72,7 +72,7 @@ jobs:
          ARCHS: ${{ inputs.archs || '["x64","arm64"]' }}
          DEBIANS: ${{ inputs.debians || '["bullseye","bookworm"]' }}
          IMAGE_TAG: |
-            ${{ hashFiles('build-tools.Dockerfile',
+            ${{ hashFiles('build-tools/Dockerfile',
                          '.github/workflows/build-build-tools-image.yml') }}
        run: |
          echo "archs=${ARCHS}"           | tee -a ${GITHUB_OUTPUT}
@@ -144,7 +144,7 @@ jobs:

      - uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
        with:
-          file: build-tools.Dockerfile
+          file: build-tools/Dockerfile
          context: .
          provenance: false
          push: true
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,16 +1,16 @@
 [submodule "vendor/postgres-v14"]
 	path = vendor/postgres-v14
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_14_STABLE_neon
 [submodule "vendor/postgres-v15"]
 	path = vendor/postgres-v15
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_15_STABLE_neon
 [submodule "vendor/postgres-v16"]
 	path = vendor/postgres-v16
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_16_STABLE_neon
 [submodule "vendor/postgres-v17"]
 	path = vendor/postgres-v17
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_17_STABLE_neon
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4296,6 +4296,7 @@ dependencies = [
 "pageserver_client",
 "pageserver_client_grpc",
 "pageserver_page_api",
+ "pprof",
 "rand 0.8.5",
 "reqwest",
 "serde",
@@ -5289,6 +5290,7 @@ dependencies = [
 "async-trait",
 "atomic-take",
 "aws-config",
+ "aws-credential-types",
 "aws-sdk-iam",
 "aws-sigv4",
 "base64 0.22.1",
@@ -5328,6 +5330,7 @@ dependencies = [
 "itoa",
 "jose-jwa",
 "jose-jwk",
+ "json",
 "lasso",
 "measured",
 "metrics",
@@ -6991,6 +6994,7 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "reqwest",
+ "safekeeper_api",
 "serde_json",
 "storage_controller_client",
 "tokio",
@@ -7560,6 +7564,7 @@ dependencies = [
 "futures-core",
 "pin-project-lite",
 "tokio",
+ "tokio-util",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -201,7 +201,7 @@ tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.g
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
-tokio-stream = "0.1"
+tokio-stream = { version = "0.1", features = ["sync"] }
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
 toml = "0.8"
--- a/build-tools/Dockerfile
+++ b/build-tools/Dockerfile
@@ -35,7 +35,7 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
    echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc

-COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
+COPY build-tools/patches/pgcopydbv017.patch /pgcopydbv017.patch

 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
        set -e && \
--- a/build-tools/patches/pgcopydbv017.patch
+++ b/build-tools/patches/pgcopydbv017.patch
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -9,7 +9,7 @@
 #
 # build-tools:   This contains Rust compiler toolchain and other tools needed at compile
 #                time. This is also used for the storage builds. This image is defined in
-#                build-tools.Dockerfile.
+#                build-tools/Dockerfile.
 #
 # build-deps:    Contains C compiler, other build tools, and compile-time dependencies
 #                needed to compile PostgreSQL and most extensions. (Some extensions need
@@ -115,7 +115,7 @@ ARG EXTENSIONS=all
 FROM $BASE_IMAGE_SHA AS build-deps
 ARG DEBIAN_VERSION

-# Keep in sync with build-tools.Dockerfile
+# Keep in sync with build-tools/Dockerfile
 ENV PROTOC_VERSION=25.1

 # Use strict mode for bash to catch errors early
@@ -1790,7 +1790,7 @@ RUN set -e \
 #########################################################################################
 FROM build-deps AS exporters
 ARG TARGETARCH
-# Keep sql_exporter version same as in build-tools.Dockerfile and
+# Keep sql_exporter version same as in build-tools/Dockerfile and
 # test_runner/regress/test_compute_metrics.py
 # See comment on the top of the file regading `echo`, `-e` and `\n`
 RUN if [ "$TARGETARCH" = "amd64" ]; then\
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1040,6 +1040,8 @@ impl ComputeNode {
            PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
        };

+        self.fix_zenith_signal_neon_signal()?;
+
        let mut state = self.state.lock().unwrap();
        state.metrics.pageserver_connect_micros =
            connected.duration_since(started).as_micros() as u64;
@@ -1049,6 +1051,27 @@ impl ComputeNode {
        Ok(())
    }

+    /// Move the Zenith signal file to Neon signal file location.
+    /// This makes Compute compatible with older PageServers that don't yet
+    /// know about the Zenith->Neon rename.
+    fn fix_zenith_signal_neon_signal(&self) -> Result<()> {
+        let datadir = Path::new(&self.params.pgdata);
+
+        let neonsig = datadir.join("neon.signal");
+
+        if neonsig.is_file() {
+            return Ok(());
+        }
+
+        let zenithsig = datadir.join("zenith.signal");
+
+        if zenithsig.is_file() {
+            fs::copy(zenithsig, neonsig)?;
+        }
+
+        Ok(())
+    }
+
    /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
    /// the connection was established, and the (compressed) size of the basebackup.
    fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
@@ -2464,7 +2487,7 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
                serde_json::to_string(&extensions).expect("failed to serialize extensions list")
            );
        }
-        Err(err) => error!("could not get installed extensions: {err:?}"),
+        Err(err) => error!("could not get installed extensions: {err}"),
    }
    Ok(())
 }
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -2,6 +2,7 @@ use std::collections::HashMap;

 use anyhow::Result;
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
+use tokio_postgres::error::Error as PostgresError;
 use tokio_postgres::{Client, Config, NoTls};

 use crate::metrics::INSTALLED_EXTENSIONS;
@@ -10,7 +11,7 @@ use crate::metrics::INSTALLED_EXTENSIONS;
 /// and to make database listing query here more explicit.
 ///
 /// Limit the number of databases to 500 to avoid excessive load.
-async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
+async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
    // `pg_database.datconnlimit = -2` means that the database is in the
    // invalid state
    let databases = client
@@ -37,7 +38,9 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 /// Same extension can be installed in multiple databases with different versions,
 /// so we report a separate metric (number of databases where it is installed)
 /// for each extension version.
-pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExtensions> {
+pub async fn get_installed_extensions(
+    mut conf: Config,
+) -> Result<InstalledExtensions, PostgresError> {
    conf.application_name("compute_ctl:get_installed_extensions");
    let databases: Vec<String> = {
        let (mut client, connection) = conf.connect(NoTls).await?;
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -36,7 +36,7 @@ impl StorageBroker {
    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
        let broker = &self.env.broker;

-        print!("Starting neon broker at {}", broker.client_url());
+        println!("Starting neon broker at {}", broker.client_url());

        let mut args = Vec::new();

--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -32,7 +32,8 @@
 //!     config.json                 - passed to `compute_ctl`
 //!     pgdata/
 //!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
-//!         zenith.signal
+//!         neon.signal
+//!         zenith.signal         - copy of neon.signal, for backward compatibility
 //!         <other PostgreSQL files>
 //! ```
 //!
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -217,6 +217,9 @@ pub struct NeonStorageControllerConf {
    pub posthog_config: Option<PostHogConfig>,

    pub kick_secondary_downloads: Option<bool>,
+
+    #[serde(with = "humantime_serde")]
+    pub shard_split_request_timeout: Option<Duration>,
 }

 impl NeonStorageControllerConf {
@@ -250,6 +253,7 @@ impl Default for NeonStorageControllerConf {
            timeline_safekeeper_count: None,
            posthog_config: None,
            kick_secondary_downloads: None,
+            shard_split_request_timeout: None,
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -303,7 +303,7 @@ impl PageServerNode {
    async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
-        print!(
+        println!(
            "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
            self.conf.id,
            self.pg_connection_config.raw_address(),
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -127,7 +127,7 @@ impl SafekeeperNode {
        extra_opts: &[String],
        retry_timeout: &Duration,
    ) -> anyhow::Result<()> {
-        print!(
+        println!(
            "Starting safekeeper at '{}' in '{}', retrying for {:?}",
            self.pg_connection_config.raw_address(),
            self.datadir_path().display(),
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -648,6 +648,13 @@ impl StorageController {
            args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
        }

+        if let Some(duration) = self.config.shard_split_request_timeout {
+            args.push(format!(
+                "--shard-split-request-timeout={}",
+                humantime::Duration::from(duration)
+            ));
+        }
+
        let mut envs = vec![
            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
@@ -660,7 +667,7 @@ impl StorageController {
            ));
        }

-        println!("Starting storage controller");
+        println!("Starting storage controller at {scheme}://{host}:{listen_port}");

        background_process::start_process(
            COMMAND,
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -14,6 +14,7 @@ humantime.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
+safekeeper_api.workspace=true
 serde_json = { workspace = true, features = ["raw_value"] }
 storage_controller_client.workspace = true
 tokio.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -11,7 +11,7 @@ use pageserver_api::controller_api::{
    PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
    ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
    SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    TenantShardMigrateRequest, TenantShardMigrateResponse, TimelineSafekeeperMigrateRequest,
 };
 use pageserver_api::models::{
    EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
@@ -21,6 +21,7 @@ use pageserver_api::models::{
 use pageserver_api::shard::{ShardStripeSize, TenantShardId};
 use pageserver_client::mgmt_api::{self};
 use reqwest::{Certificate, Method, StatusCode, Url};
+use safekeeper_api::models::TimelineLocateResponse;
 use storage_controller_client::control_api::Client;
 use utils::id::{NodeId, TenantId, TimelineId};

@@ -279,6 +280,23 @@ enum Command {
        #[arg(long)]
        concurrency: Option<usize>,
    },
+    /// Locate safekeepers for a timeline from the storcon DB.
+    TimelineLocate {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+    },
+    /// Migrate a timeline to a new set of safekeepers
+    TimelineSafekeeperMigrate {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+        /// Example: --new-sk-set 1,2,3
+        #[arg(long, required = true, value_delimiter = ',')]
+        new_sk_set: Vec<NodeId>,
+    },
 }

 #[derive(Parser)]
@@ -458,6 +476,7 @@ async fn main() -> anyhow::Result<()> {
                        listen_http_port,
                        listen_https_port,
                        availability_zone_id: AvailabilityZone(availability_zone_id),
+                        node_ip_addr: None,
                    }),
                )
                .await?;
@@ -1324,7 +1343,7 @@ async fn main() -> anyhow::Result<()> {
            concurrency,
        } => {
            let mut path = format!(
-                "/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
+                "v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
            );

            if let Some(c) = concurrency {
@@ -1335,6 +1354,41 @@ async fn main() -> anyhow::Result<()> {
                .dispatch::<(), ()>(Method::POST, path, None)
                .await?;
        }
+        Command::TimelineLocate {
+            tenant_id,
+            timeline_id,
+        } => {
+            let path = format!("debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate");
+
+            let resp = storcon_client
+                .dispatch::<(), TimelineLocateResponse>(Method::GET, path, None)
+                .await?;
+
+            let sk_set = resp.sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
+            let new_sk_set = resp
+                .new_sk_set
+                .as_ref()
+                .map(|ids| ids.iter().map(|id| id.0 as i64).collect::<Vec<_>>());
+
+            println!("generation = {}", resp.generation);
+            println!("sk_set = {sk_set:?}");
+            println!("new_sk_set = {new_sk_set:?}");
+        }
+        Command::TimelineSafekeeperMigrate {
+            tenant_id,
+            timeline_id,
+            new_sk_set,
+        } => {
+            let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate");
+
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::POST,
+                    path,
+                    Some(TimelineSafekeeperMigrateRequest { new_sk_set }),
+                )
+                .await?;
+        }
    }

    Ok(())
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -129,9 +129,10 @@ segment to bootstrap the WAL writing, but it doesn't contain the checkpoint reco
 changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
 from WAL.

-This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start
-at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last
-checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo.
+This includes code to read the `neon.signal` (also `zenith.signal`) file, which tells the startup 
+code the LSN to start at. When the `neon.signal` file is present, the startup uses that LSN
+instead of the last checkpoint's LSN. The system is known to be consistent at that LSN, without 
+any WAL redo.


 ### How to get rid of the patch
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,5 +1,6 @@
 use std::collections::{HashMap, HashSet};
 use std::fmt::Display;
+use std::net::IpAddr;
 use std::str::FromStr;
 use std::time::{Duration, Instant};

@@ -10,7 +11,7 @@ use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;

-use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
+use crate::models::{PageserverUtilization, ShardParameters, TenantConfig, TimelineInfo};
 use crate::shard::{ShardStripeSize, TenantShardId};

 #[derive(Serialize, Deserialize, Debug)]
@@ -60,6 +61,11 @@ pub struct NodeRegisterRequest {
    pub listen_https_port: Option<u16>,

    pub availability_zone_id: AvailabilityZone,
+
+    // Reachable IP address of the PS/SK registering, if known.
+    // Hadron Cluster Coordiantor will update the DNS record of the registering node
+    // with this IP address.
+    pub node_ip_addr: Option<IpAddr>,
 }

 #[derive(Serialize, Deserialize)]
@@ -126,6 +132,13 @@ pub struct TenantDescribeResponse {
    pub config: TenantConfig,
 }

+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantTimelineDescribeResponse {
+    pub shards: Vec<TimelineInfo>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_consistent_lsn: Option<Lsn>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct NodeShardResponse {
    pub node_id: NodeId,
@@ -538,6 +551,39 @@ pub struct SafekeeperDescribeResponse {
    pub scheduling_policy: SkSchedulingPolicy,
 }

+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct TimelineSafekeeperPeer {
+    pub node_id: NodeId,
+    pub listen_http_addr: String,
+    pub http_port: i32,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SCSafekeeperTimeline {
+    // SC does not know the tenant id.
+    pub timeline_id: TimelineId,
+    pub peers: Vec<NodeId>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SCSafekeeperTimelinesResponse {
+    pub timelines: Vec<SCSafekeeperTimeline>,
+    pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SafekeeperTimeline {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub peers: Vec<NodeId>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SafekeeperTimelinesResponse {
+    pub timelines: Vec<SafekeeperTimeline>,
+    pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
+}
+
 #[derive(Serialize, Deserialize, Clone)]
 pub struct SafekeeperSchedulingPolicyRequest {
    pub scheduling_policy: SkSchedulingPolicy,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1622,6 +1622,9 @@ pub struct TimelineInfo {

    /// Whether the timeline is invisible in synthetic size calculations.
    pub is_invisible: Option<bool>,
+    // HADRON: the largest LSN below which all page updates have been included in the image layers.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_consistent_lsn: Option<Lsn>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -31,6 +31,7 @@ pub struct UnreliableWrapper {
    /* BEGIN_HADRON */
    // This the probability of failure for each operation, ranged from [0, 100].
    // The probability is default to 100, which means that all operations will fail.
+    // Storage will fail by probability up to attempts_to_fail times.
    attempt_failure_probability: u64,
    /* END_HADRON */
 }
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -11,7 +11,7 @@ use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;

-use crate::membership::Configuration;
+use crate::membership::{Configuration, SafekeeperGeneration};
 use crate::{ServerInfo, Term};

 #[derive(Debug, Serialize, Deserialize)]
@@ -311,3 +311,12 @@ pub struct PullTimelineResponse {
    pub safekeeper_host: Option<String>,
    // TODO: add more fields?
 }
+
+/// Response to a timeline locate request.
+/// Storcon-only API.
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct TimelineLocateResponse {
+    pub generation: SafekeeperGeneration,
+    pub sk_set: Vec<NodeId>,
+    pub new_sk_set: Option<Vec<NodeId>>,
+}
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -47,6 +47,7 @@ where

 /* BEGIN_HADRON */
 pub enum DeploymentMode {
+    Local,
    Dev,
    Staging,
    Prod,
@@ -64,7 +65,7 @@ pub fn get_deployment_mode() -> Option<DeploymentMode> {
            }
        },
        Err(_) => {
-            tracing::error!("DEPLOYMENT_MODE not set");
+            // tracing::error!("DEPLOYMENT_MODE not set");
            None
        }
    }
--- a/libs/utils/src/ip_address.rs
+++ b/libs/utils/src/ip_address.rs
@@ -0,0 +1,73 @@
+use std::env::{VarError, var};
+use std::error::Error;
+use std::net::IpAddr;
+use std::str::FromStr;
+
+/// Name of the environment variable containing the reachable IP address of the node. If set, the IP address contained in this
+/// environment variable is used as the reachable IP address of the pageserver or safekeeper node during node registration.
+/// In a Kubernetes environment, this environment variable should be set by Kubernetes to the Pod IP (specified in the Pod
+/// template).
+pub const HADRON_NODE_IP_ADDRESS: &str = "HADRON_NODE_IP_ADDRESS";
+
+/// Read the reachable IP address of this page server from env var HADRON_NODE_IP_ADDRESS.
+/// In Kubernetes this environment variable is set to the Pod IP (specified in the Pod template).
+pub fn read_node_ip_addr_from_env() -> Result<Option<IpAddr>, Box<dyn Error>> {
+    match var(HADRON_NODE_IP_ADDRESS) {
+        Ok(v) => {
+            if let Ok(addr) = IpAddr::from_str(&v) {
+                Ok(Some(addr))
+            } else {
+                Err(format!("Invalid IP address string: {v}. Cannot be parsed as either an IPv4 or an IPv6 address.").into())
+            }
+        }
+        Err(VarError::NotPresent) => Ok(None),
+        Err(e) => Err(e.into()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::env;
+    use std::net::{Ipv4Addr, Ipv6Addr};
+
+    #[test]
+    fn test_read_node_ip_addr_from_env() {
+        // SAFETY: test code
+        unsafe {
+            // Test with a valid IPv4 address
+            env::set_var(HADRON_NODE_IP_ADDRESS, "192.168.1.1");
+            let result = read_node_ip_addr_from_env().unwrap();
+            assert_eq!(result, Some(IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1))));
+
+            // Test with a valid IPv6 address
+            env::set_var(
+                HADRON_NODE_IP_ADDRESS,
+                "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
+            );
+        }
+        let result = read_node_ip_addr_from_env().unwrap();
+        assert_eq!(
+            result,
+            Some(IpAddr::V6(
+                Ipv6Addr::from_str("2001:0db8:85a3:0000:0000:8a2e:0370:7334").unwrap()
+            ))
+        );
+
+        // Test with an invalid IP address
+        // SAFETY: test code
+        unsafe {
+            env::set_var(HADRON_NODE_IP_ADDRESS, "invalid_ip");
+        }
+        let result = read_node_ip_addr_from_env();
+        assert!(result.is_err());
+
+        // Test with no environment variable set
+        // SAFETY: test code
+        unsafe {
+            env::remove_var(HADRON_NODE_IP_ADDRESS);
+        }
+        let result = read_node_ip_addr_from_env().unwrap();
+        assert_eq!(result, None);
+    }
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,6 +26,9 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;

+// utility functions to obtain reachable IP addresses in PS/SK nodes.
+pub mod ip_address;
+
 pub mod shard;

 mod hex;
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,4 +1,5 @@
 use std::future::Future;
+use std::pin::Pin;
 use std::str::FromStr;
 use std::time::Duration;

@@ -7,7 +8,7 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
 use tokio::time::Instant;
-use tracing::info;
+use tracing::{info, warn};

 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
@@ -377,10 +378,11 @@ impl std::fmt::Debug for SecretString {
 ///
 /// TODO: consider upgrading this to a warning, but currently it fires too often.
 #[inline]
-pub async fn log_slow<F, O>(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O
-where
-    F: Future<Output = O>,
-{
+pub async fn log_slow<O>(
+    name: &str,
+    threshold: Duration,
+    f: Pin<&mut impl Future<Output = O>>,
+) -> O {
    monitor_slow_future(
        threshold,
        threshold, // period = threshold
@@ -394,16 +396,42 @@ where
            if !is_slow {
                return;
            }
+            let elapsed = elapsed_total.as_secs_f64();
            if ready {
-                info!(
-                    "slow {name} completed after {:.3}s",
-                    elapsed_total.as_secs_f64()
-                );
+                info!("slow {name} completed after {elapsed:.3}s");
            } else {
-                info!(
-                    "slow {name} still running after {:.3}s",
-                    elapsed_total.as_secs_f64()
-                );
+                info!("slow {name} still running after {elapsed:.3}s");
+            }
+        },
+    )
+    .await
+}
+
+/// Logs a periodic warning if a future is slow to complete.
+#[inline]
+pub async fn warn_slow<O>(
+    name: &str,
+    threshold: Duration,
+    f: Pin<&mut impl Future<Output = O>>,
+) -> O {
+    monitor_slow_future(
+        threshold,
+        threshold, // period = threshold
+        f,
+        |MonitorSlowFutureCallback {
+             ready,
+             is_slow,
+             elapsed_total,
+             elapsed_since_last_callback: _,
+         }| {
+            if !is_slow {
+                return;
+            }
+            let elapsed = elapsed_total.as_secs_f64();
+            if ready {
+                warn!("slow {name} completed after {elapsed:.3}s");
+            } else {
+                warn!("slow {name} still running after {elapsed:.3}s");
            }
        },
    )
@@ -416,7 +444,7 @@ where
 pub async fn monitor_slow_future<F, O>(
    threshold: Duration,
    period: Duration,
-    mut fut: std::pin::Pin<&mut F>,
+    mut fut: Pin<&mut F>,
    mut cb: impl FnMut(MonitorSlowFutureCallback),
 ) -> O
 where
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -1,13 +1,16 @@
 use std::collections::HashMap;
 use std::num::NonZero;
+use std::pin::pin;
 use std::sync::Arc;
+use std::time::{Duration, Instant};

 use anyhow::anyhow;
 use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
 use tonic::codec::CompressionEncoding;
-use tracing::instrument;
+use tracing::{debug, instrument};
+use utils::logging::warn_slow;

 use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
 use crate::retry::Retry;
@@ -21,28 +24,40 @@ use utils::shard::{ShardCount, ShardIndex, ShardNumber};
 /// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
 /// when full.
 ///
+/// Normal requests are small, and we don't pipeline them, so we can afford a large number of
+/// streams per connection.
+///
 /// TODO: tune all of these constants, and consider making them configurable.
-/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
-/// with only streams.
-const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
+const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(64).unwrap();

-/// Max number of concurrent unary request clients per shard.
-const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
+/// Max number of concurrent bulk GetPage streams per channel (i.e. TCP connection). These use a
+/// dedicated channel pool with a lower client limit, to avoid TCP-level head-of-line blocking and
+/// transmission delays. This also concentrates large window sizes on a smaller set of
+/// streams/connections, presumably reducing memory use.
+const MAX_BULK_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();

-/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage
-/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`.
-const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
+/// The batch size threshold at which a GetPage request will use the bulk stream pool.
+///
+/// The gRPC initial window size is 64 KB. Each page is 8 KB, so let's avoid increasing the window
+/// size for the normal stream pool, and route requests for >= 5 pages (>32 KB) to the bulk pool.
+const BULK_THRESHOLD_BATCH_SIZE: usize = 5;

-/// Max number of pipelined requests per stream.
-const MAX_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(2).unwrap();
+/// The overall request call timeout, including retries and pool acquisition.
+/// TODO: should we retry forever? Should the caller decide?
+const CALL_TIMEOUT: Duration = Duration::from_secs(60);

-/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
-/// are more throughput-oriented, we have a smaller limit but higher queue depth.
-const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
+/// The per-request (retry attempt) timeout, including any lazy connection establishment.
+const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);

-/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus
-/// get a larger queue depth.
-const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
+/// The initial request retry backoff duration. The first retry does not back off.
+/// TODO: use a different backoff for ResourceExhausted (rate limiting)? Needs server support.
+const BASE_BACKOFF: Duration = Duration::from_millis(5);
+
+/// The maximum request retry backoff duration.
+const MAX_BACKOFF: Duration = Duration::from_secs(5);
+
+/// Threshold and interval for warning about slow operation.
+const SLOW_THRESHOLD: Duration = Duration::from_secs(3);

 /// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
 /// basic `page_api::Client` gRPC client, and supports:
@@ -50,10 +65,19 @@ const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
 /// * Sharded tenants across multiple Pageservers.
 /// * Pooling of connections, clients, and streams for efficient resource use.
 /// * Concurrent use by many callers.
-/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling.
+/// * Internal handling of GetPage bidirectional streams.
 /// * Automatic retries.
 /// * Observability.
 ///
+/// The client has dedicated connection/client/stream pools per shard, for resource reuse. These
+/// pools are unbounded: we allow scaling out as many concurrent streams as needed to serve all
+/// concurrent callers, which mostly eliminates head-of-line blocking. Idle streams are fairly
+/// cheap: the server task currently uses 26 KB of memory, so we can comfortably fit 100,000
+/// concurrent idle streams (2.5 GB memory). The worst case degenerates to the old libpq case with
+/// one stream per backend, but without the TCP connection overhead. In the common case we expect
+/// significantly lower stream counts due to stream sharing, driven e.g. by idle backends, LFC hits,
+/// read coalescing, sharding (backends typically only talk to one shard at a time), etc.
+///
 /// TODO: this client does not support base backups or LSN leases, as these are only used by
 /// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
 pub struct PageserverClient {
@@ -67,8 +91,6 @@ pub struct PageserverClient {
    compression: Option<CompressionEncoding>,
    /// The shards for this tenant.
    shards: ArcSwap<Shards>,
-    /// The retry configuration.
-    retry: Retry,
 }

 impl PageserverClient {
@@ -94,7 +116,6 @@ impl PageserverClient {
            auth_token,
            compression,
            shards: ArcSwap::new(Arc::new(shards)),
-            retry: Retry,
        })
    }

@@ -142,13 +163,15 @@ impl PageserverClient {
        &self,
        req: page_api::CheckRelExistsRequest,
    ) -> tonic::Result<page_api::CheckRelExistsResponse> {
-        self.retry
-            .with(async |_| {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.check_rel_exists(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.check_rel_exists(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
    }

    /// Returns the total size of a database, as # of bytes.
@@ -157,13 +180,15 @@ impl PageserverClient {
        &self,
        req: page_api::GetDbSizeRequest,
    ) -> tonic::Result<page_api::GetDbSizeResponse> {
-        self.retry
-            .with(async |_| {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.get_db_size(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_db_size(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
    }

    /// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
@@ -193,6 +218,8 @@ impl PageserverClient {
            return Err(tonic::Status::invalid_argument("request attempt must be 0"));
        }

+        debug!("sending request: {req:?}");
+
        // The shards may change while we're fetching pages. We execute the request using a stable
        // view of the shards (especially important for requests that span shards), but retry the
        // top-level (pre-split) request to pick up shard changes. This can lead to unnecessary
@@ -201,13 +228,16 @@ impl PageserverClient {
        //
        // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
        // once we figure out how to handle these.
-        self.retry
-            .with(async |attempt| {
-                let mut req = req.clone();
-                req.request_id.attempt = attempt as u32;
-                Self::get_page_with_shards(req, &self.shards.load_full()).await
-            })
-            .await
+        let resp = Self::with_retries(CALL_TIMEOUT, async |attempt| {
+            let mut req = req.clone();
+            req.request_id.attempt = attempt as u32;
+            let shards = self.shards.load_full();
+            Self::with_timeout(REQUEST_TIMEOUT, Self::get_page_with_shards(req, &shards)).await
+        })
+        .await?;
+
+        debug!("received response: {resp:?}");
+        Ok(resp)
    }

    /// Fetches pages using the given shards. This uses a stable view of the shards, regardless of
@@ -246,7 +276,7 @@ impl PageserverClient {
        req: page_api::GetPageRequest,
        shard: &Shard,
    ) -> tonic::Result<page_api::GetPageResponse> {
-        let stream = shard.stream(req.request_class.is_bulk()).await;
+        let mut stream = shard.stream(Self::is_bulk(&req)).await?;
        let resp = stream.send(req.clone()).await?;

        // Convert per-request errors into a tonic::Status.
@@ -290,13 +320,15 @@ impl PageserverClient {
        &self,
        req: page_api::GetRelSizeRequest,
    ) -> tonic::Result<page_api::GetRelSizeResponse> {
-        self.retry
-            .with(async |_| {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.get_rel_size(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_rel_size(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
    }

    /// Fetches an SLRU segment.
@@ -305,13 +337,50 @@ impl PageserverClient {
        &self,
        req: page_api::GetSlruSegmentRequest,
    ) -> tonic::Result<page_api::GetSlruSegmentResponse> {
-        self.retry
-            .with(async |_| {
-                // SLRU segments are only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.get_slru_segment(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // SLRU segments are only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_slru_segment(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
+    }
+
+    /// Runs the given async closure with retries up to the given timeout. Only certain gRPC status
+    /// codes are retried, see [`Retry::should_retry`]. Returns `DeadlineExceeded` on timeout.
+    async fn with_retries<T, F, O>(timeout: Duration, f: F) -> tonic::Result<T>
+    where
+        F: FnMut(usize) -> O, // pass attempt number, starting at 0
+        O: Future<Output = tonic::Result<T>>,
+    {
+        Retry {
+            timeout: Some(timeout),
+            base_backoff: BASE_BACKOFF,
+            max_backoff: MAX_BACKOFF,
+        }
+        .with(f)
+        .await
+    }
+
+    /// Runs the given future with a timeout. Returns `DeadlineExceeded` on timeout.
+    async fn with_timeout<T>(
+        timeout: Duration,
+        f: impl Future<Output = tonic::Result<T>>,
+    ) -> tonic::Result<T> {
+        let started = Instant::now();
+        tokio::time::timeout(timeout, f).await.map_err(|_| {
+            tonic::Status::deadline_exceeded(format!(
+                "request timed out after {:.3}s",
+                started.elapsed().as_secs_f64()
+            ))
+        })?
+    }
+
+    /// Returns true if the request is considered a bulk request and should use the bulk pool.
+    fn is_bulk(req: &page_api::GetPageRequest) -> bool {
+        req.block_numbers.len() >= BULK_THRESHOLD_BATCH_SIZE
    }
 }

@@ -440,15 +509,23 @@ impl Shards {
    }
 }

-/// A single shard. Uses dedicated resource pools with the following structure:
+/// A single shard. Has dedicated resource pools with the following structure:
 ///
-/// * Channel pool: unbounded.
-///   * Unary client pool: MAX_UNARY_CLIENTS.
-///   * Stream client pool: unbounded.
-///     * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH.
-/// * Bulk channel pool: unbounded.
+/// * Channel pool: MAX_CLIENTS_PER_CHANNEL.
+///   * Client pool: unbounded.
+///     * Stream pool: unbounded.
+/// * Bulk channel pool: MAX_BULK_CLIENTS_PER_CHANNEL.
 ///   * Bulk client pool: unbounded.
-///     * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
+///     * Bulk stream pool: unbounded.
+///
+/// We use a separate bulk channel pool with a lower concurrency limit for large batch requests.
+/// This avoids TCP-level head-of-line blocking, and also concentrates large window sizes on a
+/// smaller set of streams/connections, which presumably reduces memory use. Neither of these pools
+/// are bounded, nor do they pipeline requests, so the latency characteristics should be mostly
+/// similar (except for TCP transmission time).
+///
+/// TODO: since we never use bounded pools, we could consider removing the pool limiters. However,
+/// the code is fairly trivial, so we may as well keep them around for now in case we need them.
 struct Shard {
    /// The shard ID.
    id: ShardIndex,
@@ -456,7 +533,7 @@ struct Shard {
    client_pool: Arc<ClientPool>,
    /// GetPage stream pool.
    stream_pool: Arc<StreamPool>,
-    /// GetPage stream pool for bulk requests, e.g. prefetches.
+    /// GetPage stream pool for bulk requests.
    bulk_stream_pool: Arc<StreamPool>,
 }

@@ -470,50 +547,30 @@ impl Shard {
        auth_token: Option<String>,
        compression: Option<CompressionEncoding>,
    ) -> anyhow::Result<Self> {
-        // Common channel pool for unary and stream requests. Bounded by client/stream pools.
-        let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
-
-        // Client pool for unary requests.
+        // Shard pools for unary requests and non-bulk GetPage requests.
        let client_pool = ClientPool::new(
-            channel_pool.clone(),
+            ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?,
            tenant_id,
            timeline_id,
            shard_id,
            auth_token.clone(),
            compression,
-            Some(MAX_UNARY_CLIENTS),
+            None, // unbounded
        );
+        let stream_pool = StreamPool::new(client_pool.clone(), None); // unbounded

-        // GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients,
-        // but shares a channel pool with it (as it's unbounded).
-        let stream_pool = StreamPool::new(
-            ClientPool::new(
-                channel_pool.clone(),
-                tenant_id,
-                timeline_id,
-                shard_id,
-                auth_token.clone(),
-                compression,
-                None, // unbounded, limited by stream pool
-            ),
-            Some(MAX_STREAMS),
-            MAX_STREAM_QUEUE_DEPTH,
-        );
-
-        // Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
-        // to avoid head-of-line blocking of latency-sensitive requests.
+        // Bulk GetPage stream pool for large batches (prefetches, sequential scans, vacuum, etc.).
        let bulk_stream_pool = StreamPool::new(
            ClientPool::new(
-                ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?,
+                ChannelPool::new(url, MAX_BULK_CLIENTS_PER_CHANNEL)?,
                tenant_id,
                timeline_id,
                shard_id,
                auth_token,
                compression,
-                None, // unbounded, limited by stream pool
+                None, // unbounded,
            ),
-            Some(MAX_BULK_STREAMS),
-            MAX_BULK_STREAM_QUEUE_DEPTH,
+            None, // unbounded
        );

        Ok(Self {
@@ -525,19 +582,23 @@ impl Shard {
    }

    /// Returns a pooled client for this shard.
+    #[instrument(skip_all)]
    async fn client(&self) -> tonic::Result<ClientGuard> {
-        self.client_pool
-            .get()
-            .await
-            .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
+        warn_slow(
+            "client pool acquisition",
+            SLOW_THRESHOLD,
+            pin!(self.client_pool.get()),
+        )
+        .await
    }

-    /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
-    /// pool (e.g. for prefetches).
-    async fn stream(&self, bulk: bool) -> StreamGuard {
-        match bulk {
-            false => self.stream_pool.get().await,
-            true => self.bulk_stream_pool.get().await,
-        }
+    /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk pool.
+    #[instrument(skip_all, fields(bulk))]
+    async fn stream(&self, bulk: bool) -> tonic::Result<StreamGuard> {
+        let pool = match bulk {
+            false => &self.stream_pool,
+            true => &self.bulk_stream_pool,
+        };
+        warn_slow("stream pool acquisition", SLOW_THRESHOLD, pin!(pool.get())).await
    }
 }
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -9,19 +9,36 @@
 //!
 //! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
 //!   can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
-//!   per-channel client limit. Channels may be closed when they are no longer used by any clients.
+//!   per-channel client limit. Channels are closed immediately when empty, and indirectly rely on
+//!   client/stream idle timeouts.
 //!
 //! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
 //!   channel from the ChannelPool for the client's lifetime. A client can only be acquired by a
-//!   single caller at a time, and is returned to the pool when dropped. Idle clients may be removed
-//!   from the pool after some time, to free up the channel.
+//!   single caller at a time, and is returned to the pool when dropped. Idle clients are removed
+//!   from the pool after a while to free up resources.
 //!
 //! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
-//!   ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
-//!   returns a guard that can be used to send a single request, to properly enforce queue depth and
-//!   route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
-//!   possibly pipelining multiple requests from multiple callers on the same stream (up to some
-//!   queue depth). Idle streams may be removed from the pool after a while to free up the client.
+//!   ClientPool for the stream's lifetime. A stream can only be acquired by a single caller at a
+//!   time, and is returned to the pool when dropped. Idle streams are removed from the pool after
+//!   a while to free up resources.
+//!
+//!   The stream only supports sending a single, synchronous request at a time, and does not support
+//!   pipelining multiple requests from different callers onto the same stream -- instead, we scale
+//!   out concurrent streams to improve throughput. There are many reasons for this design choice:
+//!
+//!     * It (mostly) eliminates head-of-line blocking. A single stream is processed sequentially by
+//!       a single server task, which may block e.g. on layer downloads, LSN waits, etc.
+//!
+//!     * Cancellation becomes trivial, by closing the stream. Otherwise, if a caller goes away
+//!       (e.g. because of a timeout), the request would still be processed by the server and block
+//!       requests behind it in the stream. It might even block its own timeout retry.
+//!
+//!     * Stream scheduling becomes significantly simpler and cheaper.
+//!
+//!     * Individual callers can still use client-side batching for pipelining.
+//!
+//!     * Idle streams are cheap. Benchmarks show that an idle GetPage stream takes up about 26 KB
+//!       per stream (2.5 GB for 100,000 streams), so we can afford to scale out.
 //!
 //! Each channel corresponds to one TCP connection. Each client unary request and each stream
 //! corresponds to one HTTP/2 stream and server task.
@@ -29,33 +46,31 @@
 //! TODO: error handling (including custom error types).
 //! TODO: observability.

-use std::collections::{BTreeMap, HashMap};
+use std::collections::BTreeMap;
 use std::num::NonZero;
 use std::ops::{Deref, DerefMut};
+use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, Weak};
 use std::time::{Duration, Instant};

-use futures::StreamExt as _;
-use tokio::sync::mpsc::{Receiver, Sender};
-use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use futures::{Stream, StreamExt as _};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, watch};
+use tokio_stream::wrappers::WatchStream;
 use tokio_util::sync::CancellationToken;
 use tonic::codec::CompressionEncoding;
 use tonic::transport::{Channel, Endpoint};
-use tracing::{error, warn};

 use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;

-/// Reap channels/clients/streams that have been idle for this long.
+/// Reap clients/streams that have been idle for this long. Channels are reaped immediately when
+/// empty, and indirectly rely on the client/stream idle timeouts.
 ///
-/// TODO: this is per-pool. For nested pools, it can take up to 3x as long for a TCP connection to
-/// be reaped. First, we must wait for an idle stream to be reaped, which marks its client as idle.
-/// Then, we must wait for the idle client to be reaped, which marks its channel as idle. Then, we
-/// must wait for the idle channel to be reaped. Is that a problem? Maybe not, we just have to
-/// account for it when setting the reap threshold. Alternatively, we can immediately reap empty
-/// channels, and/or stream pool clients.
+/// A stream's client will be reaped after 2x the idle threshold (first stream the client), but
+/// that's okay -- if the stream closes abruptly (e.g. due to timeout or cancellation), we want to
+/// keep its client around in the pool for a while.
 const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
    false => Duration::from_secs(180),
    true => Duration::from_secs(1), // exercise reaping in tests
@@ -83,8 +98,6 @@ pub struct ChannelPool {
    max_clients_per_channel: NonZero<usize>,
    /// Open channels.
    channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
-    /// Reaps idle channels.
-    idle_reaper: Reaper,
    /// Channel ID generator.
    next_channel_id: AtomicUsize,
 }
@@ -96,9 +109,6 @@ struct ChannelEntry {
    channel: Channel,
    /// Number of clients using this channel.
    clients: usize,
-    /// The channel has been idle (no clients) since this time. None if channel is in use.
-    /// INVARIANT: Some if clients == 0, otherwise None.
-    idle_since: Option<Instant>,
 }

 impl ChannelPool {
@@ -108,15 +118,12 @@ impl ChannelPool {
        E: TryInto<Endpoint> + Send + Sync + 'static,
        <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
    {
-        let pool = Arc::new(Self {
+        Ok(Arc::new(Self {
            endpoint: endpoint.try_into()?,
            max_clients_per_channel,
            channels: Mutex::default(),
-            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
            next_channel_id: AtomicUsize::default(),
-        });
-        pool.idle_reaper.spawn(&pool);
-        Ok(pool)
+        }))
    }

    /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
@@ -137,22 +144,17 @@ impl ChannelPool {
        let mut channels = self.channels.lock().unwrap();

        // Try to find an existing channel with available capacity. We check entries in BTreeMap
-        // order, to fill up the lower-ordered channels first. The ClientPool also prefers clients
-        // with lower-ordered channel IDs first. This will cluster clients in lower-ordered
+        // order, to fill up the lower-ordered channels first. The client/stream pools also prefer
+        // clients with lower-ordered channel IDs first. This will cluster clients in lower-ordered
        // channels, and free up higher-ordered channels such that they can be reaped.
        for (&id, entry) in channels.iter_mut() {
            assert!(
                entry.clients <= self.max_clients_per_channel.get(),
                "channel overflow"
            );
-            assert_eq!(
-                entry.idle_since.is_some(),
-                entry.clients == 0,
-                "incorrect channel idle state"
-            );
+            assert_ne!(entry.clients, 0, "empty channel not reaped");
            if entry.clients < self.max_clients_per_channel.get() {
                entry.clients += 1;
-                entry.idle_since = None;
                return ChannelGuard {
                    pool: Arc::downgrade(self),
                    id,
@@ -169,7 +171,6 @@ impl ChannelPool {
        let entry = ChannelEntry {
            channel: channel.clone(),
            clients: 1, // account for the guard below
-            idle_since: None,
        };
        channels.insert(id, entry);

@@ -181,20 +182,6 @@ impl ChannelPool {
    }
 }

-impl Reapable for ChannelPool {
-    /// Reaps channels that have been idle since before the cutoff.
-    fn reap_idle(&self, cutoff: Instant) {
-        self.channels.lock().unwrap().retain(|_, entry| {
-            let Some(idle_since) = entry.idle_since else {
-                assert_ne!(entry.clients, 0, "empty channel not marked idle");
-                return true;
-            };
-            assert_eq!(entry.clients, 0, "idle channel has clients");
-            idle_since >= cutoff
-        })
-    }
-}
-
 /// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`,
 /// since the gRPC client requires an owned `Channel`.
 pub struct ChannelGuard {
@@ -211,7 +198,7 @@ impl ChannelGuard {
    }
 }

-/// Returns the channel to the pool.
+/// Returns the channel to the pool. The channel is closed when empty.
 impl Drop for ChannelGuard {
    fn drop(&mut self) {
        let Some(pool) = self.pool.upgrade() else {
@@ -220,11 +207,12 @@ impl Drop for ChannelGuard {

        let mut channels = pool.channels.lock().unwrap();
        let entry = channels.get_mut(&self.id).expect("unknown channel");
-        assert!(entry.idle_since.is_none(), "active channel marked idle");
        assert!(entry.clients > 0, "channel underflow");
        entry.clients -= 1;
+
+        // Reap empty channels immediately.
        if entry.clients == 0 {
-            entry.idle_since = Some(Instant::now()); // mark channel as idle
+            channels.remove(&self.id);
        }
    }
 }
@@ -253,8 +241,7 @@ pub struct ClientPool {
    ///
    /// The first client in the map will be acquired next. The map is sorted by client ID, which in
    /// turn is sorted by its channel ID, such that we prefer acquiring idle clients from
-    /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
-    /// clients are reaped.
+    /// lower-ordered channels. This allows us to free up and reap higher-ordered channels.
    idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
    /// Reaps idle clients.
    idle_reaper: Reaper,
@@ -310,7 +297,7 @@ impl ClientPool {
    /// This is moderately performance-sensitive. It is called for every unary request, but these
    /// establish a new gRPC stream per request so they're already expensive. GetPage requests use
    /// the `StreamPool` instead.
-    pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
+    pub async fn get(self: &Arc<Self>) -> tonic::Result<ClientGuard> {
        // Acquire a permit if the pool is bounded.
        let mut permit = None;
        if let Some(limiter) = self.limiter.clone() {
@@ -328,7 +315,7 @@ impl ClientPool {
            });
        }

-        // Slow path: construct a new client.
+        // Construct a new client.
        let mut channel_guard = self.channel_pool.get();
        let client = page_api::Client::new(
            channel_guard.take(),
@@ -337,7 +324,8 @@ impl ClientPool {
            self.shard_id,
            self.auth_token.clone(),
            self.compression,
-        )?;
+        )
+        .map_err(|err| tonic::Status::internal(format!("failed to create client: {err}")))?;

        Ok(ClientGuard {
            pool: Arc::downgrade(self),
@@ -407,287 +395,187 @@ impl Drop for ClientGuard {
 /// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
 /// acquires a client from the inner `ClientPool` for the stream's lifetime.
 ///
-/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send
-/// a single request and await the response. Internally, requests are multiplexed across streams and
-/// channels. This allows proper queue depth enforcement and response routing.
+/// Individual streams only send a single request at a time, and do not pipeline multiple callers
+/// onto the same stream. Instead, we scale out the number of concurrent streams. This is primarily
+/// to eliminate head-of-line blocking. See the module documentation for more details.
 ///
 /// TODO: consider making this generic over request and response types; not currently needed.
 pub struct StreamPool {
    /// The client pool to acquire clients from. Must be unbounded.
    client_pool: Arc<ClientPool>,
-    /// All pooled streams.
+    /// Idle pooled streams. Acquired streams are removed from here and returned on drop.
    ///
-    /// Incoming requests will be sent over an existing stream with available capacity. If all
-    /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
-    /// stream has an associated Tokio task that processes requests and responses.
-    streams: Mutex<HashMap<StreamID, StreamEntry>>,
-    /// The max number of concurrent streams, or None if unbounded.
-    max_streams: Option<NonZero<usize>>,
-    /// The max number of concurrent requests per stream.
-    max_queue_depth: NonZero<usize>,
-    /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
-    /// None if the pool is unbounded.
+    /// The first stream in the map will be acquired next. The map is sorted by stream ID, which is
+    /// equivalent to the client ID and in turn sorted by its channel ID. This way we prefer
+    /// acquiring idle streams from lower-ordered channels, which allows us to free up and reap
+    /// higher-ordered channels.
+    idle: Mutex<BTreeMap<StreamID, StreamEntry>>,
+    /// Limits the max number of concurrent streams. None if the pool is unbounded.
    limiter: Option<Arc<Semaphore>>,
    /// Reaps idle streams.
    idle_reaper: Reaper,
-    /// Stream ID generator.
-    next_stream_id: AtomicUsize,
 }

-type StreamID = usize;
-type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>;
-type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>;
-type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
+/// The stream ID. Reuses the inner client ID.
+type StreamID = ClientID;

+/// A pooled stream.
 struct StreamEntry {
-    /// Sends caller requests to the stream task. The stream task exits when this is dropped.
-    sender: RequestSender,
-    /// Number of in-flight requests on this stream.
-    queue_depth: usize,
-    /// The time when this stream went idle (queue_depth == 0).
-    /// INVARIANT: Some if queue_depth == 0, otherwise None.
-    idle_since: Option<Instant>,
+    /// The bidirectional stream.
+    stream: BiStream,
+    /// The time when this stream was last used, i.e. when it was put back into `StreamPool::idle`.
+    idle_since: Instant,
+}
+
+/// A bidirectional GetPage stream and its client. Can send requests and receive responses.
+struct BiStream {
+    /// The owning client. Holds onto the channel slot while the stream is alive.
+    client: ClientGuard,
+    /// Stream for sending requests. Uses a watch channel, so it can only send a single request at a
+    /// time, and the caller must await the response before sending another request. This is
+    /// enforced by `StreamGuard::send`.
+    sender: watch::Sender<page_api::GetPageRequest>,
+    /// Stream for receiving responses.
+    receiver: Pin<Box<dyn Stream<Item = tonic::Result<page_api::GetPageResponse>> + Send>>,
 }

 impl StreamPool {
-    /// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth`
-    /// concurrent requests on each stream, and use up to `max_streams` concurrent streams.
+    /// Creates a new stream pool, using the given client pool. It will use up to `max_streams`
+    /// concurrent streams.
    ///
    /// The client pool must be unbounded. The stream pool will enforce its own limits, and because
    /// streams are long-lived they can cause persistent starvation if they exhaust the client pool.
    /// The stream pool should generally have its own dedicated client pool (but it can share a
    /// channel pool with others since these are always unbounded).
-    pub fn new(
-        client_pool: Arc<ClientPool>,
-        max_streams: Option<NonZero<usize>>,
-        max_queue_depth: NonZero<usize>,
-    ) -> Arc<Self> {
+    pub fn new(client_pool: Arc<ClientPool>, max_streams: Option<NonZero<usize>>) -> Arc<Self> {
        assert!(client_pool.limiter.is_none(), "bounded client pool");
        let pool = Arc::new(Self {
            client_pool,
-            streams: Mutex::default(),
-            limiter: max_streams.map(|max_streams| {
-                Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
-            }),
-            max_streams,
-            max_queue_depth,
+            idle: Mutex::default(),
+            limiter: max_streams.map(|max_streams| Arc::new(Semaphore::new(max_streams.get()))),
            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
-            next_stream_id: AtomicUsize::default(),
        });
        pool.idle_reaper.spawn(&pool);
        pool
    }

-    /// Acquires an available stream from the pool, or spins up a new stream async if all streams
-    /// are full. Returns a guard that can be used to send a single request on the stream and await
-    /// the response, with queue depth quota already acquired. Blocks if the pool is at capacity
-    /// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight).
+    /// Acquires an available stream from the pool, or spins up a new stream if all streams are
+    /// full. Returns a guard that can be used to send requests and await the responses. Blocks if
+    /// the pool is full.
    ///
    /// This is very performance-sensitive, as it is on the GetPage hot path.
    ///
-    /// TODO: this must do something more sophisticated for performance. We want:
-    ///
-    /// * Cheap, concurrent access in the common case where we can use a pooled stream.
-    /// * Quick acquisition of pooled streams with available capacity.
-    /// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
-    /// * Prefer filling up existing streams' queue depth before spinning up new streams.
-    /// * Don't hold a lock while spinning up new streams.
-    /// * Allow concurrent clients to join onto streams while they're spun up.
-    /// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
-    ///
-    /// For now, we just do something simple but inefficient (linear scan under mutex).
-    pub async fn get(self: &Arc<Self>) -> StreamGuard {
+    /// TODO: is a `Mutex<BTreeMap>` performant enough? Will it become too contended? We can't
+    /// trivially use e.g. DashMap or sharding, because we want to pop lower-ordered streams first
+    /// to free up higher-ordered channels.
+    pub async fn get(self: &Arc<Self>) -> tonic::Result<StreamGuard> {
        // Acquire a permit if the pool is bounded.
        let mut permit = None;
        if let Some(limiter) = self.limiter.clone() {
            permit = Some(limiter.acquire_owned().await.expect("never closed"));
        }
-        let mut streams = self.streams.lock().unwrap();

-        // Look for a pooled stream with available capacity.
-        for (&id, entry) in streams.iter_mut() {
-            assert!(
-                entry.queue_depth <= self.max_queue_depth.get(),
-                "stream queue overflow"
-            );
-            assert_eq!(
-                entry.idle_since.is_some(),
-                entry.queue_depth == 0,
-                "incorrect stream idle state"
-            );
-            if entry.queue_depth < self.max_queue_depth.get() {
-                entry.queue_depth += 1;
-                entry.idle_since = None;
-                return StreamGuard {
-                    pool: Arc::downgrade(self),
-                    id,
-                    sender: entry.sender.clone(),
-                    permit,
-                };
-            }
+        // Fast path: acquire an idle stream from the pool.
+        if let Some((_, entry)) = self.idle.lock().unwrap().pop_first() {
+            return Ok(StreamGuard {
+                pool: Arc::downgrade(self),
+                stream: Some(entry.stream),
+                can_reuse: true,
+                permit,
+            });
        }

-        // No available stream, spin up a new one. We install the stream entry in the pool first and
-        // return the guard, while spinning up the stream task async. This allows other callers to
-        // join onto this stream and also create additional streams concurrently if this fills up.
-        let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
-        let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
-        let entry = StreamEntry {
-            sender: req_tx.clone(),
-            queue_depth: 1, // reserve quota for this caller
-            idle_since: None,
-        };
-        streams.insert(id, entry);
+        // Spin up a new stream. Uses a watch channel to send a single request at a time, since
+        // `StreamGuard::send` enforces this anyway and it avoids unnecessary channel overhead.
+        let mut client = self.client_pool.get().await?;

-        if let Some(max_streams) = self.max_streams {
-            assert!(streams.len() <= max_streams.get(), "stream overflow");
-        };
+        let (req_tx, req_rx) = watch::channel(page_api::GetPageRequest::default());
+        let req_stream = WatchStream::from_changes(req_rx);
+        let resp_stream = client.get_pages(req_stream).await?;

-        let client_pool = self.client_pool.clone();
-        let pool = Arc::downgrade(self);
-
-        tokio::spawn(async move {
-            if let Err(err) = Self::run_stream(client_pool, req_rx).await {
-                error!("stream failed: {err}");
-            }
-            // Remove stream from pool on exit. Weak reference to avoid holding the pool alive.
-            if let Some(pool) = pool.upgrade() {
-                let entry = pool.streams.lock().unwrap().remove(&id);
-                assert!(entry.is_some(), "unknown stream ID: {id}");
-            }
-        });
-
-        StreamGuard {
+        Ok(StreamGuard {
            pool: Arc::downgrade(self),
-            id,
-            sender: req_tx,
+            stream: Some(BiStream {
+                client,
+                sender: req_tx,
+                receiver: Box::pin(resp_stream),
+            }),
+            can_reuse: true,
            permit,
-        }
-    }
-
-    /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
-    /// bidirectional GetPage stream, then forwards requests and responses between callers and the
-    /// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be
-    /// atomic with pool stream acquisition.
-    ///
-    /// The task exits when the request channel is closed, or on a stream error. The caller is
-    /// responsible for removing the stream from the pool on exit.
-    async fn run_stream(
-        client_pool: Arc<ClientPool>,
-        mut caller_rx: RequestReceiver,
-    ) -> anyhow::Result<()> {
-        // Acquire a client from the pool and create a stream.
-        let mut client = client_pool.get().await?;
-
-        // NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could
-        // theoretically deadlock if both the client and server block on sends (since we're not
-        // reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and
-        // low queue depths, but it was seen to happen with the libpq protocol so better safe than
-        // sorry. It should never buffer more than the queue depth anyway, but using an unbounded
-        // channel guarantees that it will never block.
-        let (req_tx, req_rx) = mpsc::unbounded_channel();
-        let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
-        let mut resp_stream = client.get_pages(req_stream).await?;
-
-        // Track caller response channels by request ID. If the task returns early, these response
-        // channels will be dropped and the waiting callers will receive an error.
-        //
-        // NB: this will leak entries if the server doesn't respond to a request (by request ID).
-        // It shouldn't happen, and if it does it will often hold onto queue depth quota anyway and
-        // block further use. But we could consider reaping closed channels after some time.
-        let mut callers = HashMap::new();
-
-        // Process requests and responses.
-        loop {
-            tokio::select! {
-                // Receive requests from callers and send them to the stream.
-                req = caller_rx.recv() => {
-                    // Shut down if request channel is closed.
-                    let Some((req, resp_tx)) = req else {
-                        return Ok(());
-                    };
-
-                    // Store the response channel by request ID.
-                    if callers.contains_key(&req.request_id) {
-                        // Error on request ID duplicates. Ignore callers that went away.
-                        _ = resp_tx.send(Err(tonic::Status::invalid_argument(
-                            format!("duplicate request ID: {}", req.request_id),
-                        )));
-                        continue;
-                    }
-                    callers.insert(req.request_id, resp_tx);
-
-                    // Send the request on the stream. Bail out if the stream is closed.
-                    req_tx.send(req).map_err(|_| {
-                        tonic::Status::unavailable("stream closed")
-                    })?;
-                }
-
-                // Receive responses from the stream and send them to callers.
-                resp = resp_stream.next() => {
-                    // Shut down if the stream is closed, and bail out on stream errors.
-                    let Some(resp) = resp.transpose()? else {
-                        return Ok(())
-                    };
-
-                    // Send the response to the caller. Ignore errors if the caller went away.
-                    let Some(resp_tx) = callers.remove(&resp.request_id) else {
-                        warn!("received response for unknown request ID: {}", resp.request_id);
-                        continue;
-                    };
-                    _ = resp_tx.send(Ok(resp));
-                }
-            }
-        }
+        })
    }
 }

 impl Reapable for StreamPool {
    /// Reaps streams that have been idle since before the cutoff.
    fn reap_idle(&self, cutoff: Instant) {
-        self.streams.lock().unwrap().retain(|_, entry| {
-            let Some(idle_since) = entry.idle_since else {
-                assert_ne!(entry.queue_depth, 0, "empty stream not marked idle");
-                return true;
-            };
-            assert_eq!(entry.queue_depth, 0, "idle stream has requests");
-            idle_since >= cutoff
-        });
+        self.idle
+            .lock()
+            .unwrap()
+            .retain(|_, entry| entry.idle_since >= cutoff);
    }
 }

-/// A pooled stream reference. Can be used to send a single request, to properly enforce queue
-/// depth. Queue depth is already reserved and will be returned on drop.
+/// A stream acquired from the pool. Returned to the pool when dropped, unless there are still
+/// in-flight requests on the stream, or the stream failed.
 pub struct StreamGuard {
    pool: Weak<StreamPool>,
-    id: StreamID,
-    sender: RequestSender,
+    stream: Option<BiStream>,             // Some until dropped
+    can_reuse: bool,                      // returned to pool if true
    permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
 }

 impl StreamGuard {
-    /// Sends a request on the stream and awaits the response. Consumes the guard, since it's only
-    /// valid for a single request (to enforce queue depth). This also drops the guard on return and
-    /// returns the queue depth quota to the pool.
+    /// Sends a request on the stream and awaits the response. If the future is dropped before it
+    /// resolves (e.g. due to a timeout or cancellation), the stream will be closed to cancel the
+    /// request and is not returned to the pool. The same is true if the stream errors, in which
+    /// case the caller can't send further requests on the stream.
    ///
-    /// The `GetPageRequest::request_id` must be unique across in-flight requests.
+    /// We only support sending a single request at a time, to eliminate head-of-line blocking. See
+    /// module documentation for details.
    ///
    /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
    /// to avoid tearing down the stream for per-request errors. Callers must check this.
    pub async fn send(
-        self,
+        &mut self,
        req: page_api::GetPageRequest,
    ) -> tonic::Result<page_api::GetPageResponse> {
-        let (resp_tx, resp_rx) = oneshot::channel();
+        let req_id = req.request_id;
+        let stream = self.stream.as_mut().expect("not dropped");

-        self.sender
-            .send((req, resp_tx))
-            .await
+        // Mark the stream as not reusable while the request is in flight. We can't return the
+        // stream to the pool until we receive the response, to avoid head-of-line blocking and
+        // stale responses. Failed streams can't be reused either.
+        if !self.can_reuse {
+            return Err(tonic::Status::internal("stream can't be reused"));
+        }
+        self.can_reuse = false;
+
+        // Send the request and receive the response.
+        //
+        // NB: this uses a watch channel, so it's unsafe to change this code to pipeline requests.
+        stream
+            .sender
+            .send(req)
            .map_err(|_| tonic::Status::unavailable("stream closed"))?;

-        resp_rx
+        let resp = stream
+            .receiver
+            .next()
            .await
-            .map_err(|_| tonic::Status::unavailable("stream closed"))?
+            .ok_or_else(|| tonic::Status::unavailable("stream closed"))??;
+
+        if resp.request_id != req_id {
+            return Err(tonic::Status::internal(format!(
+                "response ID {} does not match request ID {}",
+                resp.request_id, req_id
+            )));
+        }
+
+        // Success, mark the stream as reusable.
+        self.can_reuse = true;
+
+        Ok(resp)
    }
 }

@@ -697,26 +585,21 @@ impl Drop for StreamGuard {
            return; // pool was dropped
        };

-        // Release the queue depth reservation on drop. This can prematurely decrement it if dropped
-        // before the response is received, but that's okay.
-        //
-        // TODO: actually, it's probably not okay. Queue depth release should be moved into the
-        // stream task, such that it continues to account for the queue depth slot until the server
-        // responds. Otherwise, if a slow request times out and keeps blocking the stream, the
-        // server will keep waiting on it and we can pile on subsequent requests (including the
-        // timeout retry) in the same stream and get blocked. But we may also want to avoid blocking
-        // requests on e.g. LSN waits and layer downloads, instead returning early to free up the
-        // stream. Or just scale out streams with a queue depth of 1 to sidestep all head-of-line
-        // blocking. TBD.
-        let mut streams = pool.streams.lock().unwrap();
-        let entry = streams.get_mut(&self.id).expect("unknown stream");
-        assert!(entry.idle_since.is_none(), "active stream marked idle");
-        assert!(entry.queue_depth > 0, "stream queue underflow");
-        entry.queue_depth -= 1;
-        if entry.queue_depth == 0 {
-            entry.idle_since = Some(Instant::now()); // mark stream as idle
+        // If the stream isn't reusable, it can't be returned to the pool.
+        if !self.can_reuse {
+            return;
        }

+        // Place the idle stream back into the pool.
+        let entry = StreamEntry {
+            stream: self.stream.take().expect("dropped once"),
+            idle_since: Instant::now(),
+        };
+        pool.idle
+            .lock()
+            .unwrap()
+            .insert(entry.stream.client.id, entry);
+
        _ = self.permit; // returned on drop, referenced for visibility
    }
 }
--- a/pageserver/client_grpc/src/retry.rs
+++ b/pageserver/client_grpc/src/retry.rs
@@ -1,5 +1,6 @@
 use std::time::Duration;

+use futures::future::pending;
 use tokio::time::Instant;
 use tracing::{error, info, warn};

@@ -8,60 +9,54 @@ use utils::backoff::exponential_backoff_duration;
 /// A retry handler for Pageserver gRPC requests.
 ///
 /// This is used instead of backoff::retry for better control and observability.
-pub struct Retry;
+pub struct Retry {
+    /// Timeout across all retry attempts. If None, retries forever.
+    pub timeout: Option<Duration>,
+    /// The initial backoff duration. The first retry does not use a backoff.
+    pub base_backoff: Duration,
+    /// The maximum backoff duration.
+    pub max_backoff: Duration,
+}

 impl Retry {
-    /// The per-request timeout.
-    // TODO: tune these, and/or make them configurable. Should we retry forever?
-    const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
-    /// The total timeout across all attempts
-    const TOTAL_TIMEOUT: Duration = Duration::from_secs(60);
-    /// The initial backoff duration.
-    const BASE_BACKOFF: Duration = Duration::from_millis(10);
-    /// The maximum backoff duration.
-    const MAX_BACKOFF: Duration = Duration::from_secs(10);
-    /// If true, log successful requests. For debugging.
-    const LOG_SUCCESS: bool = false;
-
-    /// Runs the given async closure with timeouts and retries (exponential backoff), passing the
-    /// attempt number starting at 0. Logs errors, using the current tracing span for context.
+    /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
+    /// using the current tracing span for context.
    ///
-    /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
-    /// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
+    /// Only certain gRPC status codes are retried, see [`Self::should_retry`].
    pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
    where
-        F: FnMut(usize) -> O, // takes attempt number, starting at 0
+        F: FnMut(usize) -> O, // pass attempt number, starting at 0
        O: Future<Output = tonic::Result<T>>,
    {
        let started = Instant::now();
-        let deadline = started + Self::TOTAL_TIMEOUT;
+        let deadline = self.timeout.map(|timeout| started + timeout);
        let mut last_error = None;
        let mut retries = 0;
        loop {
-            // Set up a future to wait for the backoff (if any) and run the request with a timeout.
+            // Set up a future to wait for the backoff, if any, and run the closure.
            let backoff_and_try = async {
                // NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
                // https://github.com/tokio-rs/tokio/issues/6866
-                if let Some(backoff) = Self::backoff_duration(retries) {
+                if let Some(backoff) = self.backoff_duration(retries) {
                    tokio::time::sleep(backoff).await;
                }

-                let request_started = Instant::now();
-                tokio::time::timeout(Self::REQUEST_TIMEOUT, f(retries))
-                    .await
-                    .map_err(|_| {
-                        tonic::Status::deadline_exceeded(format!(
-                            "request timed out after {:.3}s",
-                            request_started.elapsed().as_secs_f64()
-                        ))
-                    })?
+                f(retries).await
            };

-            // Wait for the backoff and request, or bail out if the total timeout is exceeded.
+            // Set up a future for the timeout, if any.
+            let timeout = async {
+                match deadline {
+                    Some(deadline) => tokio::time::sleep_until(deadline).await,
+                    None => pending().await,
+                }
+            };
+
+            // Wait for the backoff and request, or bail out if the timeout is exceeded.
            let result = tokio::select! {
                result = backoff_and_try => result,

-                _ = tokio::time::sleep_until(deadline) => {
+                _ = timeout => {
                    let last_error = last_error.unwrap_or_else(|| {
                        tonic::Status::deadline_exceeded(format!(
                            "request timed out after {:.3}s",
@@ -79,7 +74,7 @@ impl Retry {
            match result {
                // Success, return the result.
                Ok(result) => {
-                    if retries > 0 || Self::LOG_SUCCESS {
+                    if retries > 0 {
                        info!(
                            "request succeeded after {retries} retries in {:.3}s",
                            started.elapsed().as_secs_f64(),
@@ -112,12 +107,13 @@ impl Retry {
        }
    }

-    /// Returns the backoff duration for the given retry attempt, or None for no backoff.
-    fn backoff_duration(retry: usize) -> Option<Duration> {
+    /// Returns the backoff duration for the given retry attempt, or None for no backoff. The first
+    /// attempt and first retry never backs off, so this returns None for 0 and 1 retries.
+    fn backoff_duration(&self, retries: usize) -> Option<Duration> {
        let backoff = exponential_backoff_duration(
-            retry as u32,
-            Self::BASE_BACKOFF.as_secs_f64(),
-            Self::MAX_BACKOFF.as_secs_f64(),
+            (retries as u32).saturating_sub(1), // first retry does not back off
+            self.base_backoff.as_secs_f64(),
+            self.max_backoff.as_secs_f64(),
        );
        (!backoff.is_zero()).then_some(backoff)
    }
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -49,7 +49,7 @@ impl From<ProtocolError> for tonic::Status {
 }

 /// The LSN a request should read at.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, Default)]
 pub struct ReadLsn {
    /// The request's read LSN.
    pub request_lsn: Lsn,
@@ -329,7 +329,7 @@ impl From<GetDbSizeResponse> for proto::GetDbSizeResponse {
 }

 /// Requests one or more pages.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Default)]
 pub struct GetPageRequest {
    /// A request ID. Will be included in the response. Should be unique for in-flight requests on
    /// the stream.
@@ -430,12 +430,13 @@ impl From<RequestID> for proto::RequestId {
 }

 /// A GetPage request class.
-#[derive(Clone, Copy, Debug, strum_macros::Display)]
+#[derive(Clone, Copy, Debug, Default, strum_macros::Display)]
 pub enum GetPageClass {
    /// Unknown class. For backwards compatibility: used when an older client version sends a class
    /// that a newer server version has removed.
    Unknown,
    /// A normal request. This is the default.
+    #[default]
    Normal,
    /// A prefetch request. NB: can only be classified on pg < 18.
    Prefetch,
@@ -443,19 +444,6 @@ pub enum GetPageClass {
    Background,
 }

-impl GetPageClass {
-    /// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than
-    /// latency-sensitive).
-    pub fn is_bulk(&self) -> bool {
-        match self {
-            Self::Unknown => false,
-            Self::Normal => false,
-            Self::Prefetch => true,
-            Self::Background => true,
-        }
-    }
-}
-
 impl From<proto::GetPageClass> for GetPageClass {
    fn from(pb: proto::GetPageClass) -> Self {
        match pb {
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -16,6 +16,7 @@ futures.workspace = true
 hdrhistogram.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
+pprof.workspace = true
 rand.workspace = true
 reqwest.workspace = true
 serde.workspace = true
--- a/pageserver/pagebench/src/cmd/idle_streams.rs
+++ b/pageserver/pagebench/src/cmd/idle_streams.rs
@@ -0,0 +1,127 @@
+use std::sync::Arc;
+
+use anyhow::anyhow;
+use futures::StreamExt;
+use tonic::transport::Endpoint;
+use tracing::info;
+
+use pageserver_page_api::{GetPageClass, GetPageRequest, GetPageStatusCode, ReadLsn, RelTag};
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+use utils::shard::ShardIndex;
+
+/// Starts a large number of idle gRPC GetPage streams.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    /// The Pageserver to connect to. Must use grpc://.
+    #[clap(long, default_value = "grpc://localhost:51051")]
+    server: String,
+    /// The Pageserver HTTP API.
+    #[clap(long, default_value = "http://localhost:9898")]
+    http_server: String,
+    /// The number of streams to open.
+    #[clap(long, default_value = "100000")]
+    count: usize,
+    /// Number of streams per connection.
+    #[clap(long, default_value = "100")]
+    per_connection: usize,
+    /// Send a single GetPage request on each stream.
+    #[clap(long, default_value_t = false)]
+    send_request: bool,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()?;
+
+    rt.block_on(main_impl(args))
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    // Discover a tenant and timeline to use.
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        reqwest::Client::new(),
+        args.http_server.clone(),
+        None,
+    ));
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: Some(1),
+            targets: None,
+        },
+    )
+    .await?;
+    let ttid = timelines
+        .first()
+        .ok_or_else(|| anyhow!("no timelines found"))?;
+
+    // Set up the initial client.
+    let endpoint = Endpoint::from_shared(args.server.clone())?;
+
+    let connect = async || {
+        pageserver_page_api::Client::new(
+            endpoint.connect().await?,
+            ttid.tenant_id,
+            ttid.timeline_id,
+            ShardIndex::unsharded(),
+            None,
+            None,
+        )
+    };
+
+    let mut client = connect().await?;
+    let mut streams = Vec::with_capacity(args.count);
+
+    // Create streams.
+    for i in 0..args.count {
+        if i % 100 == 0 {
+            info!("opened {}/{} streams", i, args.count);
+        }
+        if i % args.per_connection == 0 && i > 0 {
+            client = connect().await?;
+        }
+
+        let (req_tx, req_rx) = tokio::sync::mpsc::unbounded_channel();
+        let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
+        let mut resp_stream = client.get_pages(req_stream).await?;
+
+        // Send request if specified.
+        if args.send_request {
+            req_tx.send(GetPageRequest {
+                request_id: 1.into(),
+                request_class: GetPageClass::Normal,
+                read_lsn: ReadLsn {
+                    request_lsn: Lsn::MAX,
+                    not_modified_since_lsn: Some(Lsn(1)),
+                },
+                rel: RelTag {
+                    spcnode: 1664, // pg_global
+                    dbnode: 0,     // shared database
+                    relnode: 1262, // pg_authid
+                    forknum: 0,    // init
+                },
+                block_numbers: vec![0],
+            })?;
+            let resp = resp_stream
+                .next()
+                .await
+                .transpose()?
+                .ok_or_else(|| anyhow!("no response"))?;
+            if resp.status_code != GetPageStatusCode::Ok {
+                return Err(anyhow!("{} response", resp.status_code));
+            }
+        }
+
+        // Hold onto streams to avoid closing them.
+        streams.push((req_tx, resp_stream));
+    }
+
+    info!("opened {} streams, sleeping", args.count);
+
+    // Block forever, to hold the idle streams open for inspection.
+    futures::future::pending::<()>().await;
+
+    Ok(())
+}
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -1,4 +1,7 @@
+use std::fs::File;
+
 use clap::Parser;
+use tracing::info;
 use utils::logging;

 /// Re-usable pieces of code that aren't CLI-specific.
@@ -17,38 +20,73 @@ mod cmd {
    pub(super) mod aux_files;
    pub(super) mod basebackup;
    pub(super) mod getpage_latest_lsn;
+    pub(super) mod idle_streams;
    pub(super) mod ondemand_download_churn;
    pub(super) mod trigger_initial_size_calculation;
 }

 /// Component-level performance test for pageserver.
 #[derive(clap::Parser)]
-enum Args {
+struct Args {
+    /// Takes a client CPU profile into profile.svg. The benchmark must exit cleanly before it's
+    /// written, e.g. via --runtime.
+    #[arg(long)]
+    profile: bool,
+
+    #[command(subcommand)]
+    subcommand: Subcommand,
+}
+
+#[derive(clap::Subcommand)]
+enum Subcommand {
    Basebackup(cmd::basebackup::Args),
    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
    OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
    AuxFiles(cmd::aux_files::Args),
+    IdleStreams(cmd::idle_streams::Args),
 }

-fn main() {
+fn main() -> anyhow::Result<()> {
    logging::init(
        logging::LogFormat::Plain,
        logging::TracingErrorLayerEnablement::Disabled,
        logging::Output::Stderr,
-    )
-    .unwrap();
+    )?;
    logging::replace_panic_hook_with_tracing_panic_hook().forget();

    let args = Args::parse();
-    match args {
-        Args::Basebackup(args) => cmd::basebackup::main(args),
-        Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
-        Args::TriggerInitialSizeCalculation(args) => {
+
+    // Start a CPU profile if requested.
+    let mut profiler = None;
+    if args.profile {
+        profiler = Some(
+            pprof::ProfilerGuardBuilder::default()
+                .frequency(1000)
+                .blocklist(&["libc", "libgcc", "pthread", "vdso"])
+                .build()?,
+        );
+    }
+
+    match args.subcommand {
+        Subcommand::Basebackup(args) => cmd::basebackup::main(args),
+        Subcommand::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
+        Subcommand::TriggerInitialSizeCalculation(args) => {
            cmd::trigger_initial_size_calculation::main(args)
        }
-        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
-        Args::AuxFiles(args) => cmd::aux_files::main(args),
+        Subcommand::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
+        Subcommand::AuxFiles(args) => cmd::aux_files::main(args),
+        Subcommand::IdleStreams(args) => cmd::idle_streams::main(args),
+    }?;
+
+    // Generate a CPU flamegraph if requested.
+    if let Some(profiler) = profiler {
+        let report = profiler.report().build()?;
+        drop(profiler); // stop profiling
+        let file = File::create("profile.svg")?;
+        report.flamegraph(file)?;
+        info!("wrote CPU profile flamegraph to profile.svg")
    }
-    .unwrap()
+
+    Ok(())
 }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -114,7 +114,7 @@ where
    // Compute postgres doesn't have any previous WAL files, but the first
    // record that it's going to write needs to include the LSN of the
    // previous record (xl_prev). We include prev_record_lsn in the
-    // "zenith.signal" file, so that postgres can read it during startup.
+    // "neon.signal" file, so that postgres can read it during startup.
    //
    // We don't keep full history of record boundaries in the page server,
    // however, only the predecessor of the latest record on each
@@ -751,34 +751,39 @@ where

    //
    // Add generated pg_control file and bootstrap WAL segment.
-    // Also send zenith.signal file with extra bootstrap data.
+    // Also send neon.signal and zenith.signal file with extra bootstrap data.
    //
    async fn add_pgcontrol_file(
        &mut self,
        pg_control_bytes: Bytes,
        system_identifier: u64,
    ) -> Result<(), BasebackupError> {
-        // add zenith.signal file
-        let mut zenith_signal = String::new();
+        // add neon.signal file
+        let mut neon_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
            if self.timeline.is_ancestor_lsn(self.lsn) {
-                write!(zenith_signal, "PREV LSN: none")
+                write!(neon_signal, "PREV LSN: none")
                    .map_err(|e| BasebackupError::Server(e.into()))?;
            } else {
-                write!(zenith_signal, "PREV LSN: invalid")
+                write!(neon_signal, "PREV LSN: invalid")
                    .map_err(|e| BasebackupError::Server(e.into()))?;
            }
        } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
+            write!(neon_signal, "PREV LSN: {}", self.prev_record_lsn)
                .map_err(|e| BasebackupError::Server(e.into()))?;
        }
-        self.ar
-            .append(
-                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
-                zenith_signal.as_bytes(),
-            )
-            .await
-            .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
+
+        // TODO: Remove zenith.signal once all historical computes have been replaced
+        // ... and thus support the neon.signal file.
+        for signalfilename in ["neon.signal", "zenith.signal"] {
+            self.ar
+                .append(
+                    &new_tar_header(signalfilename, neon_signal.len() as u64)?,
+                    neon_signal.as_bytes(),
+                )
+                .await
+                .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,neon.signal"))?;
+        }

        //send pg_control
        let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -917,11 +917,6 @@ async fn create_remote_storage_client(
    // If `test_remote_failures` is non-zero, wrap the client with a
    // wrapper that simulates failures.
    if conf.test_remote_failures > 0 {
-        if !cfg!(feature = "testing") {
-            anyhow::bail!(
-                "test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"
-            );
-        }
        info!(
            "Simulating remote failures for first {} attempts of each op",
            conf.test_remote_failures
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -194,6 +194,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
                        listen_http_port: m.http_port,
                        listen_https_port: m.https_port,
                        availability_zone_id: az_id.expect("Checked above"),
+                        node_ip_addr: None,
                    })
                }
                Err(e) => {
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,6 +10,7 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{Context, Result, anyhow};
+use bytes::Bytes;
 use enumset::EnumSet;
 use futures::future::join_all;
 use futures::{StreamExt, TryFutureExt};
@@ -46,6 +47,7 @@ use pageserver_api::shard::{ShardCount, TenantShardId};
 use postgres_ffi::PgMajorVersion;
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
 use scopeguard::defer;
+use serde::{Deserialize, Serialize};
 use serde_json::json;
 use tenant_size_model::svg::SvgBranchKind;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -57,6 +59,7 @@ use utils::auth::SwappableJwtAuth;
 use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
+use wal_decoder::models::record::NeonWalRecord;

 use crate::config::PageServerConf;
 use crate::context;
@@ -77,6 +80,7 @@ use crate::tenant::remote_timeline_client::{
 };
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
+use crate::tenant::storage_layer::ValuesReconstructState;
 use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
 use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
 use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
@@ -397,6 +401,7 @@ async fn build_timeline_info(
    timeline: &Arc<Timeline>,
    include_non_incremental_logical_size: bool,
    force_await_initial_logical_size: bool,
+    include_image_consistent_lsn: bool,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -421,6 +426,10 @@ async fn build_timeline_info(
                .await?,
        );
    }
+    // HADRON
+    if include_image_consistent_lsn {
+        info.image_consistent_lsn = Some(timeline.compute_image_consistent_lsn().await?);
+    }
    Ok(info)
 }

@@ -510,6 +519,8 @@ async fn build_timeline_info_common(
        is_invisible: Some(is_invisible),

        walreceiver_status,
+        // HADRON
+        image_consistent_lsn: None,
    };
    Ok(info)
 }
@@ -712,6 +723,8 @@ async fn timeline_list_handler(
        parse_query_param(&request, "include-non-incremental-logical-size")?;
    let force_await_initial_logical_size: Option<bool> =
        parse_query_param(&request, "force-await-initial-logical-size")?;
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let state = get_state(&request);
@@ -732,6 +745,7 @@ async fn timeline_list_handler(
                &timeline,
                include_non_incremental_logical_size.unwrap_or(false),
                force_await_initial_logical_size.unwrap_or(false),
+                include_image_consistent_lsn.unwrap_or(false),
                &ctx,
            )
            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -760,6 +774,9 @@ async fn timeline_and_offloaded_list_handler(
        parse_query_param(&request, "include-non-incremental-logical-size")?;
    let force_await_initial_logical_size: Option<bool> =
        parse_query_param(&request, "force-await-initial-logical-size")?;
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
+
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let state = get_state(&request);
@@ -780,6 +797,7 @@ async fn timeline_and_offloaded_list_handler(
                &timeline,
                include_non_incremental_logical_size.unwrap_or(false),
                force_await_initial_logical_size.unwrap_or(false),
+                include_image_consistent_lsn.unwrap_or(false),
                &ctx,
            )
            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -964,6 +982,9 @@ async fn timeline_detail_handler(
        parse_query_param(&request, "include-non-incremental-logical-size")?;
    let force_await_initial_logical_size: Option<bool> =
        parse_query_param(&request, "force-await-initial-logical-size")?;
+    // HADRON
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    // Logical size calculation needs downloading.
@@ -984,6 +1005,7 @@ async fn timeline_detail_handler(
            &timeline,
            include_non_incremental_logical_size.unwrap_or(false),
            force_await_initial_logical_size.unwrap_or(false),
+            include_image_consistent_lsn.unwrap_or(false),
            ctx,
        )
        .await
@@ -2690,6 +2712,16 @@ async fn deletion_queue_flush(
    }
 }

+/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+struct GetPageResponse {
+    pub page: Bytes,
+    pub layers_visited: u32,
+    pub delta_layers_visited: u32,
+    pub records: Vec<(Lsn, NeonWalRecord)>,
+    pub img: Option<(Lsn, Bytes)>,
+}
+
 async fn getpage_at_lsn_handler(
    request: Request<Body>,
    cancel: CancellationToken,
@@ -2740,21 +2772,24 @@ async fn getpage_at_lsn_handler_inner(

        // Use last_record_lsn if no lsn is provided
        let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let page = timeline.get(key.0, lsn, &ctx).await?;

        if touch {
            json_response(StatusCode::OK, ())
        } else {
-            Result::<_, ApiError>::Ok(
-                Response::builder()
-                    .status(StatusCode::OK)
-                    .header(header::CONTENT_TYPE, "application/octet-stream")
-                    .body(hyper::Body::from(page))
-                    .unwrap(),
-            )
+            let mut reconstruct_state = ValuesReconstructState::new_with_debug(IoConcurrency::sequential());
+            let page = timeline.debug_get(key.0, lsn, &ctx, &mut reconstruct_state).await?;
+            let response = GetPageResponse {
+                page,
+                layers_visited: reconstruct_state.get_layers_visited(),
+                delta_layers_visited: reconstruct_state.get_delta_layers_visited(),
+                records: reconstruct_state.debug_state.records.clone(),
+                img: reconstruct_state.debug_state.img.clone(),
+            };
+
+            json_response(StatusCode::OK, response)
        }
    }
-    .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .instrument(info_span!("timeline_debug_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
 }

@@ -3643,6 +3678,7 @@ async fn activate_post_import_handler(
        let timeline_info = build_timeline_info(
            &timeline, false, // include_non_incremental_logical_size,
            false, // force_await_initial_logical_size
+            false, // include_image_consistent_lsn
            &ctx,
        )
        .await
@@ -4164,7 +4200,7 @@ pub fn make_router(
        })
        .get(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
-            |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
+            |r|  testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
        )
        .get(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage",
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -610,13 +610,13 @@ async fn import_file(
        debug!("imported twophase file");
    } else if file_path.starts_with("pg_wal") {
        debug!("found wal file in base section. ignore it");
-    } else if file_path.starts_with("zenith.signal") {
+    } else if file_path.starts_with("zenith.signal") || file_path.starts_with("neon.signal") {
        // Parse zenith signal file to set correct previous LSN
        let bytes = read_all_bytes(reader).await?;
-        // zenith.signal format is "PREV LSN: prev_lsn"
+        // neon.signal format is "PREV LSN: prev_lsn"
        // TODO write serialization and deserialization in the same place.
-        let zenith_signal = std::str::from_utf8(&bytes)?.trim();
-        let prev_lsn = match zenith_signal {
+        let neon_signal = std::str::from_utf8(&bytes)?.trim();
+        let prev_lsn = match neon_signal {
            "PREV LSN: none" => Lsn(0),
            "PREV LSN: invalid" => Lsn(0),
            other => {
@@ -624,17 +624,17 @@ async fn import_file(
                split[1]
                    .trim()
                    .parse::<Lsn>()
-                    .context("can't parse zenith.signal")?
+                    .context("can't parse neon.signal")?
            }
        };

-        // zenith.signal is not necessarily the last file, that we handle
+        // neon.signal is not necessarily the last file, that we handle
        // but it is ok to call `finish_write()`, because final `modification.commit()`
        // will update lsn once more to the final one.
        let writer = modification.tline.writer().await;
        writer.finish_write(prev_lsn);

-        debug!("imported zenith signal {}", prev_lsn);
+        debug!("imported neon signal {}", prev_lsn);
    } else if file_path.starts_with("pg_tblspc") {
        // TODO Backups exported from neon won't have pg_tblspc, but we will need
        // this to import arbitrary postgres databases.
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3393,7 +3393,13 @@ impl TenantShard {
                .collect_vec();

            for timeline in timelines {
-                timeline.maybe_freeze_ephemeral_layer().await;
+                // Include a span with the timeline ID. The parent span already has the tenant ID.
+                let span =
+                    info_span!("maybe_freeze_ephemeral_layer", timeline_id = %timeline.timeline_id);
+                timeline
+                    .maybe_freeze_ephemeral_layer()
+                    .instrument(span)
+                    .await;
            }
        }

@@ -12816,6 +12822,40 @@ mod tests {
                },
            ]
        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_get_force_image_creation_lsn() -> anyhow::Result<()> {
+        let tenant_conf = pageserver_api::models::TenantConfig {
+            pitr_interval: Some(Duration::from_secs(7 * 3600)),
+            image_layer_force_creation_period: Some(Duration::from_secs(3600)),
+            ..Default::default()
+        };
+
+        let tenant_id = TenantId::generate();
+
+        let harness = TenantHarness::create_custom(
+            "test_get_force_image_creation_lsn",
+            tenant_conf,
+            tenant_id,
+            ShardIdentity::unsharded(),
+            Generation::new(1),
+        )
+        .await?;
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        timeline.gc_info.write().unwrap().cutoffs.time = Some(Lsn(100));
+        {
+            let writer = timeline.writer().await;
+            writer.finish_write(Lsn(5000));
+        }
+
+        let image_creation_lsn = timeline.get_force_image_creation_lsn().unwrap();
+        assert_eq!(image_creation_lsn, Lsn(4300));
        Ok(())
    }
 }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -46,10 +46,11 @@
 mod historic_layer_coverage;
 mod layer_coverage;

-use std::collections::{HashMap, VecDeque};
+use std::collections::{BTreeMap, HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Instant;

 use anyhow::Result;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
@@ -904,6 +905,103 @@ impl LayerMap {
        max_stacked_deltas
    }

+    /* BEGIN_HADRON */
+    /**
+     * Compute the image consistent LSN, the largest LSN below which all pages have been redone successfully.
+     * It works by first finding the latest image layers and store them into a map. Then for each delta layer,
+     * find all overlapping image layers in order to potentially increase the image LSN in case there are gaps
+     * (e.g., if an image is created at LSN 100 but the delta layer spans LSN [150, 200], then we can increase
+     * image LSN to 150 because there is no WAL record in between).
+     * Finally, the image consistent LSN is computed by taking the minimum of all image layers.
+     */
+    pub fn compute_image_consistent_lsn(&self, disk_consistent_lsn: Lsn) -> Lsn {
+        struct ImageLayerInfo {
+            // creation LSN of the image layer
+            image_lsn: Lsn,
+            // the current minimum LSN of newer delta layers with overlapping key ranges
+            min_delta_lsn: Lsn,
+        }
+        let started_at = Instant::now();
+
+        let min_l0_deltas_lsn = {
+            let l0_deltas = self.level0_deltas();
+            l0_deltas
+                .iter()
+                .map(|layer| layer.get_lsn_range().start)
+                .min()
+                .unwrap_or(disk_consistent_lsn)
+        };
+        let global_key_range = Key::MIN..Key::MAX;
+
+        // step 1: collect all most recent image layers into a map
+        // map: end key to image_layer_info
+        let mut image_map: BTreeMap<Key, ImageLayerInfo> = BTreeMap::new();
+        for (img_range, img) in self.image_coverage(&global_key_range, disk_consistent_lsn) {
+            let img_lsn = img.map(|layer| layer.get_lsn_range().end).unwrap_or(Lsn(0));
+            image_map.insert(
+                img_range.end,
+                ImageLayerInfo {
+                    image_lsn: img_lsn,
+                    min_delta_lsn: min_l0_deltas_lsn,
+                },
+            );
+        }
+
+        // step 2: go through all delta layers, and update the image layer info with overlapping
+        // key ranges
+        for layer in self.historic.iter() {
+            if !layer.is_delta {
+                continue;
+            }
+            let delta_key_range = layer.get_key_range();
+            let delta_lsn_range = layer.get_lsn_range();
+            for (img_end_key, img_info) in image_map.range_mut(delta_key_range.start..Key::MAX) {
+                debug_assert!(img_end_key >= &delta_key_range.start);
+                if delta_lsn_range.end > img_info.image_lsn {
+                    // the delta layer includes WAL records after the image
+                    // it's possibel that the delta layer's start LSN < image LSN, which will be simply ignored by step 3
+                    img_info.min_delta_lsn =
+                        std::cmp::min(img_info.min_delta_lsn, delta_lsn_range.start);
+                }
+                if img_end_key >= &delta_key_range.end {
+                    // we have fully processed all overlapping image layers
+                    break;
+                }
+            }
+        }
+
+        // step 3, go through all image layers and find the image consistent LSN
+        let mut img_consistent_lsn = min_l0_deltas_lsn.checked_sub(Lsn(1)).unwrap();
+        let mut prev_key = Key::MIN;
+        for (img_key, img_info) in image_map {
+            tracing::debug!(
+                "Image layer {:?}:{} has min delta lsn {}",
+                Range {
+                    start: prev_key,
+                    end: img_key,
+                },
+                img_info.image_lsn,
+                img_info.min_delta_lsn,
+            );
+            let image_lsn = std::cmp::max(
+                img_info.image_lsn,
+                img_info.min_delta_lsn.checked_sub(Lsn(1)).unwrap_or(Lsn(0)),
+            );
+            img_consistent_lsn = std::cmp::min(img_consistent_lsn, image_lsn);
+            prev_key = img_key;
+        }
+        tracing::info!(
+            "computed image_consistent_lsn {} for disk_consistent_lsn {} in {}ms. Processed {} layrs in total.",
+            img_consistent_lsn,
+            disk_consistent_lsn,
+            started_at.elapsed().as_millis(),
+            self.historic.len()
+        );
+        img_consistent_lsn
+    }
+
+    /* END_HADRON */
+
    /// Return all L0 delta layers
    pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
        &self.l0_delta_layers
@@ -1579,6 +1677,138 @@ mod tests {
            LayerVisibilityHint::Visible
        ));
    }
+
+    /* BEGIN_HADRON */
+    #[test]
+    fn test_compute_image_consistent_lsn() {
+        let mut layer_map = LayerMap::default();
+
+        let disk_consistent_lsn = Lsn(1000);
+        // case 1: empty layer map
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(
+            disk_consistent_lsn.checked_sub(Lsn(1)).unwrap(),
+            image_consistent_lsn
+        );
+
+        // case 2: only L0 delta layer
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(100),
+                Lsn(900)..Lsn(990),
+                true,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(100),
+                Lsn(850)..Lsn(899),
+                true,
+            ));
+        }
+
+        // should use min L0 delta LSN - 1 as image consistent LSN
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(849), image_consistent_lsn);
+
+        // case 3: 3 images, no L1 delta
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(40),
+                Lsn(100)..Lsn(100),
+                false,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(40)..Key::from_i128(70),
+                Lsn(200)..Lsn(200),
+                false,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(70)..Key::from_i128(100),
+                Lsn(150)..Lsn(150),
+                false,
+            ));
+        }
+        // should use min L0 delta LSN - 1 as image consistent LSN
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(849), image_consistent_lsn);
+
+        // case 4: 3 images with 1 L1 delta
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(50),
+                Lsn(300)..Lsn(350),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(299), image_consistent_lsn);
+
+        // case 5: 3 images with 1 more L1 delta with smaller LSN
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(50)..Key::from_i128(72),
+                Lsn(200)..Lsn(300),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 6: 3 images with more newer L1 deltas (no impact on final results)
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(30),
+                Lsn(400)..Lsn(500),
+                true,
+            ));
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(35)..Key::from_i128(100),
+                Lsn(450)..Lsn(600),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 7: 3 images with more older L1 deltas (no impact on final results)
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(40),
+                Lsn(0)..Lsn(50),
+                true,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(50)..Key::from_i128(100),
+                Lsn(10)..Lsn(60),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 8: 3 images with one more L1 delta with overlapping LSN range
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(50),
+                Lsn(50)..Lsn(250),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(100), image_consistent_lsn);
+    }
+
+    /* END_HADRON */
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1678,6 +1678,8 @@ impl TenantManager {
        // Phase 6: Release the InProgress on the parent shard
        drop(parent_slot_guard);

+        utils::pausable_failpoint!("shard-split-post-finish-pause");
+
        Ok(child_shards)
    }

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -75,7 +75,7 @@ where
 /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
 /// call, to collect more records.
 ///
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub(crate) struct ValueReconstructState {
    pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
    pub(crate) img: Option<(Lsn, Bytes)>,
@@ -308,6 +308,9 @@ pub struct ValuesReconstructState {
    layers_visited: u32,
    delta_layers_visited: u32,

+    pub(crate) enable_debug: bool,
+    pub(crate) debug_state: ValueReconstructState,
+
    pub(crate) io_concurrency: IoConcurrency,
    num_active_ios: Arc<AtomicUsize>,

@@ -657,6 +660,23 @@ impl ValuesReconstructState {
            layers_visited: 0,
            delta_layers_visited: 0,
            io_concurrency,
+            enable_debug: false,
+            debug_state: ValueReconstructState::default(),
+            num_active_ios: Arc::new(AtomicUsize::new(0)),
+            read_path: None,
+        }
+    }
+
+    pub(crate) fn new_with_debug(io_concurrency: IoConcurrency) -> Self {
+        Self {
+            keys: HashMap::new(),
+            keys_done: KeySpaceRandomAccum::new(),
+            keys_with_image_coverage: None,
+            layers_visited: 0,
+            delta_layers_visited: 0,
+            io_concurrency,
+            enable_debug: true,
+            debug_state: ValueReconstructState::default(),
            num_active_ios: Arc::new(AtomicUsize::new(0)),
            read_path: None,
        }
@@ -670,6 +690,12 @@ impl ValuesReconstructState {
        self.io_concurrency.spawn_io(fut).await;
    }

+    pub(crate) fn set_debug_state(&mut self, debug_state: &ValueReconstructState) {
+        if self.enable_debug {
+            self.debug_state = debug_state.clone();
+        }
+    }
+
    pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
        self.layers_visited += 1;
        if let ReadableLayer::PersistentLayer(layer) = layer {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -351,13 +351,6 @@ pub struct Timeline {
    last_image_layer_creation_check_at: AtomicLsn,
    last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,

-    // HADRON
-    /// If a key range has writes with LSN > force_image_creation_lsn, then we should force image layer creation
-    /// on this key range.
-    force_image_creation_lsn: AtomicLsn,
-    /// The last time instant when force_image_creation_lsn is computed.
-    force_image_creation_lsn_computed_at: std::sync::Mutex<Option<Instant>>,
-
    /// Current logical size of the "datadir", at the last LSN.
    current_logical_size: LogicalSize,

@@ -1260,6 +1253,57 @@ impl Timeline {
        }
    }

+    #[inline(always)]
+    pub(crate) async fn debug_get(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+        reconstruct_state: &mut ValuesReconstructState,
+    ) -> Result<Bytes, PageReconstructError> {
+        if !lsn.is_valid() {
+            return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
+        }
+
+        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
+        // already checked the key against the shard_identity when looking up the Timeline from
+        // page_service.
+        debug_assert!(!self.shard_identity.is_key_disposable(&key));
+
+        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
+        let vectored_res = self
+            .debug_get_vectored_impl(query, reconstruct_state, ctx)
+            .await;
+
+        let key_value = vectored_res?.pop_first();
+        match key_value {
+            Some((got_key, value)) => {
+                if got_key != key {
+                    error!(
+                        "Expected {}, but singular vectored get returned {}",
+                        key, got_key
+                    );
+                    Err(PageReconstructError::Other(anyhow!(
+                        "Singular vectored get returned wrong key"
+                    )))
+                } else {
+                    value
+                }
+            }
+            None => Err(PageReconstructError::MissingKey(Box::new(
+                MissingKeyError {
+                    keyspace: KeySpace::single(key..key.next()),
+                    shard: self.shard_identity.get_shard_number(&key),
+                    original_hwm_lsn: lsn,
+                    ancestor_lsn: None,
+                    backtrace: None,
+                    read_path: None,
+                    query: None,
+                },
+            ))),
+        }
+    }
+
    pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;

    /// Look up multiple page versions at a given LSN
@@ -1554,6 +1598,98 @@ impl Timeline {
        Ok(results)
    }

+    // A copy of the get_vectored_impl method except that we store the image and wal records into `reconstruct_state`.
+    // This is only used in the http getpage call for debugging purpose.
+    pub(super) async fn debug_get_vectored_impl(
+        &self,
+        query: VersionedKeySpaceQuery,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if query.is_empty() {
+            return Ok(BTreeMap::default());
+        }
+
+        let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
+            Some(ReadPath::new(
+                query.total_keyspace(),
+                query.high_watermark_lsn()?,
+            ))
+        } else {
+            None
+        };
+
+        reconstruct_state.read_path = read_path;
+
+        let traversal_res: Result<(), _> = self
+            .get_vectored_reconstruct_data(query.clone(), reconstruct_state, ctx)
+            .await;
+
+        if let Err(err) = traversal_res {
+            // Wait for all the spawned IOs to complete.
+            // See comments on `spawn_io` inside `storage_layer` for more details.
+            let mut collect_futs = std::mem::take(&mut reconstruct_state.keys)
+                .into_values()
+                .map(|state| state.collect_pending_ios())
+                .collect::<FuturesUnordered<_>>();
+            while collect_futs.next().await.is_some() {}
+            return Err(err);
+        };
+
+        let reconstruct_state = Arc::new(Mutex::new(reconstruct_state));
+        let futs = FuturesUnordered::new();
+
+        for (key, state) in std::mem::take(&mut reconstruct_state.lock().unwrap().keys) {
+            let req_lsn_for_key = query.map_key_to_lsn(&key);
+            futs.push({
+                let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
+                let rc_clone = Arc::clone(&reconstruct_state);
+
+                async move {
+                    assert_eq!(state.situation, ValueReconstructSituation::Complete);
+
+                    let converted = match state.collect_pending_ios().await {
+                        Ok(ok) => ok,
+                        Err(err) => {
+                            return (key, Err(err));
+                        }
+                    };
+                    DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);
+
+                    // The walredo module expects the records to be descending in terms of Lsn.
+                    // And we submit the IOs in that order, so, there shuold be no need to sort here.
+                    debug_assert!(
+                        converted
+                            .records
+                            .is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)),
+                        "{converted:?}"
+                    );
+                    {
+                        let mut guard = rc_clone.lock().unwrap();
+                        guard.set_debug_state(&converted);
+                    }
+                    (
+                        key,
+                        walredo_self
+                            .reconstruct_value(
+                                key,
+                                req_lsn_for_key,
+                                converted,
+                                RedoAttemptType::ReadPage,
+                            )
+                            .await,
+                    )
+                }
+            });
+        }
+
+        let results = futs
+            .collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
+            .await;
+
+        Ok(results)
+    }
+
    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
    pub(crate) fn get_last_record_lsn(&self) -> Lsn {
        self.last_record_lsn.load().last
@@ -1900,6 +2036,8 @@ impl Timeline {
    // an ephemeral layer open forever when idle.  It also freezes layers if the global limit on
    // ephemeral layer bytes has been breached.
    pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        let Ok(mut write_guard) = self.write_lock.try_lock() else {
            // If the write lock is held, there is an active wal receiver: rolling open layers
            // is their responsibility while they hold this lock.
@@ -2854,7 +2992,7 @@ impl Timeline {
    }

    // HADRON
-    fn get_image_creation_timeout(&self) -> Option<Duration> {
+    fn get_image_layer_force_creation_period(&self) -> Option<Duration> {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
            .tenant_conf
@@ -3134,9 +3272,6 @@ impl Timeline {
                repartition_threshold: 0,
                last_image_layer_creation_check_at: AtomicLsn::new(0),
                last_image_layer_creation_check_instant: Mutex::new(None),
-                // HADRON
-                force_image_creation_lsn: AtomicLsn::new(0),
-                force_image_creation_lsn_computed_at: std::sync::Mutex::new(None),
                last_received_wal: Mutex::new(None),
                rel_size_latest_cache: RwLock::new(HashMap::new()),
                rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
@@ -5381,13 +5516,16 @@ impl Timeline {
        }

        // HADRON
+        // for child timelines, we consider all pages up to ancestor_LSN are redone successfully by the parent timeline
+        min_image_lsn = min_image_lsn.max(self.get_ancestor_lsn());
        if min_image_lsn < force_image_creation_lsn.unwrap_or(Lsn(0)) && max_deltas > 0 {
            info!(
-                "forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}",
+                "forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}, num deltas: {}",
                partition.ranges[0].start,
                partition.ranges[0].end,
                min_image_lsn,
-                force_image_creation_lsn.unwrap()
+                force_image_creation_lsn.unwrap(),
+                max_deltas
            );
            return true;
        }
@@ -5611,10 +5749,11 @@ impl Timeline {
    /// Predicate function which indicates whether we should check if new image layers
    /// are required. Since checking if new image layers are required is expensive in
    /// terms of CPU, we only do it in the following cases:
-    /// 1. If the timeline has ingested sufficient WAL to justify the cost
+    /// 1. If the timeline has ingested sufficient WAL to justify the cost or ...
    /// 2. If enough time has passed since the last check:
    ///     1. For large tenants, we wish to perform the check more often since they
-    ///        suffer from the lack of image layers
+    ///        suffer from the lack of image layers. Note that we assume sharded tenants
+    ///        to be large since non-zero shards do not track the logical size.
    ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
    fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
        let large_timeline_threshold = self.conf.image_layer_generation_large_timeline_threshold;
@@ -5628,30 +5767,39 @@ impl Timeline {

        let distance_based_decision = distance.0 >= min_distance;

-        let mut time_based_decision = false;
        let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
-        if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
-            let check_required_after =
-                if Some(Into::<u64>::into(&logical_size)) >= large_timeline_threshold {
-                    self.get_checkpoint_timeout()
-                } else {
-                    Duration::from_secs(3600 * 48)
-                };
-
-            time_based_decision = match *last_check_instant {
-                Some(last_check) => {
-                    let elapsed = last_check.elapsed();
-                    elapsed >= check_required_after
+        let check_required_after = (|| {
+            if self.shard_identity.is_unsharded() {
+                if let CurrentLogicalSize::Exact(logical_size) =
+                    self.current_logical_size.current_size()
+                {
+                    if Some(Into::<u64>::into(&logical_size)) < large_timeline_threshold {
+                        return Duration::from_secs(3600 * 48);
+                    }
                }
-                None => true,
-            };
-        }
+            }
+
+            self.get_checkpoint_timeout()
+        })();
+
+        let time_based_decision = match *last_check_instant {
+            Some(last_check) => {
+                let elapsed = last_check.elapsed();
+                elapsed >= check_required_after
+            }
+            None => true,
+        };

        // Do the expensive delta layer counting only if this timeline has ingested sufficient
        // WAL since the last check or a checkpoint timeout interval has elapsed since the last
        // check.
        let decision = distance_based_decision || time_based_decision;
-
+        tracing::info!(
+            "Decided to check image layers: {}. Distance-based decision: {}, time-based decision: {}",
+            decision,
+            distance_based_decision,
+            time_based_decision
+        );
        if decision {
            self.last_image_layer_creation_check_at.store(lsn);
            *last_check_instant = Some(Instant::now());
@@ -7153,6 +7301,19 @@ impl Timeline {
            .unwrap()
            .clone()
    }
+
+    /* BEGIN_HADRON */
+    pub(crate) async fn compute_image_consistent_lsn(&self) -> anyhow::Result<Lsn> {
+        let guard = self
+            .layers
+            .read(LayerManagerLockHolder::ComputeImageConsistentLsn)
+            .await;
+        let layer_map = guard.layer_map()?;
+        let disk_consistent_lsn = self.get_disk_consistent_lsn();
+
+        Ok(layer_map.compute_image_consistent_lsn(disk_consistent_lsn))
+    }
+    /* END_HADRON */
 }

 impl Timeline {
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -8,7 +8,7 @@ use std::cmp::min;
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
-use std::time::{Duration, Instant, SystemTime};
+use std::time::{Duration, Instant};

 use super::layer_manager::LayerManagerLockHolder;
 use super::{
@@ -34,7 +34,6 @@ use pageserver_api::models::{CompactInfoResponse, CompactKeyRange};
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use pageserver_compaction::helpers::{fully_contains, overlaps_with};
 use pageserver_compaction::interface::*;
-use postgres_ffi::to_pg_timestamp;
 use serde::Serialize;
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio_util::sync::CancellationToken;
@@ -47,7 +46,6 @@ use wal_decoder::models::value::Value;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
@@ -1271,10 +1269,7 @@ impl Timeline {
        // Define partitioning schema if needed

        // HADRON
-        let force_image_creation_lsn = self
-            .get_or_compute_force_image_creation_lsn(cancel, ctx)
-            .await
-            .map_err(CompactionError::Other)?;
+        let force_image_creation_lsn = self.get_force_image_creation_lsn();

        // 1. L0 Compact
        let l0_outcome = {
@@ -1484,59 +1479,37 @@ impl Timeline {
    }

    /* BEGIN_HADRON */
-    // Get the force image creation LSN. Compute it if the last computed LSN is too old.
-    async fn get_or_compute_force_image_creation_lsn(
-        self: &Arc<Self>,
-        cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Option<Lsn>> {
-        const FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL: Duration = Duration::from_secs(10 * 60); // 10 minutes
-        let image_layer_force_creation_period = self.get_image_creation_timeout();
-        if image_layer_force_creation_period.is_none() {
-            return Ok(None);
+    // Get the force image creation LSN based on gc_cutoff_lsn.
+    // Note that this is an estimation and the workload rate may suddenly change. When that happens,
+    // the force image creation may be too early or too late, but eventually it should be able to catch up.
+    pub(crate) fn get_force_image_creation_lsn(self: &Arc<Self>) -> Option<Lsn> {
+        let image_creation_period = self.get_image_layer_force_creation_period()?;
+        let current_lsn = self.get_last_record_lsn();
+        let pitr_lsn = self.gc_info.read().unwrap().cutoffs.time?;
+        let pitr_interval = self.get_pitr_interval();
+        if pitr_lsn == Lsn::INVALID || pitr_interval.is_zero() {
+            tracing::warn!(
+                "pitr LSN/interval not found, skipping force image creation LSN calculation"
+            );
+            return None;
        }

-        let image_layer_force_creation_period = image_layer_force_creation_period.unwrap();
-        let force_image_creation_lsn_computed_at =
-            *self.force_image_creation_lsn_computed_at.lock().unwrap();
-        if force_image_creation_lsn_computed_at.is_none()
-            || force_image_creation_lsn_computed_at.unwrap().elapsed()
-                > FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL
-        {
-            let now: SystemTime = SystemTime::now();
-            let timestamp = now
-                .checked_sub(image_layer_force_creation_period)
-                .ok_or_else(|| {
-                    anyhow::anyhow!(
-                        "image creation timeout is too large: {image_layer_force_creation_period:?}"
-                    )
-                })?;
-            let timestamp = to_pg_timestamp(timestamp);
-            let force_image_creation_lsn = match self
-                .find_lsn_for_timestamp(timestamp, cancel, ctx)
-                .await?
-            {
-                LsnForTimestamp::Present(lsn) | LsnForTimestamp::Future(lsn) => lsn,
-                _ => {
-                    let gc_lsn = *self.get_applied_gc_cutoff_lsn();
-                    tracing::info!(
-                        "no LSN found for timestamp {timestamp:?}, using latest GC cutoff LSN {}",
-                        gc_lsn
-                    );
-                    gc_lsn
-                }
-            };
-            self.force_image_creation_lsn
-                .store(force_image_creation_lsn);
-            *self.force_image_creation_lsn_computed_at.lock().unwrap() = Some(Instant::now());
-            tracing::info!(
-                "computed force image creation LSN: {}",
-                force_image_creation_lsn
-            );
-            Ok(Some(force_image_creation_lsn))
-        } else {
-            Ok(Some(self.force_image_creation_lsn.load()))
-        }
+        let delta_lsn = current_lsn.checked_sub(pitr_lsn).unwrap().0
+            * image_creation_period.as_secs()
+            / pitr_interval.as_secs();
+        let force_image_creation_lsn = current_lsn.checked_sub(delta_lsn).unwrap_or(Lsn(0));
+
+        tracing::info!(
+            "Tenant shard {} computed force_image_creation_lsn: {}. Current lsn: {}, image_layer_force_creation_period: {:?}, GC cutoff: {}, PITR interval: {:?}",
+            self.tenant_shard_id,
+            force_image_creation_lsn,
+            current_lsn,
+            image_creation_period,
+            pitr_lsn,
+            pitr_interval
+        );
+
+        Some(force_image_creation_lsn)
    }
    /* END_HADRON */

--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -359,14 +359,14 @@ impl<T: Types> Cache<T> {
                Err(e) => {
                    // Retry on tenant manager error to handle tenant split more gracefully
                    if attempt < GET_MAX_RETRIES {
-                        tracing::warn!(
-                            "Fail to resolve tenant shard in attempt {}: {:?}. Retrying...",
-                            attempt,
-                            e
-                        );
                        tokio::time::sleep(RETRY_BACKOFF).await;
                        continue;
                    } else {
+                        tracing::warn!(
+                            "Failed to resolve tenant shard after {} attempts: {:?}",
+                            GET_MAX_RETRIES,
+                            e
+                        );
                        return Err(e);
                    }
                }
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -47,6 +47,7 @@ pub(crate) enum LayerManagerLockHolder {
    ImportPgData,
    DetachAncestor,
    Eviction,
+    ComputeImageConsistentLsn,
    #[cfg(test)]
    Testing,
 }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -147,6 +147,16 @@ pub enum RedoAttemptType {
    GcCompaction,
 }

+impl std::fmt::Display for RedoAttemptType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            RedoAttemptType::ReadPage => write!(f, "read page"),
+            RedoAttemptType::LegacyCompaction => write!(f, "legacy compaction"),
+            RedoAttemptType::GcCompaction => write!(f, "gc compaction"),
+        }
+    }
+}
+
 ///
 /// Public interface of WAL redo manager
 ///
@@ -199,6 +209,7 @@ impl PostgresRedoManager {
                        self.conf.wal_redo_timeout,
                        pg_version,
                        max_retry_attempts,
+                        redo_attempt_type,
                    )
                    .await
                };
@@ -221,6 +232,7 @@ impl PostgresRedoManager {
                self.conf.wal_redo_timeout,
                pg_version,
                max_retry_attempts,
+                redo_attempt_type,
            )
            .await
        }
@@ -445,6 +457,7 @@ impl PostgresRedoManager {
        wal_redo_timeout: Duration,
        pg_version: PgMajorVersion,
        max_retry_attempts: u32,
+        redo_attempt_type: RedoAttemptType,
    ) -> Result<Bytes, Error> {
        *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());

@@ -485,17 +498,28 @@ impl PostgresRedoManager {
                );

                if let Err(e) = result.as_ref() {
-                    error!(
-                        "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
-                        records.len(),
-                        records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-                        records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                        nbytes,
-                        base_img_lsn,
-                        lsn,
-                        n_attempts,
-                        e,
-                    );
+                    macro_rules! message {
+                        ($level:tt) => {
+                            $level!(
+                                "error applying {} WAL records {}..{} ({} bytes) to key {} during {}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                                records.len(),
+                                records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+                                records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                                nbytes,
+                                key,
+                                redo_attempt_type,
+                                base_img_lsn,
+                                lsn,
+                                n_attempts,
+                                e,
+                            )
+                        }
+                    }
+                    match redo_attempt_type {
+                        RedoAttemptType::ReadPage => message!(error),
+                        RedoAttemptType::LegacyCompaction => message!(error),
+                        RedoAttemptType::GcCompaction => message!(warn),
+                    }
                }

                result.map_err(Error::Other)
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -38,6 +38,8 @@ DATA = \
 	neon--1.3--1.4.sql \
 	neon--1.4--1.5.sql \
 	neon--1.5--1.6.sql \
+	neon--1.6--1.7.sql \
+	neon--1.7--1.6.sql \
 	neon--1.6--1.5.sql \
 	neon--1.5--1.4.sql \
 	neon--1.4--1.3.sql \
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -54,6 +54,7 @@
 */
 #include "postgres.h"

+#include "access/twophase.h"
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
 #include "access/xlog_internal.h"
@@ -64,6 +65,7 @@
 #include "miscadmin.h"
 #include "port/pg_iovec.h"
 #include "postmaster/interrupt.h"
+#include "postmaster/postmaster.h"
 #include "replication/walsender.h"
 #include "storage/ipc.h"
 #include "utils/timeout.h"
@@ -75,11 +77,18 @@
 #include "neon_perf_counters.h"
 #include "pagestore_client.h"

-#if PG_VERSION_NUM >= 150000
+#if PG_MAJORVERSION_NUM >= 17
+#include "storage/procnumber.h"
+#else
+#define MyProcNumber MyProc->pgprocno
+#endif
+
+
+#if PG_MAJORVERSION_NUM >= 15
 #include "access/xlogrecovery.h"
 #endif

-#if PG_VERSION_NUM < 160000
+#if PG_MAJORVERSION_NUM < 16
 typedef PGAlignedBlock PGIOAlignedBlock;
 #endif

@@ -294,6 +303,15 @@ static PrefetchState *MyPState;

 static process_interrupts_callback_t prev_interrupt_cb;

+/*
+ * Array in shared memory each cell of which contains minimal in-flight request LSN sent to PS by the backend which procno is
+ * used as index in this array. This array is initially filled with InfiniteXlogRecPtr (UINT64_MAX) so if backend
+ * didn't send any request to PS, then this value doesn't effect global min.
+ *
+ * We support only 64-bit platforms and so assume that access to array elements is atomic and no any synchronization is needed. 
+ */
+static XLogRecPtr* minPrefetchLsn;
+
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
 static uint64 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
@@ -316,6 +334,41 @@ pg_init_communicator(void)
 	ProcessInterruptsCallback = communicator_processinterrupts;
 }

+static Size
+CommunicatorShmemSize(void)
+{
+#if PG_MAJORVERSION_NUM >= 15
+	Assert(MaxBackends != 0);
+	return (MaxBackends + NUM_AUXILIARY_PROCS + max_prepared_xacts) * sizeof(XLogRecPtr);
+#else
+	return (MAX_BACKENDS + NUM_AUXILIARY_PROCS + max_prepared_xacts) * sizeof(XLogRecPtr);
+#endif
+}
+
+void
+CommunicatorShmemRequest(void)
+{
+	RequestAddinShmemSpace(CommunicatorShmemSize());
+}
+
+void
+CommunicatorShmemInit(void)
+{
+	bool found;
+	minPrefetchLsn = (XLogRecPtr*)ShmemInitStruct("Communicator shared state",
+												  CommunicatorShmemSize(),
+												  &found);
+	if (!found)
+	{
+		/*
+		 * Fill with InfiniteXLogRecPtr (UINT64_MAX).
+		 * If backend didn't send any requests to PS, then InfiniteXLogRecPtr doesn't affect global minimal value.
+		 */
+		memset(minPrefetchLsn, 0xFF, CommunicatorShmemSize());
+	}
+}
+
+
 static bool
 compact_prefetch_buffers(void)
 {
@@ -421,7 +474,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
 {
 	if (resp->tag != T_NeonGetPageResponse && resp->tag != T_NeonErrorResponse)
 	{
-		neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=%ld, ring_flush=%ld, ring_unused=%ld",
+		neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=" UINT64_FORMAT ", ring_flush=" UINT64_FORMAT ", ring_unused=" UINT64_FORMAT "",
 					   resp->tag, MyPState->ring_receive, MyPState->ring_flush, MyPState->ring_unused);
 	}
 	if (neon_protocol_version >= 3)
@@ -438,7 +491,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
 				getpage_resp->req.blkno != slot->buftag.blockNum)
 			{
 				NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-											"Receive unexpected getpage response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
+											"Receive unexpected getpage response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
 											resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
 											slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), slot->buftag.forkNum, slot->buftag.blockNum);
 			}
@@ -447,13 +500,27 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
 				 resp->lsn != slot->request_lsns.request_lsn ||
 				 resp->not_modified_since != slot->request_lsns.not_modified_since)
 		{
-			elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+			elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 				 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 				 slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
 		}
 	}
 }

+/*
+ * Update min in-flight prefetch LSN for this backend.
+ */
+static void
+update_min_prefetch_lsn(uint64 ring_index)
+{
+	if (ring_index + 1 < MyPState->ring_unused)
+	{
+		PrefetchRequest* next_slot = GetPrfSlot(ring_index + 1);
+		Assert(minPrefetchLsn[MyProcNumber] <= next_slot->request_lsns.request_lsn);
+		minPrefetchLsn[MyProcNumber] = next_slot->request_lsns.request_lsn;
+	}
+}
+
 /*
 * If there might be responses still in the TCP buffer, then we should try to
 * use those, to reduce any TCP backpressure on the OS/PS side.
@@ -478,8 +545,9 @@ communicator_prefetch_pump_state(void)
 		NeonResponse   *response;
 		PrefetchRequest *slot;
 		MemoryContext	old;
+		uint64			my_ring_index = MyPState->ring_receive;

-		slot = GetPrfSlot(MyPState->ring_receive);
+		slot = GetPrfSlot(my_ring_index);

 		old = MemoryContextSwitchTo(MyPState->errctx);
 		response = page_server->try_receive(slot->shard_no);
@@ -488,17 +556,19 @@ communicator_prefetch_pump_state(void)
 		if (response == NULL)
 			break;

+		update_min_prefetch_lsn(my_ring_index);
+
 		check_getpage_response(slot, response);

 		/* The slot should still be valid */
 		if (slot->status != PRFS_REQUESTED ||
 			slot->response != NULL ||
-			slot->my_ring_index != MyPState->ring_receive)
+			slot->my_ring_index != my_ring_index)
 		{
 			neon_shard_log(slot->shard_no, PANIC,
-						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
 						   slot->status, slot->response,
-						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+						   slot->my_ring_index, my_ring_index);
 		}
 		/* update prefetch state */
 		MyPState->n_responses_buffered += 1;
@@ -665,6 +735,9 @@ consume_prefetch_responses(void)
 {
 	if (MyPState->ring_receive < MyPState->ring_unused)
 		prefetch_wait_for(MyPState->ring_unused - 1);
+
+	minPrefetchLsn[MyProcNumber] = InfiniteXLogRecPtr; /* No more in-flight prefetch requests from this backend */
+
 	/*
 	 * We know for sure we're not working on any prefetch pages after
 	 * this.
@@ -789,9 +862,9 @@ prefetch_read(PrefetchRequest *slot)
 		slot->my_ring_index != MyPState->ring_receive)
 	{
 		neon_shard_log(slot->shard_no, PANIC,
-					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
+					   "Incorrect prefetch read: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
 					   slot->status, slot->response,
-					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
+					   slot->my_ring_index, MyPState->ring_receive);
 	}

 	/*
@@ -806,6 +879,9 @@ prefetch_read(PrefetchRequest *slot)
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive(shard_no);
 	MemoryContextSwitchTo(old);
+
+	update_min_prefetch_lsn(my_ring_index);
+
 	if (response)
 	{
 		check_getpage_response(slot, response);
@@ -816,9 +892,9 @@ prefetch_read(PrefetchRequest *slot)
 			slot->my_ring_index != MyPState->ring_receive)
 		{
 			neon_shard_log(shard_no, PANIC,
-						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
 						   slot->status, slot->response,
-						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+						   slot->my_ring_index, MyPState->ring_receive);
 		}

 		/* update prefetch state */
@@ -852,8 +928,8 @@ prefetch_read(PrefetchRequest *slot)
 		 * and the prefetch queue was flushed during the receive call
 		 */
 		neon_shard_log(shard_no, LOG,
-					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   (long) my_ring_index,
+					   "No response from reading prefetch entry " UINT64_FORMAT ": %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   my_ring_index,
 					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
 					   buftag.forkNum, buftag.blockNum);
 		return false;
@@ -924,6 +1000,8 @@ prefetch_on_ps_disconnect(void)
 		MyNeonCounters->getpage_prefetch_discards_total += 1;
 	}

+	minPrefetchLsn[MyProcNumber] = InfiniteXLogRecPtr; /* No more in-flight prefetch requests from this backend */
+
 	/*
 	 * We can have gone into retry due to network error, so update stats with
 	 * the latest available
@@ -1025,6 +1103,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);

+	minPrefetchLsn[MyProcNumber] = Min(request.hdr.lsn, minPrefetchLsn[MyProcNumber]);
+
 	while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
 	{
 		Assert(mySlotNo == MyPState->ring_unused);
@@ -1045,6 +1125,23 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 	Assert(!found);
 }

+/*
+ * Check that returned page LSN is consistent with request lsns
+ */
+static void
+check_page_lsn(NeonGetPageResponse* resp)
+{
+	if (PageGetLSN(resp->page) > resp->req.hdr.not_modified_since)
+		neon_log(PANIC, "Invalid getpage response version: %X/%08X is higher than last modified LSN %X/%08X",
+				 LSN_FORMAT_ARGS(PageGetLSN(resp->page)),
+			 LSN_FORMAT_ARGS(resp->req.hdr.not_modified_since));
+
+	if (PageGetLSN(resp->page) > resp->req.hdr.lsn)
+		neon_log(PANIC, "Invalid getpage response version: %X/%08X is higher than request LSN %X/%08X",
+			 LSN_FORMAT_ARGS(PageGetLSN(resp->page)),
+			 LSN_FORMAT_ARGS(resp->req.hdr.lsn));
+}
+
 /*
 * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted.
 * Present pages are marked in "mask" bitmap and total number of such pages is returned.
@@ -1068,7 +1165,7 @@ communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumbe
 	for (int i = 0; i < nblocks; i++)
 	{
 		PrfHashEntry *entry;
-
+		NeonGetPageResponse* resp;
 		hashkey.buftag.blockNum = blocknum + i;
 		entry = prfh_lookup(MyPState->prf_hash, &hashkey);

@@ -1101,8 +1198,9 @@ communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumbe
 				continue;
 			}
 			Assert(slot->response->tag == T_NeonGetPageResponse); /* checked by check_getpage_response when response was assigned to the slot */
-			memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
-
+			resp = (NeonGetPageResponse*)slot->response;
+			check_page_lsn(resp);
+			memcpy(buffers[i], resp->page, BLCKSZ);

 			/*
 			 * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received
@@ -1453,6 +1551,7 @@ page_server_request(void const *req)
 	PG_TRY();
 	{
 		before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
+		minPrefetchLsn[MyProcNumber] = ((NeonRequest *)req)->lsn;
 		do
 		{
 			while (!page_server->send(shard_no, (NeonRequest *) req)
@@ -1464,10 +1563,12 @@ page_server_request(void const *req)
 			resp = page_server->receive(shard_no);
 			MyNeonCounters->pageserver_open_requests--;
 		} while (resp == NULL);
+		minPrefetchLsn[MyProcNumber] = InfiniteXLogRecPtr;
 		cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
 	}
 	PG_CATCH();
 	{
+		minPrefetchLsn[MyProcNumber] = InfiniteXLogRecPtr;
 		cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
 		/* Nothing should cancel disconnect: we should not leave connection in opaque state */
 		HOLD_INTERRUPTS();
@@ -1844,7 +1945,7 @@ nm_to_string(NeonMessage *msg)
 				NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg;

 				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\"");
-				appendStringInfo(&s, ", \"db_size\": %ld}",
+				appendStringInfo(&s, ", \"db_size\": " INT64_FORMAT "}",
 								 msg_resp->db_size);
 				appendStringInfoChar(&s, '}');

@@ -1888,7 +1989,7 @@ communicator_init(void)
 	 * the check here. That's OK, we don't expect the logic to change in old
 	 * releases.
 	 */
-#if PG_VERSION_NUM>=150000
+#if PG_MAJORVERSION_NUM >= 15
 	if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS])
 		elog(ERROR, "MyNeonCounters points past end of array");
 #endif
@@ -1967,7 +2068,7 @@ neon_prefetch_response_usable(neon_request_lsns *request_lsns,
 	 * Each request to the pageserver has three LSN values associated with it:
 	 * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'.
 	 * `not_modified_since` and `request_lsn` are sent to the pageserver, but
-	 * in the primary node, we always use UINT64_MAX as the `request_lsn`, so
+	 * in the primary node, we always use InfiniteXLogRecPtr as the `request_lsn`, so
 	 * we remember `effective_request_lsn` separately. In a primary,
 	 * `effective_request_lsn` is the same as  `not_modified_since`.
 	 * See comments in neon_get_request_lsns why we can not use last flush WAL position here.
@@ -2045,7 +2146,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
 						exists_resp->req.forknum != request.forknum)
 					{
 						NEON_PANIC_CONNECTION_STATE(0, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
 					}
@@ -2058,14 +2159,14 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
 				{
 					if (!equal_requests(resp, &request.hdr))
 					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 					}
 				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 								resp->reqid,
 								RelFileInfoFmt(rinfo),
 								forkNum,
@@ -2227,6 +2328,7 @@ Retry:
 			case T_NeonGetPageResponse:
 			{
 				NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp;
+				check_page_lsn(getpage_resp);
 				memcpy(buffer, getpage_resp->page, BLCKSZ);

 				/*
@@ -2241,7 +2343,7 @@ Retry:
 			case T_NeonErrorResponse:
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[shard %d, reqid " UINT64_HEX_FORMAT "] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
 								slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
 								forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
 						 errdetail("page server returned error: %s",
@@ -2294,7 +2396,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
 						relsize_resp->req.forknum != forknum)
 					{
 						NEON_PANIC_CONNECTION_STATE(0, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
 					}
@@ -2307,14 +2409,14 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
 				{
 					if (!equal_requests(resp, &request.hdr))
 					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 					}
 				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 								resp->reqid,
 								RelFileInfoFmt(rinfo),
 								forknum,
@@ -2364,7 +2466,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
 						dbsize_resp->req.dbNode != dbNode)
 					{
 						NEON_PANIC_CONNECTION_STATE(0, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
+													"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
 					}
@@ -2377,14 +2479,14 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
 				{
 					if (!equal_requests(resp, &request.hdr))
 					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 					}
 				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read db size of db %u from page server at lsn %X/%08X",
 								resp->reqid,
 								dbNode, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)),
 						 errdetail("page server returned error: %s",
@@ -2424,15 +2526,18 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 	PG_TRY();
 	{
 		before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
+		minPrefetchLsn[MyProcNumber] = request_lsns->request_lsn;
 		do
 		{
 			while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
 			resp = page_server->receive(shard_no);
 		} while (resp == NULL);
+		minPrefetchLsn[MyProcNumber] = InfiniteXLogRecPtr;
 		cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
 	}
 	PG_CATCH();
 	{
+		minPrefetchLsn[MyProcNumber] = InfiniteXLogRecPtr;
 		cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
 		/* Nothing should cancel disconnect: we should not leave connection in opaque state */
 		HOLD_INTERRUPTS();
@@ -2455,7 +2560,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 					slru_resp->req.segno != segno)
 				{
 					NEON_PANIC_CONNECTION_STATE(0, PANIC,
-												"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
+												"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
 												resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
 												request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno);
 				}
@@ -2469,14 +2574,14 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 			{
 				if (!equal_requests(resp, &request.hdr))
 				{
-					elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+					elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 						 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 						 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 				}
 			}
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %llu at lsn %X/%08X",
+					 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read SLRU %d segment %llu at lsn %X/%08X",
 							resp->reqid,
 							kind,
 							(unsigned long long) segno,
@@ -2577,3 +2682,19 @@ communicator_processinterrupts(void)

 	return prev_interrupt_cb();
 }
+
+PG_FUNCTION_INFO_V1(neon_communicator_min_inflight_request_lsn);
+
+Datum
+neon_communicator_min_inflight_request_lsn(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr min_lsn = RecoveryInProgress()
+		? GetXLogReplayRecPtr(NULL)
+		: InfiniteXLogRecPtr;
+	size_t n_procs = ProcGlobal->allProcCount;
+	for (size_t i = 0; i < n_procs; i++)
+	{
+		min_lsn = Min(min_lsn, minPrefetchLsn[i]);
+	}
+	PG_RETURN_INT64(min_lsn);
+}
--- a/pgxn/neon/communicator.h
+++ b/pgxn/neon/communicator.h
@@ -46,5 +46,4 @@ extern int communicator_read_slru_segment(SlruKind kind, int64 segno,
 extern void communicator_reconfigure_timeout_if_needed(void);
 extern void communicator_prefetch_pump_state(void);

-
 #endif
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -162,8 +162,34 @@ typedef struct FileCacheControl
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
 	dlist_head  holes;          /* double linked list of punched holes */
-	HyperLogLogState wss_estimation; /* estimation of working set size */
+
 	ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */
+
+	/*
+	 * Estimation of working set size.
+	 *
+	 * This is not guarded by the lock. No locking is needed because all the
+	 * writes to the "registers" are simple 64-bit stores, to update a
+	 * timestamp. We assume that:
+	 *
+	 * - 64-bit stores are atomic. We could enforce that by using
+	 *   pg_atomic_uint64 instead of TimestampTz as the datatype in hll.h, but
+	 *   for now we just rely on it implicitly.
+	 *
+	 * - Even if they're not, and there is a race between two stores, it
+	 *   doesn't matter much which one wins because they're both updating the
+	 *   register with the current timestamp. Or you have a race between
+	 *   resetting the register and updating it, in which case it also doesn't
+	 *   matter much which one wins.
+	 *
+	 * - If they're not atomic, you might get an occasional "torn write" if
+	 *   you're really unlucky, but we tolerate that too. It just means that
+	 *   the estimate will be a little off, until the register is updated
+	 *   again.
+	 */
+	HyperLogLogState wss_estimation;
+
+	/* Prewarmer state */
 	PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
 	size_t n_prewarm_workers;
 	size_t n_prewarm_entries;
@@ -193,10 +219,6 @@ static char *lfc_path;
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
 static bool lfc_do_prewarm;
-static shmem_startup_hook_type prev_shmem_startup_hook;
-#if PG_VERSION_NUM>=150000
-static shmem_request_hook_type prev_shmem_request_hook;
-#endif

 bool lfc_store_prefetch_result;
 bool lfc_prewarm_update_ws_estimation;
@@ -205,6 +227,8 @@ bool AmPrewarmWorker;

 #define LFC_ENABLED() (lfc_ctl->limit != 0)

+PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
+
 /*
 * Close LFC file if opened.
 * All backends should close their LFC files once LFC is disabled.
@@ -314,18 +338,14 @@ lfc_ensure_opened(void)
 	return true;
 }

-static void
-lfc_shmem_startup(void)
+void
+LfcShmemInit(void)
 {
 	bool		found;
 	static HASHCTL info;

-	if (prev_shmem_startup_hook)
-	{
-		prev_shmem_startup_hook();
-	}
-
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+	if (lfc_max_size <= 0)
+		return;

 	lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
 	if (!found)
@@ -370,19 +390,16 @@ lfc_shmem_startup(void)
 			ConditionVariableInit(&lfc_ctl->cv[i]);

 	}
-	LWLockRelease(AddinShmemInitLock);
 }

-static void
-lfc_shmem_request(void)
+void
+LfcShmemRequest(void)
 {
-#if PG_VERSION_NUM>=150000
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-#endif
-
-	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE));
-	RequestNamedLWLockTranche("lfc_lock", 1);
+	if (lfc_max_size > 0)
+	{
+		RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE));
+		RequestNamedLWLockTranche("lfc_lock", 1);
+	}
 }

 static bool
@@ -614,18 +631,6 @@ lfc_init(void)
 							NULL,
 							NULL,
 							NULL);
-
-	if (lfc_max_size == 0)
-		return;
-
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = lfc_shmem_startup;
-#if PG_VERSION_NUM>=150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = lfc_shmem_request;
-#else
-	lfc_shmem_request();
-#endif
 }

 FileCacheState*
@@ -1142,6 +1147,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);

+	/* Update working set size estimate for the blocks */
+	for (int i = 0; i < nblocks; i++)
+	{
+		tag.blockNum = blkno + i;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}
+
 	/*
 	 * For every chunk that has blocks we're interested in, we
 	 * 1. get the chunk header
@@ -1220,14 +1232,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}

 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
-
-		/* Approximate working set for the blocks assumed in this entry */
-		for (int i = 0; i < blocks_in_chunk; i++)
-		{
-			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-		}
-
 		if (entry == NULL)
 		{
 			/* Pages are not cached */
@@ -1504,9 +1508,15 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		return false;

 	CopyNRelFileInfoToBufTag(tag, rinfo);
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	tag.forkNum = forknum;

-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+	/* Update working set size estimate for the blocks */
+	if (lfc_prewarm_update_ws_estimation)
+	{
+		tag.blockNum = blkno;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}

 	tag.blockNum = blkno - chunk_offs;
 	hash = get_hash_value(lfc_hash, &tag);
@@ -1524,19 +1534,13 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,

 	if (lwlsn > lsn)
 	{
-		elog(DEBUG1, "Skip LFC write for %d because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X",
+		elog(DEBUG1, "Skip LFC write for %u because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X",
 			 blkno, LSN_FORMAT_ARGS(lwlsn), LSN_FORMAT_ARGS(lsn));
 		LWLockRelease(lfc_lock);
 		return false;
 	}

 	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
-
-	if (lfc_prewarm_update_ws_estimation)
-	{
-		tag.blockNum = blkno;
-		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-	}
 	if (found)
 	{
 		state = GET_STATE(entry, chunk_offs);
@@ -1649,9 +1653,15 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		return;

 	CopyNRelFileInfoToBufTag(tag, rinfo);
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	tag.forkNum = forkNum;

-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+	/* Update working set size estimate for the blocks */
+	for (int i = 0; i < nblocks; i++)
+	{
+		tag.blockNum = blkno + i;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);

@@ -1692,14 +1702,6 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		cv = &lfc_ctl->cv[hash % N_COND_VARS];

 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
-
-		/* Approximate working set for the blocks assumed in this entry */
-		for (int i = 0; i < blocks_in_chunk; i++)
-		{
-			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-		}
-
 		if (found)
 		{
 			/*
@@ -2135,40 +2137,23 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		SRF_RETURN_DONE(funcctx);
 }

-PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);

-Datum
-approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+/*
+ * Internal implementation of the approximate_working_set_size_seconds()
+ * function.
+ */
+int32
+lfc_approximate_working_set_size_seconds(time_t duration, bool reset)
 {
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
-		LWLockAcquire(lfc_lock, LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
-}
+	int32		dc;

-PG_FUNCTION_INFO_V1(approximate_working_set_size);
+	if (lfc_size_limit == 0)
+		return -1;

-Datum
-approximate_working_set_size(PG_FUNCTION_ARGS)
-{
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		bool reset = PG_GETARG_BOOL(0);
-		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
-		if (reset)
-			memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
+	dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
+	if (reset)
+		memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
+	return dc;
 }

 PG_FUNCTION_INFO_V1(get_local_cache_state);
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -47,7 +47,8 @@ extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blk
 extern FileCacheState* lfc_get_state(size_t max_entries);
 extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);

-PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
+extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset);
+

 static inline bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -118,10 +118,6 @@ typedef struct
 	ShardMap	shard_map;
 } PagestoreShmemState;

-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-#endif
-static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;

@@ -1284,18 +1280,12 @@ check_neon_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(id, *newval, 16);
 }

-static Size
-PagestoreShmemSize(void)
-{
-	return add_size(sizeof(PagestoreShmemState), NeonPerfCountersShmemSize());
-}

-static bool
+void
 PagestoreShmemInit(void)
 {
 	bool		found;

-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 	pagestore_shared = ShmemInitStruct("libpagestore shared state",
 									   sizeof(PagestoreShmemState),
 									   &found);
@@ -1306,44 +1296,12 @@ PagestoreShmemInit(void)
 		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
 		AssignPageserverConnstring(page_server_connstring, NULL);
 	}
-
-	NeonPerfCountersShmemInit();
-
-	LWLockRelease(AddinShmemInitLock);
-	return found;
 }

-static void
-pagestore_shmem_startup_hook(void)
+void
+PagestoreShmemRequest(void)
 {
-	if (prev_shmem_startup_hook)
-		prev_shmem_startup_hook();
-
-	PagestoreShmemInit();
-}
-
-static void
-pagestore_shmem_request(void)
-{
-#if PG_VERSION_NUM >= 150000
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-#endif
-
-	RequestAddinShmemSpace(PagestoreShmemSize());
-}
-
-static void
-pagestore_prepare_shmem(void)
-{
-#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = pagestore_shmem_request;
-#else
-	pagestore_shmem_request();
-#endif
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = pagestore_shmem_startup_hook;
+	RequestAddinShmemSpace(sizeof(PagestoreShmemState));
 }

 /*
@@ -1352,8 +1310,6 @@ pagestore_prepare_shmem(void)
 void
 pg_init_libpagestore(void)
 {
-	pagestore_prepare_shmem();
-
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
@@ -1504,8 +1460,6 @@ pg_init_libpagestore(void)
 							0,
 							NULL, NULL, NULL);

-	relsize_hash_init();
-
 	if (page_server != NULL)
 		neon_log(ERROR, "libpagestore already loaded");

--- a/pgxn/neon/neon--1.6--1.7.sql
+++ b/pgxn/neon/neon--1.6--1.7.sql
@@ -0,0 +1,3 @@
+create function neon_communicator_min_inflight_request_lsn() returns pg_catalog.pg_lsn
+AS 'MODULE_PATHNAME', 'neon_communicator_min_inflight_request_lsn'
+LANGUAGE C;
--- a/pgxn/neon/neon--1.7--1.6.sql
+++ b/pgxn/neon/neon--1.7--1.6.sql
@@ -0,0 +1 @@
+drop function neon_communicator_min_inflight_request_lsn();
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -22,6 +22,7 @@
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/proc.h"
+#include "storage/ipc.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
 #include "utils/builtins.h"
@@ -59,11 +60,15 @@ static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
 static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags);
 static void neon_ExecutorEnd(QueryDesc *queryDesc);

-#if PG_MAJORVERSION_NUM >= 16
 static shmem_startup_hook_type prev_shmem_startup_hook;
-
 static void neon_shmem_startup_hook(void);
+static void neon_shmem_request_hook(void);
+
+#if PG_MAJORVERSION_NUM >= 15
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
 #endif
+
+
 #if PG_MAJORVERSION_NUM >= 17
 uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
 uint32		WAIT_EVENT_NEON_LFC_READ;
@@ -450,15 +455,13 @@ _PG_init(void)
 	 */
 #if PG_VERSION_NUM >= 160000
 	load_file("$libdir/neon_rmgr", false);
-
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = neon_shmem_startup_hook;
 #endif

 	/* dummy call to a Rust function in the communicator library, to check that it works */
 	(void) communicator_dummy(123);

 	pg_init_libpagestore();
+	relsize_hash_init();
 	lfc_init();
 	pg_init_walproposer();
 	init_lwlsncache();
@@ -552,6 +555,16 @@ _PG_init(void)

 	ReportSearchPath();

+#if PG_VERSION_NUM >= 150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = neon_shmem_request_hook;
+#else
+	neon_shmem_request_hook();
+#endif
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = neon_shmem_startup_hook;
+
+
 	prev_ExecutorStart = ExecutorStart_hook;
 	ExecutorStart_hook = neon_ExecutorStart;
 	prev_ExecutorEnd = ExecutorEnd_hook;
@@ -561,6 +574,8 @@ _PG_init(void)
 PG_FUNCTION_INFO_V1(pg_cluster_size);
 PG_FUNCTION_INFO_V1(backpressure_lsns);
 PG_FUNCTION_INFO_V1(backpressure_throttling_time);
+PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
+PG_FUNCTION_INFO_V1(approximate_working_set_size);

 Datum
 pg_cluster_size(PG_FUNCTION_ARGS)
@@ -607,7 +622,52 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }

-#if PG_MAJORVERSION_NUM >= 16
+Datum
+approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+{
+	time_t		duration;
+	int32		dc;
+
+	duration = PG_ARGISNULL(0) ? (time_t) -1 : PG_GETARG_INT32(0);
+
+	dc = lfc_approximate_working_set_size_seconds(duration, false);
+	if (dc < 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_INT32(dc);
+}
+
+Datum
+approximate_working_set_size(PG_FUNCTION_ARGS)
+{
+	bool		reset = PG_GETARG_BOOL(0);
+	int32		dc;
+
+	dc = lfc_approximate_working_set_size_seconds(-1, reset);
+	if (dc < 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_INT32(dc);
+}
+
+static void
+neon_shmem_request_hook(void)
+{
+#if PG_VERSION_NUM >= 150000
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+#endif
+
+	LfcShmemRequest();
+	NeonPerfCountersShmemRequest();
+	PagestoreShmemRequest();
+	RelsizeCacheShmemRequest();
+	CommunicatorShmemRequest();
+	WalproposerShmemRequest();
+	LwLsnCacheShmemRequest();
+}
+
+
 static void
 neon_shmem_startup_hook(void)
 {
@@ -615,6 +675,16 @@ neon_shmem_startup_hook(void)
 	if (prev_shmem_startup_hook)
 		prev_shmem_startup_hook();

+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	LfcShmemInit();
+	NeonPerfCountersShmemInit();
+	PagestoreShmemInit();
+	RelsizeCacheShmemInit();
+	CommunicatorShmemInit();
+	WalproposerShmemInit();
+	LwLsnCacheShmemInit();
+
 #if PG_MAJORVERSION_NUM >= 17
 	WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance");
 	WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read");
@@ -627,8 +697,9 @@ neon_shmem_startup_hook(void)
 	WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO");
 	WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download");
 #endif
+
+	LWLockRelease(AddinShmemInitLock);
 }
-#endif

 /*
 * ExecutorStart hook: start up tracking if needed
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -58,6 +58,7 @@ extern uint32		WAIT_EVENT_NEON_WAL_DL;
 														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
 														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))

+#define InfiniteXLogRecPtr UINT64_MAX

 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);
@@ -71,4 +72,21 @@ extern PGDLLEXPORT void WalProposerSync(int argc, char *argv[]);
 extern PGDLLEXPORT void WalProposerMain(Datum main_arg);
 extern PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);

+extern void LfcShmemRequest(void);
+extern void PagestoreShmemRequest(void);
+extern void RelsizeCacheShmemRequest(void);
+extern void CommunicatorShmemRequest(void);
+extern void WalproposerShmemRequest(void);
+extern void LwLsnCacheShmemRequest(void);
+extern void NeonPerfCountersShmemRequest(void);
+
+extern void LfcShmemInit(void);
+extern void PagestoreShmemInit(void);
+extern void RelsizeCacheShmemInit(void);
+extern void CommunicatorShmemInit(void);
+extern void WalproposerShmemInit(void);
+extern void LwLsnCacheShmemInit(void);
+extern void NeonPerfCountersShmemInit(void);
+
+
 #endif							/* NEON_H */
--- a/pgxn/neon/neon_lwlsncache.c
+++ b/pgxn/neon/neon_lwlsncache.c
@@ -1,5 +1,6 @@
 #include "postgres.h"

+#include "neon.h"
 #include "neon_lwlsncache.h"

 #include "miscadmin.h"
@@ -81,14 +82,6 @@ static set_max_lwlsn_hook_type prev_set_max_lwlsn_hook = NULL;
 static set_lwlsn_relation_hook_type prev_set_lwlsn_relation_hook = NULL;
 static set_lwlsn_db_hook_type prev_set_lwlsn_db_hook = NULL;

-static shmem_startup_hook_type prev_shmem_startup_hook;
-
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook;
-#endif
-
-static void shmemrequest(void);
-static void shmeminit(void);
 static void neon_set_max_lwlsn(XLogRecPtr lsn);

 void
@@ -99,16 +92,6 @@ init_lwlsncache(void)
 	
 	lwlc_register_gucs();

-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = shmeminit;
-
-	#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = shmemrequest;
-	#else
-	shmemrequest();
-	#endif
-	
 	prev_set_lwlsn_block_range_hook = set_lwlsn_block_range_hook;
 	set_lwlsn_block_range_hook = neon_set_lwlsn_block_range;
 	prev_set_lwlsn_block_v_hook = set_lwlsn_block_v_hook;
@@ -124,20 +107,19 @@ init_lwlsncache(void)
 }


-static void shmemrequest(void) {
+void
+LwLsnCacheShmemRequest(void)
+{
 	Size requested_size = sizeof(LwLsnCacheCtl);
-	
+
 	requested_size += hash_estimate_size(lwlsn_cache_size, sizeof(LastWrittenLsnCacheEntry));

 	RequestAddinShmemSpace(requested_size);
-
-	#if PG_VERSION_NUM >= 150000
-	if (prev_shmem_request_hook)
-			prev_shmem_request_hook();
-	#endif
 }

-static void shmeminit(void) {
+void
+LwLsnCacheShmemInit(void)
+{
 	static HASHCTL info;
 	bool found;
 	if (lwlsn_cache_size > 0)
@@ -157,9 +139,6 @@ static void shmeminit(void) {
 	}
 	dlist_init(&LwLsnCache->lastWrittenLsnLRU);
    LwLsnCache->maxLastWrittenLsn = GetRedoRecPtr();
-	if (prev_shmem_startup_hook) {
-		prev_shmem_startup_hook();
-	}
 }

 /*
--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -17,22 +17,21 @@
 #include "storage/shmem.h"
 #include "utils/builtins.h"

+#include "neon.h"
 #include "neon_perf_counters.h"
 #include "neon_pgversioncompat.h"

 neon_per_backend_counters *neon_per_backend_counters_shared;

-Size
-NeonPerfCountersShmemSize(void)
+void
+NeonPerfCountersShmemRequest(void)
 {
-	Size		size = 0;
-
-	size = add_size(size, mul_size(NUM_NEON_PERF_COUNTER_SLOTS,
-								   sizeof(neon_per_backend_counters)));
-
-	return size;
+	Size size = mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters));
+	RequestAddinShmemSpace(size);
 }

+
+
 void
 NeonPerfCountersShmemInit(void)
 {
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -165,4 +165,8 @@ extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
 extern TimeLineID GetWALInsertionTimeLine(void);
 #endif

+/* format codes not present in PG17-; but available in PG18+ */
+#define INT64_HEX_FORMAT "%" INT64_MODIFIER "x"
+#define UINT64_HEX_FORMAT "%" INT64_MODIFIER "x"
+
 #endif							/* NEON_PGVERSIONCOMPAT_H */
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -250,7 +250,6 @@ extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);

-
 /*
 * LSN values associated with each request to the pageserver
 */
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -675,7 +675,7 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 			 * always have that problem as the can always lag behind the
 			 * primary, but for the primary we can avoid it by always
 			 * requesting the latest page, by setting request LSN to
-			 * UINT64_MAX.
+			 * InfiniteXLogRecPtr.
 			 *
 			 * effective_request_lsn is used to check that received response is still valid.
 			 * In case of primary node it is last written LSN. Originally we used flush_lsn here,
@@ -703,7 +703,7 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 			 * The problem can be fixed by callingGetFlushRecPtr() before checking if the page is in the buffer cache.
 			 * But you can't do that within smgrprefetch(), would need to modify the caller.
 			 */
-			result->request_lsn = UINT64_MAX;
+			result->request_lsn = InfiniteXLogRecPtr;
 			result->not_modified_since = last_written_lsn;
 			result->effective_request_lsn = last_written_lsn;
 		}
@@ -2158,7 +2158,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 		request_lsn = nm_adjust_lsn(request_lsn);
 	}
 	else
-		request_lsn = UINT64_MAX;
+		request_lsn = InfiniteXLogRecPtr;

 	/*
 	 * GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -10,6 +10,7 @@
 */
 #include "postgres.h"

+#include "neon.h"
 #include "neon_pgversioncompat.h"

 #include "pagestore_client.h"
@@ -49,32 +50,23 @@ typedef struct
 								 * algorithm */
 } RelSizeHashControl;

-static HTAB *relsize_hash;
-static LWLockId relsize_lock;
-static int	relsize_hash_size;
-static RelSizeHashControl* relsize_ctl;
-static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-static void relsize_shmem_request(void);
-#endif
-
 /*
 * Size of a cache entry is 36 bytes. So this default will take about 2.3 MB,
 * which seems reasonable.
 */
 #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)

-static void
-neon_smgr_shmem_startup(void)
+static HTAB *relsize_hash;
+static LWLockId relsize_lock;
+static int	relsize_hash_size = DEFAULT_RELSIZE_HASH_SIZE;
+static RelSizeHashControl* relsize_ctl;
+
+void
+RelsizeCacheShmemInit(void)
 {
 	static HASHCTL info;
 	bool found;

-	if (prev_shmem_startup_hook)
-		prev_shmem_startup_hook();
-
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 	relsize_ctl = (RelSizeHashControl *) ShmemInitStruct("relsize_hash", sizeof(RelSizeHashControl), &found);
 	if (!found)
 	{
@@ -85,7 +77,6 @@ neon_smgr_shmem_startup(void)
 									 relsize_hash_size, relsize_hash_size,
 									 &info,
 									 HASH_ELEM | HASH_BLOBS);
-		LWLockRelease(AddinShmemInitLock);
 		relsize_ctl->size = 0;
 		relsize_ctl->hits = 0;
 		relsize_ctl->misses = 0;
@@ -242,34 +233,15 @@ relsize_hash_init(void)
 							PGC_POSTMASTER,
 							0,
 							NULL, NULL, NULL);
-
-	if (relsize_hash_size > 0)
-	{
-#if PG_VERSION_NUM >= 150000
-		prev_shmem_request_hook = shmem_request_hook;
-		shmem_request_hook = relsize_shmem_request;
-#else
-		RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
-		RequestNamedLWLockTranche("neon_relsize", 1);
-#endif
-
-		prev_shmem_startup_hook = shmem_startup_hook;
-		shmem_startup_hook = neon_smgr_shmem_startup;
-	}
 }

-#if PG_VERSION_NUM >= 150000
 /*
 * shmem_request hook: request additional shared resources.  We'll allocate or
 * attach to the shared resources in neon_smgr_shmem_startup().
 */
-static void
-relsize_shmem_request(void)
+void
+RelsizeCacheShmemRequest(void)
 {
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-
 	RequestAddinShmemSpace(sizeof(RelSizeHashControl) + hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
 	RequestNamedLWLockTranche("neon_relsize", 1);
 }
-#endif
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -83,10 +83,8 @@ static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr;
 static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr;
 static HotStandbyFeedback agg_hs_feedback;

-static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
 static void assign_neon_safekeepers(const char *newval, void *extra);
-static void nwp_prepare_shmem(void);
 static uint64 backpressure_lag_impl(void);
 static uint64 startup_backpressure_wrap(void);
 static bool backpressure_throttling_impl(void);
@@ -99,11 +97,6 @@ static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
 static void walprop_pg_load_libpqwalreceiver(void);

 static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL;
-static shmem_startup_hook_type prev_shmem_startup_hook_type;
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-static void walproposer_shmem_request(void);
-#endif
 static void WalproposerShmemInit_SyncSafekeeper(void);


@@ -193,8 +186,6 @@ pg_init_walproposer(void)

 	nwp_register_gucs();

-	nwp_prepare_shmem();
-
 	delay_backend_us = &startup_backpressure_wrap;
 	PrevProcessInterruptsCallback = ProcessInterruptsCallback;
 	ProcessInterruptsCallback = backpressure_throttling_impl;
@@ -482,12 +473,11 @@ WalproposerShmemSize(void)
 	return sizeof(WalproposerShmemState);
 }

-static bool
+void
 WalproposerShmemInit(void)
 {
 	bool		found;

-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 	walprop_shared = ShmemInitStruct("Walproposer shared state",
 									 sizeof(WalproposerShmemState),
 									 &found);
@@ -504,9 +494,6 @@ WalproposerShmemInit(void)
 		pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
 		/* END_HADRON */
 	}
-	LWLockRelease(AddinShmemInitLock);
-
-	return found;
 }

 static void
@@ -609,42 +596,15 @@ walprop_register_bgworker(void)

 /* shmem handling */

-static void
-nwp_prepare_shmem(void)
-{
-#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = walproposer_shmem_request;
-#else
-	RequestAddinShmemSpace(WalproposerShmemSize());
-#endif
-	prev_shmem_startup_hook_type = shmem_startup_hook;
-	shmem_startup_hook = nwp_shmem_startup_hook;
-}
-
-#if PG_VERSION_NUM >= 150000
 /*
 * shmem_request hook: request additional shared resources.  We'll allocate or
- * attach to the shared resources in nwp_shmem_startup_hook().
+ * attach to the shared resources in WalproposerShmemInit().
 */
-static void
-walproposer_shmem_request(void)
+void
+WalproposerShmemRequest(void)
 {
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-
 	RequestAddinShmemSpace(WalproposerShmemSize());
 }
-#endif
-
-static void
-nwp_shmem_startup_hook(void)
-{
-	if (prev_shmem_startup_hook_type)
-		prev_shmem_startup_hook_type();
-
-	WalproposerShmemInit();
-}

 WalproposerShmemState *
 GetWalpropShmemState(void)
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -236,13 +236,13 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 	bool		save_neon_test_evict;

 	/*
-	 * Temporarily set the zenith_test_evict GUC, so that when we pin and
+	 * Temporarily set the neon_test_evict GUC, so that when we pin and
 	 * unpin a buffer, the buffer is evicted. We use that hack to evict all
 	 * buffers, as there is no explicit "evict this buffer" function in the
 	 * buffer manager.
 	 */
-	save_neon_test_evict = zenith_test_evict;
-	zenith_test_evict = true;
+	save_neon_test_evict = neon_test_evict;
+	neon_test_evict = true;
 	PG_TRY();
 	{
 		/* Scan through all the buffers */
@@ -273,7 +273,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)

 			/*
 			 * Pin the buffer, and release it again. Because we have
-			 * zenith_test_evict==true, this will evict the page from the
+			 * neon_test_evict==true, this will evict the page from the
 			 * buffer cache if no one else is holding a pin on it.
 			 */
 			if (isvalid)
@@ -286,7 +286,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 	PG_FINALLY();
 	{
 		/* restore the GUC */
-		zenith_test_evict = save_neon_test_evict;
+		neon_test_evict = save_neon_test_evict;
 	}
 	PG_END_TRY();

--- a/pgxn/typedefs.list
+++ b/pgxn/typedefs.list
@@ -2953,17 +2953,17 @@ XmlTableBuilderData
 YYLTYPE
 YYSTYPE
 YY_BUFFER_STATE
-ZenithErrorResponse
-ZenithExistsRequest
-ZenithExistsResponse
-ZenithGetPageRequest
-ZenithGetPageResponse
-ZenithMessage
-ZenithMessageTag
-ZenithNblocksRequest
-ZenithNblocksResponse
-ZenithRequest
-ZenithResponse
+NeonErrorResponse
+NeonExistsRequest
+NeonExistsResponse
+NeonGetPageRequest
+NeonGetPageResponse
+NeonMessage
+NeonMessageTag
+NeonNblocksRequest
+NeonNblocksResponse
+NeonRequest
+NeonResponse
 _SPI_connection
 _SPI_plan
 __AssignProcessToJobObject
--- a/poetry.lock
+++ b/poetry.lock
@@ -2,127 +2,123 @@

 [[package]]
 name = "aiohappyeyeballs"
-version = "2.3.5"
+version = "2.6.1"
 description = "Happy Eyeballs for asyncio"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"},
-    {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"},
+    {file = "aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8"},
+    {file = "aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558"},
 ]

 [[package]]
 name = "aiohttp"
-version = "3.10.11"
+version = "3.12.14"
 description = "Async http client/server framework (asyncio)"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5077b1a5f40ffa3ba1f40d537d3bec4383988ee51fbba6b74aa8fb1bc466599e"},
-    {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8d6a14a4d93b5b3c2891fca94fa9d41b2322a68194422bef0dd5ec1e57d7d298"},
-    {file = "aiohttp-3.10.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ffbfde2443696345e23a3c597049b1dd43049bb65337837574205e7368472177"},
-    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20b3d9e416774d41813bc02fdc0663379c01817b0874b932b81c7f777f67b217"},
-    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b943011b45ee6bf74b22245c6faab736363678e910504dd7531a58c76c9015a"},
-    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48bc1d924490f0d0b3658fe5c4b081a4d56ebb58af80a6729d4bd13ea569797a"},
-    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e12eb3f4b1f72aaaf6acd27d045753b18101524f72ae071ae1c91c1cd44ef115"},
-    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f14ebc419a568c2eff3c1ed35f634435c24ead2fe19c07426af41e7adb68713a"},
-    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:72b191cdf35a518bfc7ca87d770d30941decc5aaf897ec8b484eb5cc8c7706f3"},
-    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5ab2328a61fdc86424ee540d0aeb8b73bbcad7351fb7cf7a6546fc0bcffa0038"},
-    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:aa93063d4af05c49276cf14e419550a3f45258b6b9d1f16403e777f1addf4519"},
-    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:30283f9d0ce420363c24c5c2421e71a738a2155f10adbb1a11a4d4d6d2715cfc"},
-    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e5358addc8044ee49143c546d2182c15b4ac3a60be01c3209374ace05af5733d"},
-    {file = "aiohttp-3.10.11-cp310-cp310-win32.whl", hash = "sha256:e1ffa713d3ea7cdcd4aea9cddccab41edf6882fa9552940344c44e59652e1120"},
-    {file = "aiohttp-3.10.11-cp310-cp310-win_amd64.whl", hash = "sha256:778cbd01f18ff78b5dd23c77eb82987ee4ba23408cbed233009fd570dda7e674"},
-    {file = "aiohttp-3.10.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:80ff08556c7f59a7972b1e8919f62e9c069c33566a6d28586771711e0eea4f07"},
-    {file = "aiohttp-3.10.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c8f96e9ee19f04c4914e4e7a42a60861066d3e1abf05c726f38d9d0a466e695"},
-    {file = "aiohttp-3.10.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fb8601394d537da9221947b5d6e62b064c9a43e88a1ecd7414d21a1a6fba9c24"},
-    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ea224cf7bc2d8856d6971cea73b1d50c9c51d36971faf1abc169a0d5f85a382"},
-    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db9503f79e12d5d80b3efd4d01312853565c05367493379df76d2674af881caa"},
-    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0f449a50cc33f0384f633894d8d3cd020e3ccef81879c6e6245c3c375c448625"},
-    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82052be3e6d9e0c123499127782a01a2b224b8af8c62ab46b3f6197035ad94e9"},
-    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:20063c7acf1eec550c8eb098deb5ed9e1bb0521613b03bb93644b810986027ac"},
-    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:489cced07a4c11488f47aab1f00d0c572506883f877af100a38f1fedaa884c3a"},
-    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ea9b3bab329aeaa603ed3bf605f1e2a6f36496ad7e0e1aa42025f368ee2dc07b"},
-    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ca117819d8ad113413016cb29774b3f6d99ad23c220069789fc050267b786c16"},
-    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2dfb612dcbe70fb7cdcf3499e8d483079b89749c857a8f6e80263b021745c730"},
-    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9b615d3da0d60e7d53c62e22b4fd1c70f4ae5993a44687b011ea3a2e49051b8"},
-    {file = "aiohttp-3.10.11-cp311-cp311-win32.whl", hash = "sha256:29103f9099b6068bbdf44d6a3d090e0a0b2be6d3c9f16a070dd9d0d910ec08f9"},
-    {file = "aiohttp-3.10.11-cp311-cp311-win_amd64.whl", hash = "sha256:236b28ceb79532da85d59aa9b9bf873b364e27a0acb2ceaba475dc61cffb6f3f"},
-    {file = "aiohttp-3.10.11-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:7480519f70e32bfb101d71fb9a1f330fbd291655a4c1c922232a48c458c52710"},
-    {file = "aiohttp-3.10.11-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f65267266c9aeb2287a6622ee2bb39490292552f9fbf851baabc04c9f84e048d"},
-    {file = "aiohttp-3.10.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7400a93d629a0608dc1d6c55f1e3d6e07f7375745aaa8bd7f085571e4d1cee97"},
-    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f34b97e4b11b8d4eb2c3a4f975be626cc8af99ff479da7de49ac2c6d02d35725"},
-    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e7b825da878464a252ccff2958838f9caa82f32a8dbc334eb9b34a026e2c636"},
-    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f92a344c50b9667827da308473005f34767b6a2a60d9acff56ae94f895f385"},
-    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc6f1ab987a27b83c5268a17218463c2ec08dbb754195113867a27b166cd6087"},
-    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1dc0f4ca54842173d03322793ebcf2c8cc2d34ae91cc762478e295d8e361e03f"},
-    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7ce6a51469bfaacff146e59e7fb61c9c23006495d11cc24c514a455032bcfa03"},
-    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:aad3cd91d484d065ede16f3cf15408254e2469e3f613b241a1db552c5eb7ab7d"},
-    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f4df4b8ca97f658c880fb4b90b1d1ec528315d4030af1ec763247ebfd33d8b9a"},
-    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:2e4e18a0a2d03531edbc06c366954e40a3f8d2a88d2b936bbe78a0c75a3aab3e"},
-    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6ce66780fa1a20e45bc753cda2a149daa6dbf1561fc1289fa0c308391c7bc0a4"},
-    {file = "aiohttp-3.10.11-cp312-cp312-win32.whl", hash = "sha256:a919c8957695ea4c0e7a3e8d16494e3477b86f33067478f43106921c2fef15bb"},
-    {file = "aiohttp-3.10.11-cp312-cp312-win_amd64.whl", hash = "sha256:b5e29706e6389a2283a91611c91bf24f218962717c8f3b4e528ef529d112ee27"},
-    {file = "aiohttp-3.10.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:703938e22434d7d14ec22f9f310559331f455018389222eed132808cd8f44127"},
-    {file = "aiohttp-3.10.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9bc50b63648840854e00084c2b43035a62e033cb9b06d8c22b409d56eb098413"},
-    {file = "aiohttp-3.10.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5f0463bf8b0754bc744e1feb61590706823795041e63edf30118a6f0bf577461"},
-    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6c6dec398ac5a87cb3a407b068e1106b20ef001c344e34154616183fe684288"},
-    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bcaf2d79104d53d4dcf934f7ce76d3d155302d07dae24dff6c9fffd217568067"},
-    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:25fd5470922091b5a9aeeb7e75be609e16b4fba81cdeaf12981393fb240dd10e"},
-    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbde2ca67230923a42161b1f408c3992ae6e0be782dca0c44cb3206bf330dee1"},
-    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:249c8ff8d26a8b41a0f12f9df804e7c685ca35a207e2410adbd3e924217b9006"},
-    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:878ca6a931ee8c486a8f7b432b65431d095c522cbeb34892bee5be97b3481d0f"},
-    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8663f7777ce775f0413324be0d96d9730959b2ca73d9b7e2c2c90539139cbdd6"},
-    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6cd3f10b01f0c31481fba8d302b61603a2acb37b9d30e1d14e0f5a58b7b18a31"},
-    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e8d8aad9402d3aa02fdc5ca2fe68bcb9fdfe1f77b40b10410a94c7f408b664d"},
-    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:38e3c4f80196b4f6c3a85d134a534a56f52da9cb8d8e7af1b79a32eefee73a00"},
-    {file = "aiohttp-3.10.11-cp313-cp313-win32.whl", hash = "sha256:fc31820cfc3b2863c6e95e14fcf815dc7afe52480b4dc03393c4873bb5599f71"},
-    {file = "aiohttp-3.10.11-cp313-cp313-win_amd64.whl", hash = "sha256:4996ff1345704ffdd6d75fb06ed175938c133425af616142e7187f28dc75f14e"},
-    {file = "aiohttp-3.10.11-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:74baf1a7d948b3d640badeac333af581a367ab916b37e44cf90a0334157cdfd2"},
-    {file = "aiohttp-3.10.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:473aebc3b871646e1940c05268d451f2543a1d209f47035b594b9d4e91ce8339"},
-    {file = "aiohttp-3.10.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c2f746a6968c54ab2186574e15c3f14f3e7f67aef12b761e043b33b89c5b5f95"},
-    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d110cabad8360ffa0dec8f6ec60e43286e9d251e77db4763a87dcfe55b4adb92"},
-    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0099c7d5d7afff4202a0c670e5b723f7718810000b4abcbc96b064129e64bc7"},
-    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0316e624b754dbbf8c872b62fe6dcb395ef20c70e59890dfa0de9eafccd2849d"},
-    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a5f7ab8baf13314e6b2485965cbacb94afff1e93466ac4d06a47a81c50f9cca"},
-    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c891011e76041e6508cbfc469dd1a8ea09bc24e87e4c204e05f150c4c455a5fa"},
-    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:9208299251370ee815473270c52cd3f7069ee9ed348d941d574d1457d2c73e8b"},
-    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:459f0f32c8356e8125f45eeff0ecf2b1cb6db1551304972702f34cd9e6c44658"},
-    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:14cdc8c1810bbd4b4b9f142eeee23cda528ae4e57ea0923551a9af4820980e39"},
-    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:971aa438a29701d4b34e4943e91b5e984c3ae6ccbf80dd9efaffb01bd0b243a9"},
-    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:9a309c5de392dfe0f32ee57fa43ed8fc6ddf9985425e84bd51ed66bb16bce3a7"},
-    {file = "aiohttp-3.10.11-cp38-cp38-win32.whl", hash = "sha256:9ec1628180241d906a0840b38f162a3215114b14541f1a8711c368a8739a9be4"},
-    {file = "aiohttp-3.10.11-cp38-cp38-win_amd64.whl", hash = "sha256:9c6e0ffd52c929f985c7258f83185d17c76d4275ad22e90aa29f38e211aacbec"},
-    {file = "aiohttp-3.10.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cdc493a2e5d8dc79b2df5bec9558425bcd39aff59fc949810cbd0832e294b106"},
-    {file = "aiohttp-3.10.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b3e70f24e7d0405be2348da9d5a7836936bf3a9b4fd210f8c37e8d48bc32eca6"},
-    {file = "aiohttp-3.10.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:968b8fb2a5eee2770eda9c7b5581587ef9b96fbdf8dcabc6b446d35ccc69df01"},
-    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deef4362af9493d1382ef86732ee2e4cbc0d7c005947bd54ad1a9a16dd59298e"},
-    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:686b03196976e327412a1b094f4120778c7c4b9cff9bce8d2fdfeca386b89829"},
-    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3bf6d027d9d1d34e1c2e1645f18a6498c98d634f8e373395221121f1c258ace8"},
-    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:099fd126bf960f96d34a760e747a629c27fb3634da5d05c7ef4d35ef4ea519fc"},
-    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c73c4d3dae0b4644bc21e3de546530531d6cdc88659cdeb6579cd627d3c206aa"},
-    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0c5580f3c51eea91559db3facd45d72e7ec970b04528b4709b1f9c2555bd6d0b"},
-    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fdf6429f0caabfd8a30c4e2eaecb547b3c340e4730ebfe25139779b9815ba138"},
-    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:d97187de3c276263db3564bb9d9fad9e15b51ea10a371ffa5947a5ba93ad6777"},
-    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:0acafb350cfb2eba70eb5d271f55e08bd4502ec35e964e18ad3e7d34d71f7261"},
-    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c13ed0c779911c7998a58e7848954bd4d63df3e3575f591e321b19a2aec8df9f"},
-    {file = "aiohttp-3.10.11-cp39-cp39-win32.whl", hash = "sha256:22b7c540c55909140f63ab4f54ec2c20d2635c0289cdd8006da46f3327f971b9"},
-    {file = "aiohttp-3.10.11-cp39-cp39-win_amd64.whl", hash = "sha256:7b26b1551e481012575dab8e3727b16fe7dd27eb2711d2e63ced7368756268fb"},
-    {file = "aiohttp-3.10.11.tar.gz", hash = "sha256:9dc2b8f3dcab2e39e0fa309c8da50c3b55e6f34ab25f1a71d3288f24924d33a7"},
+    {file = "aiohttp-3.12.14-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:906d5075b5ba0dd1c66fcaaf60eb09926a9fef3ca92d912d2a0bbdbecf8b1248"},
+    {file = "aiohttp-3.12.14-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c875bf6fc2fd1a572aba0e02ef4e7a63694778c5646cdbda346ee24e630d30fb"},
+    {file = "aiohttp-3.12.14-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fbb284d15c6a45fab030740049d03c0ecd60edad9cd23b211d7e11d3be8d56fd"},
+    {file = "aiohttp-3.12.14-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38e360381e02e1a05d36b223ecab7bc4a6e7b5ab15760022dc92589ee1d4238c"},
+    {file = "aiohttp-3.12.14-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:aaf90137b5e5d84a53632ad95ebee5c9e3e7468f0aab92ba3f608adcb914fa95"},
+    {file = "aiohttp-3.12.14-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e532a25e4a0a2685fa295a31acf65e027fbe2bea7a4b02cdfbbba8a064577663"},
+    {file = "aiohttp-3.12.14-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eab9762c4d1b08ae04a6c77474e6136da722e34fdc0e6d6eab5ee93ac29f35d1"},
+    {file = "aiohttp-3.12.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abe53c3812b2899889a7fca763cdfaeee725f5be68ea89905e4275476ffd7e61"},
+    {file = "aiohttp-3.12.14-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5760909b7080aa2ec1d320baee90d03b21745573780a072b66ce633eb77a8656"},
+    {file = "aiohttp-3.12.14-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:02fcd3f69051467bbaa7f84d7ec3267478c7df18d68b2e28279116e29d18d4f3"},
+    {file = "aiohttp-3.12.14-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:4dcd1172cd6794884c33e504d3da3c35648b8be9bfa946942d353b939d5f1288"},
+    {file = "aiohttp-3.12.14-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:224d0da41355b942b43ad08101b1b41ce633a654128ee07e36d75133443adcda"},
+    {file = "aiohttp-3.12.14-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e387668724f4d734e865c1776d841ed75b300ee61059aca0b05bce67061dcacc"},
+    {file = "aiohttp-3.12.14-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:dec9cde5b5a24171e0b0a4ca064b1414950904053fb77c707efd876a2da525d8"},
+    {file = "aiohttp-3.12.14-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bbad68a2af4877cc103cd94af9160e45676fc6f0c14abb88e6e092b945c2c8e3"},
+    {file = "aiohttp-3.12.14-cp310-cp310-win32.whl", hash = "sha256:ee580cb7c00bd857b3039ebca03c4448e84700dc1322f860cf7a500a6f62630c"},
+    {file = "aiohttp-3.12.14-cp310-cp310-win_amd64.whl", hash = "sha256:cf4f05b8cea571e2ccc3ca744e35ead24992d90a72ca2cf7ab7a2efbac6716db"},
+    {file = "aiohttp-3.12.14-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f4552ff7b18bcec18b60a90c6982049cdb9dac1dba48cf00b97934a06ce2e597"},
+    {file = "aiohttp-3.12.14-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8283f42181ff6ccbcf25acaae4e8ab2ff7e92b3ca4a4ced73b2c12d8cd971393"},
+    {file = "aiohttp-3.12.14-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:040afa180ea514495aaff7ad34ec3d27826eaa5d19812730fe9e529b04bb2179"},
+    {file = "aiohttp-3.12.14-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b413c12f14c1149f0ffd890f4141a7471ba4b41234fe4fd4a0ff82b1dc299dbb"},
+    {file = "aiohttp-3.12.14-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:1d6f607ce2e1a93315414e3d448b831238f1874b9968e1195b06efaa5c87e245"},
+    {file = "aiohttp-3.12.14-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:565e70d03e924333004ed101599902bba09ebb14843c8ea39d657f037115201b"},
+    {file = "aiohttp-3.12.14-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4699979560728b168d5ab63c668a093c9570af2c7a78ea24ca5212c6cdc2b641"},
+    {file = "aiohttp-3.12.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad5fdf6af93ec6c99bf800eba3af9a43d8bfd66dce920ac905c817ef4a712afe"},
+    {file = "aiohttp-3.12.14-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4ac76627c0b7ee0e80e871bde0d376a057916cb008a8f3ffc889570a838f5cc7"},
+    {file = "aiohttp-3.12.14-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:798204af1180885651b77bf03adc903743a86a39c7392c472891649610844635"},
+    {file = "aiohttp-3.12.14-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:4f1205f97de92c37dd71cf2d5bcfb65fdaed3c255d246172cce729a8d849b4da"},
+    {file = "aiohttp-3.12.14-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:76ae6f1dd041f85065d9df77c6bc9c9703da9b5c018479d20262acc3df97d419"},
+    {file = "aiohttp-3.12.14-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a194ace7bc43ce765338ca2dfb5661489317db216ea7ea700b0332878b392cab"},
+    {file = "aiohttp-3.12.14-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:16260e8e03744a6fe3fcb05259eeab8e08342c4c33decf96a9dad9f1187275d0"},
+    {file = "aiohttp-3.12.14-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8c779e5ebbf0e2e15334ea404fcce54009dc069210164a244d2eac8352a44b28"},
+    {file = "aiohttp-3.12.14-cp311-cp311-win32.whl", hash = "sha256:a289f50bf1bd5be227376c067927f78079a7bdeccf8daa6a9e65c38bae14324b"},
+    {file = "aiohttp-3.12.14-cp311-cp311-win_amd64.whl", hash = "sha256:0b8a69acaf06b17e9c54151a6c956339cf46db4ff72b3ac28516d0f7068f4ced"},
+    {file = "aiohttp-3.12.14-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a0ecbb32fc3e69bc25efcda7d28d38e987d007096cbbeed04f14a6662d0eee22"},
+    {file = "aiohttp-3.12.14-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:0400f0ca9bb3e0b02f6466421f253797f6384e9845820c8b05e976398ac1d81a"},
+    {file = "aiohttp-3.12.14-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a56809fed4c8a830b5cae18454b7464e1529dbf66f71c4772e3cfa9cbec0a1ff"},
+    {file = "aiohttp-3.12.14-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f2e373276e4755691a963e5d11756d093e346119f0627c2d6518208483fb6d"},
+    {file = "aiohttp-3.12.14-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:ca39e433630e9a16281125ef57ece6817afd1d54c9f1bf32e901f38f16035869"},
+    {file = "aiohttp-3.12.14-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c748b3f8b14c77720132b2510a7d9907a03c20ba80f469e58d5dfd90c079a1c"},
+    {file = "aiohttp-3.12.14-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0a568abe1b15ce69d4cc37e23020720423f0728e3cb1f9bcd3f53420ec3bfe7"},
+    {file = "aiohttp-3.12.14-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9888e60c2c54eaf56704b17feb558c7ed6b7439bca1e07d4818ab878f2083660"},
+    {file = "aiohttp-3.12.14-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3006a1dc579b9156de01e7916d38c63dc1ea0679b14627a37edf6151bc530088"},
+    {file = "aiohttp-3.12.14-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:aa8ec5c15ab80e5501a26719eb48a55f3c567da45c6ea5bb78c52c036b2655c7"},
+    {file = "aiohttp-3.12.14-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:39b94e50959aa07844c7fe2206b9f75d63cc3ad1c648aaa755aa257f6f2498a9"},
+    {file = "aiohttp-3.12.14-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:04c11907492f416dad9885d503fbfc5dcb6768d90cad8639a771922d584609d3"},
+    {file = "aiohttp-3.12.14-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:88167bd9ab69bb46cee91bd9761db6dfd45b6e76a0438c7e884c3f8160ff21eb"},
+    {file = "aiohttp-3.12.14-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:791504763f25e8f9f251e4688195e8b455f8820274320204f7eafc467e609425"},
+    {file = "aiohttp-3.12.14-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2785b112346e435dd3a1a67f67713a3fe692d288542f1347ad255683f066d8e0"},
+    {file = "aiohttp-3.12.14-cp312-cp312-win32.whl", hash = "sha256:15f5f4792c9c999a31d8decf444e79fcfd98497bf98e94284bf390a7bb8c1729"},
+    {file = "aiohttp-3.12.14-cp312-cp312-win_amd64.whl", hash = "sha256:3b66e1a182879f579b105a80d5c4bd448b91a57e8933564bf41665064796a338"},
+    {file = "aiohttp-3.12.14-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:3143a7893d94dc82bc409f7308bc10d60285a3cd831a68faf1aa0836c5c3c767"},
+    {file = "aiohttp-3.12.14-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3d62ac3d506cef54b355bd34c2a7c230eb693880001dfcda0bf88b38f5d7af7e"},
+    {file = "aiohttp-3.12.14-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:48e43e075c6a438937c4de48ec30fa8ad8e6dfef122a038847456bfe7b947b63"},
+    {file = "aiohttp-3.12.14-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:077b4488411a9724cecc436cbc8c133e0d61e694995b8de51aaf351c7578949d"},
+    {file = "aiohttp-3.12.14-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d8c35632575653f297dcbc9546305b2c1133391089ab925a6a3706dfa775ccab"},
+    {file = "aiohttp-3.12.14-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b8ce87963f0035c6834b28f061df90cf525ff7c9b6283a8ac23acee6502afd4"},
+    {file = "aiohttp-3.12.14-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0a2cf66e32a2563bb0766eb24eae7e9a269ac0dc48db0aae90b575dc9583026"},
+    {file = "aiohttp-3.12.14-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdea089caf6d5cde975084a884c72d901e36ef9c2fd972c9f51efbbc64e96fbd"},
+    {file = "aiohttp-3.12.14-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a7865f27db67d49e81d463da64a59365ebd6b826e0e4847aa111056dcb9dc88"},
+    {file = "aiohttp-3.12.14-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0ab5b38a6a39781d77713ad930cb5e7feea6f253de656a5f9f281a8f5931b086"},
+    {file = "aiohttp-3.12.14-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:9b3b15acee5c17e8848d90a4ebc27853f37077ba6aec4d8cb4dbbea56d156933"},
+    {file = "aiohttp-3.12.14-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e4c972b0bdaac167c1e53e16a16101b17c6d0ed7eac178e653a07b9f7fad7151"},
+    {file = "aiohttp-3.12.14-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7442488b0039257a3bdbc55f7209587911f143fca11df9869578db6c26feeeb8"},
+    {file = "aiohttp-3.12.14-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:f68d3067eecb64c5e9bab4a26aa11bd676f4c70eea9ef6536b0a4e490639add3"},
+    {file = "aiohttp-3.12.14-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f88d3704c8b3d598a08ad17d06006cb1ca52a1182291f04979e305c8be6c9758"},
+    {file = "aiohttp-3.12.14-cp313-cp313-win32.whl", hash = "sha256:a3c99ab19c7bf375c4ae3debd91ca5d394b98b6089a03231d4c580ef3c2ae4c5"},
+    {file = "aiohttp-3.12.14-cp313-cp313-win_amd64.whl", hash = "sha256:3f8aad695e12edc9d571f878c62bedc91adf30c760c8632f09663e5f564f4baa"},
+    {file = "aiohttp-3.12.14-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b8cc6b05e94d837bcd71c6531e2344e1ff0fb87abe4ad78a9261d67ef5d83eae"},
+    {file = "aiohttp-3.12.14-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d1dcb015ac6a3b8facd3677597edd5ff39d11d937456702f0bb2b762e390a21b"},
+    {file = "aiohttp-3.12.14-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3779ed96105cd70ee5e85ca4f457adbce3d9ff33ec3d0ebcdf6c5727f26b21b3"},
+    {file = "aiohttp-3.12.14-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:717a0680729b4ebd7569c1dcd718c46b09b360745fd8eb12317abc74b14d14d0"},
+    {file = "aiohttp-3.12.14-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b5dd3a2ef7c7e968dbbac8f5574ebeac4d2b813b247e8cec28174a2ba3627170"},
+    {file = "aiohttp-3.12.14-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4710f77598c0092239bc12c1fcc278a444e16c7032d91babf5abbf7166463f7b"},
+    {file = "aiohttp-3.12.14-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f3e9f75ae842a6c22a195d4a127263dbf87cbab729829e0bd7857fb1672400b2"},
+    {file = "aiohttp-3.12.14-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f9c8d55d6802086edd188e3a7d85a77787e50d56ce3eb4757a3205fa4657922"},
+    {file = "aiohttp-3.12.14-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:79b29053ff3ad307880d94562cca80693c62062a098a5776ea8ef5ef4b28d140"},
+    {file = "aiohttp-3.12.14-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:23e1332fff36bebd3183db0c7a547a1da9d3b4091509f6d818e098855f2f27d3"},
+    {file = "aiohttp-3.12.14-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:a564188ce831fd110ea76bcc97085dd6c625b427db3f1dbb14ca4baa1447dcbc"},
+    {file = "aiohttp-3.12.14-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:a7a1b4302f70bb3ec40ca86de82def532c97a80db49cac6a6700af0de41af5ee"},
+    {file = "aiohttp-3.12.14-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:1b07ccef62950a2519f9bfc1e5b294de5dd84329f444ca0b329605ea787a3de5"},
+    {file = "aiohttp-3.12.14-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:938bd3ca6259e7e48b38d84f753d548bd863e0c222ed6ee6ace3fd6752768a84"},
+    {file = "aiohttp-3.12.14-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8bc784302b6b9f163b54c4e93d7a6f09563bd01ff2b841b29ed3ac126e5040bf"},
+    {file = "aiohttp-3.12.14-cp39-cp39-win32.whl", hash = "sha256:a3416f95961dd7d5393ecff99e3f41dc990fb72eda86c11f2a60308ac6dcd7a0"},
+    {file = "aiohttp-3.12.14-cp39-cp39-win_amd64.whl", hash = "sha256:196858b8820d7f60578f8b47e5669b3195c21d8ab261e39b1d705346458f445f"},
+    {file = "aiohttp-3.12.14.tar.gz", hash = "sha256:6e06e120e34d93100de448fd941522e11dafa78ef1a893c179901b7d66aa29f2"},
 ]

 [package.dependencies]
-aiohappyeyeballs = ">=2.3.0"
-aiosignal = ">=1.1.2"
+aiohappyeyeballs = ">=2.5.0"
+aiosignal = ">=1.4.0"
 attrs = ">=17.3.0"
 frozenlist = ">=1.1.1"
 multidict = ">=4.5,<7.0"
-yarl = ">=1.12.0,<2.0"
+propcache = ">=0.2.0"
+yarl = ">=1.17.0,<2.0"

 [package.extras]
-speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.2.0) ; sys_platform == \"linux\" or sys_platform == \"darwin\"", "brotlicffi ; platform_python_implementation != \"CPython\""]
+speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.3.0)", "brotlicffi ; platform_python_implementation != \"CPython\""]

 [[package]]
 name = "aiopg"
@@ -145,18 +141,19 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]

 [[package]]
 name = "aiosignal"
-version = "1.3.1"
+version = "1.4.0"
 description = "aiosignal: a list of registered asynchronous callbacks"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
-    {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
+    {file = "aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e"},
+    {file = "aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7"},
 ]

 [package.dependencies]
 frozenlist = ">=1.1.0"
+typing-extensions = {version = ">=4.2", markers = "python_version < \"3.13\""}

 [[package]]
 name = "allure-pytest"
@@ -3847,4 +3844,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "bd93313f110110aa53b24a3ed47ba2d7f60e2c658a79cdff7320fed1bb1b57b5"
+content-hash = "6a1e8ba06b8194bf28d87fd5e184e2ddc2b4a19dffcbe3953b26da3d55c9212f"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -16,6 +16,7 @@ async-compression.workspace = true
 async-trait.workspace = true
 atomic-take.workspace = true
 aws-config.workspace = true
+aws-credential-types.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
 base64.workspace = true
@@ -48,6 +49,7 @@ indexmap = { workspace = true, features = ["serde"] }
 ipnet.workspace = true
 itertools.workspace = true
 itoa.workspace = true
+json = { path = "../libs/proxy/json" }
 lasso = { workspace = true, features = ["multi-threaded"] }
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
@@ -127,4 +129,4 @@ rstest.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
 tokio-postgres.workspace = true
-tracing-test = "0.2"
+tracing-test = "0.2"
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -123,6 +123,11 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_pl
 docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';"
 ```

+If you want to test query cancellation, redis is also required:
+```sh
+docker run --detach --name proxy-redis --publish 6379:6379 redis:7.0
+```
+
 Let's create self-signed certificate by running:
 ```sh
 openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build"
@@ -130,7 +135,10 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key

 Then we need to build proxy with 'testing' feature and run, e.g.:
 ```sh
-RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
+RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- \
+  --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' \
+  --redis-auth-type="plain" --redis-plain="redis://127.0.0.1:6379" \
+  -c server.crt -k server.key
 ```

 Now from client you can start a new session:
--- a/proxy/src/batch.rs
+++ b/proxy/src/batch.rs
@@ -7,13 +7,17 @@ use std::pin::pin;
 use std::sync::Mutex;

 use scopeguard::ScopeGuard;
+use tokio::sync::oneshot;
 use tokio::sync::oneshot::error::TryRecvError;

 use crate::ext::LockExt;

+type ProcResult<P> = Result<<P as QueueProcessing>::Res, <P as QueueProcessing>::Err>;
+
 pub trait QueueProcessing: Send + 'static {
    type Req: Send + 'static;
    type Res: Send;
+    type Err: Send + Clone;

    /// Get the desired batch size.
    fn batch_size(&self, queue_size: usize) -> usize;
@@ -24,7 +28,18 @@ pub trait QueueProcessing: Send + 'static {
    /// If this apply can error, it's expected that errors be forwarded to each Self::Res.
    ///
    /// Batching does not need to happen atomically.
-    fn apply(&mut self, req: Vec<Self::Req>) -> impl Future<Output = Vec<Self::Res>> + Send;
+    fn apply(
+        &mut self,
+        req: Vec<Self::Req>,
+    ) -> impl Future<Output = Result<Vec<Self::Res>, Self::Err>> + Send;
+}
+
+#[derive(thiserror::Error)]
+pub enum BatchQueueError<E: Clone, C> {
+    #[error(transparent)]
+    Result(E),
+    #[error(transparent)]
+    Cancelled(C),
 }

 pub struct BatchQueue<P: QueueProcessing> {
@@ -34,7 +49,7 @@ pub struct BatchQueue<P: QueueProcessing> {

 struct BatchJob<P: QueueProcessing> {
    req: P::Req,
-    res: tokio::sync::oneshot::Sender<P::Res>,
+    res: tokio::sync::oneshot::Sender<Result<P::Res, P::Err>>,
 }

 impl<P: QueueProcessing> BatchQueue<P> {
@@ -55,11 +70,11 @@ impl<P: QueueProcessing> BatchQueue<P> {
        &self,
        req: P::Req,
        cancelled: impl Future<Output = R>,
-    ) -> Result<P::Res, R> {
+    ) -> Result<P::Res, BatchQueueError<P::Err, R>> {
        let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req);

        let mut cancelled = pin!(cancelled);
-        let resp = loop {
+        let resp: Option<Result<P::Res, P::Err>> = loop {
            // try become the leader, or try wait for success.
            let mut processor = tokio::select! {
                // try become leader.
@@ -72,7 +87,7 @@ impl<P: QueueProcessing> BatchQueue<P> {
                    if inner.queue.remove(&id).is_some() {
                        tracing::warn!("batched task cancelled before completion");
                    }
-                    return Err(cancel);
+                    return Err(BatchQueueError::Cancelled(cancel));
                },
            };

@@ -96,18 +111,30 @@ impl<P: QueueProcessing> BatchQueue<P> {
            // good: we didn't get cancelled.
            ScopeGuard::into_inner(cancel_safety);

-            if values.len() != resps.len() {
-                tracing::error!(
-                    "batch: invalid response size, expected={}, got={}",
-                    resps.len(),
-                    values.len()
-                );
-            }
+            match values {
+                Ok(values) => {
+                    if values.len() != resps.len() {
+                        tracing::error!(
+                            "batch: invalid response size, expected={}, got={}",
+                            resps.len(),
+                            values.len()
+                        );
+                    }

-            // send response values.
-            for (tx, value) in std::iter::zip(resps, values) {
-                if tx.send(value).is_err() {
-                    // receiver hung up but that's fine.
+                    // send response values.
+                    for (tx, value) in std::iter::zip(resps, values) {
+                        if tx.send(Ok(value)).is_err() {
+                            // receiver hung up but that's fine.
+                        }
+                    }
+                }
+
+                Err(err) => {
+                    for tx in resps {
+                        if tx.send(Err(err.clone())).is_err() {
+                            // receiver hung up but that's fine.
+                        }
+                    }
                }
            }

@@ -129,7 +156,8 @@ impl<P: QueueProcessing> BatchQueue<P> {

        tracing::debug!(id, "batch: job completed");

-        Ok(resp.expect("no response found. batch processer should not panic"))
+        resp.expect("no response found. batch processer should not panic")
+            .map_err(BatchQueueError::Result)
    }
 }

@@ -139,8 +167,8 @@ struct BatchQueueInner<P: QueueProcessing> {
 }

 impl<P: QueueProcessing> BatchQueueInner<P> {
-    fn register_job(&mut self, req: P::Req) -> (u64, tokio::sync::oneshot::Receiver<P::Res>) {
-        let (tx, rx) = tokio::sync::oneshot::channel();
+    fn register_job(&mut self, req: P::Req) -> (u64, oneshot::Receiver<ProcResult<P>>) {
+        let (tx, rx) = oneshot::channel();

        let id = self.version;

@@ -158,7 +186,7 @@ impl<P: QueueProcessing> BatchQueueInner<P> {
        (id, rx)
    }

-    fn get_batch(&mut self, p: &P) -> (Vec<P::Req>, Vec<tokio::sync::oneshot::Sender<P::Res>>) {
+    fn get_batch(&mut self, p: &P) -> (Vec<P::Req>, Vec<oneshot::Sender<ProcResult<P>>>) {
        let batch_size = p.batch_size(self.queue.len());
        let mut reqs = Vec::with_capacity(batch_size);
        let mut resps = Vec::with_capacity(batch_size);
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -522,15 +522,7 @@ pub async fn run() -> anyhow::Result<()> {
        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
    }

-    if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend
-        && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api
-        && let Some(client) = redis_client
-    {
-        // project info cache and invalidation of that cache.
-        let cache = api.caches.project_info.clone();
-        maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone()));
-        maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-
+    if let Some(client) = redis_client {
        // Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
        // This prevents immediate exit and pod restart,
        // which can cause hammering of the redis in case of connection issues.
@@ -560,6 +552,16 @@ pub async fn run() -> anyhow::Result<()> {
                }
            }
        }
+
+        #[allow(irrefutable_let_patterns)]
+        if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend
+            && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api
+        {
+            // project info cache and invalidation of that cache.
+            let cache = api.caches.project_info.clone();
+            maintenance_tasks.spawn(notifications::task_main(client, cache.clone()));
+            maintenance_tasks.spawn(async move { cache.gc_worker().await });
+        }
    }

    let maintenance = loop {
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -14,8 +14,8 @@ use std::time::{Duration, Instant};
 use hashlink::{LruCache, linked_hash_map::RawEntryMut};
 use tracing::debug;

+use super::Cache;
 use super::common::Cached;
-use super::{Cache, timed_lru};

 /// An implementation of timed LRU cache with fixed capacity.
 /// Key properties:
@@ -30,7 +30,7 @@ use super::{Cache, timed_lru};
 ///
 /// * There's an API for immediate invalidation (removal) of a cache entry;
 ///   It's useful in case we know for sure that the entry is no longer correct.
-///   See [`timed_lru::Cached`] for more information.
+///   See [`Cached`] for more information.
 ///
 /// * Expired entries are kept in the cache, until they are evicted by the LRU policy,
 ///   or by a successful lookup (i.e. the entry hasn't expired yet).
@@ -217,15 +217,18 @@ impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
 }

 impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
-    /// Retrieve a cached entry in convenient wrapper.
-    pub(crate) fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
+    /// Retrieve a cached entry in convenient wrapper, alongside timing information.
+    pub(crate) fn get_with_created_at<Q>(
+        &self,
+        key: &Q,
+    ) -> Option<Cached<&Self, (<Self as Cache>::Value, Instant)>>
    where
        K: Borrow<Q> + Clone,
        Q: Hash + Eq + ?Sized,
    {
        self.get_raw(key, |key, entry| Cached {
            token: Some((self, key.clone())),
-            value: entry.value.clone(),
+            value: (entry.value.clone(), entry.created_at),
        })
    }
 }
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -4,12 +4,11 @@ use std::pin::pin;
 use std::sync::{Arc, OnceLock};
 use std::time::Duration;

-use anyhow::anyhow;
 use futures::FutureExt;
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use postgres_client::RawCancelToken;
 use postgres_client::tls::MakeTlsConnect;
-use redis::{Cmd, FromRedisValue, Value};
+use redis::{Cmd, FromRedisValue, SetExpiry, SetOptions, Value};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
@@ -18,7 +17,7 @@ use tracing::{debug, error, info};

 use crate::auth::AuthError;
 use crate::auth::backend::ComputeUserInfo;
-use crate::batch::{BatchQueue, QueueProcessing};
+use crate::batch::{BatchQueue, BatchQueueError, QueueProcessing};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::ControlPlaneApi;
@@ -28,23 +27,39 @@ use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, Redis
 use crate::pqproto::CancelKeyData;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
-use crate::redis::kv_ops::RedisKVClient;
+use crate::redis::kv_ops::{RedisKVClient, RedisKVClientError};
+use crate::util::run_until;

 type IpSubnetKey = IpNet;

-const CANCEL_KEY_TTL: std::time::Duration = std::time::Duration::from_secs(600);
-const CANCEL_KEY_REFRESH: std::time::Duration = std::time::Duration::from_secs(570);
+const CANCEL_KEY_TTL: Duration = Duration::from_secs(600);
+const CANCEL_KEY_REFRESH: Duration = Duration::from_secs(570);

 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
-    StoreCancelKey {
+    Store {
        key: CancelKeyData,
        value: Box<str>,
-        expire: std::time::Duration,
+        expire: Duration,
    },
-    GetCancelData {
+    Refresh {
+        key: CancelKeyData,
+        expire: Duration,
+    },
+    Get {
        key: CancelKeyData,
    },
+    GetOld {
+        key: CancelKeyData,
+    },
+}
+
+#[derive(thiserror::Error, Debug, Clone)]
+pub enum PipelineError {
+    #[error("could not send cmd to redis: {0}")]
+    RedisKVClient(Arc<RedisKVClientError>),
+    #[error("incorrect number of responses from redis")]
+    IncorrectNumberOfResponses,
 }

 pub struct Pipeline {
@@ -60,7 +75,7 @@ impl Pipeline {
        }
    }

-    async fn execute(self, client: &mut RedisKVClient) -> Vec<anyhow::Result<Value>> {
+    async fn execute(self, client: &mut RedisKVClient) -> Result<Vec<Value>, PipelineError> {
        let responses = self.replies;
        let batch_size = self.inner.len();

@@ -78,43 +93,44 @@ impl Pipeline {
                    batch_size,
                    responses, "successfully completed cancellation jobs",
                );
-                values.into_iter().map(Ok).collect()
+                Ok(values.into_iter().collect())
            }
            Ok(value) => {
                error!(batch_size, ?value, "unexpected redis return value");
-                std::iter::repeat_with(|| Err(anyhow!("incorrect response type from redis")))
-                    .take(responses)
-                    .collect()
-            }
-            Err(err) => {
-                std::iter::repeat_with(|| Err(anyhow!("could not send cmd to redis: {err}")))
-                    .take(responses)
-                    .collect()
+                Err(PipelineError::IncorrectNumberOfResponses)
            }
+            Err(err) => Err(PipelineError::RedisKVClient(Arc::new(err))),
        }
    }

-    fn add_command_with_reply(&mut self, cmd: Cmd) {
+    fn add_command(&mut self, cmd: Cmd) {
        self.inner.add_command(cmd);
        self.replies += 1;
    }
-
-    fn add_command_no_reply(&mut self, cmd: Cmd) {
-        self.inner.add_command(cmd).ignore();
-    }
 }

 impl CancelKeyOp {
    fn register(&self, pipe: &mut Pipeline) {
        match self {
-            CancelKeyOp::StoreCancelKey { key, value, expire } => {
+            CancelKeyOp::Store { key, value, expire } => {
                let key = KeyPrefix::Cancel(*key).build_redis_key();
-                pipe.add_command_with_reply(Cmd::hset(&key, "data", &**value));
-                pipe.add_command_no_reply(Cmd::expire(&key, expire.as_secs() as i64));
+                pipe.add_command(Cmd::set_options(
+                    &key,
+                    &**value,
+                    SetOptions::default().with_expiration(SetExpiry::EX(expire.as_secs())),
+                ));
            }
-            CancelKeyOp::GetCancelData { key } => {
+            CancelKeyOp::Refresh { key, expire } => {
                let key = KeyPrefix::Cancel(*key).build_redis_key();
-                pipe.add_command_with_reply(Cmd::hget(key, "data"));
+                pipe.add_command(Cmd::expire(&key, expire.as_secs() as i64));
+            }
+            CancelKeyOp::GetOld { key } => {
+                let key = KeyPrefix::Cancel(*key).build_redis_key();
+                pipe.add_command(Cmd::hget(key, "data"));
+            }
+            CancelKeyOp::Get { key } => {
+                let key = KeyPrefix::Cancel(*key).build_redis_key();
+                pipe.add_command(Cmd::get(key));
            }
        }
    }
@@ -127,13 +143,14 @@ pub struct CancellationProcessor {

 impl QueueProcessing for CancellationProcessor {
    type Req = (CancelChannelSizeGuard<'static>, CancelKeyOp);
-    type Res = anyhow::Result<redis::Value>;
+    type Res = redis::Value;
+    type Err = PipelineError;

    fn batch_size(&self, _queue_size: usize) -> usize {
        self.batch_size
    }

-    async fn apply(&mut self, batch: Vec<Self::Req>) -> Vec<Self::Res> {
+    async fn apply(&mut self, batch: Vec<Self::Req>) -> Result<Vec<Self::Res>, Self::Err> {
        if !self.client.credentials_refreshed() {
            // this will cause a timeout for cancellation operations
            tracing::debug!(
@@ -244,18 +261,18 @@ impl CancellationHandler {
        &self,
        key: CancelKeyData,
    ) -> Result<Option<CancelClosure>, CancelError> {
-        let guard = Metrics::get()
-            .proxy
-            .cancel_channel_size
-            .guard(RedisMsgKind::HGet);
-        let op = CancelKeyOp::GetCancelData { key };
+        const TIMEOUT: Duration = Duration::from_secs(5);

        let Some(tx) = self.tx.get() else {
            tracing::warn!("cancellation handler is not available");
            return Err(CancelError::InternalError);
        };

-        const TIMEOUT: Duration = Duration::from_secs(5);
+        let guard = Metrics::get()
+            .proxy
+            .cancel_channel_size
+            .guard(RedisMsgKind::Get);
+        let op = CancelKeyOp::Get { key };
        let result = timeout(
            TIMEOUT,
            tx.call((guard, op), std::future::pending::<Infallible>()),
@@ -264,10 +281,37 @@ impl CancellationHandler {
        .map_err(|_| {
            tracing::warn!("timed out waiting to receive GetCancelData response");
            CancelError::RateLimit
-        })?
-        // cannot be cancelled
-        .unwrap_or_else(|x| match x {})
-        .map_err(|e| {
+        })?;
+
+        // We may still have cancel keys set with HSET <key> "data".
+        // Check error type and retry with HGET.
+        // TODO: remove code after HSET is not used anymore.
+        let result = if let Err(err) = result.as_ref()
+            && let BatchQueueError::Result(err) = err
+            && let PipelineError::RedisKVClient(err) = err
+            && let RedisKVClientError::Redis(err) = &**err
+            && let Some(errcode) = err.code()
+            && errcode == "WRONGTYPE"
+        {
+            let guard = Metrics::get()
+                .proxy
+                .cancel_channel_size
+                .guard(RedisMsgKind::HGet);
+            let op = CancelKeyOp::GetOld { key };
+            timeout(
+                TIMEOUT,
+                tx.call((guard, op), std::future::pending::<Infallible>()),
+            )
+            .await
+            .map_err(|_| {
+                tracing::warn!("timed out waiting to receive GetCancelData response");
+                CancelError::RateLimit
+            })?
+        } else {
+            result
+        };
+
+        let result = result.map_err(|e| {
            tracing::warn!("failed to receive GetCancelData response: {e}");
            CancelError::InternalError
        })?;
@@ -438,39 +482,94 @@ impl Session {

        let mut cancel = pin!(cancel);

+        enum State {
+            Set,
+            Refresh,
+        }
+        let mut state = State::Set;
+
        loop {
-            let guard = Metrics::get()
-                .proxy
-                .cancel_channel_size
-                .guard(RedisMsgKind::HSet);
-            let op = CancelKeyOp::StoreCancelKey {
-                key: self.key,
-                value: closure_json.clone(),
-                expire: CANCEL_KEY_TTL,
+            let guard_op = match state {
+                State::Set => {
+                    let guard = Metrics::get()
+                        .proxy
+                        .cancel_channel_size
+                        .guard(RedisMsgKind::Set);
+                    let op = CancelKeyOp::Store {
+                        key: self.key,
+                        value: closure_json.clone(),
+                        expire: CANCEL_KEY_TTL,
+                    };
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "registering cancellation key"
+                    );
+                    (guard, op)
+                }
+
+                State::Refresh => {
+                    let guard = Metrics::get()
+                        .proxy
+                        .cancel_channel_size
+                        .guard(RedisMsgKind::Expire);
+                    let op = CancelKeyOp::Refresh {
+                        key: self.key,
+                        expire: CANCEL_KEY_TTL,
+                    };
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "refreshing cancellation key"
+                    );
+                    (guard, op)
+                }
            };

-            tracing::debug!(
-                src=%self.key,
-                dest=?cancel_closure.cancel_token,
-                "registering cancellation key"
-            );
-
-            match tx.call((guard, op), cancel.as_mut()).await {
-                Ok(Ok(_)) => {
+            match tx.call(guard_op, cancel.as_mut()).await {
+                // SET returns OK
+                Ok(Value::Okay) => {
                    tracing::debug!(
                        src=%self.key,
                        dest=?cancel_closure.cancel_token,
                        "registered cancellation key"
                    );
+                    state = State::Refresh;
+                }

-                    // wait before continuing.
-                    tokio::time::sleep(CANCEL_KEY_REFRESH).await;
+                // EXPIRE returns 1
+                Ok(Value::Int(1)) => {
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "refreshed cancellation key"
+                    );
                }
+
+                Ok(_) => {
+                    // Any other response likely means the key expired.
+                    tracing::warn!(src=%self.key, "refreshing cancellation key failed");
+                    // Re-enter the SET loop to repush full data.
+                    state = State::Set;
+                }
+
                // retry immediately.
-                Ok(Err(error)) => {
-                    tracing::warn!(?error, "error registering cancellation key");
+                Err(BatchQueueError::Result(error)) => {
+                    tracing::warn!(?error, "error refreshing cancellation key");
+                    // Small delay to prevent busy loop with high cpu and logging.
+                    tokio::time::sleep(Duration::from_millis(10)).await;
+                    continue;
                }
-                Err(Err(_cancelled)) => break,
+
+                Err(BatchQueueError::Cancelled(Err(_cancelled))) => break,
+            }
+
+            // wait before continuing. break immediately if cancelled.
+            if run_until(tokio::time::sleep(CANCEL_KEY_REFRESH), cancel.as_mut())
+                .await
+                .is_err()
+            {
+                break;
            }
        }

--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -23,12 +23,13 @@ use crate::control_plane::errors::{
    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
 };
 use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse};
 use crate::control_plane::{
    AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, EndpointAccessControl, NodeInfo,
    RoleAccessControl,
 };
 use crate::metrics::Metrics;
+use crate::proxy::retry::CouldRetry;
 use crate::rate_limiter::WakeComputeRateLimiter;
 use crate::types::{EndpointCacheKey, EndpointId, RoleName};
 use crate::{compute, http, scram};
@@ -382,16 +383,31 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {

        macro_rules! check_cache {
            () => {
-                if let Some(cached) = self.caches.node_info.get(&key) {
-                    let (cached, info) = cached.take_value();
-                    let info = info.map_err(|c| {
-                        info!(key = &*key, "found cached wake_compute error");
-                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
-                    })?;
+                if let Some(cached) = self.caches.node_info.get_with_created_at(&key) {
+                    let (cached, (info, created_at)) = cached.take_value();
+                    return match info {
+                        Err(mut msg) => {
+                            info!(key = &*key, "found cached wake_compute error");

-                    debug!(key = &*key, "found cached compute node info");
-                    ctx.set_project(info.aux.clone());
-                    return Ok(cached.map(|()| info));
+                            // if retry_delay_ms is set, reduce it by the amount of time it spent in cache
+                            if let Some(status) = &mut msg.status {
+                                if let Some(retry_info) = &mut status.details.retry_info {
+                                    retry_info.retry_delay_ms = retry_info
+                                        .retry_delay_ms
+                                        .saturating_sub(created_at.elapsed().as_millis() as u64)
+                                }
+                            }
+
+                            Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                                msg,
+                            )))
+                        }
+                        Ok(info) => {
+                            debug!(key = &*key, "found cached compute node info");
+                            ctx.set_project(info.aux.clone());
+                            Ok(cached.map(|()| info))
+                        }
+                    };
                }
            };
        }
@@ -434,42 +450,29 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                Ok(cached.map(|()| node))
            }
            Err(err) => match err {
-                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
-                    let Some(status) = &err.status else {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
-                    };
+                WakeComputeError::ControlPlane(ControlPlaneError::Message(ref msg)) => {
+                    let retry_info = msg.status.as_ref().and_then(|s| s.details.retry_info);

-                    let reason = status
-                        .details
-                        .error_info
-                        .map_or(Reason::Unknown, |x| x.reason);
-
-                    // if we can retry this error, do not cache it.
-                    if reason.can_retry() {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
+                    // If we can retry this error, do not cache it,
+                    // unless we were given a retry delay.
+                    if msg.could_retry() && retry_info.is_none() {
+                        return Err(err);
                    }

-                    // at this point, we should only have quota errors.
                    debug!(
                        key = &*key,
                        "created a cache entry for the wake compute error"
                    );

-                    self.caches.node_info.insert_ttl(
-                        key,
-                        Err(err.clone()),
-                        Duration::from_secs(30),
-                    );
+                    let ttl = retry_info.map_or(Duration::from_secs(30), |r| {
+                        Duration::from_millis(r.retry_delay_ms)
+                    });

-                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                        err,
-                    )))
+                    self.caches.node_info.insert_ttl(key, Err(msg.clone()), ttl);
+
+                    Err(err)
                }
-                err => return Err(err),
+                err => Err(err),
            },
        }
    }
--- a/proxy/src/control_plane/errors.rs
+++ b/proxy/src/control_plane/errors.rs
@@ -43,28 +43,35 @@ impl UserFacingError for ControlPlaneError {
 }

 impl ReportableError for ControlPlaneError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
+    fn get_error_kind(&self) -> ErrorKind {
        match self {
            ControlPlaneError::Message(e) => match e.get_reason() {
-                Reason::RoleProtected => ErrorKind::User,
-                Reason::ResourceNotFound => ErrorKind::User,
-                Reason::ProjectNotFound => ErrorKind::User,
-                Reason::EndpointNotFound => ErrorKind::User,
-                Reason::BranchNotFound => ErrorKind::User,
+                Reason::RoleProtected
+                | Reason::ResourceNotFound
+                | Reason::ProjectNotFound
+                | Reason::EndpointNotFound
+                | Reason::EndpointDisabled
+                | Reason::BranchNotFound
+                | Reason::InvalidEphemeralEndpointOptions => ErrorKind::User,
+
                Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
-                Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota,
-                Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota,
-                Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota,
-                Reason::WrittenDataQuotaExceeded => ErrorKind::Quota,
-                Reason::DataTransferQuotaExceeded => ErrorKind::Quota,
-                Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota,
-                Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
-                Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
-                Reason::RunningOperations => ErrorKind::ControlPlane,
-                Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
-                Reason::Unknown => ErrorKind::ControlPlane,
+
+                Reason::NonDefaultBranchComputeTimeExceeded
+                | Reason::ActiveTimeQuotaExceeded
+                | Reason::ComputeTimeQuotaExceeded
+                | Reason::WrittenDataQuotaExceeded
+                | Reason::DataTransferQuotaExceeded
+                | Reason::LogicalSizeQuotaExceeded
+                | Reason::ActiveEndpointsLimitExceeded => ErrorKind::Quota,
+
+                Reason::ConcurrencyLimitReached
+                | Reason::LockAlreadyTaken
+                | Reason::RunningOperations
+                | Reason::EndpointIdle
+                | Reason::ProjectUnderMaintenance
+                | Reason::Unknown => ErrorKind::ControlPlane,
            },
-            ControlPlaneError::Transport(_) => crate::error::ErrorKind::ControlPlane,
+            ControlPlaneError::Transport(_) => ErrorKind::ControlPlane,
        }
    }
 }
@@ -120,10 +127,10 @@ impl UserFacingError for GetAuthInfoError {
 }

 impl ReportableError for GetAuthInfoError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
+    fn get_error_kind(&self) -> ErrorKind {
        match self {
-            Self::BadSecret => crate::error::ErrorKind::ControlPlane,
-            Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
+            Self::BadSecret => ErrorKind::ControlPlane,
+            Self::ApiError(_) => ErrorKind::ControlPlane,
        }
    }
 }
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -126,10 +126,16 @@ pub(crate) enum Reason {
    /// or that the subject doesn't have enough permissions to access the requested endpoint.
    #[serde(rename = "ENDPOINT_NOT_FOUND")]
    EndpointNotFound,
+    /// EndpointDisabled indicates that the endpoint has been disabled and does not accept connections.
+    #[serde(rename = "ENDPOINT_DISABLED")]
+    EndpointDisabled,
    /// BranchNotFound indicates that the branch wasn't found, usually due to the provided ID not being correct,
    /// or that the subject doesn't have enough permissions to access the requested branch.
    #[serde(rename = "BRANCH_NOT_FOUND")]
    BranchNotFound,
+    /// InvalidEphemeralEndpointOptions indicates that the specified LSN or timestamp are wrong.
+    #[serde(rename = "INVALID_EPHEMERAL_OPTIONS")]
+    InvalidEphemeralEndpointOptions,
    /// RateLimitExceeded indicates that the rate limit for the operation has been exceeded.
    #[serde(rename = "RATE_LIMIT_EXCEEDED")]
    RateLimitExceeded,
@@ -152,6 +158,9 @@ pub(crate) enum Reason {
    /// LogicalSizeQuotaExceeded indicates that the logical size quota was exceeded.
    #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")]
    LogicalSizeQuotaExceeded,
+    /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded.
+    #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")]
+    ActiveEndpointsLimitExceeded,
    /// RunningOperations indicates that the project already has some running operations
    /// and scheduling of new ones is prohibited.
    #[serde(rename = "RUNNING_OPERATIONS")]
@@ -162,9 +171,13 @@ pub(crate) enum Reason {
    /// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken.
    #[serde(rename = "LOCK_ALREADY_TAKEN")]
    LockAlreadyTaken,
-    /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded.
-    #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")]
-    ActiveEndpointsLimitExceeded,
+    /// EndpointIdle indicates that the endpoint cannot become active, because it's idle.
+    #[serde(rename = "ENDPOINT_IDLE")]
+    EndpointIdle,
+    /// ProjectUnderMaintenance indicates that the project is currently ongoing maintenance,
+    /// and thus cannot accept connections.
+    #[serde(rename = "PROJECT_UNDER_MAINTENANCE")]
+    ProjectUnderMaintenance,
    #[default]
    #[serde(other)]
    Unknown,
@@ -184,13 +197,15 @@ impl Reason {
    pub(crate) fn can_retry(self) -> bool {
        match self {
            // do not retry role protected errors
-            // not a transitive error
+            // not a transient error
            Reason::RoleProtected => false,
-            // on retry, it will still not be found
+            // on retry, it will still not be found or valid
            Reason::ResourceNotFound
            | Reason::ProjectNotFound
            | Reason::EndpointNotFound
-            | Reason::BranchNotFound => false,
+            | Reason::EndpointDisabled
+            | Reason::BranchNotFound
+            | Reason::InvalidEphemeralEndpointOptions => false,
            // we were asked to go away
            Reason::RateLimitExceeded
            | Reason::NonDefaultBranchComputeTimeExceeded
@@ -200,11 +215,13 @@ impl Reason {
            | Reason::DataTransferQuotaExceeded
            | Reason::LogicalSizeQuotaExceeded
            | Reason::ActiveEndpointsLimitExceeded => false,
-            // transitive error. control plane is currently busy
+            // transient error. control plane is currently busy
            // but might be ready soon
            Reason::RunningOperations
            | Reason::ConcurrencyLimitReached
-            | Reason::LockAlreadyTaken => true,
+            | Reason::LockAlreadyTaken
+            | Reason::EndpointIdle
+            | Reason::ProjectUnderMaintenance => true,
            // unknown error. better not retry it.
            Reason::Unknown => false,
        }
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -374,11 +374,10 @@ pub enum Waiting {
 #[label(singleton = "kind")]
 #[allow(clippy::enum_variant_names)]
 pub enum RedisMsgKind {
-    HSet,
-    HSetMultiple,
+    Set,
+    Get,
+    Expire,
    HGet,
-    HGetAll,
-    HDel,
 }

 #[derive(Default, Clone)]
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -195,15 +195,18 @@ impl NeonOptions {
    // proxy options:

    /// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute.
-    pub const PARAMS_COMPAT: &str = "proxy_params_compat";
+    pub const PARAMS_COMPAT: &'static str = "proxy_params_compat";

    // cplane options:

    /// `LSN` allows provisioning an ephemeral compute with time-travel to the provided LSN.
-    const LSN: &str = "lsn";
+    const LSN: &'static str = "lsn";
+
+    /// `TIMESTAMP` allows provisioning an ephemeral compute with time-travel to the provided timestamp.
+    const TIMESTAMP: &'static str = "timestamp";

    /// `ENDPOINT_TYPE` allows configuring an ephemeral compute to be read_only or read_write.
-    const ENDPOINT_TYPE: &str = "endpoint_type";
+    const ENDPOINT_TYPE: &'static str = "endpoint_type";

    pub(crate) fn parse_params(params: &StartupMessageParams) -> Self {
        params
@@ -228,6 +231,7 @@ impl NeonOptions {
            // This is not a cplane option, we know it does not create ephemeral computes.
            Self::PARAMS_COMPAT => false,
            Self::LSN => true,
+            Self::TIMESTAMP => true,
            Self::ENDPOINT_TYPE => true,
            // err on the side of caution. any cplane options we don't know about
            // might lead to ephemeral computes.
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -4,11 +4,12 @@ use std::time::Duration;

 use futures::FutureExt;
 use redis::aio::{ConnectionLike, MultiplexedConnection};
-use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult};
+use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisError, RedisResult};
 use tokio::task::AbortHandle;
 use tracing::{error, info, warn};

 use super::elasticache::CredentialsProvider;
+use crate::redis::elasticache::CredentialsProviderError;

 enum Credentials {
    Static(ConnectionInfo),
@@ -26,6 +27,14 @@ impl Clone for Credentials {
    }
 }

+#[derive(thiserror::Error, Debug)]
+pub enum ConnectionProviderError {
+    #[error(transparent)]
+    Redis(#[from] RedisError),
+    #[error(transparent)]
+    CredentialsProvider(#[from] CredentialsProviderError),
+}
+
 /// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token.
 /// Provides PubSub connection without credentials refresh.
 pub struct ConnectionWithCredentialsProvider {
@@ -86,15 +95,18 @@ impl ConnectionWithCredentialsProvider {
        }
    }

-    async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> {
-        redis::cmd("PING").query_async(con).await
+    async fn ping(con: &mut MultiplexedConnection) -> Result<(), ConnectionProviderError> {
+        redis::cmd("PING")
+            .query_async(con)
+            .await
+            .map_err(Into::into)
    }

    pub(crate) fn credentials_refreshed(&self) -> bool {
        self.credentials_refreshed.load(Ordering::Relaxed)
    }

-    pub(crate) async fn connect(&mut self) -> anyhow::Result<()> {
+    pub(crate) async fn connect(&mut self) -> Result<(), ConnectionProviderError> {
        let _guard = self.mutex.lock().await;
        if let Some(con) = self.con.as_mut() {
            match Self::ping(con).await {
@@ -141,7 +153,7 @@ impl ConnectionWithCredentialsProvider {
        Ok(())
    }

-    async fn get_connection_info(&self) -> anyhow::Result<ConnectionInfo> {
+    async fn get_connection_info(&self) -> Result<ConnectionInfo, ConnectionProviderError> {
        match &self.credentials {
            Credentials::Static(info) => Ok(info.clone()),
            Credentials::Dynamic(provider, addr) => {
@@ -160,7 +172,7 @@ impl ConnectionWithCredentialsProvider {
        }
    }

-    async fn get_client(&self) -> anyhow::Result<redis::Client> {
+    async fn get_client(&self) -> Result<redis::Client, ConnectionProviderError> {
        let client = redis::Client::open(self.get_connection_info().await?)?;
        self.credentials_refreshed.store(true, Ordering::Relaxed);
        Ok(client)
--- a/proxy/src/redis/elasticache.rs
+++ b/proxy/src/redis/elasticache.rs
@@ -9,10 +9,12 @@ use aws_config::meta::region::RegionProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
+use aws_credential_types::provider::error::CredentialsError;
 use aws_sdk_iam::config::ProvideCredentials;
 use aws_sigv4::http_request::{
-    self, SignableBody, SignableRequest, SignatureLocation, SigningSettings,
+    self, SignableBody, SignableRequest, SignatureLocation, SigningError, SigningSettings,
 };
+use aws_sigv4::sign::v4::signing_params::BuildError;
 use tracing::info;

 #[derive(Debug)]
@@ -40,6 +42,18 @@ impl AWSIRSAConfig {
    }
 }

+#[derive(thiserror::Error, Debug)]
+pub enum CredentialsProviderError {
+    #[error(transparent)]
+    AwsCredentials(#[from] CredentialsError),
+    #[error(transparent)]
+    AwsSigv4Build(#[from] BuildError),
+    #[error(transparent)]
+    AwsSigv4Singing(#[from] SigningError),
+    #[error(transparent)]
+    Http(#[from] http::Error),
+}
+
 /// Credentials provider for AWS elasticache authentication.
 ///
 /// Official documentation:
@@ -92,7 +106,9 @@ impl CredentialsProvider {
        })
    }

-    pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
+    pub(crate) async fn provide_credentials(
+        &self,
+    ) -> Result<(String, String), CredentialsProviderError> {
        let aws_credentials = self
            .credentials_provider
            .provide_credentials()
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -2,9 +2,18 @@ use std::time::Duration;

 use futures::FutureExt;
 use redis::aio::ConnectionLike;
-use redis::{Cmd, FromRedisValue, Pipeline, RedisResult};
+use redis::{Cmd, FromRedisValue, Pipeline, RedisError, RedisResult};

 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use crate::redis::connection_with_credentials_provider::ConnectionProviderError;
+
+#[derive(thiserror::Error, Debug)]
+pub enum RedisKVClientError {
+    #[error(transparent)]
+    Redis(#[from] RedisError),
+    #[error(transparent)]
+    ConnectionProvider(#[from] ConnectionProviderError),
+}

 pub struct RedisKVClient {
    client: ConnectionWithCredentialsProvider,
@@ -32,12 +41,13 @@ impl RedisKVClient {
        Self { client }
    }

-    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+    pub async fn try_connect(&mut self) -> Result<(), RedisKVClientError> {
        self.client
            .connect()
            .boxed()
            .await
            .inspect_err(|e| tracing::error!("failed to connect to redis: {e}"))
+            .map_err(Into::into)
    }

    pub(crate) fn credentials_refreshed(&self) -> bool {
@@ -47,7 +57,7 @@ impl RedisKVClient {
    pub(crate) async fn query<T: FromRedisValue>(
        &mut self,
        q: &impl Queryable,
-    ) -> anyhow::Result<T> {
+    ) -> Result<T, RedisKVClientError> {
        let e = match q.query(&mut self.client).await {
            Ok(t) => return Ok(t),
            Err(e) => e,
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -1,6 +1,7 @@
+use json::{ListSer, ObjectSer, ValueSer};
 use postgres_client::Row;
 use postgres_client::types::{Kind, Type};
-use serde_json::{Map, Value};
+use serde_json::Value;

 //
 // Convert json non-string types to strings, so that they can be passed to Postgres
@@ -74,44 +75,40 @@ pub(crate) enum JsonConversionError {
    UnbalancedString,
 }

-enum OutputMode {
-    Array(Vec<Value>),
-    Object(Map<String, Value>),
+enum OutputMode<'a> {
+    Array(ListSer<'a>),
+    Object(ObjectSer<'a>),
 }

-impl OutputMode {
-    fn key(&mut self, key: &str) -> &mut Value {
+impl OutputMode<'_> {
+    fn key(&mut self, key: &str) -> ValueSer<'_> {
        match self {
-            OutputMode::Array(values) => push_entry(values, Value::Null),
-            OutputMode::Object(map) => map.entry(key.to_string()).or_insert(Value::Null),
+            OutputMode::Array(values) => values.entry(),
+            OutputMode::Object(map) => map.key(key),
        }
    }

-    fn finish(self) -> Value {
+    fn finish(self) {
        match self {
-            OutputMode::Array(values) => Value::Array(values),
-            OutputMode::Object(map) => Value::Object(map),
+            OutputMode::Array(values) => values.finish(),
+            OutputMode::Object(map) => map.finish(),
        }
    }
 }

-fn push_entry<T>(arr: &mut Vec<T>, t: T) -> &mut T {
-    arr.push(t);
-    arr.last_mut().expect("a value was just inserted")
-}
-
 //
 // Convert postgres row with text-encoded values to JSON object
 //
 pub(crate) fn pg_text_row_to_json(
+    output: ValueSer,
    row: &Row,
    raw_output: bool,
    array_mode: bool,
-) -> Result<Value, JsonConversionError> {
+) -> Result<(), JsonConversionError> {
    let mut entries = if array_mode {
-        OutputMode::Array(Vec::with_capacity(row.columns().len()))
+        OutputMode::Array(output.list())
    } else {
-        OutputMode::Object(Map::with_capacity(row.columns().len()))
+        OutputMode::Object(output.object())
    };

    for (i, column) in row.columns().iter().enumerate() {
@@ -120,53 +117,48 @@ pub(crate) fn pg_text_row_to_json(
        let value = entries.key(column.name());

        match pg_value {
-            Some(v) if raw_output => *value = Value::String(v.to_string()),
+            Some(v) if raw_output => value.value(v),
            Some(v) => pg_text_to_json(value, v, column.type_())?,
-            None => *value = Value::Null,
+            None => value.value(json::Null),
        }
    }

-    Ok(entries.finish())
+    entries.finish();
+    Ok(())
 }

 //
 // Convert postgres text-encoded value to JSON value
 //
-fn pg_text_to_json(
-    output: &mut Value,
-    val: &str,
-    pg_type: &Type,
-) -> Result<(), JsonConversionError> {
+fn pg_text_to_json(output: ValueSer, val: &str, pg_type: &Type) -> Result<(), JsonConversionError> {
    if let Kind::Array(elem_type) = pg_type.kind() {
        // todo: we should fetch this from postgres.
        let delimiter = ',';

-        let mut array = vec![];
-        pg_array_parse(&mut array, val, elem_type, delimiter)?;
-        *output = Value::Array(array);
+        json::value_as_list!(|output| pg_array_parse(output, val, elem_type, delimiter)?);
        return Ok(());
    }

    match *pg_type {
-        Type::BOOL => *output = Value::Bool(val == "t"),
+        Type::BOOL => output.value(val == "t"),
        Type::INT2 | Type::INT4 => {
            let val = val.parse::<i32>()?;
-            *output = Value::Number(serde_json::Number::from(val));
+            output.value(val);
        }
        Type::FLOAT4 | Type::FLOAT8 => {
            let fval = val.parse::<f64>()?;
-            let num = serde_json::Number::from_f64(fval);
-            if let Some(num) = num {
-                *output = Value::Number(num);
+            if fval.is_finite() {
+                output.value(fval);
            } else {
                // Pass Nan, Inf, -Inf as strings
                // JS JSON.stringify() does converts them to null, but we
                // want to preserve them, so we pass them as strings
-                *output = Value::String(val.to_string());
+                output.value(val);
            }
        }
-        Type::JSON | Type::JSONB => *output = serde_json::from_str(val)?,
-        _ => *output = Value::String(val.to_string()),
+        // we assume that the string value is valid json.
+        Type::JSON | Type::JSONB => output.write_raw_json(val.as_bytes()),
+        _ => output.value(val),
    }

    Ok(())
@@ -192,7 +184,7 @@ fn pg_text_to_json(
 /// gets its own level of curly braces, and delimiters must be written between adjacent
 /// curly-braced entities of the same level.
 fn pg_array_parse(
-    elements: &mut Vec<Value>,
+    elements: &mut ListSer,
    mut pg_array: &str,
    elem: &Type,
    delim: char,
@@ -221,7 +213,7 @@ fn pg_array_parse(
 /// reads a single array from the `pg_array` string and pushes each values to `elements`.
 /// returns the rest of the `pg_array` string that was not read.
 fn pg_array_parse_inner<'a>(
-    elements: &mut Vec<Value>,
+    elements: &mut ListSer,
    mut pg_array: &'a str,
    elem: &Type,
    delim: char,
@@ -234,7 +226,7 @@ fn pg_array_parse_inner<'a>(
    let mut q = String::new();

    loop {
-        let value = push_entry(elements, Value::Null);
+        let value = elements.entry();
        pg_array = pg_array_parse_item(value, &mut q, pg_array, elem, delim)?;

        // check for separator.
@@ -260,7 +252,7 @@ fn pg_array_parse_inner<'a>(
 ///
 /// `quoted` is a scratch allocation that has no defined output.
 fn pg_array_parse_item<'a>(
-    output: &mut Value,
+    output: ValueSer,
    quoted: &mut String,
    mut pg_array: &'a str,
    elem: &Type,
@@ -276,9 +268,8 @@ fn pg_array_parse_item<'a>(

    if pg_array.starts_with('{') {
        // nested array.
-        let mut nested = vec![];
-        pg_array = pg_array_parse_inner(&mut nested, pg_array, elem, delim)?;
-        *output = Value::Array(nested);
+        pg_array =
+            json::value_as_list!(|output| pg_array_parse_inner(output, pg_array, elem, delim))?;
        return Ok(pg_array);
    }

@@ -306,7 +297,7 @@ fn pg_array_parse_item<'a>(
    // we might have an item string:
    // check for null
    if item == "NULL" {
-        *output = Value::Null;
+        output.value(json::Null);
    } else {
        pg_text_to_json(output, item, elem)?;
    }
@@ -440,15 +431,15 @@ mod tests {
    }

    fn pg_text_to_json(val: &str, pg_type: &Type) -> Value {
-        let mut v = Value::Null;
-        super::pg_text_to_json(&mut v, val, pg_type).unwrap();
-        v
+        let output = json::value_to_string!(|v| super::pg_text_to_json(v, val, pg_type).unwrap());
+        serde_json::from_str(&output).unwrap()
    }

    fn pg_array_parse(pg_array: &str, pg_type: &Type) -> Value {
-        let mut array = vec![];
-        super::pg_array_parse(&mut array, pg_array, pg_type, ',').unwrap();
-        Value::Array(array)
+        let output = json::value_to_string!(|v| json::value_as_list!(|v| {
+            super::pg_array_parse(v, pg_array, pg_type, ',').unwrap();
+        }));
+        serde_json::from_str(&output).unwrap()
    }

    #[test]
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -14,10 +14,7 @@ use hyper::http::{HeaderName, HeaderValue};
 use hyper::{Request, Response, StatusCode, header};
 use indexmap::IndexMap;
 use postgres_client::error::{DbError, ErrorPosition, SqlState};
-use postgres_client::{
-    GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, RowStream, Transaction,
-};
-use serde::Serialize;
+use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use serde_json::Value;
 use serde_json::value::RawValue;
 use tokio::time::{self, Instant};
@@ -687,32 +684,21 @@ impl QueryData {
        let (inner, mut discard) = client.inner();
        let cancel_token = inner.cancel_token();

-        match select(
+        let mut json_buf = vec![];
+
+        let batch_result = match select(
            pin!(query_to_json(
                config,
                &mut *inner,
                self,
-                &mut 0,
+                json::ValueSer::new(&mut json_buf),
                parsed_headers
            )),
            pin!(cancel.cancelled()),
        )
        .await
        {
-            // The query successfully completed.
-            Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
-                discard.check_idle(status);
-
-                let json_output =
-                    serde_json::to_string(&results).expect("json serialization should not fail");
-                Ok(json_output)
-            }
-            // The query failed with an error
-            Either::Left((Err(e), __not_yet_cancelled)) => {
-                discard.discard();
-                Err(e)
-            }
-            // The query was cancelled.
+            Either::Left((res, __not_yet_cancelled)) => res,
            Either::Right((_cancelled, query)) => {
                tracing::info!("cancelling query");
                if let Err(err) = cancel_token.cancel_query(NoTls).await {
@@ -721,13 +707,7 @@ impl QueryData {
                // wait for the query cancellation
                match time::timeout(time::Duration::from_millis(100), query).await {
                    // query successed before it was cancelled.
-                    Ok(Ok((status, results))) => {
-                        discard.check_idle(status);
-
-                        let json_output = serde_json::to_string(&results)
-                            .expect("json serialization should not fail");
-                        Ok(json_output)
-                    }
+                    Ok(Ok(status)) => Ok(status),
                    // query failed or was cancelled.
                    Ok(Err(error)) => {
                        let db_error = match &error {
@@ -743,14 +723,29 @@ impl QueryData {
                            discard.discard();
                        }

-                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                        return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                    }
                    Err(_timeout) => {
                        discard.discard();
-                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                        return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                    }
                }
            }
+        };
+
+        match batch_result {
+            // The query successfully completed.
+            Ok(status) => {
+                discard.check_idle(status);
+
+                let json_output = String::from_utf8(json_buf).expect("json should be valid utf8");
+                Ok(json_output)
+            }
+            // The query failed with an error
+            Err(e) => {
+                discard.discard();
+                Err(e)
+            }
        }
    }
 }
@@ -787,7 +782,7 @@ impl BatchQueryData {
            })
            .map_err(SqlOverHttpError::Postgres)?;

-        let json_output = match query_batch(
+        let json_output = match query_batch_to_json(
            config,
            cancel.child_token(),
            &mut transaction,
@@ -845,24 +840,21 @@ async fn query_batch(
    transaction: &mut Transaction<'_>,
    queries: BatchQueryData,
    parsed_headers: HttpHeaders,
-) -> Result<String, SqlOverHttpError> {
-    let mut results = Vec::with_capacity(queries.queries.len());
-    let mut current_size = 0;
+    results: &mut json::ListSer<'_>,
+) -> Result<(), SqlOverHttpError> {
    for stmt in queries.queries {
        let query = pin!(query_to_json(
            config,
            transaction,
            stmt,
-            &mut current_size,
+            results.entry(),
            parsed_headers,
        ));
        let cancelled = pin!(cancel.cancelled());
        let res = select(query, cancelled).await;
        match res {
            // TODO: maybe we should check that the transaction bit is set here
-            Either::Left((Ok((_, values)), _cancelled)) => {
-                results.push(values);
-            }
+            Either::Left((Ok(_), _cancelled)) => {}
            Either::Left((Err(e), _cancelled)) => {
                return Err(e);
            }
@@ -872,8 +864,22 @@ async fn query_batch(
        }
    }

-    let results = json!({ "results": results });
-    let json_output = serde_json::to_string(&results).expect("json serialization should not fail");
+    Ok(())
+}
+
+async fn query_batch_to_json(
+    config: &'static HttpConfig,
+    cancel: CancellationToken,
+    tx: &mut Transaction<'_>,
+    queries: BatchQueryData,
+    headers: HttpHeaders,
+) -> Result<String, SqlOverHttpError> {
+    let json_output = json::value_to_string!(|obj| json::value_as_object!(|obj| {
+        let results = obj.key("results");
+        json::value_as_list!(|results| {
+            query_batch(config, cancel, tx, queries, headers, results).await?;
+        });
+    }));

    Ok(json_output)
 }
@@ -882,54 +888,54 @@ async fn query_to_json<T: GenericClient>(
    config: &'static HttpConfig,
    client: &mut T,
    data: QueryData,
-    current_size: &mut usize,
+    output: json::ValueSer<'_>,
    parsed_headers: HttpHeaders,
-) -> Result<(ReadyForQueryStatus, impl Serialize + use<T>), SqlOverHttpError> {
+) -> Result<ReadyForQueryStatus, SqlOverHttpError> {
    let query_start = Instant::now();

-    let query_params = data.params;
+    let mut output = json::ObjectSer::new(output);
    let mut row_stream = client
-        .query_raw_txt(&data.query, query_params)
+        .query_raw_txt(&data.query, data.params)
        .await
        .map_err(SqlOverHttpError::Postgres)?;
    let query_acknowledged = Instant::now();

-    let columns_len = row_stream.statement.columns().len();
-    let mut fields = Vec::with_capacity(columns_len);
-
+    let mut json_fields = output.key("fields").list();
    for c in row_stream.statement.columns() {
-        fields.push(json!({
-            "name": c.name().to_owned(),
-            "dataTypeID": c.type_().oid(),
-            "tableID": c.table_oid(),
-            "columnID": c.column_id(),
-            "dataTypeSize": c.type_size(),
-            "dataTypeModifier": c.type_modifier(),
-            "format": "text",
-        }));
+        let json_field = json_fields.entry();
+        json::value_as_object!(|json_field| {
+            json_field.entry("name", c.name());
+            json_field.entry("dataTypeID", c.type_().oid());
+            json_field.entry("tableID", c.table_oid());
+            json_field.entry("columnID", c.column_id());
+            json_field.entry("dataTypeSize", c.type_size());
+            json_field.entry("dataTypeModifier", c.type_modifier());
+            json_field.entry("format", "text");
+        });
    }
+    json_fields.finish();

-    let raw_output = parsed_headers.raw_output;
    let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
+    let raw_output = parsed_headers.raw_output;

    // Manually drain the stream into a vector to leave row_stream hanging
    // around to get a command tag. Also check that the response is not too
    // big.
-    let mut rows = Vec::new();
+    let mut rows = 0;
+    let mut json_rows = output.key("rows").list();
    while let Some(row) = row_stream.next().await {
        let row = row.map_err(SqlOverHttpError::Postgres)?;
-        *current_size += row.body_len();

        // we don't have a streaming response support yet so this is to prevent OOM
        // from a malicious query (eg a cross join)
-        if *current_size > config.max_response_size_bytes {
+        if json_rows.as_buffer().len() > config.max_response_size_bytes {
            return Err(SqlOverHttpError::ResponseTooLarge(
                config.max_response_size_bytes,
            ));
        }

-        let row = pg_text_row_to_json(&row, raw_output, array_mode)?;
-        rows.push(row);
+        pg_text_row_to_json(json_rows.entry(), &row, raw_output, array_mode)?;
+        rows += 1;

        // assumption: parsing pg text and converting to json takes CPU time.
        // let's assume it is slightly expensive, so we should consume some cooperative budget.
@@ -937,16 +943,14 @@ async fn query_to_json<T: GenericClient>(
        // of rows and never hit the tokio mpsc for a long time (although unlikely).
        tokio::task::consume_budget().await;
    }
+    json_rows.finish();

    let query_resp_end = Instant::now();
-    let RowStream {
-        command_tag,
-        status: ready,
-        ..
-    } = row_stream;
+
+    let ready = row_stream.status;

    // grab the command tag and number of rows affected
-    let command_tag = command_tag.unwrap_or_default();
+    let command_tag = row_stream.command_tag.unwrap_or_default();
    let mut command_tag_split = command_tag.split(' ');
    let command_tag_name = command_tag_split.next().unwrap_or_default();
    let command_tag_count = if command_tag_name == "INSERT" {
@@ -959,7 +963,7 @@ async fn query_to_json<T: GenericClient>(
    .and_then(|s| s.parse::<i64>().ok());

    info!(
-        rows = rows.len(),
+        rows,
        ?ready,
        command_tag,
        acknowledgement = ?(query_acknowledged - query_start),
@@ -967,16 +971,12 @@ async fn query_to_json<T: GenericClient>(
        "finished executing query"
    );

-    // Resulting JSON format is based on the format of node-postgres result.
-    let results = json!({
-        "command": command_tag_name.to_string(),
-        "rowCount": command_tag_count,
-        "rows": rows,
-        "fields": fields,
-        "rowAsArray": array_mode,
-    });
+    output.entry("command", command_tag_name);
+    output.entry("rowCount", command_tag_count);
+    output.entry("rowAsArray", array_mode);

-    Ok((ready, results))
+    output.finish();
+    Ok(ready)
 }

 enum Client {
--- a/proxy/src/util.rs
+++ b/proxy/src/util.rs
@@ -7,8 +7,16 @@ pub async fn run_until_cancelled<F: Future>(
    f: F,
    cancellation_token: &CancellationToken,
 ) -> Option<F::Output> {
-    match select(pin!(f), pin!(cancellation_token.cancelled())).await {
-        Either::Left((f, _)) => Some(f),
-        Either::Right(((), _)) => None,
+    run_until(f, cancellation_token.cancelled()).await.ok()
+}
+
+/// Runs the future `f` unless interrupted by future `condition`.
+pub async fn run_until<F1: Future, F2: Future>(
+    f: F1,
+    condition: F2,
+) -> Result<F1::Output, F2::Output> {
+    match select(pin!(f), pin!(condition)).await {
+        Either::Left((f1, _)) => Ok(f1),
+        Either::Right((f2, _)) => Err(f2),
    }
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.10.11"
+aiohttp = "3.12.14"
 pytest-rerunfailures = "^15.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -6,10 +6,10 @@
 use std::error::Error as _;

 use http_utils::error::HttpErrorBody;
-use reqwest::{IntoUrl, Method, StatusCode};
+use reqwest::{IntoUrl, Method, Response, StatusCode};
 use safekeeper_api::models::{
    self, PullTimelineRequest, PullTimelineResponse, SafekeeperStatus, SafekeeperUtilization,
-    TimelineCreateRequest, TimelineStatus,
+    TimelineCreateRequest,
 };
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;
@@ -161,13 +161,12 @@ impl Client {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-    ) -> Result<TimelineStatus> {
+    ) -> Result<Response> {
        let uri = format!(
            "{}/v1/tenant/{}/timeline/{}",
            self.mgmt_api_endpoint, tenant_id, timeline_id
        );
-        let resp = self.get(&uri).await?;
-        resp.json().await.map_err(Error::ReceiveBody)
+        self.get(&uri).await
    }

    pub async fn snapshot(
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -23,6 +23,7 @@ use safekeeper::defaults::{
    DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
    DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
 };
+use safekeeper::hadron;
 use safekeeper::wal_backup::WalBackup;
 use safekeeper::{
    BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf,
@@ -252,6 +253,10 @@ struct Args {
    /// Run in development mode (disables security checks)
    #[arg(long, help = "Run in development mode (disables security checks)")]
    dev: bool,
+    /* BEGIN_HADRON */
+    #[arg(long)]
+    enable_pull_timeline_on_startup: bool,
+    /* END_HADRON */
 }

 // Like PathBufValueParser, but allows empty string.
@@ -435,6 +440,11 @@ async fn main() -> anyhow::Result<()> {
        use_https_safekeeper_api: args.use_https_safekeeper_api,
        enable_tls_wal_service_api: args.enable_tls_wal_service_api,
        force_metric_collection_on_scrape: args.force_metric_collection_on_scrape,
+        /* BEGIN_HADRON */
+        advertise_pg_addr_tenant_only: None,
+        enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup,
+        hcc_base_url: None,
+        /* END_HADRON */
    });

    // initialize sentry if SENTRY_DSN is provided
@@ -529,6 +539,20 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
    // Load all timelines from disk to memory.
    global_timelines.init().await?;

+    /* BEGIN_HADRON */
+    if conf.enable_pull_timeline_on_startup && global_timelines.timelines_count() == 0 {
+        match hadron::hcc_pull_timelines(&conf, global_timelines.clone()).await {
+            Ok(_) => {
+                info!("Successfully pulled all timelines from peer safekeepers");
+            }
+            Err(e) => {
+                error!("Failed to pull timelines from peer safekeepers: {:?}", e);
+                return Err(e);
+            }
+        }
+    }
+    /* END_HADRON */
+
    // Run everything in current thread rt, if asked.
    if conf.current_thread_runtime {
        info!("running in current thread runtime");
--- a/safekeeper/src/hadron.rs
+++ b/safekeeper/src/hadron.rs
@@ -0,0 +1,388 @@
+use pem::Pem;
+use safekeeper_api::models::PullTimelineRequest;
+use std::{collections::HashMap, env::VarError, net::IpAddr, sync::Arc, time::Duration};
+use tokio::time::sleep;
+use tokio_util::sync::CancellationToken;
+use url::Url;
+use utils::{backoff, id::TenantTimelineId, ip_address};
+
+use anyhow::Result;
+use pageserver_api::controller_api::{
+    AvailabilityZone, NodeRegisterRequest, SafekeeperTimeline, SafekeeperTimelinesResponse,
+};
+
+use crate::{
+    GlobalTimelines, SafeKeeperConf,
+    metrics::{
+        SK_RECOVERY_PULL_TIMELINE_ERRORS, SK_RECOVERY_PULL_TIMELINE_OKS,
+        SK_RECOVERY_PULL_TIMELINE_SECONDS, SK_RECOVERY_PULL_TIMELINES_SECONDS,
+    },
+    pull_timeline,
+    timelines_global_map::DeleteOrExclude,
+};
+
+// Extract information in the SafeKeeperConf to build a NodeRegisterRequest used to register the safekeeper with the HCC.
+fn build_node_registeration_request(
+    conf: &SafeKeeperConf,
+    node_ip_addr: Option<IpAddr>,
+) -> Result<NodeRegisterRequest> {
+    let advertise_pg_addr_with_port = conf
+        .advertise_pg_addr_tenant_only
+        .as_deref()
+        .expect("advertise_pg_addr_tenant_only is required to register with HCC");
+
+    // Extract host/port from the string.
+    let (advertise_host_addr, pg_port_str) = advertise_pg_addr_with_port.split_at(
+        advertise_pg_addr_with_port
+            .rfind(':')
+            .ok_or(anyhow::anyhow!("Invalid advertise_pg_addr"))?,
+    );
+    // Need the `[1..]` to remove the leading ':'.
+    let pg_port = pg_port_str[1..]
+        .parse::<u16>()
+        .map_err(|e| anyhow::anyhow!("Cannot parse PG port: {}", e))?;
+
+    let (_, http_port_str) = conf.listen_http_addr.split_at(
+        conf.listen_http_addr
+            .rfind(':')
+            .ok_or(anyhow::anyhow!("Invalid listen_http_addr"))?,
+    );
+    let http_port = http_port_str[1..]
+        .parse::<u16>()
+        .map_err(|e| anyhow::anyhow!("Cannot parse HTTP port: {}", e))?;
+
+    Ok(NodeRegisterRequest {
+        node_id: conf.my_id,
+        listen_pg_addr: advertise_host_addr.to_string(),
+        listen_pg_port: pg_port,
+        listen_http_addr: advertise_host_addr.to_string(),
+        listen_http_port: http_port,
+        node_ip_addr,
+        availability_zone_id: AvailabilityZone("todo".to_string()),
+        listen_grpc_addr: None,
+        listen_grpc_port: None,
+        listen_https_port: None,
+    })
+}
+
+// Retrieve the JWT token used for authenticating with HCC from the environment variable.
+// Returns None if the token cannot be retrieved.
+fn get_hcc_auth_token() -> Option<String> {
+    match std::env::var("HCC_AUTH_TOKEN") {
+        Ok(v) => {
+            tracing::info!("Loaded JWT token for authentication with HCC");
+            Some(v)
+        }
+        Err(VarError::NotPresent) => {
+            tracing::info!("No JWT token for authentication with HCC detected");
+            None
+        }
+        Err(_) => {
+            tracing::info!(
+                "Failed to either load to detect non-present HCC_AUTH_TOKEN environment variable"
+            );
+            None
+        }
+    }
+}
+
+async fn send_safekeeper_register_request(
+    request_url: &Url,
+    auth_token: &Option<String>,
+    request: &NodeRegisterRequest,
+) -> Result<()> {
+    let client = reqwest::Client::new();
+    let mut req_builder = client
+        .post(request_url.clone())
+        .header("Content-Type", "application/json");
+    if let Some(token) = auth_token {
+        req_builder = req_builder.bearer_auth(token);
+    }
+    req_builder
+        .json(&request)
+        .send()
+        .await?
+        .error_for_status()?;
+    Ok(())
+}
+
+/// Registers this safe keeper with the HCC.
+pub async fn register(conf: &SafeKeeperConf) -> Result<()> {
+    match conf.hcc_base_url.as_ref() {
+        None => {
+            tracing::info!("HCC base URL is not set, skipping registration");
+            Ok(())
+        }
+        Some(hcc_base_url) => {
+            // The following operations acquiring the auth token and the node IP address both read environment
+            // variables. It's fine for now as this `register()` function is only called once during startup.
+            // If we start to talk to HCC more regularly in the safekeeper we should probably consider
+            // refactoring things into a "HadronClusterCoordinatorClient" struct.
+            let auth_token = get_hcc_auth_token();
+            let node_ip_addr =
+                ip_address::read_node_ip_addr_from_env().expect("Error reading node IP address.");
+
+            let request = build_node_registeration_request(conf, node_ip_addr)?;
+            let cancel = CancellationToken::new();
+            let request_url = hcc_base_url.clone().join("/hadron-internal/v1/sk")?;
+
+            backoff::retry(
+                || async {
+                    send_safekeeper_register_request(&request_url, &auth_token, &request).await
+                },
+                |_| false,
+                3,
+                u32::MAX,
+                "Calling the HCC safekeeper register API",
+                &cancel,
+            )
+            .await
+            .ok_or(anyhow::anyhow!(
+                "Error in forever retry loop. This error should never be surfaced."
+            ))?
+        }
+    }
+}
+
+async fn safekeeper_list_timelines_request(
+    conf: &SafeKeeperConf,
+) -> Result<pageserver_api::controller_api::SafekeeperTimelinesResponse> {
+    if conf.hcc_base_url.is_none() {
+        tracing::info!("HCC base URL is not set, skipping registration");
+        return Err(anyhow::anyhow!("HCC base URL is not set"));
+    }
+
+    // The following operations acquiring the auth token and the node IP address both read environment
+    // variables. It's fine for now as this `register()` function is only called once during startup.
+    // If we start to talk to HCC more regularly in the safekeeper we should probably consider
+    // refactoring things into a "HadronClusterCoordinatorClient" struct.
+    let auth_token = get_hcc_auth_token();
+    let method = format!("/control/v1/safekeeper/{}/timelines", conf.my_id.0);
+    let request_url = conf.hcc_base_url.as_ref().unwrap().clone().join(&method)?;
+
+    let client = reqwest::Client::new();
+    let mut req_builder = client
+        .get(request_url.clone())
+        .header("Content-Type", "application/json")
+        .query(&[("id", conf.my_id.0)]);
+    if let Some(token) = auth_token {
+        req_builder = req_builder.bearer_auth(token);
+    }
+    let response = req_builder
+        .send()
+        .await?
+        .error_for_status()?
+        .json::<pageserver_api::controller_api::SafekeeperTimelinesResponse>()
+        .await?;
+    Ok(response)
+}
+
+// Returns true on success, false otherwise.
+pub async fn hcc_pull_timeline(
+    timeline: SafekeeperTimeline,
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+    nodeid_http: &HashMap<u64, String>,
+) -> bool {
+    let mut request = PullTimelineRequest {
+        tenant_id: timeline.tenant_id,
+        timeline_id: timeline.timeline_id,
+        http_hosts: Vec::new(),
+        ignore_tombstone: None,
+    };
+    for host in timeline.peers {
+        if host.0 == conf.my_id.0 {
+            continue;
+        }
+        if let Some(http_host) = nodeid_http.get(&host.0) {
+            request.http_hosts.push(http_host.clone());
+        }
+    }
+
+    let ca_certs = match conf
+        .ssl_ca_certs
+        .iter()
+        .map(Pem::contents)
+        .map(reqwest::Certificate::from_der)
+        .collect::<Result<Vec<_>, _>>()
+    {
+        Ok(result) => result,
+        Err(_) => {
+            return false;
+        }
+    };
+    match pull_timeline::handle_request(
+        request,
+        conf.sk_auth_token.clone(),
+        ca_certs,
+        global_timelines.clone(),
+        true,
+    )
+    .await
+    {
+        Ok(resp) => {
+            tracing::info!(
+                "Completed pulling tenant {} timeline {} from SK {:?}",
+                timeline.tenant_id,
+                timeline.timeline_id,
+                resp.safekeeper_host
+            );
+            return true;
+        }
+        Err(e) => {
+            tracing::error!(
+                "Failed to pull tenant {} timeline {} from SK {}",
+                timeline.tenant_id,
+                timeline.timeline_id,
+                e
+            );
+
+            let ttid = TenantTimelineId {
+                tenant_id: timeline.tenant_id,
+                timeline_id: timeline.timeline_id,
+            };
+            // Revert the failed timeline pull.
+            // Notice that not found timeline returns OK also.
+            match global_timelines
+                .delete_or_exclude(&ttid, DeleteOrExclude::DeleteLocal)
+                .await
+            {
+                Ok(dr) => {
+                    tracing::info!(
+                        "Deleted tenant {} timeline {} DirExists: {}",
+                        timeline.tenant_id,
+                        timeline.timeline_id,
+                        dr.dir_existed,
+                    );
+                }
+                Err(e) => {
+                    tracing::error!(
+                        "Failed to delete tenant {} timeline {} from global_timelines: {}",
+                        timeline.tenant_id,
+                        timeline.timeline_id,
+                        e
+                    );
+                }
+            }
+        }
+    }
+    false
+}
+
+pub async fn hcc_pull_timeline_till_success(
+    timeline: SafekeeperTimeline,
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+    nodeid_http: &HashMap<u64, String>,
+) {
+    const MAX_PULL_TIMELINE_RETRIES: u64 = 100;
+    for i in 0..MAX_PULL_TIMELINE_RETRIES {
+        if hcc_pull_timeline(
+            timeline.clone(),
+            conf,
+            global_timelines.clone(),
+            nodeid_http,
+        )
+        .await
+        {
+            SK_RECOVERY_PULL_TIMELINE_OKS.inc();
+            return;
+        }
+        tracing::error!(
+            "Failed to pull timeline {} from SK peers, retrying {}/{}",
+            timeline.timeline_id,
+            i + 1,
+            MAX_PULL_TIMELINE_RETRIES
+        );
+        tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+    }
+    SK_RECOVERY_PULL_TIMELINE_ERRORS.inc();
+}
+
+pub async fn hcc_pull_timelines(
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+) -> Result<()> {
+    let _timer = SK_RECOVERY_PULL_TIMELINES_SECONDS.start_timer();
+    tracing::info!("Start pulling timelines from SK peers");
+
+    let mut response = SafekeeperTimelinesResponse {
+        timelines: Vec::new(),
+        safekeeper_peers: Vec::new(),
+    };
+    for i in 0..100 {
+        match safekeeper_list_timelines_request(conf).await {
+            Ok(timelines) => {
+                response = timelines;
+            }
+            Err(e) => {
+                tracing::error!("Failed to list timelines from HCC: {}", e);
+                if i == 99 {
+                    return Err(e);
+                }
+            }
+        }
+        sleep(Duration::from_millis(100)).await;
+    }
+
+    let mut nodeid_http = HashMap::new();
+    for sk in response.safekeeper_peers {
+        nodeid_http.insert(
+            sk.node_id.0,
+            format!("http://{}:{}", sk.listen_http_addr, sk.http_port),
+        );
+    }
+    tracing::info!("Received {} timelines from HCC", response.timelines.len());
+    for timeline in response.timelines {
+        let _timer = SK_RECOVERY_PULL_TIMELINE_SECONDS
+            .with_label_values(&[
+                &timeline.tenant_id.to_string(),
+                &timeline.timeline_id.to_string(),
+            ])
+            .start_timer();
+        hcc_pull_timeline_till_success(timeline, conf, global_timelines.clone(), &nodeid_http)
+            .await;
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use utils::id::NodeId;
+
+    #[test]
+    fn test_build_node_registeration_request() {
+        // Test that:
+        // 1. We always extract the host name and port used to register with the HCC from the
+        //    `advertise_pg_addr` if it is set.
+        // 2. The correct ports are extracted from `advertise_pg_addr` and `listen_http_addr`.
+        let mut conf = SafeKeeperConf::dummy();
+        conf.my_id = NodeId(1);
+        conf.advertise_pg_addr_tenant_only =
+            Some("safe-keeper-1.safe-keeper.hadron.svc.cluster.local:5454".to_string());
+        // `listen_pg_addr` and `listen_pg_addr_tenant_only` are not used for node registration. Set them to a different
+        // host and port values and make sure that they don't show up in the node registration request.
+        conf.listen_pg_addr = "0.0.0.0:5456".to_string();
+        conf.listen_pg_addr_tenant_only = Some("0.0.0.0:5456".to_string());
+        conf.listen_http_addr = "0.0.0.0:7676".to_string();
+        let node_ip_addr: Option<IpAddr> = Some("127.0.0.1".parse().unwrap());
+
+        let request = build_node_registeration_request(&conf, node_ip_addr).unwrap();
+        assert_eq!(request.node_id, NodeId(1));
+        assert_eq!(
+            request.listen_pg_addr,
+            "safe-keeper-1.safe-keeper.hadron.svc.cluster.local"
+        );
+        assert_eq!(request.listen_pg_port, 5454);
+        assert_eq!(
+            request.listen_http_addr,
+            "safe-keeper-1.safe-keeper.hadron.svc.cluster.local"
+        );
+        assert_eq!(request.listen_http_port, 7676);
+        assert_eq!(
+            request.node_ip_addr,
+            Some(IpAddr::V4("127.0.0.1".parse().unwrap()))
+        );
+    }
+}
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -241,9 +241,14 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
            ApiError::InternalServerError(anyhow::anyhow!("failed to parse CA certs: {e}"))
        })?;

-    let resp =
-        pull_timeline::handle_request(data, conf.sk_auth_token.clone(), ca_certs, global_timelines)
-            .await?;
+    let resp = pull_timeline::handle_request(
+        data,
+        conf.sk_auth_token.clone(),
+        ca_certs,
+        global_timelines,
+        false,
+    )
+    .await?;
    json_response(StatusCode::OK, resp)
 }

--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -10,6 +10,7 @@ use pem::Pem;
 use remote_storage::RemoteStorageConfig;
 use storage_broker::Uri;
 use tokio::runtime::Runtime;
+use url::Url;
 use utils::auth::SwappableJwtAuth;
 use utils::id::NodeId;
 use utils::logging::SecretString;
@@ -20,6 +21,7 @@ pub mod control_file;
 pub mod control_file_upgrade;
 pub mod copy_timeline;
 pub mod debug_dump;
+pub mod hadron;
 pub mod handler;
 pub mod http;
 pub mod metrics;
@@ -100,6 +102,11 @@ pub struct SafeKeeperConf {
    pub advertise_pg_addr: Option<String>,
    pub availability_zone: Option<String>,
    pub no_sync: bool,
+    /* BEGIN_HADRON */
+    pub advertise_pg_addr_tenant_only: Option<String>,
+    pub enable_pull_timeline_on_startup: bool,
+    pub hcc_base_url: Option<Url>,
+    /* END_HADRON */
    pub broker_endpoint: Uri,
    pub broker_keepalive_interval: Duration,
    pub heartbeat_timeout: Duration,
@@ -185,6 +192,11 @@ impl SafeKeeperConf {
            use_https_safekeeper_api: false,
            enable_tls_wal_service_api: false,
            force_metric_collection_on_scrape: true,
+            /* BEGIN_HADRON */
+            advertise_pg_addr_tenant_only: None,
+            enable_pull_timeline_on_startup: false,
+            hcc_base_url: None,
+            /* END_HADRON */
        }
    }
 }
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -85,6 +85,43 @@ pub static WAL_STORAGE_LIMIT_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
    )
    .expect("Failed to register safekeeper_wal_storage_limit_errors counter")
 });
+pub static SK_RECOVERY_PULL_TIMELINE_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_recovery_pull_timeline_errors",
+        concat!(
+            "Number of errors due to pull_timeline errors during SK lost disk recovery.",
+            "An increase in this metric indicates pull timelines runs into error."
+        )
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_errors counter")
+});
+pub static SK_RECOVERY_PULL_TIMELINE_OKS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_recovery_pull_timeline_oks",
+        concat!(
+            "Number of successful pull_timeline during SK lost disk recovery.",
+            "An increase in this metric indicates pull timelines is successful."
+        )
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_oks counter")
+});
+pub static SK_RECOVERY_PULL_TIMELINES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_recovery_pull_timelines_seconds",
+        "Seconds to pull timelines",
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timelines_seconds histogram")
+});
+pub static SK_RECOVERY_PULL_TIMELINE_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "safekeeper_recovery_pull_timeline_seconds",
+        "Seconds to pull timeline",
+        &["tenant_id", "timeline_id"],
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_seconds histogram vec")
+});
 /* END_HADRON */
 pub static PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -8,6 +8,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
+use http::StatusCode;
 use http_utils::error::ApiError;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
 use remote_storage::GenericRemoteStorage;
@@ -21,10 +22,11 @@ use tokio::fs::OpenOptions;
 use tokio::io::AsyncWrite;
 use tokio::sync::mpsc;
 use tokio::task;
+use tokio::time::sleep;
 use tokio_tar::{Archive, Builder, Header};
 use tokio_util::io::{CopyToBytes, SinkWriter};
 use tokio_util::sync::PollSender;
-use tracing::{error, info, instrument};
+use tracing::{error, info, instrument, warn};
 use utils::crashsafe::fsync_async_opt;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::logging::SecretString;
@@ -449,6 +451,7 @@ pub async fn handle_request(
    sk_auth_token: Option<SecretString>,
    ssl_ca_certs: Vec<Certificate>,
    global_timelines: Arc<GlobalTimelines>,
+    wait_for_peer_timeline_status: bool,
 ) -> Result<PullTimelineResponse, ApiError> {
    let existing_tli = global_timelines.get(TenantTimelineId::new(
        request.tenant_id,
@@ -472,37 +475,100 @@ pub async fn handle_request(
    let http_hosts = request.http_hosts.clone();

    // Figure out statuses of potential donors.
-    let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
-        futures::future::join_all(http_hosts.iter().map(|url| async {
-            let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
-            let info = cclient
-                .timeline_status(request.tenant_id, request.timeline_id)
-                .await?;
-            Ok(info)
-        }))
-        .await;
-
    let mut statuses = Vec::new();
-    for (i, response) in responses.into_iter().enumerate() {
-        match response {
-            Ok(status) => {
-                statuses.push((status, i));
-            }
-            Err(e) => {
-                info!("error fetching status from {}: {e}", http_hosts[i]);
+    if !wait_for_peer_timeline_status {
+        let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
+            futures::future::join_all(http_hosts.iter().map(|url| async {
+                let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
+                let resp = cclient
+                    .timeline_status(request.tenant_id, request.timeline_id)
+                    .await?;
+                let info: TimelineStatus = resp
+                    .json()
+                    .await
+                    .context("Failed to deserialize timeline status")
+                    .map_err(|e| mgmt_api::Error::ReceiveErrorBody(e.to_string()))?;
+                Ok(info)
+            }))
+            .await;
+
+        for (i, response) in responses.into_iter().enumerate() {
+            match response {
+                Ok(status) => {
+                    statuses.push((status, i));
+                }
+                Err(e) => {
+                    info!("error fetching status from {}: {e}", http_hosts[i]);
+                }
            }
        }
-    }

-    // Allow missing responses from up to one safekeeper (say due to downtime)
-    // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
-    // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
-    let min_required_successful = (http_hosts.len() - 1).max(1);
-    if statuses.len() < min_required_successful {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "only got {} successful status responses. required: {min_required_successful}",
-            statuses.len()
-        )));
+        // Allow missing responses from up to one safekeeper (say due to downtime)
+        // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
+        // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
+        let min_required_successful = (http_hosts.len() - 1).max(1);
+        if statuses.len() < min_required_successful {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "only got {} successful status responses. required: {min_required_successful}",
+                statuses.len()
+            )));
+        }
+    } else {
+        let mut retry = true;
+        // We must get status from all other peers.
+        // Otherwise, we may run into split-brain scenario.
+        while retry {
+            statuses.clear();
+            retry = false;
+            for (i, url) in http_hosts.iter().enumerate() {
+                let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
+                match cclient
+                    .timeline_status(request.tenant_id, request.timeline_id)
+                    .await
+                {
+                    Ok(resp) => {
+                        if resp.status() == StatusCode::NOT_FOUND {
+                            warn!(
+                                "Timeline {} not found on peer SK {}, no need to pull it",
+                                TenantTimelineId::new(request.tenant_id, request.timeline_id),
+                                url
+                            );
+                            return Ok(PullTimelineResponse {
+                                safekeeper_host: None,
+                            });
+                        }
+                        let info: TimelineStatus = resp
+                            .json()
+                            .await
+                            .context("Failed to deserialize timeline status")
+                            .map_err(ApiError::InternalServerError)?;
+                        statuses.push((info, i));
+                    }
+                    Err(e) => {
+                        match e {
+                            // If we get a 404, it means the timeline doesn't exist on this safekeeper.
+                            // We can ignore this error.
+                            mgmt_api::Error::ApiError(status, _)
+                                if status == StatusCode::NOT_FOUND =>
+                            {
+                                warn!(
+                                    "Timeline {} not found on peer SK {}, no need to pull it",
+                                    TenantTimelineId::new(request.tenant_id, request.timeline_id),
+                                    url
+                                );
+                                return Ok(PullTimelineResponse {
+                                    safekeeper_host: None,
+                                });
+                            }
+                            _ => {}
+                        }
+                        retry = true;
+                        error!("Failed to get timeline status from {}: {:#}", url, e);
+                    }
+                }
+            }
+            sleep(std::time::Duration::from_millis(100)).await;
+        }
    }

    // Find the most advanced safekeeper
@@ -511,6 +577,12 @@ pub async fn handle_request(
        .max_by_key(|(status, _)| {
            (
                status.acceptor_state.epoch,
+                /* BEGIN_HADRON */
+                // We need to pull from the SK with the highest term.
+                // This is because another compute may come online and vote the same highest term again on the other two SKs.
+                // Then, there will be 2 computes running on the same term.
+                status.acceptor_state.term,
+                /* END_HADRON */
                status.flush_lsn,
                status.commit_lsn,
            )
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -191,6 +191,11 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
        use_https_safekeeper_api: false,
        enable_tls_wal_service_api: false,
        force_metric_collection_on_scrape: true,
+        /* BEGIN_HADRON */
+        enable_pull_timeline_on_startup: false,
+        advertise_pg_addr_tenant_only: None,
+        hcc_base_url: None,
+        /* END_HADRON */
    };

    let mut global = GlobalMap::new(disk, conf.clone())?;
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -850,6 +850,31 @@ async fn handle_tenant_describe(
    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }

+/* BEGIN_HADRON */
+async fn handle_tenant_timeline_describe(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Scrubber)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_timeline_describe(tenant_id, timeline_id)
+            .await?,
+    )
+}
+/* END_HADRON */
+
 async fn handle_tenant_list(
    service: Arc<Service>,
    req: Request<Body>,
@@ -2480,6 +2505,13 @@ pub fn make_router(
            )
        })
        // Timeline operations
+        .get("/control/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_describe,
+                RequestName("v1_tenant_timeline_describe"),
+            )
+        })
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            tenant_service_handler(
                r,
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -222,6 +222,9 @@ struct Cli {
    /// Primarily useful for testing to reduce test execution time.
    #[arg(long, default_value = "false", action=ArgAction::Set)]
    kick_secondary_downloads: bool,
+
+    #[arg(long)]
+    shard_split_request_timeout: Option<humantime::Duration>,
 }

 enum StrictMode {
@@ -470,6 +473,10 @@ async fn async_main() -> anyhow::Result<()> {
        timeline_safekeeper_count: args.timeline_safekeeper_count,
        posthog_config: posthog_config.clone(),
        kick_secondary_downloads: args.kick_secondary_downloads,
+        shard_split_request_timeout: args
+            .shard_split_request_timeout
+            .map(humantime::Duration::into)
+            .unwrap_or(Duration::MAX),
    };

    // Validate that we can connect to the database
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`drop function neon_communicator_min_inflight_request_lsn();`