Resolve merge conflicts

Fix rust formatting
Make it possible to control lazy_sru_download through tenant config
2026-05-23 16:10:37 +00:00 · 2025-04-16 08:20:46 +03:00 · 2025-04-16 07:49:47 +03:00 · 2025-04-16 07:49:47 +03:00 · 2025-04-16 07:49:46 +03:00 · 2025-04-16 07:49:44 +03:00
113 changed files with 4831 additions and 1713 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -6,6 +6,7 @@ self-hosted-runner:
    - small
    - small-metal
    - small-arm64
+    - unit-perf
    - us-east-2
 config-variables:
  - AWS_ECR_REGION
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -70,6 +70,7 @@ runs:

    - name: Install Allure
      shell: bash -euxo pipefail {0}
+      working-directory: /tmp
      run: |
        if ! which allure; then
          ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
--- a/.github/workflows/_create-release-pr.yml
+++ b/.github/workflows/_create-release-pr.yml
@@ -53,10 +53,13 @@ jobs:
            || inputs.component-name == 'Compute' && 'release-compute'
          }}
      run: |
-        today=$(date +'%Y-%m-%d')
-        echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT}
-        echo "rc-branch=rc/${RELEASE_BRANCH}/${today}"  | tee -a ${GITHUB_OUTPUT}
-        echo "release-branch=${RELEASE_BRANCH}"         | tee -a ${GITHUB_OUTPUT}
+        now_date=$(date -u +'%Y-%m-%d')
+        now_time=$(date -u +'%H-%M-%Z')
+        {
+          echo "title=${COMPONENT_NAME} release ${now_date}"
+          echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}"
+          echo "release-branch=${RELEASE_BRANCH}"
+        } | tee -a ${GITHUB_OUTPUT}

    - name: Configure git
      run: |
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -284,7 +284,7 @@ jobs:
      statuses: write
      contents: write
      pull-requests: write
-    runs-on: [ self-hosted, small-metal ]
+    runs-on: [ self-hosted, unit-perf ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
@@ -1271,7 +1271,7 @@ jobs:
          exit 1

  deploy:
-    needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
+    needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, trigger-custom-extensions-build-and-wait ]
    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
    if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
    permissions:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1416,6 +1416,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "base64 0.13.1",
 "camino",
 "clap",
 "comfy-table",
@@ -1425,10 +1426,13 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.30",
+ "jsonwebtoken",
 "nix 0.27.1",
 "once_cell",
 "pageserver_api",
 "pageserver_client",
+ "pem",
+ "pkcs8 0.10.2",
 "postgres_backend",
 "postgres_connection",
 "regex",
@@ -1437,6 +1441,7 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
+ "sha2",
 "storage_broker",
 "thiserror 1.0.69",
 "tokio",
@@ -2817,6 +2822,7 @@ dependencies = [
 "hyper 0.14.30",
 "itertools 0.10.5",
 "jemalloc_pprof",
+ "jsonwebtoken",
 "metrics",
 "once_cell",
 "pprof",
@@ -2837,6 +2843,7 @@ dependencies = [
 "utils",
 "uuid",
 "workspace_hack",
+ "x509-cert",
 ]

 [[package]]
@@ -4268,6 +4275,7 @@ dependencies = [
 "hyper 0.14.30",
 "indoc",
 "itertools 0.10.5",
+ "jsonwebtoken",
 "md5",
 "metrics",
 "nix 0.27.1",
@@ -5684,9 +5692,9 @@ dependencies = [

 [[package]]
 name = "ring"
-version = "0.17.13"
+version = "0.17.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
 dependencies = [
 "cc",
 "cfg-if",
@@ -5987,6 +5995,7 @@ dependencies = [
 "humantime",
 "hyper 0.14.30",
 "itertools 0.10.5",
+ "jsonwebtoken",
 "metrics",
 "once_cell",
 "pageserver_api",
@@ -7871,6 +7880,7 @@ dependencies = [
 "metrics",
 "nix 0.27.1",
 "once_cell",
+ "pem",
 "pin-project-lite",
 "postgres_connection",
 "pprof",
@@ -8459,6 +8469,7 @@ dependencies = [
 "once_cell",
 "p256 0.13.2",
 "parquet",
+ "pkcs8 0.10.2",
 "prettyplease",
 "proc-macro2",
 "prost 0.13.3",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -141,7 +141,9 @@ parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
+pem = "3.0.3"
 pin-project-lite = "0.2"
+pkcs8 = "0.10.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -29,13 +29,12 @@
 //! ```sh
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
-//!             -S /var/db/postgres/specs/current.json \
+//!             -c /var/db/postgres/configs/config.json \
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
 use std::ffi::OsString;
 use std::fs::File;
-use std::path::Path;
 use std::process::exit;
 use std::sync::mpsc;
 use std::thread;
@@ -43,8 +42,7 @@ use std::time::Duration;

 use anyhow::{Context, Result};
 use clap::Parser;
-use compute_api::responses::ComputeCtlConfig;
-use compute_api::spec::ComputeSpec;
+use compute_api::responses::ComputeConfig;
 use compute_tools::compute::{
    BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
 };
@@ -118,8 +116,10 @@ struct Cli {
    #[arg(long)]
    pub set_disk_quota_for_fs: Option<String>,

-    #[arg(short = 'S', long, group = "spec-path")]
-    pub spec_path: Option<OsString>,
+    // TODO(tristan957): remove alias after compatibility tests are no longer
+    // an issue
+    #[arg(short = 'c', long, alias = "spec-path")]
+    pub config: Option<OsString>,

    #[arg(short = 'i', long, group = "compute-id")]
    pub compute_id: String,
@@ -127,8 +127,9 @@ struct Cli {
    #[arg(
        short = 'p',
        long,
-        conflicts_with = "spec-path",
-        value_name = "CONTROL_PLANE_API_BASE_URL"
+        conflicts_with = "config",
+        value_name = "CONTROL_PLANE_API_BASE_URL",
+        requires = "compute-id"
    )]
    pub control_plane_uri: Option<String>,
 }
@@ -138,7 +139,7 @@ fn main() -> Result<()> {

    let scenario = failpoint_support::init();

-    // For historical reasons, the main thread that processes the spec and launches postgres
+    // For historical reasons, the main thread that processes the config and launches postgres
    // is synchronous, but we always have this tokio runtime available and we "enter" it so
    // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
    // from all parts of compute_ctl.
@@ -154,7 +155,7 @@ fn main() -> Result<()> {

    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;

-    let cli_spec = try_spec_from_cli(&cli)?;
+    let config = get_config(&cli)?;

    let compute_node = ComputeNode::new(
        ComputeNodeParams {
@@ -175,8 +176,7 @@ fn main() -> Result<()> {
            #[cfg(target_os = "linux")]
            vm_monitor_addr: cli.vm_monitor_addr,
        },
-        cli_spec.spec,
-        cli_spec.compute_ctl_config,
+        config,
    )?;

    let exit_code = compute_node.run()?;
@@ -201,27 +201,17 @@ async fn init() -> Result<()> {
    Ok(())
 }

-fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
-    // First, read spec from the path if provided
-    if let Some(ref spec_path) = cli.spec_path {
-        let file = File::open(Path::new(spec_path))?;
-        return Ok(CliSpecParams {
-            spec: Some(serde_json::from_reader(file)?),
-            compute_ctl_config: ComputeCtlConfig::default(),
-        });
+fn get_config(cli: &Cli) -> Result<ComputeConfig> {
+    // First, read the config from the path if provided
+    if let Some(ref config) = cli.config {
+        let file = File::open(config)?;
+        return Ok(serde_json::from_reader(&file)?);
    }

-    if cli.control_plane_uri.is_none() {
-        panic!("must specify --control-plane-uri");
-    };
-
-    // If the spec wasn't provided in the CLI arguments, then retrieve it from
+    // If the config wasn't provided in the CLI arguments, then retrieve it from
    // the control plane
-    match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
-        Ok(resp) => Ok(CliSpecParams {
-            spec: resp.0,
-            compute_ctl_config: resp.1,
-        }),
+    match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
+        Ok(config) => Ok(config),
        Err(e) => {
            error!(
                "cannot get response from control plane: {}\n\
@@ -233,13 +223,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
    }
 }

-struct CliSpecParams {
-    /// If a spec was provided via CLI or file, the [`ComputeSpec`]
-    spec: Option<ComputeSpec>,
-    #[allow(dead_code)]
-    compute_ctl_config: ComputeCtlConfig,
-}
-
 fn deinit_and_exit(exit_code: Option<i32>) -> ! {
    // Shutdown trace pipeline gracefully, so that it has a chance to send any
    // pending traces before we exit. Shutting down OTEL tracing provider may
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -11,7 +11,7 @@ use std::{env, fs};
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
-use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus};
+use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus};
 use compute_api::spec::{
    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
 };
@@ -303,11 +303,7 @@ struct StartVmMonitorResult {
 }

 impl ComputeNode {
-    pub fn new(
-        params: ComputeNodeParams,
-        cli_spec: Option<ComputeSpec>,
-        compute_ctl_config: ComputeCtlConfig,
-    ) -> Result<Self> {
+    pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result<Self> {
        let connstr = params.connstr.as_str();
        let conn_conf = postgres::config::Config::from_str(connstr)
            .context("cannot build postgres config from connstr")?;
@@ -315,8 +311,8 @@ impl ComputeNode {
            .context("cannot build tokio postgres config from connstr")?;

        let mut new_state = ComputeState::new();
-        if let Some(cli_spec) = cli_spec {
-            let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
+        if let Some(spec) = config.spec {
+            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
            new_state.pspec = Some(pspec);
        }

@@ -327,7 +323,7 @@ impl ComputeNode {
            state: Mutex::new(new_state),
            state_changed: Condvar::new(),
            ext_download_progress: RwLock::new(HashMap::new()),
-            compute_ctl_config,
+            compute_ctl_config: config.compute_ctl_config,
        })
    }

@@ -523,11 +519,14 @@ impl ComputeNode {

        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
-            "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
+            "starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
            pspec.tenant_id,
            pspec.timeline_id,
+            pspec.spec.project_id.as_deref().unwrap_or("None"),
+            pspec.spec.branch_id.as_deref().unwrap_or("None"),
+            pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
            pspec.spec.features,
            pspec.spec.remote_extensions,
        );
@@ -631,19 +630,23 @@ impl ComputeNode {
            });
        }

-        // Configure and start rsyslog for HIPAA if necessary
-        if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
-            let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
-            if remote_endpoint.is_empty() {
-                anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+        // Configure and start rsyslog for compliance audit logging
+        match pspec.spec.audit_log_level {
+            ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
+                let remote_endpoint =
+                    std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
+                if remote_endpoint.is_empty() {
+                    anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+                }
+
+                let log_directory_path = Path::new(&self.params.pgdata).join("log");
+                let log_directory_path = log_directory_path.to_string_lossy().to_string();
+                configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
+
+                // Launch a background task to clean up the audit logs
+                launch_pgaudit_gc(log_directory_path);
            }
-
-            let log_directory_path = Path::new(&self.params.pgdata).join("log");
-            let log_directory_path = log_directory_path.to_string_lossy().to_string();
-            configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
-
-            // Launch a background task to clean up the audit logs
-            launch_pgaudit_gc(log_directory_path);
+            _ => {}
        }

        // Configure and start rsyslog for Postgres logs export
@@ -894,32 +897,28 @@ impl ComputeNode {
        let mut client = config.connect(NoTls)?;
        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;

-        let basebackup_cmd = match lsn {
-            Lsn(0) => {
-                if spec.spec.mode != ComputeMode::Primary {
-                    format!(
-                        "basebackup {} {} --gzip --replica",
-                        spec.tenant_id, spec.timeline_id
-                    )
-                } else {
-                    format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
-                }
-            }
-            _ => {
-                if spec.spec.mode != ComputeMode::Primary {
-                    format!(
-                        "basebackup {} {} {} --gzip --replica",
-                        spec.tenant_id, spec.timeline_id, lsn
-                    )
-                } else {
-                    format!(
-                        "basebackup {} {} {} --gzip",
-                        spec.tenant_id, spec.timeline_id, lsn
-                    )
-                }
-            }
-        };
-
+        let tenant_id = spec.tenant_id.to_string();
+        let timeline_id = spec.timeline_id.to_string();
+        let lsn_str = lsn.to_string();
+        let mut cmd = Vec::new();
+        cmd.push("basebackup");
+        cmd.push(&tenant_id);
+        cmd.push(&timeline_id);
+        if lsn != Lsn::INVALID {
+            cmd.push(&lsn_str);
+        }
+        cmd.push("--gzip");
+        if spec.spec.mode != ComputeMode::Primary {
+            cmd.push("--replica");
+        }
+        if spec
+            .spec
+            .features
+            .contains(&ComputeFeature::LazySlruDownload)
+        {
+            cmd.push("--lazy-slru-download")
+        }
+        let basebackup_cmd = cmd.join(" ");
        let copyreader = client.copy_out(basebackup_cmd.as_str())?;
        let mut measured_reader = MeasuredReader::new(copyreader);
        let mut bufreader = std::io::BufReader::new(&mut measured_reader);
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -89,6 +89,15 @@ pub fn write_postgres_conf(
            escape_conf_value(&s.to_string())
        )?;
    }
+    if let Some(s) = &spec.project_id {
+        writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.branch_id {
+        writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.endpoint_id {
+        writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
+    }

    // tls
    if let Some(tls_config) = tls_config {
@@ -169,7 +178,7 @@ pub fn write_postgres_conf(
    // and don't allow the user or the control plane admin to change them.
    match spec.audit_log_level {
        ComputeAudit::Disabled => {}
-        ComputeAudit::Log => {
+        ComputeAudit::Log | ComputeAudit::Base => {
            writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
            writeln!(file, "pgaudit.log='ddl,role'")?;
            // Disable logging of catalog queries to reduce the noise
@@ -193,16 +202,20 @@ pub fn write_postgres_conf(
            }
            writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
        }
-        ComputeAudit::Hipaa => {
+        ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
            writeln!(
                file,
                "# Managed by compute_ctl compliance audit settings: begin"
            )?;
-            // This log level is very verbose
-            // but this is necessary for HIPAA compliance.
-            // Exclude 'misc' category, because it doesn't contain anythig relevant.
-            writeln!(file, "pgaudit.log='all, -misc'")?;
-            writeln!(file, "pgaudit.log_parameter=on")?;
+            // Enable logging of parameters.
+            // This is very verbose and may contain sensitive data.
+            if spec.audit_log_level == ComputeAudit::Full {
+                writeln!(file, "pgaudit.log_parameter=on")?;
+                writeln!(file, "pgaudit.log='all'")?;
+            } else {
+                writeln!(file, "pgaudit.log_parameter=off")?;
+                writeln!(file, "pgaudit.log='all, -misc'")?;
+            }
            // Disable logging of catalog queries
            // The catalog doesn't contain sensitive data, so we don't need to audit it.
            writeln!(file, "pgaudit.log_catalog=off")?;
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -11,7 +11,7 @@ use futures::future::BoxFuture;
 use http::{Request, Response, StatusCode};
 use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
 use tower_http::auth::AsyncAuthorizeRequest;
-use tracing::warn;
+use tracing::{debug, warn};

 use crate::http::{JsonResponse, extract::RequestId};

@@ -54,8 +54,8 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
        Box::pin(async move {
            let request_id = request.extract_parts::<RequestId>().await.unwrap();

-            // TODO: Remove this stanza after teaching neon_local and the
-            // regression tests to use a JWT + JWKS.
+            // TODO(tristan957): Remove this stanza after teaching neon_local
+            // and the regression tests to use a JWT + JWKS.
            //
            // https://github.com/neondatabase/neon/issues/11316
            if cfg!(feature = "testing") {
@@ -92,7 +92,7 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
            if data.claims.compute_id != compute_id {
                return Err(JsonResponse::error(
                    StatusCode::UNAUTHORIZED,
-                    "invalid claims in authorization token",
+                    "invalid compute ID in authorization token claims",
                ));
            }

@@ -112,12 +112,16 @@ impl Authorize {
        token: &str,
        validation: &Validation,
    ) -> Result<TokenData<ComputeClaims>> {
+        debug_assert!(!jwks.keys.is_empty());
+
+        debug!("verifying token {}", token);
+
        for jwk in jwks.keys.iter() {
            let decoding_key = match DecodingKey::from_jwk(jwk) {
                Ok(key) => key,
                Err(e) => {
                    warn!(
-                        "Failed to construct decoding key from {}: {}",
+                        "failed to construct decoding key from {}: {}",
                        jwk.common.key_id.as_ref().unwrap(),
                        e
                    );
@@ -130,7 +134,7 @@ impl Authorize {
                Ok(data) => return Ok(data),
                Err(e) => {
                    warn!(
-                        "Failed to decode authorization token using {}: {}",
+                        "failed to decode authorization token using {}: {}",
                        jwk.common.key_id.as_ref().unwrap(),
                        e
                    );
@@ -140,6 +144,6 @@ impl Authorize {
            }
        }

-        Err(anyhow!("Failed to verify authorization token"))
+        Err(anyhow!("failed to verify authorization token"))
    }
 }
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -19,13 +19,13 @@ pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
 // but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
 // And it's fair to call it a 'RPC' (Remote Procedure Call).
 pub enum CPlaneRequestRPC {
-    GetSpec,
+    GetConfig,
 }

 impl CPlaneRequestRPC {
    pub fn as_str(&self) -> &str {
        match self {
-            CPlaneRequestRPC::GetSpec => "GetSpec",
+            CPlaneRequestRPC::GetConfig => "GetConfig",
        }
    }
 }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -3,9 +3,8 @@ use std::path::Path;

 use anyhow::{Result, anyhow, bail};
 use compute_api::responses::{
-    ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
+    ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse,
 };
-use compute_api::spec::ComputeSpec;
 use reqwest::StatusCode;
 use tokio_postgres::Client;
 use tracing::{error, info, instrument};
@@ -21,7 +20,7 @@ use crate::params::PG_HBA_ALL_MD5;
 fn do_control_plane_request(
    uri: &str,
    jwt: &str,
-) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
+) -> Result<ControlPlaneConfigResponse, (bool, String, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
        .header("Authorization", format!("Bearer {}", jwt))
@@ -29,14 +28,14 @@ fn do_control_plane_request(
        .map_err(|e| {
            (
                true,
-                format!("could not perform spec request to control plane: {:?}", e),
+                format!("could not perform request to control plane: {:?}", e),
                UNKNOWN_HTTP_STATUS.to_string(),
            )
        })?;

    let status = resp.status();
    match status {
-        StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
+        StatusCode::OK => match resp.json::<ControlPlaneConfigResponse>() {
            Ok(spec_resp) => Ok(spec_resp),
            Err(e) => Err((
                true,
@@ -69,40 +68,35 @@ fn do_control_plane_request(
    }
 }

-/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
-/// env variable is set, it will be used for authorization.
-pub fn get_spec_from_control_plane(
-    base_uri: &str,
-    compute_id: &str,
-) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
+/// Request config from the control-plane by compute_id. If
+/// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for
+/// authorization.
+pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeConfig> {
    let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
-    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
-        Ok(v) => v,
-        Err(_) => "".to_string(),
-    };
+    let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default();
    let mut attempt = 1;

-    info!("getting spec from control plane: {}", cp_uri);
+    info!("getting config from control plane: {}", cp_uri);

    // Do 3 attempts to get spec from the control plane using the following logic:
    // - network error -> then retry
    // - compute id is unknown or any other error -> bail out
    // - no spec for compute yet (Empty state) -> return Ok(None)
-    // - got spec -> return Ok(Some(spec))
+    // - got config -> return Ok(Some(config))
    while attempt < 4 {
        let result = match do_control_plane_request(&cp_uri, &jwt) {
-            Ok(spec_resp) => {
+            Ok(config_resp) => {
                CPLANE_REQUESTS_TOTAL
                    .with_label_values(&[
-                        CPlaneRequestRPC::GetSpec.as_str(),
+                        CPlaneRequestRPC::GetConfig.as_str(),
                        &StatusCode::OK.to_string(),
                    ])
                    .inc();
-                match spec_resp.status {
-                    ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
+                match config_resp.status {
+                    ControlPlaneComputeStatus::Empty => Ok(config_resp.into()),
                    ControlPlaneComputeStatus::Attached => {
-                        if let Some(spec) = spec_resp.spec {
-                            Ok((Some(spec), spec_resp.compute_ctl_config))
+                        if config_resp.spec.is_some() {
+                            Ok(config_resp.into())
                        } else {
                            bail!("compute is attached, but spec is empty")
                        }
@@ -111,7 +105,7 @@ pub fn get_spec_from_control_plane(
            }
            Err((retry, msg, status)) => {
                CPLANE_REQUESTS_TOTAL
-                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
+                    .with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status])
                    .inc();
                if retry {
                    Err(anyhow!(msg))
@@ -122,7 +116,7 @@ pub fn get_spec_from_control_plane(
        };

        if let Err(e) = &result {
-            error!("attempt {} to get spec failed with: {}", attempt, e);
+            error!("attempt {} to get config failed with: {}", attempt, e);
        } else {
            return result;
        }
@@ -133,13 +127,13 @@ pub fn get_spec_from_control_plane(

    // All attempts failed, return error.
    Err(anyhow::anyhow!(
-        "Exhausted all attempts to retrieve the spec from the control plane"
+        "Exhausted all attempts to retrieve the config from the control plane"
    ))
 }

 /// Check `pg_hba.conf` and update if needed to allow external connections.
 pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
-    // XXX: consider making it a part of spec.json
+    // XXX: consider making it a part of config.json
    let pghba_path = pgdata_path.join("pg_hba.conf");

    if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
@@ -153,7 +147,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {

 /// Create a standby.signal file
 pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
-    // XXX: consider making it a part of spec.json
+    // XXX: consider making it a part of config.json
    let signalfile = pgdata_path.join("standby.signal");

    if !signalfile.exists() {
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -278,12 +278,12 @@ impl ComputeNode {
            // so that all config operations are audit logged.
            match spec.audit_log_level
            {
-                ComputeAudit::Hipaa => {
+                ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
                    phases.push(CreatePgauditExtension);
                    phases.push(CreatePgauditlogtofileExtension);
                    phases.push(DisablePostgresDBPgAudit);
                }
-                ComputeAudit::Log => {
+                ComputeAudit::Log | ComputeAudit::Base => {
                    phases.push(CreatePgauditExtension);
                    phases.push(DisablePostgresDBPgAudit);
                }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,13 +6,17 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+base64.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 humantime.workspace = true
+jsonwebtoken.workspace = true
 nix.workspace = true
 once_cell.workspace = true
+pem.workspace = true
+pkcs8.workspace = true
 humantime-serde.workspace = true
 hyper0.workspace = true
 regex.workspace = true
@@ -20,6 +24,7 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+sha2.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 toml_edit.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -552,6 +552,7 @@ enum EndpointCmd {
    Start(EndpointStartCmdArgs),
    Reconfigure(EndpointReconfigureCmdArgs),
    Stop(EndpointStopCmdArgs),
+    GenerateJwt(EndpointGenerateJwtCmdArgs),
 }

 #[derive(clap::Args)]
@@ -699,6 +700,13 @@ struct EndpointStopCmdArgs {
    mode: String,
 }

+#[derive(clap::Args)]
+#[clap(about = "Generate a JWT for an endpoint")]
+struct EndpointGenerateJwtCmdArgs {
+    #[clap(help = "Postgres endpoint id")]
+    endpoint_id: String,
+}
+
 #[derive(clap::Subcommand)]
 #[clap(about = "Manage neon_local branch name mappings")]
 enum MappingsCmd {
@@ -1528,6 +1536,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
            endpoint.stop(&args.mode, args.destroy)?;
        }
+        EndpointCmd::GenerateJwt(args) => {
+            let endpoint_id = &args.endpoint_id;
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id)
+                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
+            let jwt = endpoint.generate_jwt()?;
+
+            println!("{jwt}");
+        }
    }

    Ok(())
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -29,7 +29,7 @@
 //!     compute.log               - log output of `compute_ctl` and `postgres`
 //!     endpoint.json             - serialized `EndpointConf` struct
 //!     postgresql.conf           - postgresql settings
-//!     spec.json                 - passed to `compute_ctl`
+//!     config.json                 - passed to `compute_ctl`
 //!     pgdata/
 //!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
 //!         zenith.signal
@@ -42,20 +42,29 @@ use std::path::PathBuf;
 use std::process::Command;
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+use std::time::{Duration, Instant};

 use anyhow::{Context, Result, anyhow, bail};
-use compute_api::requests::ConfigurationRequest;
-use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
+use compute_api::requests::{ComputeClaims, ConfigurationRequest};
+use compute_api::responses::{
+    ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig,
+};
 use compute_api::spec::{
    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
    RemoteExtSpec, Role,
 };
+use jsonwebtoken::jwk::{
+    AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
+    OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
+};
 use nix::sys::signal::{Signal, kill};
 use pageserver_api::shard::ShardStripeSize;
+use pem::Pem;
+use pkcs8::der::Decode;
 use reqwest::header::CONTENT_TYPE;
 use safekeeper_api::membership::SafekeeperGeneration;
 use serde::{Deserialize, Serialize};
+use sha2::{Digest, Sha256};
 use tracing::debug;
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -80,6 +89,7 @@ pub struct EndpointConf {
    drop_subscriptions_before_start: bool,
    features: Vec<ComputeFeature>,
    cluster: Option<Cluster>,
+    compute_ctl_config: ComputeCtlConfig,
 }

 //
@@ -135,6 +145,36 @@ impl ComputeControlPlane {
            .unwrap_or(self.base_port)
    }

+    /// Create a JSON Web Key Set. This ideally matches the way we create a JWKS
+    /// from the production control plane.
+    fn create_jwks_from_pem(pem: Pem) -> Result<JwkSet> {
+        let document = pkcs8::Document::from_der(&pem.into_contents())?;
+
+        let mut hasher = Sha256::new();
+        hasher.update(&document);
+        let key_hash = hasher.finalize();
+
+        Ok(JwkSet {
+            keys: vec![Jwk {
+                common: CommonParameters {
+                    public_key_use: Some(PublicKeyUse::Signature),
+                    key_operations: Some(vec![KeyOperations::Verify]),
+                    key_algorithm: Some(KeyAlgorithm::EdDSA),
+                    key_id: Some(base64::encode_config(key_hash, base64::URL_SAFE_NO_PAD)),
+                    x509_url: None::<String>,
+                    x509_chain: None::<Vec<String>>,
+                    x509_sha1_fingerprint: None::<String>,
+                    x509_sha256_fingerprint: None::<String>,
+                },
+                algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters {
+                    key_type: OctetKeyPairType::OctetKeyPair,
+                    curve: EllipticCurve::Ed25519,
+                    x: base64::encode_config(&document, base64::URL_SAFE_NO_PAD),
+                }),
+            }],
+        })
+    }
+
    #[allow(clippy::too_many_arguments)]
    pub fn new_endpoint(
        &mut self,
@@ -152,6 +192,10 @@ impl ComputeControlPlane {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
        let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
+        let compute_ctl_config = ComputeCtlConfig {
+            jwks: Self::create_jwks_from_pem(self.env.read_public_key()?)?,
+            tls: None::<TlsConfig>,
+        };
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port),
@@ -179,6 +223,7 @@ impl ComputeControlPlane {
            reconfigure_concurrency: 1,
            features: vec![],
            cluster: None,
+            compute_ctl_config: compute_ctl_config.clone(),
        });

        ep.create_endpoint_dir()?;
@@ -198,6 +243,7 @@ impl ComputeControlPlane {
                reconfigure_concurrency: 1,
                features: vec![],
                cluster: None,
+                compute_ctl_config,
            })?,
        )?;
        std::fs::write(
@@ -240,7 +286,6 @@ impl ComputeControlPlane {

 ///////////////////////////////////////////////////////////////////////////////

-#[derive(Debug)]
 pub struct Endpoint {
    /// used as the directory name
    endpoint_id: String,
@@ -269,6 +314,9 @@ pub struct Endpoint {
    features: Vec<ComputeFeature>,
    // Cluster settings
    cluster: Option<Cluster>,
+
+    /// The compute_ctl config for the endpoint's compute.
+    compute_ctl_config: ComputeCtlConfig,
 }

 #[derive(PartialEq, Eq)]
@@ -331,6 +379,7 @@ impl Endpoint {
            drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
            features: conf.features,
            cluster: conf.cluster,
+            compute_ctl_config: conf.compute_ctl_config,
        })
    }

@@ -578,6 +627,13 @@ impl Endpoint {
        Ok(safekeeper_connstrings)
    }

+    /// Generate a JWT with the correct claims.
+    pub fn generate_jwt(&self) -> Result<String> {
+        self.env.generate_auth_token(&ComputeClaims {
+            compute_id: self.endpoint_id.clone(),
+        })
+    }
+
    #[allow(clippy::too_many_arguments)]
    pub async fn start(
        &self,
@@ -619,87 +675,101 @@ impl Endpoint {
            remote_extensions = None;
        };

-        // Create spec file
-        let mut spec = ComputeSpec {
-            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
-            format_version: 1.0,
-            operation_uuid: None,
-            features: self.features.clone(),
-            swap_size_bytes: None,
-            disk_quota_bytes: None,
-            disable_lfc_resizing: None,
-            cluster: Cluster {
-                cluster_id: None, // project ID: not used
-                name: None,       // project name: not used
-                state: None,
-                roles: if create_test_user {
-                    vec![Role {
+        // Create config file
+        let config = {
+            let mut spec = ComputeSpec {
+                skip_pg_catalog_updates: self.skip_pg_catalog_updates,
+                format_version: 1.0,
+                operation_uuid: None,
+                features: self.features.clone(),
+                swap_size_bytes: None,
+                disk_quota_bytes: None,
+                disable_lfc_resizing: None,
+                cluster: Cluster {
+                    cluster_id: None, // project ID: not used
+                    name: None,       // project name: not used
+                    state: None,
+                    roles: if create_test_user {
+                        vec![Role {
+                            name: PgIdent::from_str("test").unwrap(),
+                            encrypted_password: None,
+                            options: None,
+                        }]
+                    } else {
+                        Vec::new()
+                    },
+                    databases: if create_test_user {
+                        vec![Database {
+                            name: PgIdent::from_str("neondb").unwrap(),
+                            owner: PgIdent::from_str("test").unwrap(),
+                            options: None,
+                            restrict_conn: false,
+                            invalid: false,
+                        }]
+                    } else {
+                        Vec::new()
+                    },
+                    settings: None,
+                    postgresql_conf: Some(postgresql_conf.clone()),
+                },
+                delta_operations: None,
+                tenant_id: Some(self.tenant_id),
+                timeline_id: Some(self.timeline_id),
+                project_id: None,
+                branch_id: None,
+                endpoint_id: Some(self.endpoint_id.clone()),
+                mode: self.mode,
+                pageserver_connstring: Some(pageserver_connstring),
+                safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
+                safekeeper_connstrings,
+                storage_auth_token: auth_token.clone(),
+                remote_extensions,
+                pgbouncer_settings: None,
+                shard_stripe_size: Some(shard_stripe_size),
+                local_proxy_config: None,
+                reconfigure_concurrency: self.reconfigure_concurrency,
+                drop_subscriptions_before_start: self.drop_subscriptions_before_start,
+                audit_log_level: ComputeAudit::Disabled,
+                logs_export_host: None::<String>,
+            };
+
+            // this strange code is needed to support respec() in tests
+            if self.cluster.is_some() {
+                debug!("Cluster is already set in the endpoint spec, using it");
+                spec.cluster = self.cluster.clone().unwrap();
+
+                debug!("spec.cluster {:?}", spec.cluster);
+
+                // fill missing fields again
+                if create_test_user {
+                    spec.cluster.roles.push(Role {
                        name: PgIdent::from_str("test").unwrap(),
                        encrypted_password: None,
                        options: None,
-                    }]
-                } else {
-                    Vec::new()
-                },
-                databases: if create_test_user {
-                    vec![Database {
+                    });
+                    spec.cluster.databases.push(Database {
                        name: PgIdent::from_str("neondb").unwrap(),
                        owner: PgIdent::from_str("test").unwrap(),
                        options: None,
                        restrict_conn: false,
                        invalid: false,
-                    }]
-                } else {
-                    Vec::new()
-                },
-                settings: None,
-                postgresql_conf: Some(postgresql_conf.clone()),
-            },
-            delta_operations: None,
-            tenant_id: Some(self.tenant_id),
-            timeline_id: Some(self.timeline_id),
-            mode: self.mode,
-            pageserver_connstring: Some(pageserver_connstring),
-            safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
-            safekeeper_connstrings,
-            storage_auth_token: auth_token.clone(),
-            remote_extensions,
-            pgbouncer_settings: None,
-            shard_stripe_size: Some(shard_stripe_size),
-            local_proxy_config: None,
-            reconfigure_concurrency: self.reconfigure_concurrency,
-            drop_subscriptions_before_start: self.drop_subscriptions_before_start,
-            audit_log_level: ComputeAudit::Disabled,
-            logs_export_host: None::<String>,
+                    });
+                }
+                spec.cluster.postgresql_conf = Some(postgresql_conf);
+            }
+
+            ComputeConfig {
+                spec: Some(spec),
+                compute_ctl_config: self.compute_ctl_config.clone(),
+            }
        };

-        // this strange code is needed to support respec() in tests
-        if self.cluster.is_some() {
-            debug!("Cluster is already set in the endpoint spec, using it");
-            spec.cluster = self.cluster.clone().unwrap();
-
-            debug!("spec.cluster {:?}", spec.cluster);
-
-            // fill missing fields again
-            if create_test_user {
-                spec.cluster.roles.push(Role {
-                    name: PgIdent::from_str("test").unwrap(),
-                    encrypted_password: None,
-                    options: None,
-                });
-                spec.cluster.databases.push(Database {
-                    name: PgIdent::from_str("neondb").unwrap(),
-                    owner: PgIdent::from_str("test").unwrap(),
-                    options: None,
-                    restrict_conn: false,
-                    invalid: false,
-                });
-            }
-            spec.cluster.postgresql_conf = Some(postgresql_conf);
-        }
-
+        // TODO(tristan957): Remove the write to spec.json after compatibility
+        // tests work themselves out
        let spec_path = self.endpoint_path().join("spec.json");
-        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
+        std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
+        let config_path = self.endpoint_path().join("config.json");
+        std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;

        // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
        let logfile = std::fs::OpenOptions::new()
@@ -707,6 +777,16 @@ impl Endpoint {
            .append(true)
            .open(self.endpoint_path().join("compute.log"))?;

+        // TODO(tristan957): Remove when compatibility tests are no longer an
+        // issue
+        let old_compute_ctl = {
+            let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
+            let help_output = cmd.arg("--help").output()?;
+            let help_output = String::from_utf8_lossy(&help_output.stdout);
+
+            !help_output.contains("--config")
+        };
+
        // Launch compute_ctl
        let conn_str = self.connstr("cloud_admin", "postgres");
        println!("Starting postgres node at '{}'", conn_str);
@@ -725,9 +805,18 @@ impl Endpoint {
        ])
        .args(["--pgdata", self.pgdata().to_str().unwrap()])
        .args(["--connstr", &conn_str])
+        // TODO(tristan957): Change this to --config when compatibility tests
+        // are no longer an issue
        .args([
            "--spec-path",
-            self.endpoint_path().join("spec.json").to_str().unwrap(),
+            self.endpoint_path()
+                .join(if old_compute_ctl {
+                    "spec.json"
+                } else {
+                    "config.json"
+                })
+                .to_str()
+                .unwrap(),
        ])
        .args([
            "--pgbin",
@@ -739,16 +828,7 @@ impl Endpoint {
        ])
        // TODO: It would be nice if we generated compute IDs with the same
        // algorithm as the real control plane.
-        .args([
-            "--compute-id",
-            &format!(
-                "compute-{}",
-                SystemTime::now()
-                    .duration_since(UNIX_EPOCH)
-                    .unwrap()
-                    .as_secs()
-            ),
-        ])
+        .args(["--compute-id", &self.endpoint_id])
        .stdin(std::process::Stdio::null())
        .stderr(logfile.try_clone()?)
        .stdout(logfile);
@@ -846,6 +926,7 @@ impl Endpoint {
                    self.external_http_address.port()
                ),
            )
+            .bearer_auth(self.generate_jwt()?)
            .send()
            .await?;

@@ -870,10 +951,12 @@ impl Endpoint {
        stripe_size: Option<ShardStripeSize>,
        safekeepers: Option<Vec<NodeId>>,
    ) -> Result<()> {
-        let mut spec: ComputeSpec = {
-            let spec_path = self.endpoint_path().join("spec.json");
-            let file = std::fs::File::open(spec_path)?;
-            serde_json::from_reader(file)?
+        let (mut spec, compute_ctl_config) = {
+            let config_path = self.endpoint_path().join("config.json");
+            let file = std::fs::File::open(config_path)?;
+            let config: ComputeConfig = serde_json::from_reader(file)?;
+
+            (config.spec.unwrap(), config.compute_ctl_config)
        };

        let postgresql_conf = self.read_postgresql_conf()?;
@@ -920,10 +1003,11 @@ impl Endpoint {
                self.external_http_address.port()
            ))
            .header(CONTENT_TYPE.as_str(), "application/json")
+            .bearer_auth(self.generate_jwt()?)
            .body(
                serde_json::to_string(&ConfigurationRequest {
                    spec,
-                    compute_ctl_config: ComputeCtlConfig::default(),
+                    compute_ctl_config,
                })
                .unwrap(),
            )
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -12,6 +12,7 @@ use std::{env, fs};

 use anyhow::{Context, bail};
 use clap::ValueEnum;
+use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
@@ -56,6 +57,7 @@ pub struct LocalEnv {

    // used to issue tokens during e.g pg start
    pub private_key_path: PathBuf,
+    /// Path to environment's public key
    pub public_key_path: PathBuf,

    pub broker: NeonBroker,
@@ -758,11 +760,11 @@ impl LocalEnv {

    // this function is used only for testing purposes in CLI e g generate tokens during init
    pub fn generate_auth_token<S: Serialize>(&self, claims: &S) -> anyhow::Result<String> {
-        let private_key_path = self.get_private_key_path();
-        let key_data = fs::read(private_key_path)?;
-        encode_from_key_file(claims, &key_data)
+        let key = self.read_private_key()?;
+        encode_from_key_file(claims, &key)
    }

+    /// Get the path to the private key.
    pub fn get_private_key_path(&self) -> PathBuf {
        if self.private_key_path.is_absolute() {
            self.private_key_path.to_path_buf()
@@ -771,6 +773,29 @@ impl LocalEnv {
        }
    }

+    /// Get the path to the public key.
+    pub fn get_public_key_path(&self) -> PathBuf {
+        if self.public_key_path.is_absolute() {
+            self.public_key_path.to_path_buf()
+        } else {
+            self.base_data_dir.join(&self.public_key_path)
+        }
+    }
+
+    /// Read the contents of the private key file.
+    pub fn read_private_key(&self) -> anyhow::Result<Pem> {
+        let private_key_path = self.get_private_key_path();
+        let pem = pem::parse(fs::read(private_key_path)?)?;
+        Ok(pem)
+    }
+
+    /// Read the contents of the public key file.
+    pub fn read_public_key(&self) -> anyhow::Result<Pem> {
+        let public_key_path = self.get_public_key_path();
+        let pem = pem::parse(fs::read(public_key_path)?)?;
+        Ok(pem)
+    }
+
    /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
    pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
        let base_path = base_path();
@@ -956,6 +981,7 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
            String::from_utf8_lossy(&keygen_output.stderr)
        );
    }
+
    // Extract the public key from the private key file
    //
    // openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
@@ -972,6 +998,7 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
            String::from_utf8_lossy(&keygen_output.stderr)
        );
    }
+
    Ok(())
 }

@@ -980,7 +1007,7 @@ fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()>
    // -out rootCA.crt -keyout rootCA.key
    let keygen_output = Command::new("openssl")
        .args([
-            "req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
+            "req", "-x509", "-newkey", "ed25519", "-nodes", "-days", "36500",
        ])
        .args(["-subj", "/CN=Neon Local CA"])
        .args(["-out", cert_path.to_str().unwrap()])
@@ -1010,7 +1037,7 @@ fn generate_ssl_cert(
    // -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
    let keygen_output = Command::new("openssl")
        .args(["req", "-new", "-nodes"])
-        .args(["-newkey", "rsa:2048"])
+        .args(["-newkey", "ed25519"])
        .args(["-subj", "/CN=localhost"])
        .args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
        .args(["-keyout", key_path.to_str().unwrap()])
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -318,7 +318,7 @@ impl PageServerNode {
                self.conf.id, datadir,
            )
        })?;
-        let args = vec!["-D", datadir_path_str, "--dev"];
+        let args = vec!["-D", datadir_path_str];

        background_process::start_process(
            "pageserver",
@@ -535,6 +535,11 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'gc_compaction_enabled' as bool")?,
+            gc_compaction_verification: settings
+                .remove("gc_compaction_verification")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_verification' as bool")?,
            gc_compaction_initial_threshold_kb: settings
                .remove("gc_compaction_initial_threshold_kb")
                .map(|x| x.parse::<u64>())
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -162,7 +162,6 @@ impl SafekeeperNode {
            listen_http,
            "--availability-zone".to_owned(),
            availability_zone,
-            "--dev".to_owned(),
        ];
        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
            let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -13,9 +13,12 @@ use pageserver_api::controller_api::{
    NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
    TenantCreateResponse, TenantLocateResponse,
 };
-use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
+use pageserver_api::models::{
+    TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
+};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
+use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::{Certificate, Method};
 use serde::de::DeserializeOwned;
@@ -32,8 +35,8 @@ use crate::local_env::{LocalEnv, NeonStorageControllerConf};

 pub struct StorageController {
    env: LocalEnv,
-    private_key: Option<Vec<u8>>,
-    public_key: Option<String>,
+    private_key: Option<Pem>,
+    public_key: Option<Pem>,
    client: reqwest::Client,
    config: NeonStorageControllerConf,

@@ -82,7 +85,8 @@ impl NeonStorageControllerStopArgs {
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
    pub node_id: Option<NodeId>,
-    pub generation_override: Option<i32>,
+    pub generation_override: Option<i32>, // only new tenants
+    pub config: Option<TenantConfig>,     // only new tenants
 }

 #[derive(Serialize, Deserialize)]
@@ -113,7 +117,9 @@ impl StorageController {
            AuthType::Trust => (None, None),
            AuthType::NeonJWT => {
                let private_key_path = env.get_private_key_path();
-                let private_key = fs::read(private_key_path).expect("failed to read private key");
+                let private_key =
+                    pem::parse(fs::read(private_key_path).expect("failed to read private key"))
+                        .expect("failed to parse PEM file");

                // If pageserver auth is enabled, this implicitly enables auth for this service,
                // using the same credentials.
@@ -135,9 +141,13 @@ impl StorageController {
                        .expect("Empty key dir")
                        .expect("Error reading key dir");

-                    std::fs::read_to_string(dent.path()).expect("Can't read public key")
+                    pem::parse(std::fs::read_to_string(dent.path()).expect("Can't read public key"))
+                        .expect("Failed to parse PEM file")
                } else {
-                    std::fs::read_to_string(&public_key_path).expect("Can't read public key")
+                    pem::parse(
+                        std::fs::read_to_string(&public_key_path).expect("Can't read public key"),
+                    )
+                    .expect("Failed to parse PEM file")
                };
                (Some(private_key), Some(public_key))
            }
@@ -805,6 +815,7 @@ impl StorageController {
            tenant_shard_id,
            node_id: Some(pageserver_id),
            generation_override: None,
+            config: None,
        };

        let response = self
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -11,8 +11,8 @@ generate_id() {

 PG_VERSION=${PG_VERSION:-14}

-SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
-SPEC_FILE=/tmp/spec.json
+CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
+CONFIG_FILE=/tmp/config.json

 echo "Waiting pageserver become ready."
 while ! nc -z pageserver 6400; do
@@ -20,7 +20,7 @@ while ! nc -z pageserver 6400; do
 done
 echo "Page server is ready."

-cp ${SPEC_FILE_ORG} ${SPEC_FILE}
+cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}

 if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
   tenant_id=${TENANT_ID}
@@ -73,17 +73,27 @@ else
  ulid_extension=ulid
 fi
 echo "Adding pgx_ulid"
-shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE})
-sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE}
+shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
+sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
 echo "Overwrite tenant id and timeline id in spec file"
-sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
-sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
+sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
+sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}

-cat ${SPEC_FILE}
+cat ${CONFIG_FILE}
+
+# TODO(tristan957): Remove these workarounds for backwards compatibility after
+# the next compute release. That includes these next few lines and the
+# --spec-path in the compute_ctl invocation.
+if compute_ctl --help | grep --quiet -- '--config'; then
+  SPEC_PATH="$CONFIG_FILE"
+else
+  jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
+  SPEC_PATH=/tmp/spec.json
+fi

 echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
     -b /usr/local/bin/postgres                              \
     --compute-id "compute-$RANDOM"                          \
-     -S ${SPEC_FILE}
+     --spec-path "$SPEC_PATH"
--- a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
@@ -0,0 +1,148 @@
+{
+    "spec": {
+        "format_version": 1.0,
+
+        "timestamp": "2022-10-12T18:00:00.000Z",
+        "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
+
+        "cluster": {
+            "cluster_id": "docker_compose",
+            "name": "docker_compose_test",
+            "state": "restarted",
+            "roles": [
+                {
+                    "name": "cloud_admin",
+                    "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
+                    "options": null
+                }
+            ],
+            "databases": [
+            ],
+            "settings": [
+                {
+                    "name": "fsync",
+                    "value": "off",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "wal_level",
+                    "value": "logical",
+                    "vartype": "enum"
+                },
+                {
+                    "name": "wal_log_hints",
+                    "value": "on",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "log_connections",
+                    "value": "on",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "port",
+                    "value": "55433",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "shared_buffers",
+                    "value": "1MB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_connections",
+                    "value": "100",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "listen_addresses",
+                    "value": "0.0.0.0",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_wal_senders",
+                    "value": "10",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "max_replication_slots",
+                    "value": "10",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "wal_sender_timeout",
+                    "value": "5s",
+                    "vartype": "string"
+                },
+                {
+                    "name": "wal_keep_size",
+                    "value": "0",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "password_encryption",
+                    "value": "md5",
+                    "vartype": "enum"
+                },
+                {
+                    "name": "restart_after_crash",
+                    "value": "off",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "synchronous_standby_names",
+                    "value": "walproposer",
+                    "vartype": "string"
+                },
+                {
+                    "name": "shared_preload_libraries",
+                    "value": "neon,pg_cron,timescaledb,pg_stat_statements",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.safekeepers",
+                    "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.timeline_id",
+                    "value": "TIMELINE_ID",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.tenant_id",
+                    "value": "TENANT_ID",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.pageserver_connstring",
+                    "value": "host=pageserver port=6400",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_replication_write_lag",
+                    "value": "500MB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_replication_flush_lag",
+                    "value": "10GB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "cron.database",
+                    "value": "postgres",
+                    "vartype": "string"
+                }
+            ]
+        },
+
+        "delta_operations": [
+        ]
+    },
+    "compute_ctl_config": {
+        "jwks": {
+            "keys": []
+        }
+    }
+}
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -1,141 +0,0 @@
-{
-    "format_version": 1.0,
-
-    "timestamp": "2022-10-12T18:00:00.000Z",
-    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
-
-    "cluster": {
-        "cluster_id": "docker_compose",
-        "name": "docker_compose_test",
-        "state": "restarted",
-        "roles": [
-            {
-                "name": "cloud_admin",
-                "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
-                "options": null
-            }
-        ],
-        "databases": [
-        ],
-        "settings": [
-            {
-                "name": "fsync",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "wal_level",
-                "value": "logical",
-                "vartype": "enum"
-            },
-            {
-                "name": "wal_log_hints",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "log_connections",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "port",
-                "value": "55433",
-                "vartype": "integer"
-            },
-            {
-                "name": "shared_buffers",
-                "value": "1MB",
-                "vartype": "string"
-            },
-            {
-                "name": "max_connections",
-                "value": "100",
-                "vartype": "integer"
-            },
-            {
-                "name": "listen_addresses",
-                "value": "0.0.0.0",
-                "vartype": "string"
-            },
-            {
-                "name": "max_wal_senders",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "max_replication_slots",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "wal_sender_timeout",
-                "value": "5s",
-                "vartype": "string"
-            },
-            {
-                "name": "wal_keep_size",
-                "value": "0",
-                "vartype": "integer"
-            },
-            {
-                "name": "password_encryption",
-                "value": "md5",
-                "vartype": "enum"
-            },
-            {
-                "name": "restart_after_crash",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "synchronous_standby_names",
-                "value": "walproposer",
-                "vartype": "string"
-            },
-            {
-                "name": "shared_preload_libraries",
-                "value": "neon,pg_cron,timescaledb,pg_stat_statements",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.safekeepers",
-                "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.timeline_id",
-                "value": "TIMELINE_ID",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.tenant_id",
-                "value": "TENANT_ID",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.pageserver_connstring",
-                "value": "host=pageserver port=6400",
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_write_lag",
-                "value": "500MB",
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_flush_lag",
-                "value": "10GB",
-                "vartype": "string"
-            },
-            {
-                "name": "cron.database",
-                "value": "postgres",
-                "vartype": "string"
-            }
-        ]
-    },
-
-    "delta_operations": [
-    ]
-}
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -159,7 +159,7 @@ services:
      #- RUST_BACKTRACE=1
    # Mount the test files directly, for faster editing cycle.
    volumes:
-      - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/
      - ./compute_wrapper/shell/:/shell/
    ports:
      - 55433:55433 # pg protocol handler
--- a/docker-compose/ext-src/pg_jsonschema-src/Makefile
+++ b/docker-compose/ext-src/pg_jsonschema-src/Makefile
@@ -0,0 +1,8 @@
+EXTENSION = pg_jsonschema
+DATA = pg_jsonschema--1.0.sql
+REGRESS = jsonschema_valid_api  jsonschema_edge_cases
+REGRESS_OPTS = --load-extension=pg_jsonschema
+
+PG_CONFIG ?= pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
--- a/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_edge_cases.out
+++ b/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_edge_cases.out
@@ -0,0 +1,87 @@
+-- Schema with enums, nulls, extra properties disallowed
+SELECT jsonschema_is_valid('{
+  "type": "object",
+  "properties": {
+    "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+    "email": { "type": ["string", "null"], "format": "email" }
+  },
+  "required": ["status"],
+  "additionalProperties": false
+}'::json);
+ jsonschema_is_valid 
+---------------------
+ t
+(1 row)
+
+-- Valid enum and null email
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "email": null}'::json
+);
+ jsonschema_validation_errors 
+------------------------------
+ {}
+(1 row)
+
+-- Invalid enum value
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "disabled", "email": null}'::json
+);
+                     jsonschema_validation_errors                     
+----------------------------------------------------------------------
+ {"\"disabled\" is not one of [\"active\",\"inactive\",\"pending\"]"}
+(1 row)
+
+-- Invalid email format (assuming format is validated)
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "email": "not-an-email"}'::json
+);
+      jsonschema_validation_errors       
+-----------------------------------------
+ {"\"not-an-email\" is not a \"email\""}
+(1 row)
+
+-- Extra property not allowed
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "extra": "should not be here"}'::json
+);
+                    jsonschema_validation_errors                    
+--------------------------------------------------------------------
+ {"Additional properties are not allowed ('extra' was unexpected)"}
+(1 row)
+
--- a/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_valid_api.out
+++ b/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_valid_api.out
@@ -0,0 +1,65 @@
+-- Define schema
+SELECT jsonschema_is_valid('{
+  "type": "object",
+  "properties": {
+    "username": { "type": "string" },
+    "age": { "type": "integer" }
+  },
+  "required": ["username"]
+}'::json);
+ jsonschema_is_valid 
+---------------------
+ t
+(1 row)
+
+-- Valid instance
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"username": "alice", "age": 25}'::json
+);
+ jsonschema_validation_errors 
+------------------------------
+ {}
+(1 row)
+
+-- Invalid instance: missing required "username"
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"age": 25}'::json
+);
+      jsonschema_validation_errors       
+-----------------------------------------
+ {"\"username\" is a required property"}
+(1 row)
+
+-- Invalid instance: wrong type for "age"
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"username": "bob", "age": "twenty"}'::json
+);
+       jsonschema_validation_errors        
+-------------------------------------------
+ {"\"twenty\" is not of type \"integer\""}
+(1 row)
+
--- a/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_edge_cases.sql
+++ b/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_edge_cases.sql
@@ -0,0 +1,66 @@
+-- Schema with enums, nulls, extra properties disallowed
+SELECT jsonschema_is_valid('{
+  "type": "object",
+  "properties": {
+    "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+    "email": { "type": ["string", "null"], "format": "email" }
+  },
+  "required": ["status"],
+  "additionalProperties": false
+}'::json);
+
+-- Valid enum and null email
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "email": null}'::json
+);
+
+-- Invalid enum value
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "disabled", "email": null}'::json
+);
+
+-- Invalid email format (assuming format is validated)
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "email": "not-an-email"}'::json
+);
+
+-- Extra property not allowed
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "extra": "should not be here"}'::json
+);
--- a/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_valid_api.sql
+++ b/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_valid_api.sql
@@ -0,0 +1,48 @@
+-- Define schema
+SELECT jsonschema_is_valid('{
+  "type": "object",
+  "properties": {
+    "username": { "type": "string" },
+    "age": { "type": "integer" }
+  },
+  "required": ["username"]
+}'::json);
+
+-- Valid instance
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"username": "alice", "age": 25}'::json
+);
+
+-- Invalid instance: missing required "username"
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"age": 25}'::json
+);
+
+-- Invalid instance: wrong type for "age"
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"username": "bob", "age": "twenty"}'::json
+);
--- a/docker-compose/ext-src/pg_session_jwt-src/Makefile
+++ b/docker-compose/ext-src/pg_session_jwt-src/Makefile
@@ -0,0 +1,9 @@
+EXTENSION = pg_session_jwt
+
+REGRESS = basic_functions
+REGRESS_OPTS = --load-extension=$(EXTENSION)
+export PGOPTIONS = -c pg_session_jwt.jwk={"crv":"Ed25519","kty":"OKP","x":"R_Abz-63zJ00l-IraL5fQhwkhGVZCSooQFV5ntC3C7M"}
+
+PG_CONFIG ?= pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
--- a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out
+++ b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out
@@ -0,0 +1,35 @@
+-- Basic functionality tests for pg_session_jwt
+-- Test auth.init() function
+SELECT auth.init();
+ init 
+------
+ 
+(1 row)
+
+-- Test an invalid JWT
+SELECT auth.jwt_session_init('INVALID-JWT');
+ERROR:  invalid JWT encoding
+-- Test creating a session with an expired JWT
+SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
+ERROR:  Token used after it has expired
+-- Test creating a session with a valid JWT
+SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
+ jwt_session_init 
+------------------
+ 
+(1 row)
+
+-- Test auth.session() function
+SELECT auth.session();
+                                 session                                 
+-------------------------------------------------------------------------
+ {"exp": 4896164252, "iat": 1742564252, "jti": 434343, "sub": "user123"}
+(1 row)
+
+-- Test auth.user_id() function
+SELECT auth.user_id() AS user_id;
+ user_id 
+---------
+ user123
+(1 row)
+
--- a/docker-compose/ext-src/pg_session_jwt-src/sql/basic_functions.sql
+++ b/docker-compose/ext-src/pg_session_jwt-src/sql/basic_functions.sql
@@ -0,0 +1,19 @@
+-- Basic functionality tests for pg_session_jwt
+
+-- Test auth.init() function
+SELECT auth.init();
+
+-- Test an invalid JWT
+SELECT auth.jwt_session_init('INVALID-JWT');
+
+-- Test creating a session with an expired JWT
+SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
+
+-- Test creating a session with a valid JWT
+SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
+
+-- Test auth.session() function
+SELECT auth.session();
+
+-- Test auth.user_id() function
+SELECT auth.user_id() AS user_id;
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -14,6 +14,32 @@ pub struct GenericAPIError {
    pub error: String,
 }

+/// All configuration parameters necessary for a compute. When
+/// [`ComputeConfig::spec`] is provided, it means that the compute is attached
+/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided
+/// and contains parameters necessary for operating `compute_ctl` independently
+/// of whether a tenant is attached to the compute or not.
+///
+/// This also happens to be the body of `compute_ctl`'s /configure request.
+#[derive(Debug, Deserialize, Serialize)]
+pub struct ComputeConfig {
+    /// The compute spec
+    pub spec: Option<ComputeSpec>,
+
+    /// The compute_ctl configuration
+    #[allow(dead_code)]
+    pub compute_ctl_config: ComputeCtlConfig,
+}
+
+impl From<ControlPlaneConfigResponse> for ComputeConfig {
+    fn from(value: ControlPlaneConfigResponse) -> Self {
+        Self {
+            spec: value.spec,
+            compute_ctl_config: value.compute_ctl_config,
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize)]
 pub struct ExtensionInstallResponse {
    pub extension: PgIdent,
@@ -134,7 +160,7 @@ pub struct CatalogObjects {
    pub databases: Vec<Database>,
 }

-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
 pub struct ComputeCtlConfig {
    /// Set of JSON web keys that the compute can use to authenticate
    /// communication from the control plane.
@@ -153,7 +179,7 @@ impl Default for ComputeCtlConfig {
    }
 }

-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
 pub struct TlsConfig {
    pub key_path: String,
    pub cert_path: String,
@@ -161,7 +187,7 @@ pub struct TlsConfig {

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
 #[derive(Deserialize, Debug)]
-pub struct ControlPlaneSpecResponse {
+pub struct ControlPlaneConfigResponse {
    pub spec: Option<ComputeSpec>,
    pub status: ControlPlaneComputeStatus,
    pub compute_ctl_config: ComputeCtlConfig,
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -1,8 +1,8 @@
-//! `ComputeSpec` represents the contents of the spec.json file.
-//!
-//! The spec.json file is used to pass information to 'compute_ctl'. It contains
-//! all the information needed to start up the right version of PostgreSQL,
-//! and connect it to the storage nodes.
+//! The ComputeSpec contains all the information needed to start up
+//! the right version of PostgreSQL, and connect it to the storage nodes.
+//! It can be passed as part of the `config.json`, or the control plane can
+//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
+//! compute_ctl can fetch it by calling the control plane's API.
 use std::collections::HashMap;

 use indexmap::IndexMap;
@@ -104,6 +104,12 @@ pub struct ComputeSpec {
    pub timeline_id: Option<TimelineId>,
    pub pageserver_connstring: Option<String>,

+    // More neon ids that we expose to the compute_ctl
+    // and to postgres as neon extension GUCs.
+    pub project_id: Option<String>,
+    pub branch_id: Option<String>,
+    pub endpoint_id: Option<String>,
+
    /// Safekeeper membership config generation. It is put in
    /// neon.safekeepers GUC and serves two purposes:
    /// 1) Non zero value forces walproposer to use membership configurations.
@@ -159,13 +165,7 @@ pub struct ComputeSpec {
    #[serde(default)] // Default false
    pub drop_subscriptions_before_start: bool,

-    /// Log level for audit logging:
-    ///
-    /// Disabled - no audit logging. This is the default.
-    /// log - log masked statements to the postgres log using pgaudit extension
-    /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
-    ///
-    /// Extensions should be present in shared_preload_libraries
+    /// Log level for compute audit logging
    #[serde(default)]
    pub audit_log_level: ComputeAudit,

@@ -183,6 +183,9 @@ pub enum ComputeFeature {
    /// track short-lived connections as user activity.
    ActivityMonitorExperimental,

+    /// Download all SLRU files on demand
+    LazySlruDownload,
+
    /// This is a special feature flag that is used to represent unknown feature flags.
    /// Basically all unknown to enum flags are represented as this one. See unit test
    /// `parse_unknown_features()` for more details.
@@ -289,14 +292,25 @@ impl ComputeMode {
 }

 /// Log level for audit logging
-/// Disabled, log, hipaa
-/// Default is Disabled
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
    #[default]
    Disabled,
+    // Deprecated, use Base instead
    Log,
+    // (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
+    // logged to the standard postgresql log stream
+    Base,
+    // Deprecated, use Full or Extended instead
    Hipaa,
+    // (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access
+    Extended,
+    // (pgaudit.log='all', pgaudit.log_parameter='on'),
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access.
+    Full,
 }

 #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
--- a/libs/http-utils/Cargo.toml
+++ b/libs/http-utils/Cargo.toml
@@ -14,6 +14,7 @@ futures.workspace = true
 hyper0.workspace = true
 itertools.workspace = true
 jemalloc_pprof.workspace = true
+jsonwebtoken.workspace = true
 once_cell.workspace = true
 pprof.workspace = true
 regex.workspace = true
@@ -30,6 +31,7 @@ tokio.workspace = true
 tracing.workspace = true
 url.workspace = true
 uuid.workspace = true
+x509-cert.workspace = true

 # to use tokio channels as streams, this is faster to compile than async_stream
 # why is it only here? no other crate should use it, streams are rarely needed.
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -8,6 +8,7 @@ use bytes::{Bytes, BytesMut};
 use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
 use hyper::http::HeaderValue;
 use hyper::{Body, Method, Request, Response};
+use jsonwebtoken::TokenData;
 use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
 use once_cell::sync::Lazy;
 use pprof::ProfilerGuardBuilder;
@@ -618,7 +619,7 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                    })?;
                    let token = parse_token(header_value)?;

-                    let data = auth.decode(token).map_err(|err| {
+                    let data: TokenData<Claims> = auth.decode(token).map_err(|err| {
                        warn!("Authentication error: {err}");
                        // Rely on From<AuthError> for ApiError impl
                        err
--- a/libs/http-utils/src/server.rs
+++ b/libs/http-utils/src/server.rs
@@ -4,6 +4,8 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use hyper0::Body;
 use hyper0::server::conn::Http;
+use metrics::{IntCounterVec, register_int_counter_vec};
+use once_cell::sync::Lazy;
 use routerify::{RequestService, RequestServiceBuilder};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
@@ -26,6 +28,24 @@ pub struct Server {
    tls_acceptor: Option<TlsAcceptor>,
 }

+static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_started_total",
+        "Number of established http/https connections",
+        &["scheme"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_errors_total",
+        "Number of occured connection errors by type",
+        &["type"]
+    )
+    .expect("failed to define a metric")
+});
+
 impl Server {
    pub fn new(
        request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
@@ -60,6 +80,15 @@ impl Server {
            false
        }

+        let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
+        let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
+        let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
+        let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
+        let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
+
+        let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
+        let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
+
        let mut connections = FuturesUnordered::new();
        loop {
            tokio::select! {
@@ -67,6 +96,7 @@ impl Server {
                    let (tcp_stream, remote_addr) = match stream {
                        Ok(stream) => stream,
                        Err(err) => {
+                            tcp_error_cnt.inc();
                            if !suppress_io_error(&err) {
                                info!("Failed to accept TCP connection: {err:#}");
                            }
@@ -78,11 +108,18 @@ impl Server {
                    let tls_acceptor = self.tls_acceptor.clone();
                    let cancel = cancel.clone();

+                    let tls_error_cnt = tls_error_cnt.clone();
+                    let http_error_cnt = http_error_cnt.clone();
+                    let https_error_cnt = https_error_cnt.clone();
+                    let http_connection_cnt = http_connection_cnt.clone();
+                    let https_connection_cnt = https_connection_cnt.clone();
+
                    connections.push(tokio::spawn(
                        async move {
                            match tls_acceptor {
                                Some(tls_acceptor) => {
                                    // Handle HTTPS connection.
+                                    https_connection_cnt.inc();
                                    let tls_stream = tokio::select! {
                                        tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
                                        _ = cancel.cancelled() => return,
@@ -90,6 +127,7 @@ impl Server {
                                    let tls_stream = match tls_stream {
                                        Ok(tls_stream) => tls_stream,
                                        Err(err) => {
+                                            tls_error_cnt.inc();
                                            if !suppress_io_error(&err) {
                                                info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
                                            }
@@ -97,6 +135,7 @@ impl Server {
                                        }
                                    };
                                    if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
+                                        https_error_cnt.inc();
                                        if !suppress_hyper_error(&err) {
                                            info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
                                        }
@@ -104,7 +143,9 @@ impl Server {
                                }
                                None => {
                                    // Handle HTTP connection.
+                                    http_connection_cnt.inc();
                                    if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
+                                        http_error_cnt.inc();
                                        if !suppress_hyper_error(&err) {
                                            info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
                                        }
@@ -115,6 +156,7 @@ impl Server {
                 }
                Some(conn) = connections.next() => {
                    if let Err(err) = conn {
+                        panic_error_cnt.inc();
                        error!("Connection panicked: {err:#}");
                    }
                }
@@ -122,6 +164,7 @@ impl Server {
                    // Wait for graceful shutdown of all connections.
                    while let Some(conn) = connections.next().await {
                        if let Err(err) = conn {
+                            panic_error_cnt.inc();
                            error!("Connection panicked: {err:#}");
                        }
                    }
--- a/libs/http-utils/src/tls_certs.rs
+++ b/libs/http-utils/src/tls_certs.rs
@@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration};
 use anyhow::Context;
 use arc_swap::ArcSwap;
 use camino::Utf8Path;
+use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
+use once_cell::sync::Lazy;
 use rustls::{
-    pki_types::{CertificateDer, PrivateKeyDer},
+    pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
    server::{ClientHello, ResolvesServerCert},
    sign::CertifiedKey,
 };
+use x509_cert::der::Reader;

 pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
    let cert_data = tokio::fs::read(filename)
@@ -53,6 +56,76 @@ pub async fn load_certified_key(
    Ok(certified_key)
 }

+/// rustls's CertifiedKey with extra parsed fields used for metrics.
+struct ParsedCertifiedKey {
+    certified_key: CertifiedKey,
+    expiration_time: UnixTime,
+}
+
+/// Parse expiration time from an X509 certificate.
+fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
+    let parsed_cert = x509_cert::der::SliceReader::new(cert)
+        .context("Failed to parse cerficiate")?
+        .decode::<x509_cert::Certificate>()
+        .context("Failed to parse cerficiate")?;
+
+    Ok(UnixTime::since_unix_epoch(
+        parsed_cert
+            .tbs_certificate
+            .validity
+            .not_after
+            .to_unix_duration(),
+    ))
+}
+
+async fn load_and_parse_certified_key(
+    key_filename: &Utf8Path,
+    cert_filename: &Utf8Path,
+) -> anyhow::Result<ParsedCertifiedKey> {
+    let certified_key = load_certified_key(key_filename, cert_filename).await?;
+    let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
+    Ok(ParsedCertifiedKey {
+        certified_key,
+        expiration_time,
+    })
+}
+
+static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "tls_certs_expiration_time_seconds",
+        "Expiration time of the loaded certificate since unix epoch in seconds",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_started_total",
+        "Number of certificate reload loop iterations started",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_updated_total",
+        "Number of times the certificate was updated to the new one",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_failed_total",
+        "Number of times the certificate reload failed",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
 /// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
 /// the disk periodically.
 #[derive(Debug)]
@@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver {
 impl ReloadingCertificateResolver {
    /// Creates a new Resolver by loading certificate and private key from FS and
    /// creating tokio::task to reload them with provided reload_period.
+    /// resolver_name is used as metric's label.
    pub async fn new(
+        resolver_name: &str,
        key_filename: &Utf8Path,
        cert_filename: &Utf8Path,
        reload_period: Duration,
    ) -> anyhow::Result<Arc<Self>> {
+        // Create metrics for current resolver.
+        let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
+        let cert_reload_started_counter =
+            CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_updated_counter =
+            CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_failed_counter =
+            CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
+
+        let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
+
        let this = Arc::new(Self {
-            certified_key: ArcSwap::from_pointee(
-                load_certified_key(key_filename, cert_filename).await?,
-            ),
+            certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
        });
+        cert_expiration_time.set(parsed_key.expiration_time.as_secs());

        tokio::spawn({
            let weak_this = Arc::downgrade(&this);
@@ -88,17 +173,22 @@ impl ReloadingCertificateResolver {
                        Some(this) => this,
                        None => break, // Resolver has been destroyed, exit.
                    };
-                    match load_certified_key(&key_filename, &cert_filename).await {
-                        Ok(new_certified_key) => {
-                            if new_certified_key.cert == this.certified_key.load().cert {
+                    cert_reload_started_counter.inc();
+
+                    match load_and_parse_certified_key(&key_filename, &cert_filename).await {
+                        Ok(parsed_key) => {
+                            if parsed_key.certified_key.cert == this.certified_key.load().cert {
                                tracing::debug!("Certificate has not changed since last reloading");
                            } else {
                                tracing::info!("Certificate has been reloaded");
-                                this.certified_key.store(Arc::new(new_certified_key));
+                                this.certified_key.store(Arc::new(parsed_key.certified_key));
+                                cert_expiration_time.set(parsed_key.expiration_time.as_secs());
+                                cert_reload_updated_counter.inc();
                            }
                            last_reload_failed = false;
                        }
                        Err(err) => {
+                            cert_reload_failed_counter.inc();
                            // Note: Reloading certs may fail if it conflicts with the script updating
                            // the files at the same time. Warn only if the error is persistent.
                            if last_reload_failed {
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -180,6 +180,7 @@ pub struct ConfigToml {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub generate_unarchival_heatmap: Option<bool>,
    pub tracing: Option<Tracing>,
+    pub enable_tls_page_service_api: bool,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -206,6 +207,10 @@ pub struct PageServicePipeliningConfigPipelined {
    /// Causes runtime errors if larger than max get_vectored batch size.
    pub max_batch_size: NonZeroUsize,
    pub execution: PageServiceProtocolPipelinedExecutionStrategy,
+    // The default below is such that new versions of the software can start
+    // with the old configuration.
+    #[serde(default)]
+    pub batching: PageServiceProtocolPipelinedBatchingStrategy,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -215,6 +220,19 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
    Tasks,
 }

+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum PageServiceProtocolPipelinedBatchingStrategy {
+    /// All get page requests in a batch will be at the same LSN
+    #[default]
+    UniformLsn,
+    /// Get page requests in a batch may be at different LSN
+    ///
+    /// One key cannot be present more than once at different LSNs in
+    /// the same batch.
+    ScatteredLsn,
+}
+
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
 pub enum GetVectoredConcurrentIo {
@@ -451,6 +469,8 @@ pub struct TenantConfigToml {
    // gc-compaction related configs
    /// Enable automatic gc-compaction trigger on this tenant.
    pub gc_compaction_enabled: bool,
+    /// Enable verification of gc-compaction results.
+    pub gc_compaction_verification: bool,
    /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
    /// gc-compaction will be triggered.
    pub gc_compaction_initial_threshold_kb: u64,
@@ -612,9 +632,12 @@ impl Default for ConfigToml {
            page_service_pipelining: if !cfg!(test) {
                PageServicePipeliningConfig::Serial
            } else {
+                // Do not turn this into the default until scattered reads have been
+                // validated and rolled-out fully.
                PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
                    max_batch_size: NonZeroUsize::new(32).unwrap(),
                    execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
+                    batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
                })
            },
            get_vectored_concurrent_io: if !cfg!(test) {
@@ -631,6 +654,7 @@ impl Default for ConfigToml {
            load_previous_heatmap: None,
            generate_unarchival_heatmap: None,
            tracing: None,
+            enable_tls_page_service_api: false,
        }
    }
 }
@@ -690,6 +714,7 @@ pub mod tenant_conf_defaults {
    // image layers should be created.
    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
    pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
+    pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
    pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
    pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
 }
@@ -744,6 +769,7 @@ impl Default for TenantConfigToml {
            wal_receiver_protocol_override: None,
            rel_size_v2_enabled: false,
            gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
+            gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
            gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
            gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
            sampling_ratio: None,
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -7,7 +7,8 @@ use std::time::{Duration, Instant};
 /// API (`/control/v1` prefix).  Implemented by the server
 /// in [`storage_controller::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;

 use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
 use crate::shard::{ShardStripeSize, TenantShardId};
@@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest {
    pub scheduling_policy: SkSchedulingPolicy,
 }

+/// Import request for safekeeper timelines.
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TimelineImportRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub start_lsn: Lsn,
+    pub sk_set: Vec<NodeId>,
+}
+
 #[cfg(test)]
 mod test {
    use serde_json;
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -927,7 +927,7 @@ impl Key {

    /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
    #[inline(always)]
-    pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
+    pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
        Ok(match self.field1 {
            0x00 => (
                RelTag {
@@ -938,7 +938,7 @@ impl Key {
                },
                self.field6,
            ),
-            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
+            _ => return Err(ToRelBlockError(self.field1)),
        })
    }
 }
@@ -951,6 +951,17 @@ impl std::str::FromStr for Key {
    }
 }

+#[derive(Debug)]
+pub struct ToRelBlockError(u8);
+
+impl fmt::Display for ToRelBlockError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "unexpected value kind 0x{:02x}", self.0)
+    }
+}
+
+impl std::error::Error for ToRelBlockError {}
+
 #[cfg(test)]
 mod tests {
    use std::str::FromStr;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -576,6 +576,8 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_compaction_enabled: FieldPatch<bool>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_verification: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_compaction_ratio_percent: FieldPatch<u64>,
@@ -696,6 +698,9 @@ pub struct TenantConfig {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub gc_compaction_enabled: Option<bool>,

+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_compaction_verification: Option<bool>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    pub gc_compaction_initial_threshold_kb: Option<u64>,

@@ -744,6 +749,7 @@ impl TenantConfig {
            mut wal_receiver_protocol_override,
            mut rel_size_v2_enabled,
            mut gc_compaction_enabled,
+            mut gc_compaction_verification,
            mut gc_compaction_initial_threshold_kb,
            mut gc_compaction_ratio_percent,
            mut sampling_ratio,
@@ -835,6 +841,9 @@ impl TenantConfig {
        patch
            .gc_compaction_enabled
            .apply(&mut gc_compaction_enabled);
+        patch
+            .gc_compaction_verification
+            .apply(&mut gc_compaction_verification);
        patch
            .gc_compaction_initial_threshold_kb
            .apply(&mut gc_compaction_initial_threshold_kb);
@@ -876,6 +885,7 @@ impl TenantConfig {
            wal_receiver_protocol_override,
            rel_size_v2_enabled,
            gc_compaction_enabled,
+            gc_compaction_verification,
            gc_compaction_initial_threshold_kb,
            gc_compaction_ratio_percent,
            sampling_ratio,
@@ -974,6 +984,9 @@ impl TenantConfig {
            gc_compaction_enabled: self
                .gc_compaction_enabled
                .unwrap_or(global_conf.gc_compaction_enabled),
+            gc_compaction_verification: self
+                .gc_compaction_verification
+                .unwrap_or(global_conf.gc_compaction_verification),
            gc_compaction_initial_threshold_kb: self
                .gc_compaction_initial_threshold_kb
                .unwrap_or(global_conf.gc_compaction_initial_threshold_kb),
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -29,6 +29,7 @@ futures = { workspace = true }
 jsonwebtoken.workspace = true
 nix = { workspace = true, features = ["ioctl"] }
 once_cell.workspace = true
+pem.workspace = true
 pin-project-lite.workspace = true
 regex.workspace = true
 serde.workspace = true
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -11,7 +11,8 @@ use camino::Utf8Path;
 use jsonwebtoken::{
    Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode,
 };
-use serde::{Deserialize, Serialize};
+use pem::Pem;
+use serde::{Deserialize, Serialize, de::DeserializeOwned};

 use crate::id::TenantId;

@@ -73,7 +74,10 @@ impl SwappableJwtAuth {
    pub fn swap(&self, jwt_auth: JwtAuth) {
        self.0.swap(Arc::new(jwt_auth));
    }
-    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+    pub fn decode<D: DeserializeOwned>(
+        &self,
+        token: &str,
+    ) -> std::result::Result<TokenData<D>, AuthError> {
        self.0.load().decode(token)
    }
 }
@@ -148,7 +152,10 @@ impl JwtAuth {
    /// The function tries the stored decoding keys in succession,
    /// and returns the first yielding a successful result.
    /// If there is no working decoding key, it returns the last error.
-    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+    pub fn decode<D: DeserializeOwned>(
+        &self,
+        token: &str,
+    ) -> std::result::Result<TokenData<D>, AuthError> {
        let mut res = None;
        for decoding_key in &self.decoding_keys {
            res = Some(decode(token, decoding_key, &self.validation));
@@ -173,8 +180,8 @@ impl std::fmt::Debug for JwtAuth {
 }

 // this function is used only for testing purposes in CLI e g generate tokens during init
-pub fn encode_from_key_file<S: Serialize>(claims: &S, key_data: &[u8]) -> Result<String> {
-    let key = EncodingKey::from_ed_pem(key_data)?;
+pub fn encode_from_key_file<S: Serialize>(claims: &S, pem: &Pem) -> Result<String> {
+    let key = EncodingKey::from_ed_der(pem.contents());
    Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
 }

@@ -188,13 +195,13 @@ mod tests {
    //
    // openssl genpkey -algorithm ed25519 -out ed25519-priv.pem
    // openssl pkey -in ed25519-priv.pem -pubout -out ed25519-pub.pem
-    const TEST_PUB_KEY_ED25519: &[u8] = br#"
+    const TEST_PUB_KEY_ED25519: &str = r#"
 -----BEGIN PUBLIC KEY-----
 MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
 -----END PUBLIC KEY-----
 "#;

-    const TEST_PRIV_KEY_ED25519: &[u8] = br#"
+    const TEST_PRIV_KEY_ED25519: &str = r#"
 -----BEGIN PRIVATE KEY-----
 MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 -----END PRIVATE KEY-----
@@ -222,9 +229,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH

        // Check it can be validated with the public key
        let auth = JwtAuth::new(vec![
-            DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(),
+            DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519.as_bytes()).unwrap(),
        ]);
-        let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
+        let claims_from_token: Claims = auth.decode(encoded_eddsa).unwrap().claims;
        assert_eq!(claims_from_token, expected_claims);
    }

@@ -235,13 +242,14 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
            scope: Scope::Tenant,
        };

-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
+        let pem = pem::parse(TEST_PRIV_KEY_ED25519).unwrap();
+        let encoded = encode_from_key_file(&claims, &pem).unwrap();

        // decode it back
        let auth = JwtAuth::new(vec![
-            DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(),
+            DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519.as_bytes()).unwrap(),
        ]);
-        let decoded = auth.decode(&encoded).unwrap();
+        let decoded: TokenData<Claims> = auth.decode(&encoded).unwrap();

        assert_eq!(decoded.claims, claims);
    }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -10,6 +10,8 @@ default = []
 # which adds some runtime cost to run tests on outage conditions
 testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]

+fuzz-read-path = ["testing"]
+
 [dependencies]
 anyhow.workspace = true
 arc-swap.workspace = true
@@ -33,6 +35,7 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper0.workspace = true
 itertools.workspace = true
+jsonwebtoken.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -126,7 +126,7 @@ async fn ingest(
            max_concurrency: NonZeroUsize::new(1).unwrap(),
        });
        let (_desc, path) = layer
-            .write_to_disk(&ctx, None, l0_flush_state.inner())
+            .write_to_disk(&ctx, None, l0_flush_state.inner(), &gate, cancel.clone())
            .await?
            .unwrap();
        tokio::fs::remove_file(path).await?;
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -34,7 +34,7 @@ use utils::lsn::Lsn;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::Version;
 use crate::tenant::storage_layer::IoConcurrency;
-use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
 use crate::tenant::{PageReconstructError, Timeline};

 #[derive(Debug, thiserror::Error)]
@@ -73,6 +73,7 @@ impl From<GetVectoredError> for BasebackupError {
 ///  * When working without safekeepers. In this situation it is important to match the lsn
 ///    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
 ///    to start the replication.
+#[allow(clippy::too_many_arguments)]
 pub async fn send_basebackup_tarball<'a, W>(
    write: &'a mut W,
    timeline: &'a Timeline,
@@ -80,6 +81,7 @@ pub async fn send_basebackup_tarball<'a, W>(
    prev_lsn: Option<Lsn>,
    full_backup: bool,
    replica: bool,
+    lazy_slru_download_enabled: bool,
    ctx: &'a RequestContext,
 ) -> Result<(), BasebackupError>
 where
@@ -131,8 +133,8 @@ where
    };

    info!(
-        "taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
-        backup_lsn, prev_lsn, full_backup, replica
+        "taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={}, lazy_slru_download_enabled={})",
+        backup_lsn, prev_lsn, full_backup, replica, lazy_slru_download_enabled
    );

    let basebackup = Basebackup {
@@ -142,6 +144,7 @@ where
        prev_record_lsn: prev_lsn,
        full_backup,
        replica,
+        lazy_slru_download_enabled,
        ctx,
        io_concurrency: IoConcurrency::spawn_from_conf(
            timeline.conf,
@@ -170,6 +173,7 @@ where
    prev_record_lsn: Lsn,
    full_backup: bool,
    replica: bool,
+    lazy_slru_download_enabled: bool,
    ctx: &'a RequestContext,
    io_concurrency: IoConcurrency,
 }
@@ -308,7 +312,10 @@ where
                self.timeline.pg_version,
            )?;

-        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
+        let lazy_slru_download = self
+            .timeline
+            .get_lazy_slru_download(self.lazy_slru_download_enabled)
+            && !self.full_backup;

        let pgversion = self.timeline.pg_version;
        let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]);
@@ -353,9 +360,10 @@ where
            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);

            for part in slru_partitions.parts {
+                let query = VersionedKeySpaceQuery::uniform(part, self.lsn);
                let blocks = self
                    .timeline
-                    .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
+                    .get_vectored(query, self.io_concurrency.clone(), self.ctx)
                    .await?;

                for (key, block) in blocks {
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -9,7 +9,7 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;

-use anyhow::{Context, anyhow, bail};
+use anyhow::{Context, anyhow};
 use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};
 use http_utils::tls_certs::ReloadingCertificateResolver;
@@ -79,8 +79,6 @@ fn main() -> anyhow::Result<()> {
        return Ok(());
    }

-    let dev_mode = arg_matches.get_flag("dev");
-
    // Initialize up failpoints support
    let scenario = failpoint_support::init();

@@ -101,20 +99,6 @@ fn main() -> anyhow::Result<()> {

    let (conf, ignored) = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;

-    if !dev_mode {
-        if matches!(conf.http_auth_type, AuthType::Trust)
-            || matches!(conf.pg_auth_type, AuthType::Trust)
-        {
-            bail!(
-                "Pageserver refuses to start with HTTP or PostgreSQL API authentication disabled.\n\
-                  Run with --dev to allow running without authentication.\n\
-                  This is insecure and should only be used in development environments."
-            );
-        }
-    } else {
-        warn!("Starting in dev mode: this may be an insecure configuration.");
-    }
-
    // Initialize logging.
    //
    // It must be initialized before the custom panic hook is installed below.
@@ -468,6 +452,24 @@ fn start_pageserver(
    info!("Using auth for http API: {:#?}", conf.http_auth_type);
    info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);

+    let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
+    {
+        let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
+            "main",
+            &conf.ssl_key_file,
+            &conf.ssl_cert_file,
+            conf.ssl_cert_reload_period,
+        ))?;
+
+        let server_config = rustls::ServerConfig::builder()
+            .with_no_client_auth()
+            .with_cert_resolver(resolver);
+
+        Some(Arc::new(server_config))
+    } else {
+        None
+    };
+
    match var("NEON_AUTH_TOKEN") {
        Ok(v) => {
            info!("Loaded JWT token for authentication with Safekeeper");
@@ -686,17 +688,11 @@ fn start_pageserver(

        let https_task = match https_listener {
            Some(https_listener) => {
-                let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
-                    &conf.ssl_key_file,
-                    &conf.ssl_cert_file,
-                    conf.ssl_cert_reload_period,
-                ))?;
+                let tls_server_config = tls_server_config
+                    .clone()
+                    .expect("tls_server_config is set earlier if https is enabled");

-                let server_config = rustls::ServerConfig::builder()
-                    .with_no_client_auth()
-                    .with_cert_resolver(resolver);
-
-                let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
+                let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);

                let server =
                    http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
@@ -752,6 +748,11 @@ fn start_pageserver(
            tokio::net::TcpListener::from_std(pageserver_listener)
                .context("create tokio listener")?
        },
+        if conf.enable_tls_page_service_api {
+            tls_server_config
+        } else {
+            None
+        },
    );

    // All started up! Now just sit and wait for shutdown signal.
@@ -832,12 +833,6 @@ fn cli() -> Command {
                .action(ArgAction::SetTrue)
                .help("Show enabled compile time features"),
        )
-        .arg(
-            Arg::new("dev")
-                .long("dev")
-                .action(ArgAction::SetTrue)
-                .help("Run in development mode (disables security checks)"),
-        )
 }

 #[test]
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -219,6 +219,11 @@ pub struct PageServerConf {
    pub generate_unarchival_heatmap: bool,

    pub tracing: Option<pageserver_api::config::Tracing>,
+
+    /// Enable TLS in page service API.
+    /// Does not force TLS: the client negotiates TLS usage during the handshake.
+    /// Uses key and certificate from ssl_key_file/ssl_cert_file.
+    pub enable_tls_page_service_api: bool,
 }

 /// Token for authentication to safekeepers
@@ -391,6 +396,7 @@ impl PageServerConf {
            load_previous_heatmap,
            generate_unarchival_heatmap,
            tracing,
+            enable_tls_page_service_api,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -441,6 +447,7 @@ impl PageServerConf {
            page_service_pipelining,
            get_vectored_concurrent_io,
            tracing,
+            enable_tls_page_service_api,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3253,7 +3253,7 @@ async fn ingest_aux_files(
        modification
            .put_file(&fname, content.as_bytes(), &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
    }
    modification
        .commit(&ctx)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -27,7 +27,7 @@ use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
-use crate::walingest::WalIngest;
+use crate::walingest::{WalIngest, WalIngestErrorKind};

 // Returns checkpoint LSN from controlfile
 pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
@@ -157,9 +157,9 @@ async fn import_rel(
        .put_rel_creation(rel, nblocks as u32, ctx)
        .await
    {
-        match e {
-            RelationError::AlreadyExists => {
-                debug!("Relation {} already exist. We must be extending it.", rel)
+        match e.kind {
+            WalIngestErrorKind::RelationAlreadyExists(rel) => {
+                debug!("Relation {rel} already exists. We must be extending it.")
            }
            _ => return Err(e.into()),
        }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -17,7 +17,7 @@ use metrics::{
 use once_cell::sync::Lazy;
 use pageserver_api::config::{
    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
-    PageServiceProtocolPipelinedExecutionStrategy,
+    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -1714,6 +1714,28 @@ pub enum SmgrQueryType {
    Test,
 }

+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    IntoStaticStr,
+    strum_macros::EnumCount,
+    strum_macros::EnumIter,
+    strum_macros::FromRepr,
+    enum_map::Enum,
+)]
+#[strum(serialize_all = "snake_case")]
+pub enum GetPageBatchBreakReason {
+    BatchFull,
+    NonBatchableRequest,
+    NonUniformLsn,
+    SamePageAtDifferentLsn,
+    NonUniformTimeline,
+    ExecutorSteal,
+    #[cfg(feature = "testing")]
+    NonUniformKey,
+}
+
 pub(crate) struct SmgrQueryTimePerTimeline {
    global_started: [IntCounter; SmgrQueryType::COUNT],
    global_latency: [Histogram; SmgrQueryType::COUNT],
@@ -1725,6 +1747,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
    per_timeline_flush_in_progress_micros: IntCounter,
    global_batch_wait_time: Histogram,
    per_timeline_batch_wait_time: Histogram,
+    global_batch_break_reason: [IntCounter; GetPageBatchBreakReason::COUNT],
+    per_timeline_batch_break_reason: GetPageBatchBreakReasonTimelineMetrics,
    throttling: Arc<tenant_throttling::Pagestream>,
 }

@@ -1858,12 +1882,55 @@ static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::n
    .expect("failed to define a metric")
 });

+static PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_page_service_batch_break_reason_global",
+        "Reason for breaking batches of get page requests",
+        &["reason"],
+    )
+    .expect("failed to define a metric")
+});
+
+struct GetPageBatchBreakReasonTimelineMetrics {
+    map: EnumMap<GetPageBatchBreakReason, IntCounter>,
+}
+
+impl GetPageBatchBreakReasonTimelineMetrics {
+    fn new(tenant_id: &str, shard_slug: &str, timeline_id: &str) -> Self {
+        GetPageBatchBreakReasonTimelineMetrics {
+            map: EnumMap::from_array(std::array::from_fn(|reason_idx| {
+                let reason = GetPageBatchBreakReason::from_usize(reason_idx);
+                PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.with_label_values(&[
+                    tenant_id,
+                    shard_slug,
+                    timeline_id,
+                    reason.into(),
+                ])
+            })),
+        }
+    }
+
+    fn inc(&self, reason: GetPageBatchBreakReason) {
+        self.map[reason].inc()
+    }
+}
+
+static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_service_batch_break_reason",
+        "Reason for breaking batches of get page requests",
+        &["tenant_id", "shard_id", "timeline_id", "reason"],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_page_service_config_max_batch_size",
        "Configured maximum batch size for the server-side batching functionality of page_service. \
         Labels expose more of the configuration parameters.",
-        &["mode", "execution"]
+        &["mode", "execution", "batching"]
    )
    .expect("failed to define a metric")
 });
@@ -1871,10 +1938,11 @@ pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::
 fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
    PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
    let (label_values, value) = match conf {
-        PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
+        PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1),
        PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
            max_batch_size,
            execution,
+            batching,
        }) => {
            let mode = "pipelined";
            let execution = match execution {
@@ -1883,7 +1951,12 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
                }
                PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
            };
-            ([mode, execution], max_batch_size.get())
+            let batching = match batching {
+                PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => "uniform-lsn",
+                PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => "scattered-lsn",
+            };
+
+            ([mode, execution, batching], max_batch_size.get())
        }
    };
    PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
@@ -1979,6 +2052,15 @@ impl SmgrQueryTimePerTimeline {
            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
            .unwrap();

+        let global_batch_break_reason = std::array::from_fn(|i| {
+            let reason = GetPageBatchBreakReason::from_usize(i);
+            PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL
+                .get_metric_with_label_values(&[reason.into()])
+                .unwrap()
+        });
+        let per_timeline_batch_break_reason =
+            GetPageBatchBreakReasonTimelineMetrics::new(&tenant_id, &shard_slug, &timeline_id);
+
        let global_flush_in_progress_micros =
            PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
        let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
@@ -1996,6 +2078,8 @@ impl SmgrQueryTimePerTimeline {
            per_timeline_flush_in_progress_micros,
            global_batch_wait_time,
            per_timeline_batch_wait_time,
+            global_batch_break_reason,
+            per_timeline_batch_break_reason,
            throttling: pagestream_throttle_metrics,
        }
    }
@@ -2024,9 +2108,16 @@ impl SmgrQueryTimePerTimeline {
    }

    /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
-    pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
+    pub(crate) fn observe_getpage_batch_start(
+        &self,
+        batch_size: usize,
+        break_reason: GetPageBatchBreakReason,
+    ) {
        self.global_batch_size.observe(batch_size as f64);
        self.per_timeline_batch_size.observe(batch_size as f64);
+
+        self.global_batch_break_reason[break_reason.into_usize()].inc();
+        self.per_timeline_batch_break_reason.inc(break_reason);
    }
 }

@@ -3392,6 +3483,15 @@ impl TimelineMetrics {
            shard_id,
            timeline_id,
        ]);
+
+        for reason in GetPageBatchBreakReason::iter() {
+            let _ = PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.remove_label_values(&[
+                tenant_id,
+                shard_id,
+                timeline_id,
+                reason.into(),
+            ]);
+        }
    }
 }

@@ -4270,6 +4370,7 @@ pub fn preinitialize_metrics(
    [
        &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
        &SMGR_QUERY_STARTED_GLOBAL,
+        &PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL,
    ]
    .into_iter()
    .for_each(|c| {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -15,10 +15,11 @@ use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use futures::FutureExt;
 use itertools::Itertools;
+use jsonwebtoken::TokenData;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
-    PageServiceProtocolPipelinedExecutionStrategy,
+    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::models::{
@@ -58,8 +59,8 @@ use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
 use crate::metrics::{
-    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
-    TimelineMetrics,
+    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
+    SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::Version;
 use crate::span::{
@@ -105,6 +106,7 @@ pub fn spawn(
    pg_auth: Option<Arc<SwappableJwtAuth>>,
    perf_trace_dispatch: Option<Dispatch>,
    tcp_listener: tokio::net::TcpListener,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
 ) -> Listener {
    let cancel = CancellationToken::new();
    let libpq_ctx = RequestContext::todo_child(
@@ -124,6 +126,7 @@ pub fn spawn(
            perf_trace_dispatch,
            tcp_listener,
            conf.pg_auth_type,
+            tls_config,
            conf.page_service_pipelining.clone(),
            libpq_ctx,
            cancel.clone(),
@@ -181,6 +184,7 @@ pub async fn libpq_listener_main(
    perf_trace_dispatch: Option<Dispatch>,
    listener: tokio::net::TcpListener,
    auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
    listener_ctx: RequestContext,
    listener_cancel: CancellationToken,
@@ -223,6 +227,7 @@ pub async fn libpq_listener_main(
                    local_auth,
                    socket,
                    auth_type,
+                    tls_config.clone(),
                    pipelining_config.clone(),
                    connection_ctx,
                    connections_cancel.child_token(),
@@ -264,6 +269,7 @@ async fn page_service_conn_main(
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
    connection_ctx: RequestContext,
    cancel: CancellationToken,
@@ -334,7 +340,8 @@ async fn page_service_conn_main(
        cancel.clone(),
        gate_guard,
    );
-    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
+    let pgbackend =
+        PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;

    match pgbackend.run(&mut conn_handler, &cancel).await {
        Ok(()) => {
@@ -635,6 +642,7 @@ impl std::fmt::Display for BatchedPageStreamError {
 struct BatchedGetPageRequest {
    req: PagestreamGetPageRequest,
    timer: SmgrOpTimer,
+    effective_request_lsn: Lsn,
    ctx: RequestContext,
 }

@@ -664,8 +672,8 @@ enum BatchedFeMessage {
    GetPage {
        span: Span,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
-        effective_request_lsn: Lsn,
        pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
+        batch_break_reason: GetPageBatchBreakReason,
    },
    DbSize {
        span: Span,
@@ -718,6 +726,119 @@ impl BatchedFeMessage {
            BatchedFeMessage::RespondError { .. } => {}
        }
    }
+
+    fn should_break_batch(
+        &self,
+        other: &BatchedFeMessage,
+        max_batch_size: NonZeroUsize,
+        batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
+    ) -> Option<GetPageBatchBreakReason> {
+        match (self, other) {
+            (
+                BatchedFeMessage::GetPage {
+                    shard: accum_shard,
+                    pages: accum_pages,
+                    ..
+                },
+                BatchedFeMessage::GetPage {
+                    shard: this_shard,
+                    pages: this_pages,
+                    ..
+                },
+            ) => {
+                assert_eq!(this_pages.len(), 1);
+                if accum_pages.len() >= max_batch_size.get() {
+                    trace!(%max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_pages.len(), max_batch_size.get());
+
+                    return Some(GetPageBatchBreakReason::BatchFull);
+                }
+                if !accum_shard.is_same_handle_as(this_shard) {
+                    trace!("stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+
+                    return Some(GetPageBatchBreakReason::NonUniformTimeline);
+                }
+
+                match batching_strategy {
+                    PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
+                        if let Some(last_in_batch) = accum_pages.last() {
+                            if last_in_batch.effective_request_lsn
+                                != this_pages[0].effective_request_lsn
+                            {
+                                trace!(
+                                    accum_lsn = %last_in_batch.effective_request_lsn,
+                                    this_lsn = %this_pages[0].effective_request_lsn,
+                                    "stopping batching because LSN changed"
+                                );
+
+                                return Some(GetPageBatchBreakReason::NonUniformLsn);
+                            }
+                        }
+                    }
+                    PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => {
+                        // The read path doesn't curently support serving the same page at different LSNs.
+                        // While technically possible, it's uncertain if the complexity is worth it.
+                        // Break the batch if such a case is encountered.
+                        let same_page_different_lsn = accum_pages.iter().any(|batched| {
+                            batched.req.rel == this_pages[0].req.rel
+                                && batched.req.blkno == this_pages[0].req.blkno
+                                && batched.effective_request_lsn
+                                    != this_pages[0].effective_request_lsn
+                        });
+
+                        if same_page_different_lsn {
+                            trace!(
+                                rel=%this_pages[0].req.rel,
+                                blkno=%this_pages[0].req.blkno,
+                                lsn=%this_pages[0].effective_request_lsn,
+                                "stopping batching because same page was requested at different LSNs"
+                            );
+
+                            return Some(GetPageBatchBreakReason::SamePageAtDifferentLsn);
+                        }
+                    }
+                }
+
+                None
+            }
+            #[cfg(feature = "testing")]
+            (
+                BatchedFeMessage::Test {
+                    shard: accum_shard,
+                    requests: accum_requests,
+                    ..
+                },
+                BatchedFeMessage::Test {
+                    shard: this_shard,
+                    requests: this_requests,
+                    ..
+                },
+            ) => {
+                assert!(this_requests.len() == 1);
+                if accum_requests.len() >= max_batch_size.get() {
+                    trace!(%max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_requests.len(), max_batch_size.get());
+                    return Some(GetPageBatchBreakReason::BatchFull);
+                }
+                if !accum_shard.is_same_handle_as(this_shard) {
+                    trace!("stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+                    return Some(GetPageBatchBreakReason::NonUniformTimeline);
+                }
+                let this_batch_key = this_requests[0].req.batch_key;
+                let accum_batch_key = accum_requests[0].req.batch_key;
+                if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
+                    trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
+                    return Some(GetPageBatchBreakReason::NonUniformKey);
+                }
+                None
+            }
+            (_, _) => Some(GetPageBatchBreakReason::NonBatchableRequest),
+        }
+    }
 }

 impl PageServerHandler {
@@ -1019,34 +1140,32 @@ impl PageServerHandler {
                .await?;

                // We're holding the Handle
-                // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
-                let res = Self::wait_or_get_last_lsn(
+                let effective_request_lsn = match Self::effective_request_lsn(
                    &shard,
+                    shard.get_last_record_lsn(),
                    req.hdr.request_lsn,
                    req.hdr.not_modified_since,
                    &shard.get_applied_gc_cutoff_lsn(),
-                    &ctx,
-                )
-                .maybe_perf_instrument(&ctx, |current_perf_span| {
-                    info_span!(
-                        target: PERF_TRACE_TARGET,
-                        parent: current_perf_span,
-                        "WAIT_LSN",
-                    )
-                })
-                .await;
-
-                let effective_request_lsn = match res {
+                ) {
                    Ok(lsn) => lsn,
                    Err(e) => {
                        return respond_error!(span, e);
                    }
                };
+
                BatchedFeMessage::GetPage {
                    span,
                    shard: shard.downgrade(),
-                    effective_request_lsn,
-                    pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }],
+                    pages: smallvec::smallvec![BatchedGetPageRequest {
+                        req,
+                        timer,
+                        effective_request_lsn,
+                        ctx,
+                    }],
+                    // The executor grabs the batch when it becomes idle.
+                    // Hence, [`GetPageBatchBreakReason::ExecutorSteal`] is the
+                    // default reason for breaking the batch.
+                    batch_break_reason: GetPageBatchBreakReason::ExecutorSteal,
                }
            }
            #[cfg(feature = "testing")]
@@ -1072,6 +1191,7 @@ impl PageServerHandler {
    #[instrument(skip_all, level = tracing::Level::TRACE)]
    #[allow(clippy::boxed_local)]
    fn pagestream_do_batch(
+        batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
        max_batch_size: NonZeroUsize,
        batch: &mut Result<BatchedFeMessage, QueryError>,
        this_msg: Result<BatchedFeMessage, QueryError>,
@@ -1083,90 +1203,59 @@ impl PageServerHandler {
            Err(e) => return Err(Err(e)),
        };

-        match (&mut *batch, this_msg) {
-            // something batched already, let's see if we can add this message to the batch
-            (
-                Ok(BatchedFeMessage::GetPage {
-                    span: _,
-                    shard: accum_shard,
-                    pages: accum_pages,
-                    effective_request_lsn: accum_lsn,
-                }),
-                BatchedFeMessage::GetPage {
-                    span: _,
-                    shard: this_shard,
-                    pages: this_pages,
-                    effective_request_lsn: this_lsn,
-                },
-            ) if (|| {
-                assert_eq!(this_pages.len(), 1);
-                if accum_pages.len() >= max_batch_size.get() {
-                    trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size");
-                    assert_eq!(accum_pages.len(), max_batch_size.get());
-                    return false;
-                }
-                if !accum_shard.is_same_handle_as(&this_shard) {
-                    trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
-                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                    // But the current logic for keeping responses in order does not support that.
-                    return false;
-                }
-                // the vectored get currently only supports a single LSN, so, bounce as soon
-                // as the effective request_lsn changes
-                if *accum_lsn != this_lsn {
-                    trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed");
-                    return false;
-                }
-                true
-            })() =>
-            {
-                // ok to batch
-                accum_pages.extend(this_pages);
-                Ok(())
+        let eligible_batch = match batch {
+            Ok(b) => b,
+            Err(_) => {
+                return Err(Ok(this_msg));
            }
-            #[cfg(feature = "testing")]
-            (
-                Ok(BatchedFeMessage::Test {
-                    shard: accum_shard,
-                    requests: accum_requests,
-                    ..
-                }),
-                BatchedFeMessage::Test {
-                    shard: this_shard,
-                    requests: this_requests,
-                    ..
-                },
-            ) if (|| {
-                assert!(this_requests.len() == 1);
-                if accum_requests.len() >= max_batch_size.get() {
-                    trace!(%max_batch_size, "stopping batching because of batch size");
-                    assert_eq!(accum_requests.len(), max_batch_size.get());
-                    return false;
+        };
+
+        let batch_break =
+            eligible_batch.should_break_batch(&this_msg, max_batch_size, batching_strategy);
+
+        match batch_break {
+            Some(reason) => {
+                if let BatchedFeMessage::GetPage {
+                    batch_break_reason, ..
+                } = eligible_batch
+                {
+                    *batch_break_reason = reason;
                }
-                if !accum_shard.is_same_handle_as(&this_shard) {
-                    trace!("stopping batching because timeline object mismatch");
-                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                    // But the current logic for keeping responses in order does not support that.
-                    return false;
-                }
-                let this_batch_key = this_requests[0].req.batch_key;
-                let accum_batch_key = accum_requests[0].req.batch_key;
-                if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
-                    trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
-                    return false;
-                }
-                true
-            })() =>
-            {
-                // ok to batch
-                accum_requests.extend(this_requests);
-                Ok(())
-            }
-            // something batched already but this message is unbatchable
-            (_, this_msg) => {
-                // by default, don't continue batching
+
                Err(Ok(this_msg))
            }
+            None => {
+                // ok to batch
+                match (eligible_batch, this_msg) {
+                    (
+                        BatchedFeMessage::GetPage {
+                            pages: accum_pages, ..
+                        },
+                        BatchedFeMessage::GetPage {
+                            pages: this_pages, ..
+                        },
+                    ) => {
+                        accum_pages.extend(this_pages);
+                        Ok(())
+                    }
+                    #[cfg(feature = "testing")]
+                    (
+                        BatchedFeMessage::Test {
+                            requests: accum_requests,
+                            ..
+                        },
+                        BatchedFeMessage::Test {
+                            requests: this_requests,
+                            ..
+                        },
+                    ) => {
+                        accum_requests.extend(this_requests);
+                        Ok(())
+                    }
+                    // Shape guaranteed by [`BatchedFeMessage::should_break_batch`]
+                    _ => unreachable!(),
+                }
+            }
        }
    }

@@ -1387,8 +1476,8 @@ impl PageServerHandler {
            BatchedFeMessage::GetPage {
                span,
                shard,
-                effective_request_lsn,
                pages,
+                batch_break_reason,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::getpage");
                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
@@ -1399,9 +1488,9 @@ impl PageServerHandler {
                        let res = self
                            .handle_get_page_at_lsn_request_batched(
                                &shard,
-                                effective_request_lsn,
                                pages,
                                io_concurrency,
+                                batch_break_reason,
                                &ctx,
                            )
                            .instrument(span.clone())
@@ -1718,6 +1807,7 @@ impl PageServerHandler {
        let PageServicePipeliningConfigPipelined {
            max_batch_size,
            execution,
+            batching: batching_strategy,
        } = pipelining_config;

        // Macro to _define_ a pipeline stage.
@@ -1769,7 +1859,7 @@ impl PageServerHandler {
                    exit |= read_res.is_err();
                    let could_send = batch_tx
                        .send(read_res, |batch, res| {
-                            Self::pagestream_do_batch(max_batch_size, batch, res)
+                            Self::pagestream_do_batch(batching_strategy, max_batch_size, batch, res)
                        })
                        .await;
                    exit |= could_send.is_err();
@@ -1865,7 +1955,39 @@ impl PageServerHandler {
        ctx: &RequestContext,
    ) -> Result<Lsn, PageStreamError> {
        let last_record_lsn = timeline.get_last_record_lsn();
+        let effective_request_lsn = Self::effective_request_lsn(
+            timeline,
+            last_record_lsn,
+            request_lsn,
+            not_modified_since,
+            latest_gc_cutoff_lsn,
+        )?;

+        if effective_request_lsn > last_record_lsn {
+            timeline
+                .wait_lsn(
+                    not_modified_since,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    timeline::WaitLsnTimeout::Default,
+                    ctx,
+                )
+                .await?;
+
+            // Since we waited for 'effective_request_lsn' to arrive, that is now the last
+            // record LSN. (Or close enough for our purposes; the last-record LSN can
+            // advance immediately after we return anyway)
+        }
+
+        Ok(effective_request_lsn)
+    }
+
+    fn effective_request_lsn(
+        timeline: &Timeline,
+        last_record_lsn: Lsn,
+        request_lsn: Lsn,
+        not_modified_since: Lsn,
+        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
+    ) -> Result<Lsn, PageStreamError> {
        // Sanity check the request
        if request_lsn < not_modified_since {
            return Err(PageStreamError::BadRequest(
@@ -1900,19 +2022,7 @@ impl PageServerHandler {
            }
        }

-        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
        if not_modified_since > last_record_lsn {
-            timeline
-                .wait_lsn(
-                    not_modified_since,
-                    crate::tenant::timeline::WaitLsnWaiter::PageService,
-                    timeline::WaitLsnTimeout::Default,
-                    ctx,
-                )
-                .await?;
-            // Since we waited for 'not_modified_since' to arrive, that is now the last
-            // record LSN. (Or close enough for our purposes; the last-record LSN can
-            // advance immediately after we return anyway)
            Ok(not_modified_since)
        } else {
            // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
@@ -2067,16 +2177,16 @@ impl PageServerHandler {
    async fn handle_get_page_at_lsn_request_batched(
        &mut self,
        timeline: &Timeline,
-        effective_lsn: Lsn,
        requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
        io_concurrency: IoConcurrency,
+        batch_break_reason: GetPageBatchBreakReason,
        ctx: &RequestContext,
    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
        debug_assert_current_span_has_tenant_and_timeline_id();

        timeline
            .query_metrics
-            .observe_getpage_batch_start(requests.len());
+            .observe_getpage_batch_start(requests.len(), batch_break_reason);

        // If a page trace is running, submit an event for this request.
        if let Some(page_trace) = timeline.page_trace.load().as_ref() {
@@ -2086,20 +2196,81 @@ impl PageServerHandler {
                // Ignore error (trace buffer may be full or tracer may have disconnected).
                _ = page_trace.try_send(PageTraceEvent {
                    key,
-                    effective_lsn,
+                    effective_lsn: batch.effective_request_lsn,
                    time,
                });
            }
        }

+        // If any request in the batch needs to wait for LSN, then do so now.
+        let mut perf_instrument = false;
+        let max_effective_lsn = requests
+            .iter()
+            .map(|req| {
+                if req.ctx.has_perf_span() {
+                    perf_instrument = true;
+                }
+
+                req.effective_request_lsn
+            })
+            .max()
+            .expect("batch is never empty");
+
+        let ctx = match perf_instrument {
+            true => RequestContextBuilder::from(ctx)
+                .root_perf_span(|| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        "GET_VECTORED",
+                        tenant_id = %timeline.tenant_shard_id.tenant_id,
+                        timeline_id = %timeline.timeline_id,
+                        shard = %timeline.tenant_shard_id.shard_slug(),
+                        %max_effective_lsn
+                    )
+                })
+                .attached_child(),
+            false => ctx.attached_child(),
+        };
+
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if max_effective_lsn > last_record_lsn {
+            if let Err(e) = timeline
+                .wait_lsn(
+                    max_effective_lsn,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    timeline::WaitLsnTimeout::Default,
+                    &ctx,
+                )
+                .maybe_perf_instrument(&ctx, |current_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: current_perf_span,
+                        "WAIT_LSN",
+                    )
+                })
+                .await
+            {
+                return Vec::from_iter(requests.into_iter().map(|req| {
+                    Err(BatchedPageStreamError {
+                        err: PageStreamError::from(e.clone()),
+                        req: req.req.hdr,
+                    })
+                }));
+            }
+        }
+
        let results = timeline
            .get_rel_page_at_lsn_batched(
-                requests
-                    .iter()
-                    .map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())),
-                effective_lsn,
+                requests.iter().map(|p| {
+                    (
+                        &p.req.rel,
+                        &p.req.blkno,
+                        p.effective_request_lsn,
+                        p.ctx.attached_child(),
+                    )
+                }),
                io_concurrency,
-                ctx,
+                &ctx,
            )
            .await;
        assert_eq!(results.len(), requests.len());
@@ -2223,6 +2394,7 @@ impl PageServerHandler {
        full_backup: bool,
        gzip: bool,
        replica: bool,
+        lazy_slru_download: bool,
        ctx: &RequestContext,
    ) -> Result<(), QueryError>
    where
@@ -2290,6 +2462,7 @@ impl PageServerHandler {
                prev_lsn,
                full_backup,
                replica,
+                lazy_slru_download,
                &ctx,
            )
            .await
@@ -2313,6 +2486,7 @@ impl PageServerHandler {
                    prev_lsn,
                    full_backup,
                    replica,
+                    lazy_slru_download,
                    &ctx,
                )
                .await
@@ -2330,6 +2504,7 @@ impl PageServerHandler {
                    prev_lsn,
                    full_backup,
                    replica,
+                    lazy_slru_download,
                    &ctx,
                )
                .await
@@ -2379,7 +2554,7 @@ impl PageServerHandler {
    }
 }

-/// `basebackup tenant timeline [lsn] [--gzip] [--replica]`
+/// `basebackup tenant timeline [lsn] [--gzip] [--replica] [--lazy-slru-download]`
 #[derive(Debug, Clone, Eq, PartialEq)]
 struct BaseBackupCmd {
    tenant_id: TenantId,
@@ -2387,6 +2562,7 @@ struct BaseBackupCmd {
    lsn: Option<Lsn>,
    gzip: bool,
    replica: bool,
+    lazy_slru_download: bool,
 }

 /// `fullbackup tenant timeline [lsn] [prev_lsn]`
@@ -2519,6 +2695,7 @@ impl BaseBackupCmd {

        let mut gzip = false;
        let mut replica = false;
+        let mut lazy_slru_download = false;

        for &param in &parameters[flags_parse_from..] {
            match param {
@@ -2534,6 +2711,12 @@ impl BaseBackupCmd {
                    }
                    replica = true
                }
+                "--lazy-slru-download" => {
+                    if lazy_slru_download {
+                        bail!("duplicate parameter for basebackup command: {param}")
+                    }
+                    lazy_slru_download = true
+                }
                _ => bail!("invalid parameter for basebackup command: {param}"),
            }
        }
@@ -2543,6 +2726,7 @@ impl BaseBackupCmd {
            lsn,
            gzip,
            replica,
+            lazy_slru_download,
        })
    }
 }
@@ -2667,7 +2851,7 @@ where
    ) -> Result<(), QueryError> {
        // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
        // which requires auth to be present
-        let data = self
+        let data: TokenData<Claims> = self
            .auth
            .as_ref()
            .unwrap()
@@ -2757,6 +2941,7 @@ where
                lsn,
                gzip,
                replica,
+                lazy_slru_download,
            }) => {
                tracing::Span::current()
                    .record("tenant_id", field::display(tenant_id))
@@ -2778,6 +2963,7 @@ where
                        false,
                        gzip,
                        replica,
+                        lazy_slru_download,
                        &ctx,
                    )
                    .await?;
@@ -2815,6 +3001,7 @@ where
                    true,
                    false,
                    false,
+                    false,
                    &ctx,
                )
                .await?;
@@ -2949,7 +3136,8 @@ mod tests {
                timeline_id,
                lsn: None,
                gzip: false,
-                replica: false
+                replica: false,
+                lazy_slru_download: false
            })
        );
        let cmd =
@@ -2961,7 +3149,8 @@ mod tests {
                timeline_id,
                lsn: None,
                gzip: true,
-                replica: false
+                replica: false,
+                lazy_slru_download: false
            })
        );
        let cmd =
@@ -2973,7 +3162,8 @@ mod tests {
                timeline_id,
                lsn: None,
                gzip: false,
-                replica: false
+                replica: false,
+                lazy_slru_download: false
            })
        );
        let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} 0/16ABCDE"))
@@ -2985,7 +3175,8 @@ mod tests {
                timeline_id,
                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
                gzip: false,
-                replica: false
+                replica: false,
+                lazy_slru_download: false
            })
        );
        let cmd = PageServiceCmd::parse(&format!(
@@ -2999,7 +3190,23 @@ mod tests {
                timeline_id,
                lsn: None,
                gzip: true,
-                replica: true
+                replica: true,
+                lazy_slru_download: false
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!(
+            "basebackup {tenant_id} {timeline_id} --replica --gzip --lazy-slru-download"
+        ))
+        .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: None,
+                gzip: true,
+                replica: true,
+                lazy_slru_download: true
            })
        );
        let cmd = PageServiceCmd::parse(&format!(
@@ -3013,7 +3220,8 @@ mod tests {
                timeline_id,
                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
                gzip: true,
-                replica: true
+                replica: true,
+                lazy_slru_download: false
            })
        );
        let cmd = PageServiceCmd::parse(&format!("fullbackup {tenant_id} {timeline_id}")).unwrap();
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -6,14 +6,14 @@
 //! walingest.rs handles a few things like implicit relation creation and extension.
 //! Clarify that)
 //!
-use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
+use std::collections::{HashMap, HashSet, hash_map};
 use std::ops::{ControlFlow, Range};

-use crate::PERF_TRACE_TARGET;
-use anyhow::{Context, ensure};
+use crate::walingest::{WalIngestError, WalIngestErrorKind};
+use crate::{PERF_TRACE_TARGET, ensure_walingest};
+use anyhow::Context;
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists,
    TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range,
@@ -21,7 +21,7 @@ use pageserver_api::key::{
    repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
 };
-use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
 use pageserver_api::models::RelSizeMigration;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
@@ -40,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};

 use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
-use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
+use crate::context::{PerfInstrumentFutureExt, RequestContext};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::metrics::{
    RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
@@ -50,7 +50,7 @@ use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
 };
 use crate::tenant::storage_layer::IoConcurrency;
-use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -136,12 +136,8 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {

 #[derive(Debug, thiserror::Error)]
 pub enum RelationError {
-    #[error("Relation Already Exists")]
-    AlreadyExists,
    #[error("invalid relnode")]
    InvalidRelnode,
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
 }

 ///
@@ -210,10 +206,9 @@ impl Timeline {
                let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
                let res = self
                    .get_rel_page_at_lsn_batched(
-                        pages
-                            .iter()
-                            .map(|(tag, blknum)| (tag, blknum, ctx.attached_child())),
-                        effective_lsn,
+                        pages.iter().map(|(tag, blknum)| {
+                            (tag, blknum, effective_lsn, ctx.attached_child())
+                        }),
                        io_concurrency.clone(),
                        ctx,
                    )
@@ -251,8 +246,7 @@ impl Timeline {
    /// The ordering of the returned vec corresponds to the ordering of `pages`.
    pub(crate) async fn get_rel_page_at_lsn_batched(
        &self,
-        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, RequestContext)>,
-        effective_lsn: Lsn,
+        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
        io_concurrency: IoConcurrency,
        ctx: &RequestContext,
    ) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -265,11 +259,13 @@ impl Timeline {
        let mut result = Vec::with_capacity(pages.len());
        let result_slots = result.spare_capacity_mut();

-        let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
-            BTreeMap::default();
+        let mut keys_slots: HashMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
+            HashMap::with_capacity(pages.len());

-        let mut perf_instrument = false;
-        for (response_slot_idx, (tag, blknum, ctx)) in pages.enumerate() {
+        let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
+            HashMap::with_capacity(pages.len());
+
+        for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
            if tag.relnode == 0 {
                result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
                    RelationError::InvalidRelnode.into(),
@@ -280,14 +276,14 @@ impl Timeline {
            }

            let nblocks = match self
-                .get_rel_size(*tag, Version::Lsn(effective_lsn), &ctx)
+                .get_rel_size(*tag, Version::Lsn(lsn), &ctx)
                .maybe_perf_instrument(&ctx, |crnt_perf_span| {
                    info_span!(
                        target: PERF_TRACE_TARGET,
                        parent: crnt_perf_span,
                        "GET_REL_SIZE",
                        reltag=%tag,
-                        lsn=%effective_lsn,
+                        lsn=%lsn,
                    )
                })
                .await
@@ -303,7 +299,7 @@ impl Timeline {
            if *blknum >= nblocks {
                debug!(
                    "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                    tag, blknum, effective_lsn, nblocks
+                    tag, blknum, lsn, nblocks
                );
                result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone()));
                slots_filled += 1;
@@ -312,46 +308,29 @@ impl Timeline {

            let key = rel_block_to_key(*tag, *blknum);

-            if ctx.has_perf_span() {
-                perf_instrument = true;
-            }
-
            let key_slots = keys_slots.entry(key).or_default();
            key_slots.push((response_slot_idx, ctx));
+
+            let acc = req_keyspaces.entry(lsn).or_default();
+            acc.add_key(key);
        }

-        let keyspace = {
-            // add_key requires monotonicity
-            let mut acc = KeySpaceAccum::new();
-            for key in keys_slots
-                .keys()
-                // in fact it requires strong monotonicity
-                .dedup()
-            {
-                acc.add_key(*key);
-            }
-            acc.to_keyspace()
-        };
-
-        let ctx = match perf_instrument {
-            true => RequestContextBuilder::from(ctx)
-                .root_perf_span(|| {
-                    info_span!(
-                        target: PERF_TRACE_TARGET,
-                        "GET_VECTORED",
-                        tenant_id = %self.tenant_shard_id.tenant_id,
-                        timeline_id = %self.timeline_id,
-                        lsn = %effective_lsn,
-                        shard = %self.tenant_shard_id.shard_slug(),
-                    )
-                })
-                .attached_child(),
-            false => ctx.attached_child(),
-        };
+        let query: Vec<(Lsn, KeySpace)> = req_keyspaces
+            .into_iter()
+            .map(|(lsn, acc)| (lsn, acc.to_keyspace()))
+            .collect();

+        let query = VersionedKeySpaceQuery::scattered(query);
        let res = self
-            .get_vectored(keyspace, effective_lsn, io_concurrency, &ctx)
-            .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone())
+            .get_vectored(query, io_concurrency, ctx)
+            .maybe_perf_instrument(ctx, |current_perf_span| {
+                info_span!(
+                    target: PERF_TRACE_TARGET,
+                    parent: current_perf_span,
+                    "GET_BATCH",
+                    batch_size = %page_count,
+                )
+            })
            .await;

        match res {
@@ -381,12 +360,12 @@ impl Timeline {
                        // There is no standardized way to express that the batched span followed from N request spans.
                        // So, abuse the system and mark the request contexts as follows_from the batch span, so we get
                        // some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for.
-                        req_ctx.perf_follows_from(&ctx);
+                        req_ctx.perf_follows_from(ctx);
                        slots_filled += 1;
                    }

                    result_slots[first_slot].write(res);
-                    first_req_ctx.perf_follows_from(&ctx);
+                    first_req_ctx.perf_follows_from(ctx);
                    slots_filled += 1;
                }
            }
@@ -425,7 +404,7 @@ impl Timeline {
                        }
                    };

-                    req_ctx.perf_follows_from(&ctx);
+                    req_ctx.perf_follows_from(ctx);
                    result_slots[*slot].write(err);
                }

@@ -664,8 +643,9 @@ impl Timeline {

        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
        for batch in batches.parts {
+            let query = VersionedKeySpaceQuery::uniform(batch, lsn);
            let blocks = self
-                .get_vectored(batch, lsn, io_concurrency.clone(), ctx)
+                .get_vectored(query, io_concurrency.clone(), ctx)
                .await?;

            for (_key, block) in blocks {
@@ -902,8 +882,9 @@ impl Timeline {
            );

            for batch in batches.parts.into_iter().rev() {
+                let query = VersionedKeySpaceQuery::uniform(batch, probe_lsn);
                let blocks = self
-                    .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
+                    .get_vectored(query, io_concurrency.clone(), ctx)
                    .await?;

                for (_key, clog_page) in blocks.into_iter().rev() {
@@ -1478,8 +1459,8 @@ impl DatadirModification<'_> {
    }

    /// Set the current lsn
-    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
-        ensure!(
+    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
+        ensure_walingest!(
            lsn >= self.lsn,
            "setting an older lsn {} than {} is not allowed",
            lsn,
@@ -1578,7 +1559,7 @@ impl DatadirModification<'_> {
        &mut self,
        rel: RelTag,
        ctx: &RequestContext,
-    ) -> Result<u32, PageReconstructError> {
+    ) -> Result<u32, WalIngestError> {
        // Get current size and put rel creation if rel doesn't exist
        //
        // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
@@ -1593,14 +1574,13 @@ impl DatadirModification<'_> {
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
-            self.put_rel_creation(rel, 0, ctx)
-                .await
-                .context("Relation Error")?;
+            self.put_rel_creation(rel, 0, ctx).await?;
            Ok(0)
        } else {
-            self.tline
+            Ok(self
+                .tline
                .get_rel_size(rel, Version::Modified(self), ctx)
-                .await
+                .await?)
        }
    }

@@ -1637,11 +1617,14 @@ impl DatadirModification<'_> {
        // TODO(vlad): remove this argument and replace the shard check with is_key_local
        shard: &ShardIdentity,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let mut gaps_at_lsns = Vec::default();

        for meta in batch.metadata.iter() {
-            let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
+            let key = Key::from_compact(meta.key());
+            let (rel, blkno) = key
+                .to_rel_block()
+                .map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?;
            let new_nblocks = blkno + 1;

            let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
@@ -1683,8 +1666,8 @@ impl DatadirModification<'_> {
        rel: RelTag,
        blknum: BlockNumber,
        rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
        self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
        Ok(())
    }
@@ -1696,7 +1679,7 @@ impl DatadirModification<'_> {
        segno: u32,
        blknum: BlockNumber,
        rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        if !self.tline.tenant_shard_id.is_shard_zero() {
            return Ok(());
        }
@@ -1714,14 +1697,11 @@ impl DatadirModification<'_> {
        rel: RelTag,
        blknum: BlockNumber,
        img: Bytes,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
        let key = rel_block_to_key(rel, blknum);
        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
        }
        self.put(rel_block_to_key(rel, blknum), Value::Image(img));
        Ok(())
@@ -1733,15 +1713,12 @@ impl DatadirModification<'_> {
        segno: u32,
        blknum: BlockNumber,
        img: Bytes,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        assert!(self.tline.tenant_shard_id.is_shard_zero());

        let key = slru_block_to_key(kind, segno, blknum);
        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
        }
        self.put(key, Value::Image(img));
        Ok(())
@@ -1751,15 +1728,11 @@ impl DatadirModification<'_> {
        &mut self,
        rel: RelTag,
        blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
        let key = rel_block_to_key(rel, blknum);
        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
        }

        let batch = self
@@ -1776,15 +1749,11 @@ impl DatadirModification<'_> {
        kind: SlruKind,
        segno: u32,
        blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        assert!(self.tline.tenant_shard_id.is_shard_zero());
        let key = slru_block_to_key(kind, segno, blknum);
        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
        }

        let batch = self
@@ -1832,8 +1801,10 @@ impl DatadirModification<'_> {
        dbnode: Oid,
        img: Bytes,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+    ) -> Result<(), WalIngestError> {
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;

        // Add it to the directory (if it doesn't exist already)
        let buf = self.get(DBDIR_KEY, ctx).await?;
@@ -1874,13 +1845,13 @@ impl DatadirModification<'_> {
        xid: u64,
        img: Bytes,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        // Add it to the directory entry
        let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
        let newdirbuf = if self.tline.pg_version >= 17 {
            let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
            if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
+                Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
            }
            self.pending_directory_entries.push((
                DirectoryKind::TwoPhase,
@@ -1891,7 +1862,7 @@ impl DatadirModification<'_> {
            let xid = xid as u32;
            let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
            if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
+                Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?;
            }
            self.pending_directory_entries.push((
                DirectoryKind::TwoPhase,
@@ -1909,22 +1880,22 @@ impl DatadirModification<'_> {
        &mut self,
        origin_id: RepOriginId,
        origin_lsn: Lsn,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let key = repl_origin_key(origin_id);
        self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
        Ok(())
    }

-    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
+    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> {
        self.set_replorigin(origin_id, Lsn::INVALID).await
    }

-    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
+    pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> {
        self.put(CONTROLFILE_KEY, Value::Image(img));
        Ok(())
    }

-    pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
+    pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> {
        self.put(CHECKPOINT_KEY, Value::Image(img));
        Ok(())
    }
@@ -1934,7 +1905,7 @@ impl DatadirModification<'_> {
        spcnode: Oid,
        dbnode: Oid,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let total_blocks = self
            .tline
            .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
@@ -1973,20 +1944,21 @@ impl DatadirModification<'_> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> Result<(), RelationError> {
+    ) -> Result<(), WalIngestError> {
        if rel.relnode == 0 {
-            return Err(RelationError::InvalidRelnode);
+            Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
+                "invalid relnode"
+            )))?;
        }
        // It's possible that this is the first rel for this db in this
        // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
-            .context("deserialize db")?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;

        let dbdir_exists =
            if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
                // Didn't exist. Update dbdir
                e.insert(false);
-                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+                let buf = DbDirectory::ser(&dbdir)?;
                self.pending_directory_entries.push((
                    DirectoryKind::Db,
                    MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
@@ -2003,27 +1975,25 @@ impl DatadirModification<'_> {
            RelDirectory::default()
        } else {
            // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                .context("deserialize db")?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
        };

-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;

        if v2_enabled {
            if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
            }
            let sparse_rel_dir_key =
                rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
            // check if the rel_dir_key exists in v2
-            let val = self
-                .sparse_get(sparse_rel_dir_key, ctx)
-                .await
-                .map_err(|e| RelationError::Other(e.into()))?;
+            let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
            let val = RelDirExists::decode_option(val)
-                .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                .map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
            if val == RelDirExists::Exists {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
            }
            self.put(
                sparse_rel_dir_key,
@@ -2039,9 +2009,7 @@ impl DatadirModification<'_> {
                // will be key not found errors if we don't create an empty one for rel_size_v2.
                self.put(
                    rel_dir_key,
-                    Value::Image(Bytes::from(
-                        RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
-                    )),
+                    Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
                );
            }
            self.pending_directory_entries
@@ -2049,7 +2017,7 @@ impl DatadirModification<'_> {
        } else {
            // Add the new relation to the rel directory entry, and write it back
            if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
            }
            if !dbdir_exists {
                self.pending_directory_entries
@@ -2059,9 +2027,7 @@ impl DatadirModification<'_> {
                .push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
            self.put(
                rel_dir_key,
-                Value::Image(Bytes::from(
-                    RelDirectory::ser(&rel_dir).context("serialize")?,
-                )),
+                Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
            );
        }

@@ -2086,8 +2052,8 @@ impl DatadirModification<'_> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
        if self
            .tline
            .get_rel_exists(rel, Version::Modified(self), ctx)
@@ -2117,8 +2083,8 @@ impl DatadirModification<'_> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);

        // Put size
        let size_key = rel_size_to_key(rel);
@@ -2142,8 +2108,10 @@ impl DatadirModification<'_> {
        &mut self,
        drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+    ) -> Result<(), WalIngestError> {
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
        for ((spc_node, db_node), rel_tags) in drop_relations {
            let dir_key = rel_dir_to_key(spc_node, db_node);
            let buf = self.get(dir_key, ctx).await?;
@@ -2163,7 +2131,7 @@ impl DatadirModification<'_> {
                    let key =
                        rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
                    let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
-                        .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                        .map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
                    if val == RelDirExists::Exists {
                        self.pending_directory_entries
                            .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
@@ -2206,7 +2174,7 @@ impl DatadirModification<'_> {
        segno: u32,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        assert!(self.tline.tenant_shard_id.is_shard_zero());

        // Add it to the directory entry
@@ -2215,7 +2183,7 @@ impl DatadirModification<'_> {
        let mut dir = SlruSegmentDirectory::des(&buf)?;

        if !dir.segments.insert(segno) {
-            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
+            Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?;
        }
        self.pending_directory_entries.push((
            DirectoryKind::SlruSegment(kind),
@@ -2242,7 +2210,7 @@ impl DatadirModification<'_> {
        kind: SlruKind,
        segno: u32,
        nblocks: BlockNumber,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        assert!(self.tline.tenant_shard_id.is_shard_zero());

        // Put size
@@ -2258,7 +2226,7 @@ impl DatadirModification<'_> {
        kind: SlruKind,
        segno: u32,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        // Remove it from the directory entry
        let dir_key = slru_dir_to_key(kind);
        let buf = self.get(dir_key, ctx).await?;
@@ -2283,7 +2251,7 @@ impl DatadirModification<'_> {
    }

    /// Drop a relmapper file (pg_filenode.map)
-    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
+    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> {
        // TODO
        Ok(())
    }
@@ -2293,7 +2261,7 @@ impl DatadirModification<'_> {
        &mut self,
        xid: u64,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        // Remove it from the directory entry
        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
        let newdirbuf = if self.tline.pg_version >= 17 {
@@ -2308,7 +2276,8 @@ impl DatadirModification<'_> {
            ));
            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
        } else {
-            let xid: u32 = u32::try_from(xid)?;
+            let xid: u32 = u32::try_from(xid)
+                .map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?;
            let mut dir = TwoPhaseDirectory::des(&buf)?;

            if !dir.xids.remove(&xid) {
@@ -2333,7 +2302,7 @@ impl DatadirModification<'_> {
        path: &str,
        content: &[u8],
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let key = aux_file::encode_aux_file_key(path);
        // retrieve the key from the engine
        let old_val = match self.get(key, ctx).await {
@@ -2342,7 +2311,7 @@ impl DatadirModification<'_> {
            Err(e) => return Err(e.into()),
        };
        let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
-            aux_file::decode_file_value(old_val)?
+            aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)?
        } else {
            Vec::new()
        };
@@ -2387,7 +2356,8 @@ impl DatadirModification<'_> {
            }
            (None, true) => warn!("removing non-existing aux file: {}", path),
        }
-        let new_val = aux_file::encode_file_value(&new_files)?;
+        let new_val = aux_file::encode_file_value(&new_files)
+            .map_err(WalIngestErrorKind::EncodeAuxFileError)?;
        self.put(key, Value::Image(new_val.into()));

        Ok(())
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5933,12 +5933,20 @@ mod tests {
    use models::CompactLsnRange;
    use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
    use pageserver_api::keyspace::KeySpace;
+    #[cfg(feature = "testing")]
+    use pageserver_api::keyspace::KeySpaceRandomAccum;
    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
    #[cfg(feature = "testing")]
    use pageserver_api::record::NeonWalRecord;
    use pageserver_api::value::Value;
    use pageserver_compaction::helpers::overlaps_with;
+    #[cfg(feature = "testing")]
+    use rand::SeedableRng;
+    #[cfg(feature = "testing")]
+    use rand::rngs::StdRng;
    use rand::{Rng, thread_rng};
+    #[cfg(feature = "testing")]
+    use std::ops::Range;
    use storage_layer::{IoConcurrency, PersistentLayerKey};
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -5948,7 +5956,7 @@ mod tests {
    use timeline::InMemoryLayerTestDesc;
    #[cfg(feature = "testing")]
    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
-    use timeline::{CompactOptions, DeltaLayerTestDesc};
+    use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery};
    use utils::id::TenantId;

    use super::*;
@@ -5960,6 +5968,318 @@ mod tests {
    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));

+    #[cfg(feature = "testing")]
+    struct TestTimelineSpecification {
+        start_lsn: Lsn,
+        last_record_lsn: Lsn,
+
+        in_memory_layers_shape: Vec<(Range<Key>, Range<Lsn>)>,
+        delta_layers_shape: Vec<(Range<Key>, Range<Lsn>)>,
+        image_layers_shape: Vec<(Range<Key>, Lsn)>,
+
+        gap_chance: u8,
+        will_init_chance: u8,
+    }
+
+    #[cfg(feature = "testing")]
+    struct Storage {
+        storage: HashMap<(Key, Lsn), Value>,
+        start_lsn: Lsn,
+    }
+
+    #[cfg(feature = "testing")]
+    impl Storage {
+        fn get(&self, key: Key, lsn: Lsn) -> Bytes {
+            use bytes::BufMut;
+
+            let mut crnt_lsn = lsn;
+            let mut got_base = false;
+
+            let mut acc = Vec::new();
+
+            while crnt_lsn >= self.start_lsn {
+                if let Some(value) = self.storage.get(&(key, crnt_lsn)) {
+                    acc.push(value.clone());
+
+                    match value {
+                        Value::WalRecord(NeonWalRecord::Test { will_init, .. }) => {
+                            if *will_init {
+                                got_base = true;
+                                break;
+                            }
+                        }
+                        Value::Image(_) => {
+                            got_base = true;
+                            break;
+                        }
+                        _ => unreachable!(),
+                    }
+                }
+
+                crnt_lsn = crnt_lsn.checked_sub(1u64).unwrap();
+            }
+
+            assert!(
+                got_base,
+                "Input data was incorrect. No base image for {key}@{lsn}"
+            );
+
+            tracing::debug!("Wal redo depth for {key}@{lsn} is {}", acc.len());
+
+            let mut blob = BytesMut::new();
+            for value in acc.into_iter().rev() {
+                match value {
+                    Value::WalRecord(NeonWalRecord::Test { append, .. }) => {
+                        blob.extend_from_slice(append.as_bytes());
+                    }
+                    Value::Image(img) => {
+                        blob.put(img);
+                    }
+                    _ => unreachable!(),
+                }
+            }
+
+            blob.into()
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    #[allow(clippy::too_many_arguments)]
+    async fn randomize_timeline(
+        tenant: &Arc<Tenant>,
+        new_timeline_id: TimelineId,
+        pg_version: u32,
+        spec: TestTimelineSpecification,
+        random: &mut rand::rngs::StdRng,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(Arc<Timeline>, Storage, Vec<Lsn>)> {
+        let mut storage: HashMap<(Key, Lsn), Value> = HashMap::default();
+        let mut interesting_lsns = vec![spec.last_record_lsn];
+
+        for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() {
+            let mut lsn = lsn_range.start;
+            while lsn < lsn_range.end {
+                let mut key = key_range.start;
+                while key < key_range.end {
+                    let gap = random.gen_range(1..=100) <= spec.gap_chance;
+                    let will_init = random.gen_range(1..=100) <= spec.will_init_chance;
+
+                    if gap {
+                        continue;
+                    }
+
+                    let record = if will_init {
+                        Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]")))
+                    } else {
+                        Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]")))
+                    };
+
+                    storage.insert((key, lsn), record);
+
+                    key = key.next();
+                }
+                lsn = Lsn(lsn.0 + 1);
+            }
+
+            // Stash some interesting LSN for future use
+            for offset in [0, 5, 100].iter() {
+                if *offset == 0 {
+                    interesting_lsns.push(lsn_range.start);
+                } else {
+                    let below = lsn_range.start.checked_sub(*offset);
+                    match below {
+                        Some(v) if v >= spec.start_lsn => {
+                            interesting_lsns.push(v);
+                        }
+                        _ => {}
+                    }
+
+                    let above = Lsn(lsn_range.start.0 + offset);
+                    interesting_lsns.push(above);
+                }
+            }
+        }
+
+        for (key_range, lsn_range) in spec.delta_layers_shape.iter() {
+            let mut lsn = lsn_range.start;
+            while lsn < lsn_range.end {
+                let mut key = key_range.start;
+                while key < key_range.end {
+                    let gap = random.gen_range(1..=100) <= spec.gap_chance;
+                    let will_init = random.gen_range(1..=100) <= spec.will_init_chance;
+
+                    if gap {
+                        continue;
+                    }
+
+                    let record = if will_init {
+                        Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]")))
+                    } else {
+                        Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]")))
+                    };
+
+                    storage.insert((key, lsn), record);
+
+                    key = key.next();
+                }
+                lsn = Lsn(lsn.0 + 1);
+            }
+
+            // Stash some interesting LSN for future use
+            for offset in [0, 5, 100].iter() {
+                if *offset == 0 {
+                    interesting_lsns.push(lsn_range.start);
+                } else {
+                    let below = lsn_range.start.checked_sub(*offset);
+                    match below {
+                        Some(v) if v >= spec.start_lsn => {
+                            interesting_lsns.push(v);
+                        }
+                        _ => {}
+                    }
+
+                    let above = Lsn(lsn_range.start.0 + offset);
+                    interesting_lsns.push(above);
+                }
+            }
+        }
+
+        for (key_range, lsn) in spec.image_layers_shape.iter() {
+            let mut key = key_range.start;
+            while key < key_range.end {
+                let blob = Bytes::from(format!("[image {key}@{lsn}]"));
+                let record = Value::Image(blob.clone());
+                storage.insert((key, *lsn), record);
+
+                key = key.next();
+            }
+
+            // Stash some interesting LSN for future use
+            for offset in [0, 5, 100].iter() {
+                if *offset == 0 {
+                    interesting_lsns.push(*lsn);
+                } else {
+                    let below = lsn.checked_sub(*offset);
+                    match below {
+                        Some(v) if v >= spec.start_lsn => {
+                            interesting_lsns.push(v);
+                        }
+                        _ => {}
+                    }
+
+                    let above = Lsn(lsn.0 + offset);
+                    interesting_lsns.push(above);
+                }
+            }
+        }
+
+        let in_memory_test_layers = {
+            let mut acc = Vec::new();
+
+            for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() {
+                let mut data = Vec::new();
+
+                let mut lsn = lsn_range.start;
+                while lsn < lsn_range.end {
+                    let mut key = key_range.start;
+                    while key < key_range.end {
+                        if let Some(record) = storage.get(&(key, lsn)) {
+                            data.push((key, lsn, record.clone()));
+                        }
+
+                        key = key.next();
+                    }
+                    lsn = Lsn(lsn.0 + 1);
+                }
+
+                acc.push(InMemoryLayerTestDesc {
+                    data,
+                    lsn_range: lsn_range.clone(),
+                    is_open: false,
+                })
+            }
+
+            acc
+        };
+
+        let delta_test_layers = {
+            let mut acc = Vec::new();
+
+            for (key_range, lsn_range) in spec.delta_layers_shape.iter() {
+                let mut data = Vec::new();
+
+                let mut lsn = lsn_range.start;
+                while lsn < lsn_range.end {
+                    let mut key = key_range.start;
+                    while key < key_range.end {
+                        if let Some(record) = storage.get(&(key, lsn)) {
+                            data.push((key, lsn, record.clone()));
+                        }
+
+                        key = key.next();
+                    }
+                    lsn = Lsn(lsn.0 + 1);
+                }
+
+                acc.push(DeltaLayerTestDesc {
+                    data,
+                    lsn_range: lsn_range.clone(),
+                    key_range: key_range.clone(),
+                })
+            }
+
+            acc
+        };
+
+        let image_test_layers = {
+            let mut acc = Vec::new();
+
+            for (key_range, lsn) in spec.image_layers_shape.iter() {
+                let mut data = Vec::new();
+
+                let mut key = key_range.start;
+                while key < key_range.end {
+                    if let Some(record) = storage.get(&(key, *lsn)) {
+                        let blob = match record {
+                            Value::Image(blob) => blob.clone(),
+                            _ => unreachable!(),
+                        };
+
+                        data.push((key, blob));
+                    }
+
+                    key = key.next();
+                }
+
+                acc.push((*lsn, data));
+            }
+
+            acc
+        };
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                new_timeline_id,
+                spec.start_lsn,
+                pg_version,
+                ctx,
+                in_memory_test_layers,
+                delta_test_layers,
+                image_test_layers,
+                spec.last_record_lsn,
+            )
+            .await?;
+
+        Ok((
+            tline,
+            Storage {
+                storage,
+                start_lsn: spec.start_lsn,
+            },
+            interesting_lsns,
+        ))
+    }
+
    #[tokio::test]
    async fn test_basic() -> anyhow::Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
@@ -6786,10 +7106,11 @@ mod tests {
        for read in reads {
            info!("Doing vectored read on {:?}", read);

+            let query = VersionedKeySpaceQuery::uniform(read.clone(), reads_lsn);
+
            let vectored_res = tline
                .get_vectored_impl(
-                    read.clone(),
-                    reads_lsn,
+                    query,
                    &mut ValuesReconstructState::new(io_concurrency.clone()),
                    &ctx,
                )
@@ -6868,10 +7189,11 @@ mod tests {
        };
        let read_lsn = child_timeline.get_last_record_lsn();

+        let query = VersionedKeySpaceQuery::uniform(aux_keyspace.clone(), read_lsn);
+
        let vectored_res = child_timeline
            .get_vectored_impl(
-                aux_keyspace.clone(),
-                read_lsn,
+                query,
                &mut ValuesReconstructState::new(io_concurrency.clone()),
                &ctx,
            )
@@ -7017,10 +7339,12 @@ mod tests {
        let read = KeySpace {
            ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
        };
+
+        let query = VersionedKeySpaceQuery::uniform(read.clone(), current_lsn);
+
        let results = child_timeline
            .get_vectored_impl(
-                read.clone(),
-                current_lsn,
+                query,
                &mut ValuesReconstructState::new(io_concurrency.clone()),
                &ctx,
            )
@@ -7151,12 +7475,16 @@ mod tests {
        }

        for query_lsn in query_lsns {
+            let query = VersionedKeySpaceQuery::uniform(
+                KeySpace {
+                    ranges: vec![child_gap_at_key..child_gap_at_key.next()],
+                },
+                query_lsn,
+            );
+
            let results = child_timeline
                .get_vectored_impl(
-                    KeySpace {
-                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
-                    },
-                    query_lsn,
+                    query,
                    &mut ValuesReconstructState::new(io_concurrency.clone()),
                    &ctx,
                )
@@ -7655,10 +7983,11 @@ mod tests {
            }

            let mut cnt = 0;
+            let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn);
+
            for (key, value) in tline
                .get_vectored_impl(
-                    keyspace.clone(),
-                    lsn,
+                    query,
                    &mut ValuesReconstructState::new(io_concurrency.clone()),
                    &ctx,
                )
@@ -7865,8 +8194,9 @@ mod tests {
            io_concurrency: IoConcurrency,
        ) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
            let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
+            let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn);
            let res = tline
-                .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+                .get_vectored_impl(query, &mut reconstruct_state, ctx)
                .await?;
            Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
        }
@@ -8163,13 +8493,10 @@ mod tests {

        // test vectored scan on parent timeline
        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
+        let query =
+            VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn);
        let res = tline
-            .get_vectored_impl(
-                KeySpace::single(Key::metadata_key_range()),
-                lsn,
-                &mut reconstruct_state,
-                &ctx,
-            )
+            .get_vectored_impl(query, &mut reconstruct_state, &ctx)
            .await?;

        assert_eq!(
@@ -8189,13 +8516,10 @@ mod tests {

        // test vectored scan on child timeline
        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
+        let query =
+            VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn);
        let res = child
-            .get_vectored_impl(
-                KeySpace::single(Key::metadata_key_range()),
-                lsn,
-                &mut reconstruct_state,
-                &ctx,
-            )
+            .get_vectored_impl(query, &mut reconstruct_state, &ctx)
            .await?;

        assert_eq!(
@@ -8229,13 +8553,9 @@ mod tests {
        let io_concurrency =
            IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
+        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
        let mut res = tline
-            .get_vectored_impl(
-                KeySpace::single(key..key.next()),
-                lsn,
-                &mut reconstruct_state,
-                ctx,
-            )
+            .get_vectored_impl(query, &mut reconstruct_state, ctx)
            .await?;
        Ok(res.pop_last().map(|(k, v)| {
            assert_eq!(k, key);
@@ -9257,6 +9577,7 @@ mod tests {
                &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
                3,
                None,
+                true,
            )
            .await
            .unwrap();
@@ -9381,7 +9702,15 @@ mod tests {
            ),
        ];
        let res = tline
-            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
+            .generate_key_retention(
+                key,
+                &history,
+                Lsn(0x60),
+                &[Lsn(0x40), Lsn(0x50)],
+                3,
+                None,
+                true,
+            )
            .await
            .unwrap();
        let expected_res = KeyHistoryRetention {
@@ -9460,6 +9789,7 @@ mod tests {
                &[],
                3,
                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
+                true,
            )
            .await
            .unwrap();
@@ -9508,6 +9838,7 @@ mod tests {
                &[Lsn(0x30)],
                3,
                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
+                true,
            )
            .await
            .unwrap();
@@ -10358,14 +10689,13 @@ mod tests {
            )
            .await?;

-        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let query = VersionedKeySpaceQuery::uniform(
+            KeySpace::single(get_key(0)..get_key(10)),
+            delta_layer_end_lsn,
+        );
+
        let results = tline
-            .get_vectored(
-                keyspace,
-                delta_layer_end_lsn,
-                IoConcurrency::sequential(),
-                &ctx,
-            )
+            .get_vectored(query, IoConcurrency::sequential(), &ctx)
            .await
            .expect("No vectored errors");
        for (key, res) in results {
@@ -10513,9 +10843,13 @@ mod tests {
            )
            .await?;

-        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let query = VersionedKeySpaceQuery::uniform(
+            KeySpace::single(get_key(0)..get_key(10)),
+            last_record_lsn,
+        );
+
        let results = tline
-            .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
+            .get_vectored(query, IoConcurrency::sequential(), &ctx)
            .await
            .expect("No vectored errors");
        for (key, res) in results {
@@ -10529,6 +10863,214 @@ mod tests {
        Ok(())
    }

+    // A randomized read path test. Generates a layer map according to a deterministic
+    // specification. Fills the (key, LSN) space in random manner and then performs
+    // random scattered queries validating the results against in-memory storage.
+    //
+    // See this internal Notion page for a diagram of the layer map:
+    // https://www.notion.so/neondatabase/Read-Path-Unit-Testing-Fuzzing-1d1f189e0047806c8e5cd37781b0a350?pvs=4
+    //
+    // A fuzzing mode is also supported. In this mode, the test will use a random
+    // seed instead of a hardcoded one. Use it in conjunction with `cargo stress`
+    // to run multiple instances in parallel:
+    //
+    // $ RUST_BACKTRACE=1 RUST_LOG=INFO \
+    //   cargo stress --package=pageserver --features=testing,fuzz-read-path --release -- test_read_path
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_read_path() -> anyhow::Result<()> {
+        use rand::seq::SliceRandom;
+
+        let seed = if cfg!(feature = "fuzz-read-path") {
+            let seed: u64 = thread_rng().r#gen();
+            seed
+        } else {
+            // Use a hard-coded seed when not in fuzzing mode.
+            // Note that with the current approach results are not reproducible
+            // accross platforms and Rust releases.
+            const SEED: u64 = 0;
+            SEED
+        };
+
+        let mut random = StdRng::seed_from_u64(seed);
+
+        let (queries, will_init_chance, gap_chance) = if cfg!(feature = "fuzz-read-path") {
+            const QUERIES: u64 = 5000;
+            let will_init_chance: u8 = random.gen_range(0..=10);
+            let gap_chance: u8 = random.gen_range(0..=50);
+
+            (QUERIES, will_init_chance, gap_chance)
+        } else {
+            const QUERIES: u64 = 1000;
+            const WILL_INIT_CHANCE: u8 = 1;
+            const GAP_CHANCE: u8 = 5;
+
+            (QUERIES, WILL_INIT_CHANCE, GAP_CHANCE)
+        };
+
+        let harness = TenantHarness::create("test_read_path").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        tracing::info!("Using random seed: {seed}");
+        tracing::info!(%will_init_chance, %gap_chance, "Fill params");
+
+        // Define the layer map shape. Note that this part is not randomized.
+
+        const KEY_DIMENSION_SIZE: u32 = 99;
+        let start_key = Key::from_hex("110000000033333333444444445500000000").unwrap();
+        let end_key = start_key.add(KEY_DIMENSION_SIZE);
+        let total_key_range = start_key..end_key;
+        let total_key_range_size = end_key.to_i128() - start_key.to_i128();
+        let total_start_lsn = Lsn(104);
+        let last_record_lsn = Lsn(504);
+
+        assert!(total_key_range_size % 3 == 0);
+
+        let in_memory_layers_shape = vec![
+            (total_key_range.clone(), Lsn(304)..Lsn(400)),
+            (total_key_range.clone(), Lsn(400)..last_record_lsn),
+        ];
+
+        let delta_layers_shape = vec![
+            (
+                start_key..(start_key.add((total_key_range_size / 3) as u32)),
+                Lsn(200)..Lsn(304),
+            ),
+            (
+                (start_key.add((total_key_range_size / 3) as u32))
+                    ..(start_key.add((total_key_range_size * 2 / 3) as u32)),
+                Lsn(200)..Lsn(304),
+            ),
+            (
+                (start_key.add((total_key_range_size * 2 / 3) as u32))
+                    ..(start_key.add(total_key_range_size as u32)),
+                Lsn(200)..Lsn(304),
+            ),
+        ];
+
+        let image_layers_shape = vec![
+            (
+                start_key.add((total_key_range_size * 2 / 3 - 10) as u32)
+                    ..start_key.add((total_key_range_size * 2 / 3 + 10) as u32),
+                Lsn(456),
+            ),
+            (
+                start_key.add((total_key_range_size / 3 - 10) as u32)
+                    ..start_key.add((total_key_range_size / 3 + 10) as u32),
+                Lsn(256),
+            ),
+            (total_key_range.clone(), total_start_lsn),
+        ];
+
+        let specification = TestTimelineSpecification {
+            start_lsn: total_start_lsn,
+            last_record_lsn,
+            in_memory_layers_shape,
+            delta_layers_shape,
+            image_layers_shape,
+            gap_chance,
+            will_init_chance,
+        };
+
+        // Create and randomly fill in the layers according to the specification
+        let (tline, storage, interesting_lsns) = randomize_timeline(
+            &tenant,
+            TIMELINE_ID,
+            DEFAULT_PG_VERSION,
+            specification,
+            &mut random,
+            &ctx,
+        )
+        .await?;
+
+        // Now generate queries based on the interesting lsns that we've collected.
+        //
+        // While there's still room in the query, pick and interesting LSN and a random
+        // key. Then roll the dice to see if the next key should also be included in
+        // the query. When the roll fails, break the "batch" and pick another point in the
+        // (key, LSN) space.
+
+        const PICK_NEXT_CHANCE: u8 = 50;
+        for _ in 0..queries {
+            let query = {
+                let mut keyspaces_at_lsn: HashMap<Lsn, KeySpaceRandomAccum> = HashMap::default();
+                let mut used_keys: HashSet<Key> = HashSet::default();
+
+                while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
+                    let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty");
+                    let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE));
+
+                    while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
+                        if used_keys.contains(&selected_key)
+                            || selected_key >= start_key.add(KEY_DIMENSION_SIZE)
+                        {
+                            break;
+                        }
+
+                        keyspaces_at_lsn
+                            .entry(*selected_lsn)
+                            .or_default()
+                            .add_key(selected_key);
+                        used_keys.insert(selected_key);
+
+                        let pick_next = random.gen_range(0..=100) <= PICK_NEXT_CHANCE;
+                        if pick_next {
+                            selected_key = selected_key.next();
+                        } else {
+                            break;
+                        }
+                    }
+                }
+
+                VersionedKeySpaceQuery::scattered(
+                    keyspaces_at_lsn
+                        .into_iter()
+                        .map(|(lsn, acc)| (lsn, acc.to_keyspace()))
+                        .collect(),
+                )
+            };
+
+            // Run the query and validate the results
+
+            let results = tline
+                .get_vectored(query.clone(), IoConcurrency::Sequential, &ctx)
+                .await;
+
+            let blobs = match results {
+                Ok(ok) => ok,
+                Err(err) => {
+                    panic!("seed={seed} Error returned for query {query}: {err}");
+                }
+            };
+
+            for (key, key_res) in blobs.into_iter() {
+                match key_res {
+                    Ok(blob) => {
+                        let requested_at_lsn = query.map_key_to_lsn(&key);
+                        let expected = storage.get(key, requested_at_lsn);
+
+                        if blob != expected {
+                            tracing::error!(
+                                "seed={seed} Mismatch for {key}@{requested_at_lsn} from query: {query}"
+                            );
+                        }
+
+                        assert_eq!(blob, expected);
+                    }
+                    Err(err) => {
+                        let requested_at_lsn = query.map_key_to_lsn(&key);
+
+                        panic!(
+                            "seed={seed} Error returned for {key}@{requested_at_lsn} from query {query}: {err}"
+                        );
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
    fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
        (
            k1.is_delta,
@@ -11571,6 +12113,99 @@ mod tests {
        Ok(())
    }

+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x24),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x24")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x28),
+                // This record will fail to redo
+                Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![], // in-memory layers
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    Lsn(0x20)..Lsn(0x30),
+                    delta1,
+                )], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            tline
+                .applied_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let cancel = CancellationToken::new();
+
+        // Compaction will fail, but should not fire any critical error.
+        // Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction
+        // process. It will always try to redo the logs it reads and if it doesn't work, fail the entire
+        // compaction job. Tracked in <https://github.com/neondatabase/neon/issues/10395>.
+        let res = tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: None,
+                    compact_lsn_range: None,
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await;
+        assert!(res.is_err());
+
+        Ok(())
+    }
+
    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -22,6 +22,7 @@ use bytes::{BufMut, BytesMut};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+use tokio_util::sync::CancellationToken;
 use tracing::warn;

 use crate::context::RequestContext;
@@ -169,7 +170,13 @@ pub struct BlobWriter<const BUFFERED: bool> {
 }

 impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
-    pub fn new(inner: VirtualFile, start_offset: u64) -> Self {
+    pub fn new(
+        inner: VirtualFile,
+        start_offset: u64,
+        _gate: &utils::sync::gate::Gate,
+        _cancel: CancellationToken,
+        _ctx: &RequestContext,
+    ) -> Self {
        Self {
            inner,
            offset: start_offset,
@@ -432,12 +439,14 @@ pub(crate) mod tests {
    ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
        let temp_dir = camino_tempfile::tempdir()?;
        let pathbuf = temp_dir.path().join("file");
+        let gate = utils::sync::gate::Gate::default();
+        let cancel = CancellationToken::new();

        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
            let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
-            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
+            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0, &gate, cancel.clone(), ctx);
            for blob in blobs.iter() {
                let (_, res) = if compression {
                    let res = wtr
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -714,7 +714,7 @@ impl LayerMap {
        true
    }

-    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
+    pub fn iter_historic_layers(&self) -> impl ExactSizeIterator<Item = Arc<PersistentLayerDesc>> {
        self.historic.iter()
    }

--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -504,7 +504,7 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
    }

    /// Iterate all the layers
-    pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
+    pub fn iter(&self) -> impl ExactSizeIterator<Item = Value> {
        // NOTE we can actually perform this without rebuilding,
        //      but it's not necessary for now.
        if !self.buffer.is_empty() {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -715,13 +715,34 @@ pub(crate) enum LayerId {
 }

 /// Uniquely identify a layer visit by the layer
-/// and LSN floor (or start LSN) of the reads.
-/// The layer itself is not enough since we may
-/// have different LSN lower bounds for delta layer reads.
+/// and LSN range of the reads. Note that the end of the range is exclusive.
+///
+/// The layer itself is not enough since we may have different LSN lower
+/// bounds for delta layer reads. Scenarios where this can happen are:
+///
+/// 1. Layer overlaps: imagine an image layer inside and in-memory layer
+///    and a query that only partially hits the image layer. Part of the query
+///    needs to read the whole in-memory layer and the other part needs to read
+///    only up to the image layer. Hence, they'll have different LSN floor values
+///    for the read.
+///
+/// 2. Scattered reads: the read path supports starting at different LSNs. Imagine
+///    The start LSN for one range is inside a layer and the start LSN for another range
+///    Is above the layer (includes all of it). Both ranges need to read the layer all the
+///    Way to the end but starting at different points. Hence, they'll have different LSN
+///    Ceil values.
+///
+/// The implication is that we might visit the same layer multiple times
+/// in order to read different LSN ranges from it. In practice, this isn't very concerning
+/// because:
+/// 1. Layer overlaps are rare and generally not intended
+/// 2. Scattered reads will stabilise after the first few layers provided their starting LSNs
+///    are grouped tightly enough (likely the case).
 #[derive(Debug, PartialEq, Eq, Clone, Hash)]
 struct LayerToVisitId {
    layer_id: LayerId,
    lsn_floor: Lsn,
+    lsn_ceil: Lsn,
 }

 #[derive(Debug, PartialEq, Eq, Hash)]
@@ -805,6 +826,7 @@ impl LayerFringe {
        let layer_to_visit_id = LayerToVisitId {
            layer_id: layer.id(),
            lsn_floor: lsn_range.start,
+            lsn_ceil: lsn_range.end,
        };

        let entry = self.visit_reads.entry(layer_to_visit_id.clone());
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -5,6 +5,7 @@ use std::sync::Arc;
 use bytes::Bytes;
 use pageserver_api::key::{KEY_SIZE, Key};
 use pageserver_api::value::Value;
+use tokio_util::sync::CancellationToken;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 use utils::shard::TenantShardId;
@@ -179,7 +180,7 @@ impl BatchLayerWriter {

 /// An image writer that takes images and produces multiple image layers.
 #[must_use]
-pub struct SplitImageLayerWriter {
+pub struct SplitImageLayerWriter<'a> {
    inner: ImageLayerWriter,
    target_layer_size: u64,
    lsn: Lsn,
@@ -188,9 +189,12 @@ pub struct SplitImageLayerWriter {
    tenant_shard_id: TenantShardId,
    batches: BatchLayerWriter,
    start_key: Key,
+    gate: &'a utils::sync::gate::Gate,
+    cancel: CancellationToken,
 }

-impl SplitImageLayerWriter {
+impl<'a> SplitImageLayerWriter<'a> {
+    #[allow(clippy::too_many_arguments)]
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
@@ -198,6 +202,8 @@ impl SplitImageLayerWriter {
        start_key: Key,
        lsn: Lsn,
        target_layer_size: u64,
+        gate: &'a utils::sync::gate::Gate,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        Ok(Self {
@@ -208,6 +214,8 @@ impl SplitImageLayerWriter {
                tenant_shard_id,
                &(start_key..Key::MAX),
                lsn,
+                gate,
+                cancel.clone(),
                ctx,
            )
            .await?,
@@ -217,6 +225,8 @@ impl SplitImageLayerWriter {
            batches: BatchLayerWriter::new(conf).await?,
            lsn,
            start_key,
+            gate,
+            cancel,
        })
    }

@@ -239,6 +249,8 @@ impl SplitImageLayerWriter {
                self.tenant_shard_id,
                &(key..Key::MAX),
                self.lsn,
+                self.gate,
+                self.cancel.clone(),
                ctx,
            )
            .await?;
@@ -291,7 +303,7 @@ impl SplitImageLayerWriter {
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
 /// will split them into multiple files based on size.
 #[must_use]
-pub struct SplitDeltaLayerWriter {
+pub struct SplitDeltaLayerWriter<'a> {
    inner: Option<(Key, DeltaLayerWriter)>,
    target_layer_size: u64,
    conf: &'static PageServerConf,
@@ -300,15 +312,19 @@ pub struct SplitDeltaLayerWriter {
    lsn_range: Range<Lsn>,
    last_key_written: Key,
    batches: BatchLayerWriter,
+    gate: &'a utils::sync::gate::Gate,
+    cancel: CancellationToken,
 }

-impl SplitDeltaLayerWriter {
+impl<'a> SplitDeltaLayerWriter<'a> {
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        lsn_range: Range<Lsn>,
        target_layer_size: u64,
+        gate: &'a utils::sync::gate::Gate,
+        cancel: CancellationToken,
    ) -> anyhow::Result<Self> {
        Ok(Self {
            target_layer_size,
@@ -319,6 +335,8 @@ impl SplitDeltaLayerWriter {
            lsn_range,
            last_key_written: Key::MIN,
            batches: BatchLayerWriter::new(conf).await?,
+            gate,
+            cancel,
        })
    }

@@ -344,6 +362,8 @@ impl SplitDeltaLayerWriter {
                    self.tenant_shard_id,
                    key,
                    self.lsn_range.clone(),
+                    self.gate,
+                    self.cancel.clone(),
                    ctx,
                )
                .await?,
@@ -362,6 +382,8 @@ impl SplitDeltaLayerWriter {
                    self.tenant_shard_id,
                    key,
                    self.lsn_range.clone(),
+                    self.gate,
+                    self.cancel.clone(),
                    ctx,
                )
                .await?;
@@ -469,6 +491,8 @@ mod tests {
            get_key(0),
            Lsn(0x18),
            4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
            &ctx,
        )
        .await
@@ -480,6 +504,8 @@ mod tests {
            tenant.tenant_shard_id,
            Lsn(0x18)..Lsn(0x20),
            4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
        )
        .await
        .unwrap();
@@ -546,6 +572,8 @@ mod tests {
            get_key(0),
            Lsn(0x18),
            4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
            &ctx,
        )
        .await
@@ -556,6 +584,8 @@ mod tests {
            tenant.tenant_shard_id,
            Lsn(0x18)..Lsn(0x20),
            4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
        )
        .await
        .unwrap();
@@ -643,6 +673,8 @@ mod tests {
            get_key(0),
            Lsn(0x18),
            4 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
            &ctx,
        )
        .await
@@ -654,6 +686,8 @@ mod tests {
            tenant.tenant_shard_id,
            Lsn(0x18)..Lsn(0x20),
            4 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
        )
        .await
        .unwrap();
@@ -730,6 +764,8 @@ mod tests {
            tenant.tenant_shard_id,
            Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
            4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
        )
        .await
        .unwrap();
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -50,6 +50,7 @@ use rand::distributions::Alphanumeric;
 use serde::{Deserialize, Serialize};
 use tokio::sync::OnceCell;
 use tokio_epoll_uring::IoBuf;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
@@ -400,12 +401,15 @@ impl DeltaLayerWriterInner {
    ///
    /// Start building a new delta layer.
    ///
+    #[allow(clippy::too_many_arguments)]
    async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        key_start: Key,
        lsn_range: Range<Lsn>,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename. We don't know
@@ -420,7 +424,7 @@ impl DeltaLayerWriterInner {
        let mut file = VirtualFile::create(&path, ctx).await?;
        // make room for the header block
        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
-        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
+        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);

        // Initialize the b-tree index builder
        let block_buf = BlockBuf::new();
@@ -628,12 +632,15 @@ impl DeltaLayerWriter {
    ///
    /// Start building a new delta layer.
    ///
+    #[allow(clippy::too_many_arguments)]
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        key_start: Key,
        lsn_range: Range<Lsn>,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        Ok(Self {
@@ -644,6 +651,8 @@ impl DeltaLayerWriter {
                    tenant_shard_id,
                    key_start,
                    lsn_range,
+                    gate,
+                    cancel,
                    ctx,
                )
                .await?,
@@ -1885,6 +1894,8 @@ pub(crate) mod test {
            harness.tenant_shard_id,
            entries_meta.key_range.start,
            entries_meta.lsn_range.clone(),
+            &timeline.gate,
+            timeline.cancel.clone(),
            &ctx,
        )
        .await?;
@@ -2079,6 +2090,8 @@ pub(crate) mod test {
                tenant.tenant_shard_id,
                Key::MIN,
                Lsn(0x11)..truncate_at,
+                &branch.gate,
+                branch.cancel.clone(),
                ctx,
            )
            .await
@@ -2213,6 +2226,8 @@ pub(crate) mod test {
            tenant.tenant_shard_id,
            *key_start,
            (*lsn_min)..lsn_end,
+            &tline.gate,
+            tline.cancel.clone(),
            ctx,
        )
        .await?;
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -48,6 +48,7 @@ use rand::distributions::Alphanumeric;
 use serde::{Deserialize, Serialize};
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
@@ -748,12 +749,15 @@ impl ImageLayerWriterInner {
    ///
    /// Start building a new image layer.
    ///
+    #[allow(clippy::too_many_arguments)]
    async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        key_range: &Range<Key>,
        lsn: Lsn,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
@@ -780,7 +784,7 @@ impl ImageLayerWriterInner {
        };
        // make room for the header block
        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
-        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
+        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);

        // Initialize the b-tree index builder
        let block_buf = BlockBuf::new();
@@ -988,18 +992,30 @@ impl ImageLayerWriter {
    ///
    /// Start building a new image layer.
    ///
+    #[allow(clippy::too_many_arguments)]
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        key_range: &Range<Key>,
        lsn: Lsn,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<ImageLayerWriter> {
        Ok(Self {
            inner: Some(
-                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx)
-                    .await?,
+                ImageLayerWriterInner::new(
+                    conf,
+                    timeline_id,
+                    tenant_shard_id,
+                    key_range,
+                    lsn,
+                    gate,
+                    cancel,
+                    ctx,
+                )
+                .await?,
            ),
        })
    }
@@ -1192,7 +1208,7 @@ mod test {

        // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap();
        let range = input_start..input_end;

        // Build an image layer to filter
@@ -1203,6 +1219,8 @@ mod test {
                harness.tenant_shard_id,
                &range,
                lsn,
+                &timeline.gate,
+                timeline.cancel.clone(),
                &ctx,
            )
            .await
@@ -1235,7 +1253,7 @@ mod test {
            let shard_identity = ShardIdentity::new(
                ShardNumber(shard_number),
                shard_count,
-                ShardStripeSize(0x8000),
+                ShardStripeSize(0x800),
            )
            .unwrap();
            let harness = TenantHarness::create_custom(
@@ -1268,6 +1286,8 @@ mod test {
                harness.tenant_shard_id,
                &range,
                lsn,
+                &timeline.gate,
+                timeline.cancel.clone(),
                &ctx,
            )
            .await
@@ -1287,12 +1307,12 @@ mod test {

            // This exact size and those below will need updating as/when the layer encoding changes, but
            // should be deterministic for a given version of the format, as we used no randomness generating the input.
-            assert_eq!(original_size, 1597440);
+            assert_eq!(original_size, 122880);

            match shard_number {
                0 => {
                    // We should have written out just one stripe for our shard identity
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                    let replacement = replacement.unwrap();

                    // We should have dropped some of the data
@@ -1300,7 +1320,7 @@ mod test {
                    assert!(replacement.metadata().file_size > 0);

                    // Assert that we dropped ~3/4 of the data.
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                }
                1 => {
                    // Shard 1 has no keys in our input range
@@ -1309,19 +1329,19 @@ mod test {
                }
                2 => {
                    // Shard 2 has one stripes in the input range
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                    let replacement = replacement.unwrap();
                    assert!(replacement.metadata().file_size < original_size);
                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                }
                3 => {
                    // Shard 3 has two stripes in the input range
-                    assert_eq!(wrote_keys, 0x10000);
+                    assert_eq!(wrote_keys, 0x1000);
                    let replacement = replacement.unwrap();
                    assert!(replacement.metadata().file_size < original_size);
                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 811008);
+                    assert_eq!(replacement.metadata().file_size, 73728);
                }
                _ => unreachable!(),
            }
@@ -1346,6 +1366,8 @@ mod test {
            tenant.tenant_shard_id,
            &key_range,
            lsn,
+            &tline.gate,
+            tline.cancel.clone(),
            ctx,
        )
        .await?;
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -719,6 +719,8 @@ impl InMemoryLayer {
        ctx: &RequestContext,
        key_range: Option<Range<Key>>,
        l0_flush_global_state: &l0_flush::Inner,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
@@ -759,6 +761,8 @@ impl InMemoryLayer {
            self.tenant_shard_id,
            Key::MIN,
            self.start_lsn..end_lsn,
+            gate,
+            cancel,
            ctx,
        )
        .await?;
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -585,7 +585,7 @@ pub(crate) enum PageReconstructError {
    WalRedo(anyhow::Error),

    #[error("{0}")]
-    MissingKey(MissingKeyError),
+    MissingKey(Box<MissingKeyError>),
 }

 impl From<anyhow::Error> for PageReconstructError {
@@ -690,16 +690,23 @@ impl std::fmt::Display for ReadPath {

 #[derive(thiserror::Error)]
 pub struct MissingKeyError {
-    key: Key,
+    keyspace: KeySpace,
    shard: ShardNumber,
-    cont_lsn: Lsn,
-    request_lsn: Lsn,
+    query: Option<VersionedKeySpaceQuery>,
+    // This is largest request LSN from the get page request batch
+    original_hwm_lsn: Lsn,
    ancestor_lsn: Option<Lsn>,
    /// Debug information about the read path if there's an error
    read_path: Option<ReadPath>,
    backtrace: Option<std::backtrace::Backtrace>,
 }

+impl MissingKeyError {
+    fn enrich(&mut self, query: VersionedKeySpaceQuery) {
+        self.query = Some(query);
+    }
+}
+
 impl std::fmt::Debug for MissingKeyError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self)
@@ -710,14 +717,18 @@ impl std::fmt::Display for MissingKeyError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
-            "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
-            self.key, self.shard, self.cont_lsn, self.request_lsn
+            "could not find data for key {} (shard {:?}), original HWM LSN {}",
+            self.keyspace, self.shard, self.original_hwm_lsn
        )?;

        if let Some(ref ancestor_lsn) = self.ancestor_lsn {
            write!(f, ", ancestor {}", ancestor_lsn)?;
        }

+        if let Some(ref query) = self.query {
+            write!(f, ", query {}", query)?;
+        }
+
        if let Some(ref read_path) = self.read_path {
            write!(f, "\n{}", read_path)?;
        }
@@ -817,7 +828,7 @@ pub(crate) enum GetVectoredError {
    InvalidLsn(Lsn),

    #[error("requested key not found: {0}")]
-    MissingKey(MissingKeyError),
+    MissingKey(Box<MissingKeyError>),

    #[error("ancestry walk")]
    GetReadyAncestorError(#[source] GetReadyAncestorError),
@@ -928,7 +939,7 @@ impl std::fmt::Debug for Timeline {
    }
 }

-#[derive(thiserror::Error, Debug)]
+#[derive(thiserror::Error, Debug, Clone)]
 pub(crate) enum WaitLsnError {
    // Called on a timeline which is shutting down
    #[error("Shutdown")]
@@ -1128,14 +1139,12 @@ impl Timeline {
        // page_service.
        debug_assert!(!self.shard_identity.is_key_disposable(&key));

-        let keyspace = KeySpace {
-            ranges: vec![key..key.next()],
-        };
-
        let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential());

+        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
+
        let vectored_res = self
-            .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+            .get_vectored_impl(query, &mut reconstruct_state, ctx)
            .await;

        let key_value = vectored_res?.pop_first();
@@ -1153,15 +1162,17 @@ impl Timeline {
                    value
                }
            }
-            None => Err(PageReconstructError::MissingKey(MissingKeyError {
-                key,
-                shard: self.shard_identity.get_shard_number(&key),
-                cont_lsn: Lsn(0),
-                request_lsn: lsn,
-                ancestor_lsn: None,
-                backtrace: None,
-                read_path: None,
-            })),
+            None => Err(PageReconstructError::MissingKey(Box::new(
+                MissingKeyError {
+                    keyspace: KeySpace::single(key..key.next()),
+                    shard: self.shard_identity.get_shard_number(&key),
+                    original_hwm_lsn: lsn,
+                    ancestor_lsn: None,
+                    backtrace: None,
+                    read_path: None,
+                    query: None,
+                },
+            ))),
        }
    }

@@ -1174,21 +1185,18 @@ impl Timeline {
    /// which actually vectorizes the read path.
    pub(crate) async fn get_vectored(
        &self,
-        keyspace: KeySpace,
-        lsn: Lsn,
+        query: VersionedKeySpaceQuery,
        io_concurrency: super::storage_layer::IoConcurrency,
        ctx: &RequestContext,
    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        if !lsn.is_valid() {
-            return Err(GetVectoredError::InvalidLsn(lsn));
-        }
+        let total_keyspace = query.total_keyspace();

-        let key_count = keyspace.total_raw_size().try_into().unwrap();
+        let key_count = total_keyspace.total_raw_size().try_into().unwrap();
        if key_count > Timeline::MAX_GET_VECTORED_KEYS {
            return Err(GetVectoredError::Oversized(key_count));
        }

-        for range in &keyspace.ranges {
+        for range in &total_keyspace.ranges {
            let mut key = range.start;
            while key != range.end {
                assert!(!self.shard_identity.is_key_disposable(&key));
@@ -1197,9 +1205,8 @@ impl Timeline {
        }

        trace!(
-            "get vectored request for {:?}@{} from task kind {:?}",
-            keyspace,
-            lsn,
+            "get vectored query {} from task kind {:?}",
+            query,
            ctx.task_kind(),
        );

@@ -1208,12 +1215,7 @@ impl Timeline {
            .map(|metric| (metric, Instant::now()));

        let res = self
-            .get_vectored_impl(
-                keyspace.clone(),
-                lsn,
-                &mut ValuesReconstructState::new(io_concurrency),
-                ctx,
-            )
+            .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
            .await;

        if let Some((metric, start)) = start {
@@ -1264,13 +1266,10 @@ impl Timeline {
            .for_task_kind(ctx.task_kind())
            .map(ScanLatencyOngoingRecording::start_recording);

+        let query = VersionedKeySpaceQuery::uniform(keyspace, lsn);
+
        let vectored_res = self
-            .get_vectored_impl(
-                keyspace.clone(),
-                lsn,
-                &mut ValuesReconstructState::new(io_concurrency),
-                ctx,
-            )
+            .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
            .await;

        if let Some(recording) = start {
@@ -1282,16 +1281,19 @@ impl Timeline {

    pub(super) async fn get_vectored_impl(
        &self,
-        keyspace: KeySpace,
-        lsn: Lsn,
+        query: VersionedKeySpaceQuery,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
        let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
-            Some(ReadPath::new(keyspace.clone(), lsn))
+            Some(ReadPath::new(
+                query.total_keyspace(),
+                query.high_watermark_lsn()?,
+            ))
        } else {
            None
        };
+
        reconstruct_state.read_path = read_path;

        let redo_attempt_type = if ctx.task_kind() == TaskKind::Compaction {
@@ -1311,7 +1313,7 @@ impl Timeline {
                })
                .attached_child();

-            self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, &ctx)
+            self.get_vectored_reconstruct_data(query.clone(), reconstruct_state, &ctx)
                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                .await
        };
@@ -1324,6 +1326,13 @@ impl Timeline {
                .map(|state| state.collect_pending_ios())
                .collect::<FuturesUnordered<_>>();
            while collect_futs.next().await.is_some() {}
+
+            // Enrich the missing key error with the original query.
+            if let GetVectoredError::MissingKey(mut missing_err) = err {
+                missing_err.enrich(query.clone());
+                return Err(GetVectoredError::MissingKey(missing_err));
+            }
+
            return Err(err);
        };

@@ -1341,6 +1350,8 @@ impl Timeline {

        let futs = FuturesUnordered::new();
        for (key, state) in std::mem::take(&mut reconstruct_state.keys) {
+            let req_lsn_for_key = query.map_key_to_lsn(&key);
+
            futs.push({
                let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
                let ctx = RequestContextBuilder::from(&ctx)
@@ -1387,7 +1398,7 @@ impl Timeline {

                    let walredo_deltas = converted.num_deltas();
                    let walredo_res = walredo_self
-                        .reconstruct_value(key, lsn, converted, redo_attempt_type)
+                        .reconstruct_value(key, req_lsn_for_key, converted, redo_attempt_type)
                        .maybe_perf_instrument(&ctx, |crnt_perf_span| {
                            info_span!(
                                target: PERF_TRACE_TARGET,
@@ -1414,15 +1425,18 @@ impl Timeline {
        // to avoid infinite results.
        if !results.is_empty() {
            if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
+                let total_keyspace = query.total_keyspace();
+                let max_request_lsn = query.high_watermark_lsn().expect("Validated previously");
+
                static LOG_PACER: Lazy<Mutex<RateLimit>> =
                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
                LOG_PACER.lock().unwrap().call(|| {
-                    let num_keys = keyspace.total_raw_size();
+                    let num_keys = total_keyspace.total_raw_size();
                    let num_pages = results.len();
                    tracing::info!(
                      shard_id = %self.tenant_shard_id.shard_slug(),
-                      lsn = %lsn,
-                      "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
+                      lsn = %max_request_lsn,
+                      "Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
                    );
                });
            }
@@ -2476,12 +2490,11 @@ impl Timeline {
        tenant_conf.is_gc_blocked_by_lsn_lease_deadline()
    }

-    pub(crate) fn get_lazy_slru_download(&self) -> bool {
+    pub(crate) fn get_lazy_slru_download(&self, lazy_slru_download_enabled_by_cp: bool) -> bool {
        let tenant_conf = self.tenant_conf.load();
-        tenant_conf
-            .tenant_conf
-            .lazy_slru_download
-            .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
+        tenant_conf.tenant_conf.lazy_slru_download.unwrap_or(
+            lazy_slru_download_enabled_by_cp || self.conf.default_tenant_conf.lazy_slru_download,
+        )
    }

    /// Checks if a get page request should get perf tracing
@@ -2723,6 +2736,10 @@ impl Timeline {
            .tenant_conf
            .gc_compaction_enabled
            .unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled);
+        let gc_compaction_verification = tenant_conf
+            .tenant_conf
+            .gc_compaction_verification
+            .unwrap_or(self.conf.default_tenant_conf.gc_compaction_verification);
        let gc_compaction_initial_threshold_kb = tenant_conf
            .tenant_conf
            .gc_compaction_initial_threshold_kb
@@ -2737,6 +2754,7 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent);
        GcCompactionCombinedSettings {
            gc_compaction_enabled,
+            gc_compaction_verification,
            gc_compaction_initial_threshold_kb,
            gc_compaction_ratio_percent,
        }
@@ -3935,6 +3953,154 @@ impl Timeline {
    }
 }

+#[derive(Clone)]
+/// Type representing a query in the ([`Lsn`], [`Key`]) space.
+/// In other words, a set of segments in a 2D space.
+///
+/// This representation has the advatange of avoiding hash map
+/// allocations for uniform queries.
+pub(crate) enum VersionedKeySpaceQuery {
+    /// Variant for queries at a single [`Lsn`]
+    Uniform { keyspace: KeySpace, lsn: Lsn },
+    /// Variant for queries at multiple [`Lsn`]s
+    Scattered {
+        keyspaces_at_lsn: Vec<(Lsn, KeySpace)>,
+    },
+}
+
+impl VersionedKeySpaceQuery {
+    pub(crate) fn uniform(keyspace: KeySpace, lsn: Lsn) -> Self {
+        Self::Uniform { keyspace, lsn }
+    }
+
+    pub(crate) fn scattered(keyspaces_at_lsn: Vec<(Lsn, KeySpace)>) -> Self {
+        Self::Scattered { keyspaces_at_lsn }
+    }
+
+    /// Returns the most recent (largest) LSN included in the query.
+    /// If any of the LSNs included in the query are invalid, returns
+    /// an error instead.
+    fn high_watermark_lsn(&self) -> Result<Lsn, GetVectoredError> {
+        match self {
+            Self::Uniform { lsn, .. } => {
+                if !lsn.is_valid() {
+                    return Err(GetVectoredError::InvalidLsn(*lsn));
+                }
+
+                Ok(*lsn)
+            }
+            Self::Scattered { keyspaces_at_lsn } => {
+                let mut max_lsn = None;
+                for (lsn, _keyspace) in keyspaces_at_lsn.iter() {
+                    if !lsn.is_valid() {
+                        return Err(GetVectoredError::InvalidLsn(*lsn));
+                    }
+                    max_lsn = std::cmp::max(max_lsn, Some(lsn));
+                }
+
+                if let Some(computed) = max_lsn {
+                    Ok(*computed)
+                } else {
+                    Err(GetVectoredError::Other(anyhow!("empty input")))
+                }
+            }
+        }
+    }
+
+    /// Returns the total keyspace being queried: the result of projecting
+    /// everything in the key dimensions onto the key axis.
+    fn total_keyspace(&self) -> KeySpace {
+        match self {
+            Self::Uniform { keyspace, .. } => keyspace.clone(),
+            Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn
+                .iter()
+                .map(|(_lsn, keyspace)| keyspace)
+                .fold(KeySpace::default(), |mut acc, v| {
+                    acc.merge(v);
+                    acc
+                }),
+        }
+    }
+
+    /// Returns LSN for a specific key.
+    ///
+    /// Invariant: requested key must be part of [`Self::total_keyspace`]
+    pub(super) fn map_key_to_lsn(&self, key: &Key) -> Lsn {
+        match self {
+            Self::Uniform { lsn, .. } => *lsn,
+            Self::Scattered { keyspaces_at_lsn } => {
+                keyspaces_at_lsn
+                    .iter()
+                    .find(|(_lsn, keyspace)| keyspace.contains(key))
+                    .expect("Returned key was requested")
+                    .0
+            }
+        }
+    }
+
+    /// Remove any parts of the query (segments) which overlap with the provided
+    /// key space (also segments).
+    fn remove_overlapping_with(&mut self, to_remove: &KeySpace) -> KeySpace {
+        match self {
+            Self::Uniform { keyspace, .. } => keyspace.remove_overlapping_with(to_remove),
+            Self::Scattered { keyspaces_at_lsn } => {
+                let mut removed_accum = KeySpaceRandomAccum::new();
+                keyspaces_at_lsn.iter_mut().for_each(|(_lsn, keyspace)| {
+                    let removed = keyspace.remove_overlapping_with(to_remove);
+                    removed_accum.add_keyspace(removed);
+                });
+
+                removed_accum.to_keyspace()
+            }
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        match self {
+            Self::Uniform { keyspace, .. } => keyspace.is_empty(),
+            Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn
+                .iter()
+                .all(|(_lsn, keyspace)| keyspace.is_empty()),
+        }
+    }
+
+    /// "Lower" the query on the LSN dimension
+    fn lower(&mut self, to: Lsn) {
+        match self {
+            Self::Uniform { lsn, .. } => {
+                // If the originally requested LSN is smaller than the starting
+                // LSN of the ancestor we are descending into, we need to respect that.
+                // Hence the min.
+                *lsn = std::cmp::min(*lsn, to);
+            }
+            Self::Scattered { keyspaces_at_lsn } => {
+                keyspaces_at_lsn.iter_mut().for_each(|(lsn, _keyspace)| {
+                    *lsn = std::cmp::min(*lsn, to);
+                });
+            }
+        }
+    }
+}
+
+impl std::fmt::Display for VersionedKeySpaceQuery {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+
+        match self {
+            VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
+                write!(f, "{keyspace} @ {lsn}")?;
+            }
+            VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => {
+                for (lsn, keyspace) in keyspaces_at_lsn.iter() {
+                    write!(f, "{keyspace} @ {lsn},")?;
+                }
+            }
+        }
+
+        write!(f, "]")
+    }
+}
+
 impl Timeline {
    #[allow(clippy::doc_lazy_continuation)]
    /// Get the data needed to reconstruct all keys in the provided keyspace
@@ -3949,16 +4115,15 @@ impl Timeline {
    /// 2.4. If the fringe is empty, go back to 1
    async fn get_vectored_reconstruct_data(
        &self,
-        mut keyspace: KeySpace,
-        request_lsn: Lsn,
+        mut query: VersionedKeySpaceQuery,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
+        let original_hwm_lsn = query.high_watermark_lsn().unwrap();
+
        let mut timeline_owned: Arc<Timeline>;
        let mut timeline = self;

-        let mut cont_lsn = Lsn(request_lsn.0 + 1);
-
        let missing_keyspace = loop {
            if self.cancel.is_cancelled() {
                return Err(GetVectoredError::Cancelled);
@@ -3975,15 +4140,14 @@ impl Timeline {
                            parent: crnt_perf_span,
                            "PLAN_IO_TIMELINE",
                            timeline = %timeline.timeline_id,
-                            lsn = %cont_lsn,
+                            high_watermark_lsn = %query.high_watermark_lsn().unwrap(),
                        )
                    })
                    .attached_child();

                Self::get_vectored_reconstruct_data_timeline(
                    timeline,
-                    keyspace.clone(),
-                    cont_lsn,
+                    &query,
                    reconstruct_state,
                    &self.cancel,
                    &ctx,
@@ -3992,23 +4156,23 @@ impl Timeline {
                .await?
            };

-            keyspace.remove_overlapping_with(&completed);
+            query.remove_overlapping_with(&completed);

            // Do not descend into the ancestor timeline for aux files.
            // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
            // stalling compaction.
-            keyspace.remove_overlapping_with(&KeySpace {
+            query.remove_overlapping_with(&KeySpace {
                ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
            });

            // Keyspace is fully retrieved
-            if keyspace.is_empty() {
+            if query.is_empty() {
                break None;
            }

            let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
                // Not fully retrieved but no ancestor timeline.
-                break Some(keyspace);
+                break Some(query.total_keyspace());
            };

            // Now we see if there are keys covered by the image layer but does not exist in the
@@ -4019,7 +4183,7 @@ impl Timeline {
            // keys from `keyspace`, we expect there to be no overlap between it and the image covered key
            // space. If that's not the case, we had at least one key encounter a gap in the image layer
            // and stop the search as a result of that.
-            let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            let mut removed = query.remove_overlapping_with(&image_covered_keyspace);
            // Do not fire missing key error and end early for sparse keys. Note that we hava already removed
            // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of
            // figuring out what is the inherited key range and do a fine-grained pruning.
@@ -4029,11 +4193,11 @@ impl Timeline {
            if !removed.is_empty() {
                break Some(removed);
            }
-            // If we reached this point, `remove_overlapping_with` should not have made any change to the
-            // keyspace.

-            // Take the min to avoid reconstructing a page with data newer than request Lsn.
-            cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
+            // Each key range in the original query is at some point in the LSN space.
+            // When descending into the ancestor, lower all ranges in the LSN space
+            // such that new changes on the parent timeline are not visible.
+            query.lower(timeline.ancestor_lsn);

            let ctx = RequestContextBuilder::from(ctx)
                .perf_span(|crnt_perf_span| {
@@ -4042,7 +4206,6 @@ impl Timeline {
                        parent: crnt_perf_span,
                        "GET_ANCESTOR",
                        timeline = %timeline.timeline_id,
-                        lsn = %cont_lsn,
                        ancestor = %ancestor_timeline.timeline_id,
                        ancestor_lsn = %timeline.ancestor_lsn
                    )
@@ -4072,22 +4235,47 @@ impl Timeline {
        };

        if let Some(missing_keyspace) = missing_keyspace {
-            return Err(GetVectoredError::MissingKey(MissingKeyError {
-                key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
-                shard: self
-                    .shard_identity
-                    .get_shard_number(&missing_keyspace.start().unwrap()),
-                cont_lsn,
-                request_lsn,
+            return Err(GetVectoredError::MissingKey(Box::new(MissingKeyError {
+                keyspace: missing_keyspace, /* better if we can store the full keyspace */
+                shard: self.shard_identity.number,
+                original_hwm_lsn,
                ancestor_lsn: Some(timeline.ancestor_lsn),
                backtrace: None,
                read_path: std::mem::take(&mut reconstruct_state.read_path),
-            }));
+                query: None,
+            })));
        }

        Ok(())
    }

+    async fn get_vectored_init_fringe(
+        &self,
+        query: &VersionedKeySpaceQuery,
+    ) -> Result<LayerFringe, GetVectoredError> {
+        let mut fringe = LayerFringe::new();
+        let guard = self.layers.read().await;
+
+        match query {
+            VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
+                // LSNs requested by the compute or determined by the pageserver
+                // are inclusive. Queries to the layer map use exclusive LSNs.
+                // Hence, bump the value before the query - same in the other
+                // match arm.
+                let cont_lsn = Lsn(lsn.0 + 1);
+                guard.update_search_fringe(keyspace, cont_lsn, &mut fringe)?;
+            }
+            VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => {
+                for (lsn, keyspace) in keyspaces_at_lsn.iter() {
+                    let cont_lsn_for_keyspace = Lsn(lsn.0 + 1);
+                    guard.update_search_fringe(keyspace, cont_lsn_for_keyspace, &mut fringe)?;
+                }
+            }
+        }
+
+        Ok(fringe)
+    }
+
    /// Collect the reconstruct data for a keyspace from the specified timeline.
    ///
    /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
@@ -4106,8 +4294,7 @@ impl Timeline {
    /// decides how to deal with these two keyspaces.
    async fn get_vectored_reconstruct_data_timeline(
        timeline: &Timeline,
-        keyspace: KeySpace,
-        mut cont_lsn: Lsn,
+        query: &VersionedKeySpaceQuery,
        reconstruct_state: &mut ValuesReconstructState,
        cancel: &CancellationToken,
        ctx: &RequestContext,
@@ -4123,14 +4310,7 @@ impl Timeline {
        let _guard = timeline.gc_compaction_layer_update_lock.read().await;

        // Initialize the fringe
-        let mut fringe = {
-            let mut fringe = LayerFringe::new();
-
-            let guard = timeline.layers.read().await;
-            guard.update_search_fringe(&keyspace, cont_lsn, &mut fringe)?;
-
-            fringe
-        };
+        let mut fringe = timeline.get_vectored_init_fringe(query).await?;

        let mut completed_keyspace = KeySpace::default();
        let mut image_covered_keyspace = KeySpaceRandomAccum::new();
@@ -4156,7 +4336,7 @@ impl Timeline {
                .await?;

            let mut unmapped_keyspace = keyspace_to_read;
-            cont_lsn = next_cont_lsn;
+            let cont_lsn = next_cont_lsn;

            reconstruct_state.on_layer_visited(&layer_to_read);

@@ -4805,7 +4985,13 @@ impl Timeline {
        let ctx = ctx.attached_child();
        let work = async move {
            let Some((desc, path)) = frozen_layer
-                .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner())
+                .write_to_disk(
+                    &ctx,
+                    key_range,
+                    self_clone.l0_flush_global_state.inner(),
+                    &self_clone.gate,
+                    self_clone.cancel.clone(),
+                )
                .await?
            else {
                return Ok(None);
@@ -4991,13 +5177,11 @@ impl Timeline {
                if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
                    || (last_key_in_range && key_request_accum.raw_size() > 0)
                {
+                    let query =
+                        VersionedKeySpaceQuery::uniform(key_request_accum.consume_keyspace(), lsn);
+
                    let results = self
-                        .get_vectored(
-                            key_request_accum.consume_keyspace(),
-                            lsn,
-                            io_concurrency.clone(),
-                            ctx,
-                        )
+                        .get_vectored(query, io_concurrency.clone(), ctx)
                        .await?;

                    if self.cancel.is_cancelled() {
@@ -5086,7 +5270,11 @@ impl Timeline {
        // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
        // not contain too many keys, otherwise this takes a lot of memory.
        let data = self
-            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
+            .get_vectored_impl(
+                VersionedKeySpaceQuery::uniform(partition.clone(), lsn),
+                &mut reconstruct_state,
+                ctx,
+            )
            .await?;
        let (data, total_kb_retrieved, total_keys_retrieved) = {
            let mut new_data = BTreeMap::new();
@@ -5343,6 +5531,8 @@ impl Timeline {
                self.tenant_shard_id,
                &img_range,
                lsn,
+                &self.gate,
+                self.cancel.clone(),
                ctx,
            )
            .await?;
@@ -5511,6 +5701,12 @@ impl Timeline {
            return;
        }

+        if self.cancel.is_cancelled() {
+            // We already requested stopping the tenant, so we cannot wait for the logical size
+            // calculation to complete given the task might have been already cancelled.
+            return;
+        }
+
        if let Some(await_bg_cancel) = self
            .current_logical_size
            .cancel_wait_for_background_loop_concurrency_limit_semaphore
@@ -6707,6 +6903,8 @@ impl Timeline {
            self.tenant_shard_id,
            &(min_key..end_key),
            lsn,
+            &self.gate,
+            self.cancel.clone(),
            ctx,
        )
        .await?;
@@ -6768,6 +6966,8 @@ impl Timeline {
            self.tenant_shard_id,
            deltas.key_range.start,
            deltas.lsn_range,
+            &self.gate,
+            self.cancel.clone(),
            ctx,
        )
        .await?;
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -56,7 +56,8 @@ use crate::tenant::storage_layer::batch_split_writer::{
 use crate::tenant::storage_layer::filter_iterator::FilterIterator;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{
-    AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
+    AsLayerDesc, LayerVisibilityHint, PersistentLayerDesc, PersistentLayerKey,
+    ValueReconstructState,
 };
 use crate::tenant::tasks::log_compaction_error;
 use crate::tenant::timeline::{
@@ -69,6 +70,13 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;

+/// Ratio of shard-local pages below which we trigger shard ancestor layer rewrites. 0.3 means that
+/// <= 30% of layer pages must belong to the descendant shard to rewrite the layer.
+///
+/// We choose a value < 0.5 to avoid rewriting all visible layers every time we do a power-of-two
+/// shard split, which gets expensive for large tenants.
+const ANCESTOR_COMPACTION_REWRITE_THRESHOLD: f64 = 0.3;
+
 #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
 pub struct GcCompactionJobId(pub usize);

@@ -80,6 +88,7 @@ impl std::fmt::Display for GcCompactionJobId {

 pub struct GcCompactionCombinedSettings {
    pub gc_compaction_enabled: bool,
+    pub gc_compaction_verification: bool,
    pub gc_compaction_initial_threshold_kb: u64,
    pub gc_compaction_ratio_percent: u64,
 }
@@ -225,6 +234,7 @@ impl GcCompactionQueue {
            gc_compaction_enabled,
            gc_compaction_initial_threshold_kb,
            gc_compaction_ratio_percent,
+            ..
        } = timeline.get_gc_compaction_settings();
        if !gc_compaction_enabled {
            return Ok(());
@@ -747,8 +757,8 @@ impl KeyHistoryRetention {
    async fn pipe_to(
        self,
        key: Key,
-        delta_writer: &mut SplitDeltaLayerWriter,
-        mut image_writer: Option<&mut SplitImageLayerWriter>,
+        delta_writer: &mut SplitDeltaLayerWriter<'_>,
+        mut image_writer: Option<&mut SplitImageLayerWriter<'_>>,
        stat: &mut CompactionStatistics,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -788,6 +798,123 @@ impl KeyHistoryRetention {
        }
        Ok(())
    }
+
+    /// Verify if every key in the retention is readable by replaying the logs.
+    async fn verify(
+        &self,
+        key: Key,
+        base_img_from_ancestor: &Option<(Key, Lsn, Bytes)>,
+        full_history: &[(Key, Lsn, Value)],
+        tline: &Arc<Timeline>,
+    ) -> anyhow::Result<()> {
+        // Usually the min_lsn should be the first record but we do a full iteration to be safe.
+        let Some(min_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).min() else {
+            // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
+            return Ok(());
+        };
+        let Some(max_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).max() else {
+            // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
+            return Ok(());
+        };
+        let mut base_img = base_img_from_ancestor
+            .as_ref()
+            .map(|(_, lsn, img)| (*lsn, img));
+        let mut history = Vec::new();
+
+        async fn collect_and_verify(
+            key: Key,
+            lsn: Lsn,
+            base_img: &Option<(Lsn, &Bytes)>,
+            history: &[(Lsn, &NeonWalRecord)],
+            tline: &Arc<Timeline>,
+            skip_empty: bool,
+        ) -> anyhow::Result<()> {
+            if base_img.is_none() && history.is_empty() {
+                if skip_empty {
+                    return Ok(());
+                }
+                anyhow::bail!("verification failed: key {} has no history at {}", key, lsn);
+            };
+
+            let mut records = history
+                .iter()
+                .map(|(lsn, val)| (*lsn, (*val).clone()))
+                .collect::<Vec<_>>();
+
+            // WAL redo requires records in the reverse LSN order
+            records.reverse();
+            let data = ValueReconstructState {
+                img: base_img.as_ref().map(|(lsn, img)| (*lsn, (*img).clone())),
+                records,
+            };
+
+            tline
+                .reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction)
+                .await
+                .with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?;
+
+            Ok(())
+        }
+
+        for (retain_lsn, KeyLogAtLsn(logs)) in &self.below_horizon {
+            for (lsn, val) in logs {
+                match val {
+                    Value::Image(img) => {
+                        base_img = Some((*lsn, img));
+                        history.clear();
+                    }
+                    Value::WalRecord(rec) if val.will_init() => {
+                        base_img = None;
+                        history.clear();
+                        history.push((*lsn, rec));
+                    }
+                    Value::WalRecord(rec) => {
+                        history.push((*lsn, rec));
+                    }
+                }
+            }
+            if *retain_lsn >= min_lsn {
+                // Only verify after the key appears in the full history for the first time.
+
+                // We don't modify history: in theory, we could replace the history with a single
+                // image as in `generate_key_retention` to make redos at later LSNs faster. But we
+                // want to verify everything as if they are read from the real layer map.
+                collect_and_verify(key, *retain_lsn, &base_img, &history, tline, false)
+                    .await
+                    .context("below horizon retain_lsn")?;
+            }
+        }
+
+        for (lsn, val) in &self.above_horizon.0 {
+            match val {
+                Value::Image(img) => {
+                    // Above the GC horizon, we verify every time we see an image.
+                    collect_and_verify(key, *lsn, &base_img, &history, tline, true)
+                        .await
+                        .context("above horizon full image")?;
+                    base_img = Some((*lsn, img));
+                    history.clear();
+                }
+                Value::WalRecord(rec) if val.will_init() => {
+                    // Above the GC horizon, we verify every time we see an init record.
+                    collect_and_verify(key, *lsn, &base_img, &history, tline, true)
+                        .await
+                        .context("above horizon init record")?;
+                    base_img = None;
+                    history.clear();
+                    history.push((*lsn, rec));
+                }
+                Value::WalRecord(rec) => {
+                    history.push((*lsn, rec));
+                }
+            }
+        }
+        // Ensure the latest record is readable.
+        collect_and_verify(key, max_lsn, &base_img, &history, tline, false)
+            .await
+            .context("latest record")?;
+        Ok(())
+    }
 }

 #[derive(Debug, Serialize, Default)]
@@ -1119,7 +1246,17 @@ impl Timeline {
            // being potentially much longer.
            let rewrite_max = partition_count;

-            self.compact_shard_ancestors(rewrite_max, ctx).await?;
+            let outcome = self
+                .compact_shard_ancestors(
+                    rewrite_max,
+                    options.flags.contains(CompactFlags::YieldForL0),
+                    ctx,
+                )
+                .await?;
+            match outcome {
+                CompactionOutcome::Pending | CompactionOutcome::YieldForL0 => return Ok(outcome),
+                CompactionOutcome::Done | CompactionOutcome::Skipped => {}
+            }
        }

        Ok(CompactionOutcome::Done)
@@ -1136,8 +1273,10 @@ impl Timeline {
    async fn compact_shard_ancestors(
        self: &Arc<Self>,
        rewrite_max: usize,
+        yield_for_l0: bool,
        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
+        let mut outcome = CompactionOutcome::Done;
        let mut drop_layers = Vec::new();
        let mut layers_to_rewrite: Vec<Layer> = Vec::new();

@@ -1148,15 +1287,13 @@ impl Timeline {
        // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
        // are rewriting layers.
        let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
-
-        tracing::info!(
-            "starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}",
-            *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.time
-        );
+        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;

        let layers = self.layers.read().await;
-        for layer_desc in layers.layer_map()?.iter_historic_layers() {
+        let layers_iter = layers.layer_map()?.iter_historic_layers();
+        let (layers_total, mut layers_checked) = (layers_iter.len(), 0);
+        for layer_desc in layers_iter {
+            layers_checked += 1;
            let layer = layers.get_from_desc(&layer_desc);
            if layer.metadata().shard.shard_count == self.shard_identity.count {
                // This layer does not belong to a historic ancestor, no need to re-image it.
@@ -1171,8 +1308,8 @@ impl Timeline {
                // This ancestral layer only covers keys that belong to other shards.
                // We include the full metadata in the log: if we had some critical bug that caused
                // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
-                info!(%layer, old_metadata=?layer.metadata(),
-                    "dropping layer after shard split, contains no keys for this shard.",
+                debug!(%layer, old_metadata=?layer.metadata(),
+                    "dropping layer after shard split, contains no keys for this shard",
                );

                if cfg!(debug_assertions) {
@@ -1200,14 +1337,15 @@ impl Timeline {
                continue;
            }

-            // Don't bother re-writing a layer unless it will at least halve its size
+            // Only rewrite a layer if we can reclaim significant space.
            if layer_local_page_count != u32::MAX
-                && layer_local_page_count > layer_raw_page_count / 2
+                && layer_local_page_count as f64 / layer_raw_page_count as f64
+                    <= ANCESTOR_COMPACTION_REWRITE_THRESHOLD
            {
                debug!(%layer,
-                    "layer is already mostly local ({}/{}), not rewriting",
-                    layer_local_page_count,
-                    layer_raw_page_count
+                    "layer has a large share of local pages \
+                        ({layer_local_page_count}/{layer_raw_page_count} > \
+                        {ANCESTOR_COMPACTION_REWRITE_THRESHOLD}), not rewriting",
                );
            }

@@ -1219,12 +1357,19 @@ impl Timeline {
                continue;
            }

+            // We do not yet implement rewrite of delta layers.
            if layer_desc.is_delta() {
-                // We do not yet implement rewrite of delta layers
                debug!(%layer, "Skipping rewrite of delta layer");
                continue;
            }

+            // We don't bother rewriting layers that aren't visible, since these won't be needed by
+            // reads and will likely be garbage collected soon.
+            if layer.visibility() != LayerVisibilityHint::Visible {
+                debug!(%layer, "Skipping rewrite of invisible layer");
+                continue;
+            }
+
            // Only rewrite layers if their generations differ.  This guarantees:
            //  - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
            //  - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
@@ -1234,19 +1379,36 @@ impl Timeline {
            }

            if layers_to_rewrite.len() >= rewrite_max {
-                tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
+                debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
                    layers_to_rewrite.len()
                );
-                continue;
+                outcome = CompactionOutcome::Pending;
+                break;
            }

            // Fall through: all our conditions for doing a rewrite passed.
            layers_to_rewrite.push(layer);
        }

-        // Drop read lock on layer map before we start doing time-consuming I/O
+        // Drop read lock on layer map before we start doing time-consuming I/O.
        drop(layers);

+        // Drop out early if there's nothing to do.
+        if layers_to_rewrite.is_empty() && drop_layers.is_empty() {
+            return Ok(CompactionOutcome::Done);
+        }
+
+        info!(
+            "starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \
+                checked {layers_checked}/{layers_total} layers \
+                (latest_gc_cutoff={} pitr_cutoff={})",
+            layers_to_rewrite.len(),
+            drop_layers.len(),
+            *latest_gc_cutoff,
+            pitr_cutoff,
+        );
+        let started = Instant::now();
+
        let mut replace_image_layers = Vec::new();

        for layer in layers_to_rewrite {
@@ -1254,13 +1416,15 @@ impl Timeline {
                return Err(CompactionError::ShuttingDown);
            }

-            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
+            info!(layer=%layer, "rewriting layer after shard split");
            let mut image_layer_writer = ImageLayerWriter::new(
                self.conf,
                self.timeline_id,
                self.tenant_shard_id,
                &layer.layer_desc().key_range,
                layer.layer_desc().image_layer_lsn(),
+                &self.gate,
+                self.cancel.clone(),
                ctx,
            )
            .await
@@ -1292,7 +1456,7 @@ impl Timeline {
                    .map_err(CompactionError::Other)?;
                let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
                    .map_err(CompactionError::Other)?;
-                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
+                info!(layer=%new_layer, "rewrote layer, {} -> {} bytes",
                    layer.metadata().file_size,
                    new_layer.metadata().file_size);

@@ -1302,6 +1466,26 @@ impl Timeline {
                // the layer has no data for us with the ShardedRange check above, but
                drop_layers.push(layer);
            }
+
+            // Yield for L0 compaction if necessary, but make sure we update the layer map below
+            // with the work we've already done.
+            if yield_for_l0
+                && self
+                    .l0_compaction_trigger
+                    .notified()
+                    .now_or_never()
+                    .is_some()
+            {
+                info!("shard ancestor compaction yielding for L0 compaction");
+                outcome = CompactionOutcome::YieldForL0;
+                break;
+            }
+        }
+
+        for layer in &drop_layers {
+            info!(%layer, old_metadata=?layer.metadata(),
+                "dropping layer after shard split (no keys for this shard)",
+            );
        }

        // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
@@ -1319,17 +1503,36 @@ impl Timeline {
        // necessary for correctness, but it simplifies testing, and avoids proceeding with another
        // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
        // load.
-        match self.remote_client.wait_completion().await {
-            Ok(()) => (),
-            Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
-            Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
-                return Err(CompactionError::ShuttingDown);
+        if outcome != CompactionOutcome::YieldForL0 {
+            info!("shard ancestor compaction waiting for uploads");
+            tokio::select! {
+                result = self.remote_client.wait_completion() => match result {
+                    Ok(()) => {},
+                    Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
+                    Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
+                        return Err(CompactionError::ShuttingDown);
+                    }
+                },
+                // Don't wait if there's L0 compaction to do. We don't need to update the outcome
+                // here, because we've already done the actual work.
+                _ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {},
            }
        }

+        info!(
+            "shard ancestor compaction done in {:.3}s{}",
+            started.elapsed().as_secs_f64(),
+            match outcome {
+                CompactionOutcome::Pending =>
+                    format!(", with pending work (rewrite_max={rewrite_max})"),
+                CompactionOutcome::YieldForL0 => String::from(", yielding for L0 compaction"),
+                CompactionOutcome::Skipped | CompactionOutcome::Done => String::new(),
+            }
+        );
+
        fail::fail_point!("compact-shard-ancestors-persistent");

-        Ok(())
+        Ok(outcome)
    }

    /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is
@@ -1861,6 +2064,8 @@ impl Timeline {
                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
                                lsn_range.clone()
                            },
+                            &self.gate,
+                            self.cancel.clone(),
                            ctx,
                        )
                        .await
@@ -2148,6 +2353,7 @@ impl Timeline {
    /// ```
    ///
    /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
+    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn generate_key_retention(
        self: &Arc<Timeline>,
        key: Key,
@@ -2156,6 +2362,7 @@ impl Timeline {
        retain_lsn_below_horizon: &[Lsn],
        delta_threshold_cnt: usize,
        base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
+        verification: bool,
    ) -> anyhow::Result<KeyHistoryRetention> {
        // Pre-checks for the invariants

@@ -2242,8 +2449,8 @@ impl Timeline {
            "should have at least below + above horizon batches"
        );
        let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
-        if let Some((key, lsn, img)) = base_img_from_ancestor {
-            replay_history.push((key, lsn, Value::Image(img)));
+        if let Some((key, lsn, ref img)) = base_img_from_ancestor {
+            replay_history.push((key, lsn, Value::Image(img.clone())));
        }

        /// Generate debug information for the replay history
@@ -2357,22 +2564,15 @@ impl Timeline {
            // Whether to reconstruct the image. In debug mode, we will generate an image
            // at every retain_lsn to ensure data is not corrupted, but we won't put the
            // image into the final layer.
-            let generate_image = produce_image || debug_mode;
-            if produce_image {
+            let img_and_lsn = if produce_image {
                records_since_last_image = 0;
-            }
-            let img_and_lsn = if generate_image {
                let replay_history_for_debug = if debug_mode {
                    Some(replay_history.clone())
                } else {
                    None
                };
                let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
-                let history = if produce_image {
-                    std::mem::take(&mut replay_history)
-                } else {
-                    replay_history.clone()
-                };
+                let history = std::mem::take(&mut replay_history);
                let mut img = None;
                let mut records = Vec::with_capacity(history.len());
                if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
@@ -2407,6 +2607,7 @@ impl Timeline {
                        records.push((lsn, rec));
                    }
                }
+                // WAL redo requires records in the reverse LSN order
                records.reverse();
                let state = ValueReconstructState { img, records };
                // last batch does not generate image so i is always in range, unless we force generate
@@ -2439,10 +2640,16 @@ impl Timeline {
        assert_eq!(retention.len(), lsn_split_points.len() + 1);
        for (idx, logs) in retention.into_iter().enumerate() {
            if idx == lsn_split_points.len() {
-                return Ok(KeyHistoryRetention {
+                let retention = KeyHistoryRetention {
                    below_horizon: result,
                    above_horizon: KeyLogAtLsn(logs),
-                });
+                };
+                if verification {
+                    retention
+                        .verify(key, &base_img_from_ancestor, full_history, self)
+                        .await?;
+                }
+                return Ok(retention);
            } else {
                result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
            }
@@ -2909,6 +3116,9 @@ impl Timeline {
            }
            (false, res)
        };
+
+        let verification = self.get_gc_compaction_settings().gc_compaction_verification;
+
        info!(
            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}",
            job_desc.selected_layers.len(),
@@ -3055,6 +3265,8 @@ impl Timeline {
                    job_desc.compaction_key_range.start,
                    lowest_retain_lsn,
                    self.get_compaction_target_size(),
+                    &self.gate,
+                    self.cancel.clone(),
                    ctx,
                )
                .await
@@ -3071,6 +3283,8 @@ impl Timeline {
            self.tenant_shard_id,
            lowest_retain_lsn..end_lsn,
            self.get_compaction_target_size(),
+            &self.gate,
+            self.cancel.clone(),
        )
        .await
        .context("failed to create delta layer writer")
@@ -3167,6 +3381,8 @@ impl Timeline {
                                self.tenant_shard_id,
                                desc.key_range.start,
                                desc.lsn_range.clone(),
+                                &self.gate,
+                                self.cancel.clone(),
                                ctx,
                            )
                            .await
@@ -3184,6 +3400,8 @@ impl Timeline {
                                self.tenant_shard_id,
                                job_desc.compaction_key_range.end,
                                desc.lsn_range.clone(),
+                                &self.gate,
+                                self.cancel.clone(),
                                ctx,
                            )
                            .await
@@ -3225,6 +3443,7 @@ impl Timeline {
                            .await
                            .context("failed to get ancestor image")
                            .map_err(CompactionError::Other)?,
+                        verification,
                    )
                    .await
                    .context("failed to generate key retention")
@@ -3265,6 +3484,7 @@ impl Timeline {
                    .await
                    .context("failed to get ancestor image")
                    .map_err(CompactionError::Other)?,
+                verification,
            )
            .await
            .context("failed to generate key retention")
@@ -3753,6 +3973,8 @@ impl CompactionJobExecutor for TimelineAdaptor {
            self.timeline.tenant_shard_id,
            key_range.start,
            lsn_range.clone(),
+            &self.timeline.gate,
+            self.timeline.cancel.clone(),
            ctx,
        )
        .await?;
@@ -3828,6 +4050,8 @@ impl TimelineAdaptor {
            self.timeline.tenant_shard_id,
            key_range,
            lsn,
+            &self.timeline.gate,
+            self.timeline.cancel.clone(),
            ctx,
        )
        .await?;
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -30,6 +30,7 @@ use crate::tenant::storage_layer::{
    AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
    ValuesReconstructState,
 };
+use crate::tenant::timeline::VersionedKeySpaceQuery;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 #[derive(Debug, thiserror::Error)]
@@ -212,13 +213,9 @@ async fn generate_tombstone_image_layer(
        }
    }

+    let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key_range.clone()), image_lsn);
    let data = ancestor
-        .get_vectored_impl(
-            KeySpace::single(key_range.clone()),
-            image_lsn,
-            &mut reconstruct_state,
-            ctx,
-        )
+        .get_vectored_impl(query, &mut reconstruct_state, ctx)
        .await
        .context("failed to retrieve aux keys")
        .map_err(|e| Error::launder(e, Error::Prepare))?;
@@ -231,6 +228,8 @@ async fn generate_tombstone_image_layer(
            detached.tenant_shard_id,
            &key_range,
            image_lsn,
+            &detached.gate,
+            detached.cancel.clone(),
            ctx,
        )
        .await
@@ -779,6 +778,8 @@ async fn copy_lsn_prefix(
        target_timeline.tenant_shard_id,
        layer.layer_desc().key_range.start,
        layer.layer_desc().lsn_range.start..end_lsn,
+        &target_timeline.gate,
+        target_timeline.cancel.clone(),
        ctx,
    )
    .await
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -738,6 +738,8 @@ impl ChunkProcessingJob {
            self.timeline.tenant_shard_id,
            &self.range,
            self.pgdata_lsn,
+            &self.timeline.gate,
+            self.timeline.cancel.clone(),
            ctx,
        )
        .await?;
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -580,6 +580,7 @@ impl ConnectionManagerState {
                                );
                                Ok(())
                            }
+                            WalReceiverError::Cancelled => Ok(()),
                            WalReceiverError::Other(e) => {
                                // give out an error to have task_mgr give it a really verbose logging
                                if cancellation.is_cancelled() {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -73,6 +73,7 @@ pub(super) enum WalReceiverError {
    /// Generic error
    Other(anyhow::Error),
    ClosedGate,
+    Cancelled,
 }

 impl From<tokio_postgres::Error> for WalReceiverError {
@@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection(
                                // with a similar error.
                            },
                            WalReceiverError::SuccessfulCompletion(_) => {}
+                            WalReceiverError::Cancelled => {
+                                debug!("Connection cancelled")
+                            }
                            WalReceiverError::ClosedGate => {
                                // doesn't happen at runtime
                            }
@@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection(

    let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);

-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
+        .await
+        .map_err(|e| match e.kind {
+            crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
+            _ => WalReceiverError::Other(e.into()),
+        })?;

    let shard = vec![*timeline.get_shard_identity()];

--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,13 +21,13 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.

+use std::backtrace::Backtrace;
 use std::collections::HashMap;
 use std::sync::{Arc, OnceLock};
 use std::time::{Duration, Instant, SystemTime};

-use anyhow::{Result, bail};
 use bytes::{Buf, Bytes};
-use pageserver_api::key::rel_block_to_key;
+use pageserver_api::key::{Key, rel_block_to_key};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
@@ -38,7 +38,7 @@ use postgres_ffi::{
    fsm_logical_to_physical, pg_constants,
 };
 use tracing::*;
-use utils::bin_ser::SerializeError;
+use utils::bin_ser::{DeserializeError, SerializeError};
 use utils::lsn::Lsn;
 use utils::rate_limit::RateLimit;
 use utils::{critical, failpoint_support};
@@ -104,12 +104,101 @@ struct WarnIngestLag {
    timestamp_invalid_msg_ratelimit: RateLimit,
 }

+pub struct WalIngestError {
+    pub backtrace: std::backtrace::Backtrace,
+    pub kind: WalIngestErrorKind,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum WalIngestErrorKind {
+    #[error(transparent)]
+    #[allow(private_interfaces)]
+    PageReconstructError(#[from] PageReconstructError),
+    #[error(transparent)]
+    DeserializationFailure(#[from] DeserializeError),
+    #[error(transparent)]
+    SerializationFailure(#[from] SerializeError),
+    #[error("the request contains data not supported by pageserver: {0} @ {1}")]
+    InvalidKey(Key, Lsn),
+    #[error("twophase file for xid {0} already exists")]
+    FileAlreadyExists(u64),
+    #[error("slru segment {0:?}/{1} already exists")]
+    SlruAlreadyExists(SlruKind, u32),
+    #[error("relation already exists")]
+    RelationAlreadyExists(RelTag),
+    #[error("invalid reldir key {0}")]
+    InvalidRelDirKey(Key),
+
+    #[error(transparent)]
+    LogicalError(anyhow::Error),
+    #[error(transparent)]
+    EncodeAuxFileError(anyhow::Error),
+    #[error(transparent)]
+    MaybeRelSizeV2Error(anyhow::Error),
+
+    #[error("timeline shutting down")]
+    Cancelled,
+}
+
+impl<T> From<T> for WalIngestError
+where
+    WalIngestErrorKind: From<T>,
+{
+    fn from(value: T) -> Self {
+        WalIngestError {
+            backtrace: Backtrace::capture(),
+            kind: WalIngestErrorKind::from(value),
+        }
+    }
+}
+
+impl std::error::Error for WalIngestError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        self.kind.source()
+    }
+}
+
+impl core::fmt::Display for WalIngestError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        self.kind.fmt(f)
+    }
+}
+
+impl core::fmt::Debug for WalIngestError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        if f.alternate() {
+            f.debug_map()
+                .key(&"backtrace")
+                .value(&self.backtrace)
+                .key(&"kind")
+                .value(&self.kind)
+                .finish()
+        } else {
+            writeln!(f, "Error: {:?}", self.kind)?;
+            if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured {
+                writeln!(f, "Stack backtrace: {:?}", self.backtrace)?;
+            }
+            Ok(())
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! ensure_walingest {
+    ($($t:tt)*) => {
+        _ = || -> Result<(), anyhow::Error> {
+            anyhow::ensure!($($t)*);
+            Ok(())
+        }().map_err(WalIngestErrorKind::LogicalError)?;
+    };
+}
+
 impl WalIngest {
    pub async fn new(
        timeline: &Timeline,
        startpoint: Lsn,
        ctx: &RequestContext,
-    ) -> anyhow::Result<WalIngest> {
+    ) -> Result<WalIngest, WalIngestError> {
        // Fetch the latest checkpoint into memory, so that we can compare with it
        // quickly in `ingest_record` and update it when it changes.
        let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -145,7 +234,7 @@ impl WalIngest {
        interpreted: InterpretedWalRecord,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<bool> {
+    ) -> Result<bool, WalIngestError> {
        WAL_INGEST.records_received.inc();
        let prev_len = modification.len();

@@ -288,7 +377,7 @@ impl WalIngest {
    }

    /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
-    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
+    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64, WalIngestError> {
        let next_full_xid =
            enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });

@@ -298,9 +387,9 @@ impl WalIngest {
        if xid > next_xid {
            // Wraparound occurred, must be from a prev epoch.
            if epoch == 0 {
-                bail!(
+                Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
                    "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"
-                );
+                )))?;
            }
            epoch -= 1;
        }
@@ -313,7 +402,7 @@ impl WalIngest {
        clear_vm_bits: ClearVmBits,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let ClearVmBits {
            new_heap_blkno,
            old_heap_blkno,
@@ -402,7 +491,7 @@ impl WalIngest {
        create: DbaseCreate,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let DbaseCreate {
            db_id,
            tablespace_id,
@@ -505,7 +594,7 @@ impl WalIngest {
        dbase_drop: DbaseDrop,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let DbaseDrop {
            db_id,
            tablespace_ids,
@@ -523,7 +612,7 @@ impl WalIngest {
        create: SmgrCreate,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let SmgrCreate { rel } = create;
        self.put_rel_creation(modification, rel, ctx).await?;
        Ok(())
@@ -537,7 +626,7 @@ impl WalIngest {
        truncate: XlSmgrTruncate,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let XlSmgrTruncate {
            blkno,
            rnode,
@@ -689,7 +778,7 @@ impl WalIngest {
        record: XactRecord,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let (xact_common, is_commit, is_prepared) = match record {
            XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
                let xid: u64 = if modification.tline.pg_version >= 17 {
@@ -813,7 +902,7 @@ impl WalIngest {
        truncate: ClogTruncate,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let ClogTruncate {
            pageno,
            oldest_xid,
@@ -889,7 +978,7 @@ impl WalIngest {
        zero_page: ClogZeroPage,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let ClogZeroPage { segno, rpageno } = zero_page;

        self.put_slru_page_image(
@@ -907,7 +996,7 @@ impl WalIngest {
        &mut self,
        modification: &mut DatadirModification,
        xlrec: &XlMultiXactCreate,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        // Create WAL record for updating the multixact-offsets page
        let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1010,7 +1099,7 @@ impl WalIngest {
        modification: &mut DatadirModification<'_>,
        xlrec: &XlMultiXactTruncate,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        let (maxsegment, startsegment, endsegment) =
            enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
                cp.oldestMulti = xlrec.end_trunc_off;
@@ -1058,7 +1147,7 @@ impl WalIngest {
        zero_page: MultiXactZeroPage,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        let MultiXactZeroPage {
            slru_kind,
            segno,
@@ -1080,7 +1169,7 @@ impl WalIngest {
        update: RelmapUpdate,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        let RelmapUpdate { update, buf } = update;

        modification
@@ -1093,7 +1182,7 @@ impl WalIngest {
        raw_record: RawXlogRecord,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        let RawXlogRecord { info, lsn, mut buf } = raw_record;
        let pg_version = modification.tline.pg_version;

@@ -1235,12 +1324,12 @@ impl WalIngest {
        put: PutLogicalMessage,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        let PutLogicalMessage { path, buf } = put;
        modification.put_file(path.as_str(), &buf, ctx).await
    }

-    fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> {
+    fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> {
        match record {
            StandbyRecord::RunningXacts(running_xacts) => {
                enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
@@ -1258,7 +1347,7 @@ impl WalIngest {
        &mut self,
        record: ReploriginRecord,
        modification: &mut DatadirModification<'_>,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        match record {
            ReploriginRecord::Set(set) => {
                modification
@@ -1278,7 +1367,7 @@ impl WalIngest {
        modification: &mut DatadirModification<'_>,
        rel: RelTag,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        modification.put_rel_creation(rel, 0, ctx).await?;
        Ok(())
    }
@@ -1291,7 +1380,7 @@ impl WalIngest {
        blknum: BlockNumber,
        img: Bytes,
        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), WalIngestError> {
        self.handle_rel_extend(modification, rel, blknum, ctx)
            .await?;
        modification.put_rel_page_image(rel, blknum, img)?;
@@ -1305,7 +1394,7 @@ impl WalIngest {
        blknum: BlockNumber,
        rec: NeonWalRecord,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        self.handle_rel_extend(modification, rel, blknum, ctx)
            .await?;
        modification.put_rel_wal_record(rel, blknum, rec)?;
@@ -1318,7 +1407,7 @@ impl WalIngest {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        modification.put_rel_truncation(rel, nblocks, ctx).await?;
        Ok(())
    }
@@ -1329,7 +1418,7 @@ impl WalIngest {
        rel: RelTag,
        blknum: BlockNumber,
        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), WalIngestError> {
        let new_nblocks = blknum + 1;
        // Check if the relation exists. We implicitly create relations on first
        // record.
@@ -1423,7 +1512,7 @@ impl WalIngest {
        blknum: BlockNumber,
        img: Bytes,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        if !self.shard.is_shard_zero() {
            return Ok(());
        }
@@ -1441,7 +1530,7 @@ impl WalIngest {
        segno: u32,
        blknum: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        // we don't use a cache for this like we do for relations. SLRUS are explcitly
        // extended with ZEROPAGE records, not with commit records, so it happens
        // a lot less frequently.
@@ -1509,6 +1598,7 @@ async fn get_relsize(
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
+    use anyhow::Result;
    use postgres_ffi::RELSEG_SIZE;

    use super::*;
@@ -1530,7 +1620,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
+    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
        for i in 14..=16 {
            dispatch_pgversion!(i, {
                pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -65,6 +65,9 @@ static const struct config_enum_entry neon_compute_modes[] = {
 /* GUCs */
 char	   *neon_timeline;
 char	   *neon_tenant;
+char	   *neon_project_id;
+char	   *neon_branch_id;
+char	   *neon_endpoint_id;
 int32		max_cluster_size;
 char	   *page_server_connstring;
 char	   *neon_auth_token;
@@ -1352,6 +1355,31 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);

+	DefineCustomStringVariable("neon.project_id",
+							   "Neon project_id the server is running on",
+							   NULL,
+							   &neon_project_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+	DefineCustomStringVariable("neon.branch_id",
+							   "Neon branch_id the server is running on",
+							   NULL,
+							   &neon_branch_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+	DefineCustomStringVariable("neon.endpoint_id",
+							   "Neon endpoint_id the server is running on",
+							   NULL,
+							   &neon_endpoint_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+
 	DefineCustomIntVariable("neon.stripe_size",
 							"sharding stripe size",
 							NULL,
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -99,6 +99,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->config = config;
 	wp->api = api;
 	wp->state = WPS_COLLECTING_TERMS;
+	wp->mconf.generation = INVALID_GENERATION;
+	wp->mconf.members.len = 0;
+	wp->mconf.new_members.len = 0;

 	wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);

@@ -170,6 +173,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)

 	if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
 		wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
+	if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3)
+		wp_log(FATAL, "enabling generations requires protocol version 3");
 	wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);

 	/* Fill the greeting package */
@@ -214,7 +219,7 @@ WalProposerFree(WalProposer *wp)
 static bool
 WalProposerGenerationsEnabled(WalProposer *wp)
 {
-	return wp->safekeepers_generation != 0;
+	return wp->safekeepers_generation != INVALID_GENERATION;
 }

 /*
@@ -723,13 +728,176 @@ SendProposerGreeting(Safekeeper *sk)
 	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
 }

+/*
+ * Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in
+ * members_safekeepers & new_members_safekeepers to sk.
+ */
+static void
+UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
+{
+	/* members_safekeepers etc are fixed size, sanity check mconf size */
+	if (wp->mconf.members.len > MAX_SAFEKEEPERS)
+		wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len);
+	if (wp->mconf.new_members.len > MAX_SAFEKEEPERS)
+		wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len);
+
+	/* node id is not known until greeting is received */
+	if (sk->state < SS_WAIT_VOTING)
+		return;
+
+	/* 0 is assumed to be invalid node id, should never happen */
+	if (sk->greetResponse.nodeId == 0)
+	{
+		wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port);
+		return;
+	}
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		SafekeeperId *sk_id = &wp->mconf.members.m[i];
+
+		if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId)
+		{
+			/*
+			 * If mconf or list of safekeepers to connect to changed (the
+			 * latter always currently goes through restart though),
+			 * ResetMemberSafekeeperPtrs is expected to be called before
+			 * UpdateMemberSafekeeperPtr. So, other value suggests that we are
+			 * connected to the same sk under different host name, complain
+			 * about that.
+			 */
+			if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk)
+			{
+				wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu",
+					   sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper);
+			}
+			wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu",
+				   sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
+			wp->members_safekeepers[i] = sk;
+		}
+	}
+	/* repeat for new_members */
+	for (uint32 i = 0; i < wp->mconf.new_members.len; i++)
+	{
+		SafekeeperId *sk_id = &wp->mconf.new_members.m[i];
+
+		if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId)
+		{
+			if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk)
+			{
+				wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu",
+					   sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper);
+			}
+			wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu",
+				   sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
+			wp->new_members_safekeepers[i] = sk;
+		}
+	}
+}
+
+/*
+ * Reset wp->members_safekeepers & new_members_safekeepers and refill them.
+ * Called after wp changes mconf.
+ */
+static void
+ResetMemberSafekeeperPtrs(WalProposer *wp)
+{
+	memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
+	memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		if (wp->safekeeper[i].state >= SS_WAIT_VOTING)
+			UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]);
+	}
+}
+
+static uint32
+MsetQuorum(MemberSet *mset)
+{
+	Assert(mset->len > 0);
+	return mset->len / 2 + 1;
+}
+
+/* Does n forms quorum in mset? */
+static bool
+MsetHasQuorum(MemberSet *mset, uint32 n)
+{
+	return n >= MsetQuorum(mset);
+}
+
+/*
+ * TermsCollected helper for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
+ */
+static bool
+TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
+{
+	uint32		n_greeted = 0;
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		Safekeeper *sk = msk[i];
+
+		if (sk != NULL && sk->state == SS_WAIT_VOTING)
+		{
+			if (n_greeted > 0)
+				appendStringInfoString(s, ", ");
+			appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
+			n_greeted++;
+		}
+	}
+	appendStringInfo(s, ", %u/%u total", n_greeted, mset->len);
+	return MsetHasQuorum(mset, n_greeted);
+}
+
 /*
 * Have we received greeting from enough (quorum) safekeepers to start voting?
 */
 static bool
 TermsCollected(WalProposer *wp)
 {
-	return wp->n_connected >= wp->quorum;
+	StringInfoData s;			/* str for logging */
+	bool		collected = false;
+
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
+	{
+		collected = wp->n_connected >= wp->quorum;
+		if (collected)
+		{
+			wp->propTerm++;
+			wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm);
+		}
+		return collected;
+	}
+
+	/*
+	 * With generations enabled, we start campaign only when 1) some mconf is
+	 * actually received 2) we have greetings from majority of members as well
+	 * as from majority of new_members if it exists.
+	 */
+	if (wp->mconf.generation == INVALID_GENERATION)
+		return false;
+
+	initStringInfo(&s);
+	appendStringInfoString(&s, "mset greeters: ");
+	if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
+		goto res;
+	if (wp->mconf.new_members.len > 0)
+	{
+		appendStringInfoString(&s, ", new_mset greeters: ");
+		if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
+			goto res;
+	}
+	wp->propTerm++;
+	wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm);
+	collected = true;
+
+res:
+	pfree(s.data);
+	return collected;
 }

 static void
@@ -753,13 +921,41 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	pfree(mconf_toml);

 	/*
-	 * Adopt mconf of safekeepers if it is higher. TODO: mconf change should
-	 * restart wp if it started voting.
+	 * Adopt mconf of safekeepers if it is higher.
 	 */
 	if (sk->greetResponse.mconf.generation > wp->mconf.generation)
 	{
+		/* sanity check before adopting, should never happen */
+		if (sk->greetResponse.mconf.members.len == 0)
+		{
+			wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation);
+		}
+
+		/*
+		 * If we at least started campaign, restart wp to get elected in the
+		 * new mconf. Note: in principle once wp is already elected
+		 * re-election is not required, but being conservative here is not
+		 * bad.
+		 *
+		 * TODO: put mconf to shmem to immediately pick it up on start,
+		 * otherwise if some safekeeper(s) misses latest mconf and gets
+		 * connected the first, it may cause redundant restarts here.
+		 *
+		 * More generally, it would be nice to restart walproposer (wiping
+		 * election state) without restarting the process. In particular, that
+		 * would allow sync-safekeepers not to die here if it intersected with
+		 * sk migration (as well as remove 1s delay).
+		 *
+		 * Note that assign_neon_safekeepers also currently restarts the
+		 * process, so during normal migration walproposer may restart twice.
+		 */
+		if (wp->state >= WPS_CAMPAIGN)
+		{
+			wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation);
+		}
 		MembershipConfigurationFree(&wp->mconf);
 		MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
+		ResetMemberSafekeeperPtrs(wp);
 		/* full conf was just logged above */
 		wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
 	}
@@ -767,6 +963,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_WAIT_VOTING;

+	/* In greeting safekeeper sent its id; update mappings accordingly. */
+	UpdateMemberSafekeeperPtr(wp, sk);
+
 	/*
 	 * Note: it would be better to track the counter on per safekeeper basis,
 	 * but at worst walproposer would restart with 'term rejected', so leave
@@ -778,12 +977,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		/* We're still collecting terms from the majority. */
 		wp->propTerm = Max(sk->greetResponse.term, wp->propTerm);

-		/* Quorum is acquried, prepare the vote request. */
+		/* Quorum is acquired, prepare the vote request. */
 		if (TermsCollected(wp))
 		{
-			wp->propTerm++;
-			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
-
 			wp->state = WPS_CAMPAIGN;
 			wp->voteRequest.pam.tag = 'v';
 			wp->voteRequest.generation = wp->mconf.generation;
@@ -832,8 +1028,8 @@ SendVoteRequest(Safekeeper *sk)
 					   &sk->outbuf, wp->config->proto_version);

 	/* We have quorum for voting, send our vote request */
-	wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
-		   wp->voteRequest.generation, wp->voteRequest.term);
+	wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT,
+		   sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term);
 	/* On failure, logging & resetting is handled */
 	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
 	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
@@ -851,8 +1047,8 @@ RecvVoteResponse(Safekeeper *sk)
 		return;

 	wp_log(LOG,
-		   "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-		   sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
+		   "got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+		   sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
 		   sk->voteResponse.voteGiven,
 		   GetHighestTerm(&sk->voteResponse.termHistory),
 		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
@@ -899,6 +1095,53 @@ RecvVoteResponse(Safekeeper *sk)
 	}
 }

+/*
+ * VotesCollected helper for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
+ */
+static bool
+VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
+{
+	uint32		n_votes = 0;
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		Safekeeper *sk = msk[i];
+
+		if (sk != NULL && sk->state == SS_WAIT_ELECTED)
+		{
+			Assert(sk->voteResponse.voteGiven);
+
+			/*
+			 * Find the highest vote. NULL check is for the legacy case where
+			 * safekeeper might be not initialized with LSN at all and return
+			 * 0 LSN in the vote response; we still want to set donor to
+			 * something in this case.
+			 */
+			if (GetLastLogTerm(sk) > wp->donorLastLogTerm ||
+				(GetLastLogTerm(sk) == wp->donorLastLogTerm &&
+				 sk->voteResponse.flushLsn > wp->propTermStartLsn) ||
+				wp->donor == NULL)
+			{
+				wp->donorLastLogTerm = GetLastLogTerm(sk);
+				wp->propTermStartLsn = sk->voteResponse.flushLsn;
+				wp->donor = sk;
+			}
+			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
+
+			if (n_votes > 0)
+				appendStringInfoString(s, ", ");
+			appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
+			n_votes++;
+		}
+	}
+	appendStringInfo(s, ", %u/%u total", n_votes, mset->len);
+	return MsetHasQuorum(mset, n_votes);
+}
+
+
 /*
 * Checks if enough votes has been collected to get elected and if that's the
 * case finds the highest vote, setting donor, donorLastLogTerm,
@@ -907,7 +1150,8 @@ RecvVoteResponse(Safekeeper *sk)
 static bool
 VotesCollected(WalProposer *wp)
 {
-	int			n_ready = 0;
+	StringInfoData s;			/* str for logging */
+	bool		collected = false;

 	/* assumed to be called only when not elected yet */
 	Assert(wp->state == WPS_CAMPAIGN);
@@ -916,25 +1160,62 @@ VotesCollected(WalProposer *wp)
 	wp->donorLastLogTerm = 0;
 	wp->truncateLsn = InvalidXLogRecPtr;

-	for (int i = 0; i < wp->n_safekeepers; i++)
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
 	{
-		if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
-		{
-			n_ready++;
+		int			n_ready = 0;

-			if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
-				(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
-				 wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn))
+		for (int i = 0; i < wp->n_safekeepers; i++)
+		{
+			if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
 			{
-				wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
-				wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
-				wp->donor = i;
+				n_ready++;
+
+				if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
+					(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
+					 wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) ||
+					wp->donor == NULL)
+				{
+					wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
+					wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
+					wp->donor = &wp->safekeeper[i];
+				}
+				wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
 			}
-			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
 		}
+		collected = n_ready >= wp->quorum;
+		if (collected)
+		{
+			wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers);
+		}
+		return collected;
 	}

-	return n_ready >= wp->quorum;
+	/*
+	 * if generations are enabled we're expected to get to voting only when
+	 * mconf is established.
+	 */
+	Assert(wp->mconf.generation != INVALID_GENERATION);
+
+	/*
+	 * We must get votes from both msets if both are present.
+	 */
+	initStringInfo(&s);
+	appendStringInfoString(&s, "mset voters: ");
+	if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
+		goto res;
+	if (wp->mconf.new_members.len > 0)
+	{
+		appendStringInfoString(&s, ", new_mset voters: ");
+		if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
+			goto res;
+	}
+	wp_log(LOG, "walproposer elected, %s", s.data);
+	collected = true;
+
+res:
+	pfree(s.data);
+	return collected;
 }

 /*
@@ -955,7 +1236,7 @@ HandleElectedProposer(WalProposer *wp)
 	 * that only for logical replication (and switching logical walsenders to
 	 * neon_walreader is a todo.)
 	 */
-	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
+	if (!wp->api.recovery_download(wp, wp->donor))
 	{
 		wp_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
@@ -1078,7 +1359,7 @@ ProcessPropStartPos(WalProposer *wp)
 	/*
 	 * Proposer's term history is the donor's + its own entry.
 	 */
-	dth = &wp->safekeeper[wp->donor].voteResponse.termHistory;
+	dth = &wp->donor->voteResponse.termHistory;
 	wp->propTermHistory.n_entries = dth->n_entries + 1;
 	wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries);
 	if (dth->n_entries > 0)
@@ -1086,11 +1367,10 @@ ProcessPropStartPos(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn;

-	wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-		   wp->quorum,
+	wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
 		   wp->propTerm,
 		   LSN_FORMAT_ARGS(wp->propTermStartLsn),
-		   wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+		   wp->donor->host, wp->donor->port,
 		   LSN_FORMAT_ARGS(wp->truncateLsn));

 	/*
@@ -1508,6 +1788,14 @@ RecvAppendResponses(Safekeeper *sk)

 		readAnything = true;

+		/* should never happen: sk is expected to send ERROR instead */
+		if (sk->appendResponse.generation != wp->mconf.generation)
+		{
+			wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u",
+				   sk->greetResponse.nodeId, sk->host, sk->port,
+				   sk->appendResponse.generation, wp->mconf.generation);
+		}
+
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
@@ -1624,30 +1912,101 @@ CalculateMinFlushLsn(WalProposer *wp)
 }

 /*
- * Calculate WAL position acknowledged by quorum
+ * GetAcknowledgedByQuorumWALPosition for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
 */
 static XLogRecPtr
-GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
+GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk)
 {
 	XLogRecPtr	responses[MAX_SAFEKEEPERS];

 	/*
-	 * Sort acknowledged LSNs
+	 * Ascending sort acknowledged LSNs.
 	 */
-	for (int i = 0; i < wp->n_safekeepers; i++)
+	Assert(mset->len <= MAX_SAFEKEEPERS);
+	for (uint32 i = 0; i < mset->len; i++)
 	{
+		Safekeeper *sk = msk[i];
+
 		/*
 		 * Like in Raft, we aren't allowed to commit entries from previous
-		 * terms, so ignore reported LSN until it gets to epochStartLsn.
+		 * terms, so ignore reported LSN until it gets to propTermStartLsn.
+		 *
+		 * Note: we ignore sk state, which is ok: before first ack flushLsn is
+		 * 0, and later we just preserve value across reconnections. It would
+		 * be ok to check for SS_ACTIVE as well.
 		 */
-		responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
+		if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn)
+		{
+			responses[i] = sk->appendResponse.flushLsn;
+		}
+		else
+		{
+			responses[i] = 0;
+		}
 	}
-	qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+	qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn);

 	/*
-	 * Get the smallest LSN committed by quorum
+	 * And get value committed by the quorum. A way to view this: to get the
+	 * highest value committed on the quorum, in the ordered array we skip n -
+	 * n_quorum elements to get to the first (lowest) value present on all sks
+	 * of the highest quorum.
 	 */
-	return responses[wp->n_safekeepers - wp->quorum];
+	return responses[mset->len - MsetQuorum(mset)];
+}
+
+/*
+ * Calculate WAL position acknowledged by quorum, i.e. which may be regarded
+ * committed.
+ *
+ * Zero may be returned when there is no quorum of nodes recovered to term start
+ * lsn which sent feedback yet.
+ */
+static XLogRecPtr
+GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
+{
+	XLogRecPtr	committed;
+
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
+	{
+		XLogRecPtr	responses[MAX_SAFEKEEPERS];
+
+		/*
+		 * Sort acknowledged LSNs
+		 */
+		for (int i = 0; i < wp->n_safekeepers; i++)
+		{
+			/*
+			 * Like in Raft, we aren't allowed to commit entries from previous
+			 * terms, so ignore reported LSN until it gets to
+			 * propTermStartLsn.
+			 *
+			 * Note: we ignore sk state, which is ok: before first ack
+			 * flushLsn is 0, and later we just preserve value across
+			 * reconnections. It would be ok to check for SS_ACTIVE as well.
+			 */
+			responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
+		}
+		qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+
+		/*
+		 * Get the smallest LSN committed by quorum
+		 */
+		return responses[wp->n_safekeepers - wp->quorum];
+	}
+
+	committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers);
+	if (wp->mconf.new_members.len > 0)
+	{
+		XLogRecPtr	new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers);
+
+		committed = Min(committed, new_mset_committed);
+	}
+	return committed;
 }

 /*
@@ -1662,7 +2021,7 @@ UpdateDonorShmem(WalProposer *wp)
 	int			i;
 	XLogRecPtr	donor_lsn = InvalidXLogRecPtr;

-	if (wp->n_votes < wp->quorum)
+	if (wp->state < WPS_ELECTED)
 	{
 		wp_log(WARNING, "UpdateDonorShmem called before elections are won");
 		return;
@@ -1673,9 +2032,9 @@ UpdateDonorShmem(WalProposer *wp)
 	 * about its position immediately after election before any feedbacks are
 	 * sent.
 	 */
-	if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED)
+	if (wp->donor->state >= SS_WAIT_ELECTED)
 	{
-		donor = &wp->safekeeper[wp->donor];
+		donor = wp->donor;
 		donor_lsn = wp->propTermStartLsn;
 	}

@@ -1746,22 +2105,19 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 	}

 	/*
-	 * Generally sync is done when majority switched the epoch so we committed
-	 * epochStartLsn and made the majority aware of it, ensuring they are
-	 * ready to give all WAL to pageserver. It would mean whichever majority
-	 * is alive, there will be at least one safekeeper who is able to stream
-	 * WAL to pageserver to make basebackup possible. However, since at the
-	 * moment we don't have any good mechanism of defining the healthy and
-	 * most advanced safekeeper who should push the wal into pageserver and
+	 * Generally sync is done when majority reached propTermStartLsn so we
+	 * committed it and made the majority aware of it, ensuring they are ready
+	 * to give all WAL to pageserver. It would mean whichever majority is
+	 * alive, there will be at least one safekeeper who is able to stream WAL
+	 * to pageserver to make basebackup possible. However, since at the moment
+	 * we don't have any good mechanism of defining the healthy and most
+	 * advanced safekeeper who should push the wal into pageserver and
 	 * basically the random one gets connected, to prevent hanging basebackup
 	 * (due to pageserver connecting to not-synced-safekeeper) we currently
 	 * wait for all seemingly alive safekeepers to get synced.
 	 */
 	if (wp->config->syncSafekeepers)
 	{
-		int			n_synced;
-
-		n_synced = 0;
 		for (int i = 0; i < wp->n_safekeepers; i++)
 		{
 			Safekeeper *sk = &wp->safekeeper[i];
@@ -1770,11 +2126,9 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 			/* alive safekeeper which is not synced yet; wait for it */
 			if (sk->state != SS_OFFLINE && !synced)
 				return;
-			if (synced)
-				n_synced++;
 		}

-		if (n_synced >= wp->quorum)
+		if (newCommitLsn >= wp->propTermStartLsn)
 		{
 			/* A quorum of safekeepers has been synced! */

--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -145,6 +145,7 @@ typedef uint64 NNodeId;
 * This and following structs pair ones in membership.rs.
 */
 typedef uint32 Generation;
+#define INVALID_GENERATION 0

 typedef struct SafekeeperId
 {
@@ -771,7 +772,17 @@ typedef struct WalProposer
 	/* Current walproposer membership configuration */
 	MembershipConfiguration mconf;

-	/* (n_safekeepers / 2) + 1 */
+	/*
+	 * Parallels mconf.members with pointers to the member's slot in
+	 * safekeepers array of connections, or NULL if such member is not
+	 * connected. Helps to avoid looking slot per id through all
+	 * .safekeepers[] when doing quorum checks.
+	 */
+	Safekeeper *members_safekeepers[MAX_SAFEKEEPERS];
+	/* As above, but for new_members. */
+	Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS];
+
+	/* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */
 	int			quorum;

 	/*
@@ -829,7 +840,7 @@ typedef struct WalProposer
 	term_t		donorLastLogTerm;

 	/* Most advanced acceptor */
-	int			donor;
+	Safekeeper *donor;

 	/* timeline globally starts at this LSN */
 	XLogRecPtr	timelineStartLsn;
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -509,7 +509,14 @@ pub async fn run() -> anyhow::Result<()> {
            if let Some(mut redis_kv_client) = redis_kv_client {
                maintenance_tasks.spawn(async move {
                    redis_kv_client.try_connect().await?;
-                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await
+                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?;
+
+                    drop(redis_kv_client);
+
+                    // `handle_cancel_messages` was terminated due to the tx_cancel
+                    // being dropped. this is not worthy of an error, and this task can only return `Err`,
+                    // so let's wait forever instead.
+                    std::future::pending().await
                });
            }

--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,16 +1,17 @@
-use std::convert::Infallible;
 use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;

+use anyhow::{Context, anyhow};
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use postgres_client::CancelToken;
 use postgres_client::tls::MakeTlsConnect;
 use pq_proto::CancelKeyData;
+use redis::{FromRedisValue, Pipeline, Value, pipe};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::{mpsc, oneshot};
-use tracing::{debug, info};
+use tracing::{debug, info, warn};

 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::{AuthError, check_peer_addr_is_in_list};
@@ -30,6 +31,7 @@ type IpSubnetKey = IpNet;

 const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
 const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
+const BATCH_SIZE: usize = 8;

 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
@@ -54,78 +56,168 @@ pub enum CancelKeyOp {
    },
 }

+impl CancelKeyOp {
+    fn register(self, pipe: &mut Pipeline) -> Option<CancelReplyOp> {
+        #[allow(clippy::used_underscore_binding)]
+        match self {
+            CancelKeyOp::StoreCancelKey {
+                key,
+                field,
+                value,
+                resp_tx,
+                _guard,
+                expire,
+            } => {
+                pipe.hset(&key, field, value);
+                pipe.expire(key, expire);
+                let resp_tx = resp_tx?;
+                Some(CancelReplyOp::StoreCancelKey { resp_tx, _guard })
+            }
+            CancelKeyOp::GetCancelData {
+                key,
+                resp_tx,
+                _guard,
+            } => {
+                pipe.hgetall(key);
+                Some(CancelReplyOp::GetCancelData { resp_tx, _guard })
+            }
+            CancelKeyOp::RemoveCancelKey {
+                key,
+                field,
+                resp_tx,
+                _guard,
+            } => {
+                pipe.hdel(key, field);
+                let resp_tx = resp_tx?;
+                Some(CancelReplyOp::RemoveCancelKey { resp_tx, _guard })
+            }
+        }
+    }
+}
+
+// Message types for sending through mpsc channel
+pub enum CancelReplyOp {
+    StoreCancelKey {
+        resp_tx: oneshot::Sender<anyhow::Result<()>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    GetCancelData {
+        resp_tx: oneshot::Sender<anyhow::Result<Vec<(String, String)>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    RemoveCancelKey {
+        resp_tx: oneshot::Sender<anyhow::Result<()>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+}
+
+impl CancelReplyOp {
+    fn send_err(self, e: anyhow::Error) {
+        match self {
+            CancelReplyOp::StoreCancelKey { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::GetCancelData { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+        }
+    }
+
+    fn send_value(self, v: redis::Value) {
+        match self {
+            CancelReplyOp::StoreCancelKey { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::GetCancelData { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+        }
+    }
+}
+
 // Running as a separate task to accept messages through the rx channel
-// In case of problems with RTT: switch to recv_many() + redis pipeline
 pub async fn handle_cancel_messages(
    client: &mut RedisKVClient,
    mut rx: mpsc::Receiver<CancelKeyOp>,
-) -> anyhow::Result<Infallible> {
+) -> anyhow::Result<()> {
+    let mut batch = Vec::new();
+    let mut replies = vec![];
+
    loop {
-        if let Some(msg) = rx.recv().await {
-            match msg {
-                CancelKeyOp::StoreCancelKey {
-                    key,
-                    field,
-                    value,
-                    resp_tx,
-                    _guard,
-                    expire,
-                } => {
-                    let res = client.hset(&key, field, value).await;
-                    if let Some(resp_tx) = resp_tx {
-                        if res.is_ok() {
-                            resp_tx
-                                .send(client.expire(key, expire).await)
-                                .inspect_err(|e| {
-                                    tracing::debug!(
-                                        "failed to send StoreCancelKey response: {:?}",
-                                        e
-                                    );
-                                })
-                                .ok();
-                        } else {
-                            resp_tx
-                                .send(res)
-                                .inspect_err(|e| {
-                                    tracing::debug!(
-                                        "failed to send StoreCancelKey response: {:?}",
-                                        e
-                                    );
-                                })
-                                .ok();
-                        }
-                    } else if res.is_ok() {
-                        drop(client.expire(key, expire).await);
-                    } else {
-                        tracing::warn!("failed to store cancel key: {:?}", res);
-                    }
+        if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
+            warn!("shutting down cancellation queue");
+            break Ok(());
+        }
+
+        let batch_size = batch.len();
+        debug!(batch_size, "running cancellation jobs");
+
+        let mut pipe = pipe();
+        for msg in batch.drain(..) {
+            if let Some(reply) = msg.register(&mut pipe) {
+                replies.push(reply);
+            } else {
+                pipe.ignore();
+            }
+        }
+
+        let responses = replies.len();
+
+        match client.query(pipe).await {
+            // for each reply, we expect that many values.
+            Ok(Value::Array(values)) if values.len() == responses => {
+                debug!(
+                    batch_size,
+                    responses, "successfully completed cancellation jobs",
+                );
+                for (value, reply) in std::iter::zip(values, replies.drain(..)) {
+                    reply.send_value(value);
                }
-                CancelKeyOp::GetCancelData {
-                    key,
-                    resp_tx,
-                    _guard,
-                } => {
-                    drop(resp_tx.send(client.hget_all(key).await));
+            }
+            Ok(value) => {
+                debug!(?value, "unexpected redis return value");
+                for reply in replies.drain(..) {
+                    reply.send_err(anyhow!("incorrect response type from redis"));
                }
-                CancelKeyOp::RemoveCancelKey {
-                    key,
-                    field,
-                    resp_tx,
-                    _guard,
-                } => {
-                    if let Some(resp_tx) = resp_tx {
-                        resp_tx
-                            .send(client.hdel(key, field).await)
-                            .inspect_err(|e| {
-                                tracing::debug!("failed to send StoreCancelKey response: {:?}", e);
-                            })
-                            .ok();
-                    } else {
-                        drop(client.hdel(key, field).await);
-                    }
+            }
+            Err(err) => {
+                for reply in replies.drain(..) {
+                    reply.send_err(anyhow!("could not send cmd to redis: {err}"));
                }
            }
        }
+
+        replies.clear();
    }
 }

--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -1,4 +1,5 @@
-use redis::{AsyncCommands, ToRedisArgs};
+use redis::aio::ConnectionLike;
+use redis::{Cmd, FromRedisValue, Pipeline, RedisResult};

 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
@@ -8,6 +9,23 @@ pub struct RedisKVClient {
    limiter: GlobalRateLimiter,
 }

+#[allow(async_fn_in_trait)]
+pub trait Queryable {
+    async fn query<T: FromRedisValue>(&self, conn: &mut impl ConnectionLike) -> RedisResult<T>;
+}
+
+impl Queryable for Pipeline {
+    async fn query<T: FromRedisValue>(&self, conn: &mut impl ConnectionLike) -> RedisResult<T> {
+        self.query_async(conn).await
+    }
+}
+
+impl Queryable for Cmd {
+    async fn query<T: FromRedisValue>(&self, conn: &mut impl ConnectionLike) -> RedisResult<T> {
+        self.query_async(conn).await
+    }
+}
+
 impl RedisKVClient {
    pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self {
        Self {
@@ -27,158 +45,24 @@ impl RedisKVClient {
        Ok(())
    }

-    pub(crate) async fn hset<K, F, V>(&mut self, key: K, field: F, value: V) -> anyhow::Result<()>
-    where
-        K: ToRedisArgs + Send + Sync,
-        F: ToRedisArgs + Send + Sync,
-        V: ToRedisArgs + Send + Sync,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hset");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.hset(&key, &field, &value).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to set a key-value pair: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client
-            .hset(key, field, value)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
-    #[allow(dead_code)]
-    pub(crate) async fn hset_multiple<K, V>(
+    pub(crate) async fn query<T: FromRedisValue>(
        &mut self,
-        key: &str,
-        items: &[(K, V)],
-    ) -> anyhow::Result<()>
-    where
-        K: ToRedisArgs + Send + Sync,
-        V: ToRedisArgs + Send + Sync,
-    {
+        q: impl Queryable,
+    ) -> anyhow::Result<T> {
        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hset_multiple");
+            tracing::info!("Rate limit exceeded. Skipping query");
            return Err(anyhow::anyhow!("Rate limit exceeded"));
        }

-        match self.client.hset_multiple(key, items).await {
-            Ok(()) => return Ok(()),
+        match q.query(&mut self.client).await {
+            Ok(t) => return Ok(t),
            Err(e) => {
-                tracing::error!("failed to set a key-value pair: {e}");
+                tracing::error!("failed to run query: {e}");
            }
        }

-        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        tracing::info!("Redis client is disconnected. Reconnecting...");
        self.try_connect().await?;
-        self.client
-            .hset_multiple(key, items)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
-    #[allow(dead_code)]
-    pub(crate) async fn expire<K>(&mut self, key: K, seconds: i64) -> anyhow::Result<()>
-    where
-        K: ToRedisArgs + Send + Sync,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping expire");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.expire(&key, seconds).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to set a key-value pair: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client
-            .expire(key, seconds)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
-    #[allow(dead_code)]
-    pub(crate) async fn hget<K, F, V>(&mut self, key: K, field: F) -> anyhow::Result<V>
-    where
-        K: ToRedisArgs + Send + Sync,
-        F: ToRedisArgs + Send + Sync,
-        V: redis::FromRedisValue,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hget");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.hget(&key, &field).await {
-            Ok(value) => return Ok(value),
-            Err(e) => {
-                tracing::error!("failed to get a value: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client
-            .hget(key, field)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
-    pub(crate) async fn hget_all<K, V>(&mut self, key: K) -> anyhow::Result<V>
-    where
-        K: ToRedisArgs + Send + Sync,
-        V: redis::FromRedisValue,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hgetall");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.hgetall(&key).await {
-            Ok(value) => return Ok(value),
-            Err(e) => {
-                tracing::error!("failed to get a value: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client.hgetall(key).await.map_err(anyhow::Error::new)
-    }
-
-    pub(crate) async fn hdel<K, F>(&mut self, key: K, field: F) -> anyhow::Result<()>
-    where
-        K: ToRedisArgs + Send + Sync,
-        F: ToRedisArgs + Send + Sync,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hdel");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.hdel(&key, &field).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to delete a key-value pair: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client
-            .hdel(key, field)
-            .await
-            .map_err(anyhow::Error::new)
+        Ok(q.query(&mut self.client).await?)
    }
 }
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -27,6 +27,7 @@ humantime.workspace = true
 http.workspace = true
 hyper0.workspace = true
 itertools.workspace = true
+jsonwebtoken.workspace = true
 futures.workspace = true
 once_cell.workspace = true
 parking_lot.workspace = true
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -226,9 +226,6 @@ struct Args {
    /// Path to the JWT auth token used to authenticate with other safekeepers.
    #[arg(long)]
    auth_token_path: Option<Utf8PathBuf>,
-
-    #[arg(long, help = "Run in development mode (disables security checks)")]
-    dev: bool,
 }

 // Like PathBufValueParser, but allows empty string.
@@ -346,21 +343,6 @@ async fn main() -> anyhow::Result<()> {
        }
    };

-    if !args.dev {
-        let http_auth_enabled = args.http_auth_public_key_path.is_some();
-        let pg_auth_enabled = args.pg_auth_public_key_path.is_some();
-        let pg_tenant_only_auth_enabled = args.pg_tenant_only_auth_public_key_path.is_some();
-        if !http_auth_enabled || !pg_auth_enabled || !pg_tenant_only_auth_enabled {
-            bail!(
-                "Safekeeper refuses to start with HTTP, PostgreSQL, or tenant-only PostgreSQL API authentication disabled.\n\
-                  Run with --dev to allow running without authentication.\n\
-                  This is insecure and should only be used in development environments."
-            );
-        }
-    } else {
-        warn!("Starting in dev mode: this may be an insecure configuration.");
-    }
-
    // Load JWT auth token to connect to other safekeepers for pull_timeline.
    // First check if the env var is present, then check the arg with the path.
    // We want to deprecate and remove the env var method in the future.
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -6,6 +6,7 @@ use std::str::{self, FromStr};
 use std::sync::Arc;

 use anyhow::Context;
+use jsonwebtoken::TokenData;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
 use postgres_backend::{PostgresBackend, QueryError};
@@ -278,7 +279,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
            .auth
            .as_ref()
            .expect("auth_type is configured but .auth of handler is missing");
-        let data = auth
+        let data: TokenData<Claims> = auth
            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
            .map_err(|e| QueryError::Unauthorized(e.0))?;

--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -31,6 +31,7 @@ pub async fn task_main_https(
    global_timelines: Arc<GlobalTimelines>,
 ) -> anyhow::Result<()> {
    let cert_resolver = ReloadingCertificateResolver::new(
+        "main",
        &conf.ssl_key_file,
        &conf.ssl_cert_file,
        conf.ssl_cert_reload_period,
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -629,15 +629,13 @@ impl ComputeHook {
        };

        let result = if !self.config.use_local_compute_notifications {
-            let compute_hook_url = if let Some(control_plane_url) = &self.config.control_plane_url {
-                Some(if control_plane_url.ends_with('/') {
-                    format!("{control_plane_url}notify-attach")
-                } else {
-                    format!("{control_plane_url}/notify-attach")
-                })
-            } else {
-                self.config.compute_hook_url.clone()
-            };
+            let compute_hook_url =
+                self.config
+                    .control_plane_url
+                    .as_ref()
+                    .map(|control_plane_url| {
+                        format!("{}/notify-attach", control_plane_url.trim_end_matches('/'))
+                    });

            // We validate this at startup
            let notify_url = compute_hook_url.as_ref().unwrap();
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -22,6 +22,7 @@ use pageserver_api::controller_api::{
    MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest,
    ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
+    TimelineImportRequest,
 };
 use pageserver_api::models::{
    DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest,
@@ -1235,8 +1236,18 @@ async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError
        ForwardOutcome::NotForwarded(req) => req,
    };

-    let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.step_down().await)
+    // Spawn a background task: once we start stepping down, we must finish: if the client drops
+    // their request we should avoid stopping in some part-stepped-down state.
+    let handle = tokio::spawn(async move {
+        let state = get_state(&req);
+        state.service.step_down().await
+    });
+
+    let result = handle
+        .await
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+    json_response(StatusCode::OK, result)
 }

 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1276,6 +1287,37 @@ async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiE
    )
 }

+async fn handle_timeline_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let import_req = json_request::<TimelineImportRequest>(&mut req).await?;
+
+    let state = get_state(&req);
+
+    if import_req.tenant_id != tenant_id || import_req.timeline_id != timeline_id {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "tenant id or timeline id mismatch: url={tenant_id}/{timeline_id}, body={}/{}",
+            import_req.tenant_id,
+            import_req.timeline_id
+        )));
+    }
+
+    json_response(
+        StatusCode::OK,
+        state.service.timeline_import(import_req).await?,
+    )
+}
+
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -1949,6 +1991,16 @@ pub fn make_router(
                RequestName("debug_v1_tenant_locate"),
            )
        })
+        .post(
+            "/debug/v1/tenant/:tenant_id/timeline/:timeline_id/import",
+            |r| {
+                named_request_span(
+                    r,
+                    handle_timeline_import,
+                    RequestName("debug_v1_timeline_import"),
+                )
+            },
+        )
        .get("/debug/v1/scheduler", |r| {
            named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
        })
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -86,10 +86,6 @@ struct Cli {
    #[arg(long)]
    peer_jwt_token: Option<String>,

-    /// URL to control plane compute notification endpoint
-    #[arg(long)]
-    compute_hook_url: Option<String>,
-
    /// URL to control plane storage API prefix
    #[arg(long)]
    control_plane_url: Option<String>,
@@ -360,13 +356,11 @@ async fn async_main() -> anyhow::Result<()> {
                "Insecure config!  One or more secrets is not set.  This is only permitted in `--dev` mode"
            );
        }
-        StrictMode::Strict
-            if args.compute_hook_url.is_none() && args.control_plane_url.is_none() =>
-        {
+        StrictMode::Strict if args.control_plane_url.is_none() => {
            // Production systems should always have a control plane URL set, to prevent falling
            // back to trying to use neon_local.
            anyhow::bail!(
-                "neither `--compute-hook-url` nor `--control-plane-url` are set: this is only permitted in `--dev` mode"
+                "`--control-plane-url` is not set: this is only permitted in `--dev` mode"
            );
        }
        StrictMode::Strict if args.use_local_compute_notifications => {
@@ -394,7 +388,6 @@ async fn async_main() -> anyhow::Result<()> {
        safekeeper_jwt_token: secrets.safekeeper_jwt_token,
        control_plane_jwt_token: secrets.control_plane_jwt_token,
        peer_jwt_token: secrets.peer_jwt_token,
-        compute_hook_url: args.compute_hook_url,
        control_plane_url: args.control_plane_url,
        max_offline_interval: args
            .max_offline_interval
@@ -472,6 +465,7 @@ async fn async_main() -> anyhow::Result<()> {
            let https_listener = tcp_listener::bind(https_addr)?;

            let resolver = ReloadingCertificateResolver::new(
+                "main",
                &args.ssl_key_file,
                &args.ssl_cert_file,
                *args.ssl_cert_reload_period,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -61,7 +61,7 @@ use utils::completion::Barrier;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
-use utils::sync::gate::Gate;
+use utils::sync::gate::{Gate, GateGuard};
 use utils::{failpoint_support, pausable_failpoint};

 use crate::background_node_operations::{
@@ -357,18 +357,10 @@ pub struct Config {
    // This JWT token will be used to authenticate with other storage controller instances
    pub peer_jwt_token: Option<String>,

-    /// Where the compute hook should send notifications of pageserver attachment locations
-    /// (this URL points to the control plane in prod). If this is None, the compute hook will
-    /// assume it is running in a test environment and try to update neon_local.
-    pub compute_hook_url: Option<String>,
-
    /// Prefix for storage API endpoints of the control plane. We use this prefix to compute
    /// URLs that we use to send pageserver and safekeeper attachment locations.
    /// If this is None, the compute hook will assume it is running in a test environment
    /// and try to invoke neon_local instead.
-    ///
-    /// For now, there is also `compute_hook_url` which allows configuration of the pageserver
-    /// specific endpoint, but it is in the process of being phased out.
    pub control_plane_url: Option<String>,

    /// Grace period within which a pageserver does not respond to heartbeats, but is still
@@ -594,6 +586,8 @@ struct TenantShardSplitAbort {
    new_stripe_size: Option<ShardStripeSize>,
    /// Until this abort op is complete, no other operations may be done on the tenant
    _tenant_lock: TracingExclusiveGuard<TenantOperations>,
+    /// The reconciler gate for the duration of the split operation, and any included abort.
+    _gate: GateGuard,
 }

 #[derive(thiserror::Error, Debug)]
@@ -1460,7 +1454,7 @@ impl Service {
            // Retry until shutdown: we must keep this request object alive until it is properly
            // processed, as it holds a lock guard that prevents other operations trying to do things
            // to the tenant while it is in a weird part-split state.
-            while !self.cancel.is_cancelled() {
+            while !self.reconcilers_cancel.is_cancelled() {
                match self.abort_tenant_shard_split(&op).await {
                    Ok(_) => break,
                    Err(e) => {
@@ -1473,9 +1467,12 @@ impl Service {
                        // when we retry, so that the abort op will succeed.  If the abort op is failing
                        // for some other reason, we will keep retrying forever, or until a human notices
                        // and does something about it (either fixing a pageserver or restarting the controller).
-                        tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled())
-                            .await
-                            .ok();
+                        tokio::time::timeout(
+                            Duration::from_secs(5),
+                            self.reconcilers_cancel.cancelled(),
+                        )
+                        .await
+                        .ok();
                    }
                }
            }
@@ -1847,6 +1844,7 @@ impl Service {
        };

        if insert {
+            let config = attach_req.config.clone().unwrap_or_default();
            let tsp = TenantShardPersistence {
                tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
                shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
@@ -1855,7 +1853,7 @@ impl Service {
                generation: attach_req.generation_override.or(Some(0)),
                generation_pageserver: None,
                placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
-                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
+                config: serde_json::to_string(&config).unwrap(),
                splitting: SplitState::default(),
                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                    .unwrap(),
@@ -1878,16 +1876,16 @@ impl Service {
                Ok(()) => {
                    tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);

-                    let mut locked = self.inner.write().unwrap();
-                    locked.tenants.insert(
+                    let mut shard = TenantShard::new(
                        attach_req.tenant_shard_id,
-                        TenantShard::new(
-                            attach_req.tenant_shard_id,
-                            ShardIdentity::unsharded(),
-                            PlacementPolicy::Attached(0),
-                            None,
-                        ),
+                        ShardIdentity::unsharded(),
+                        PlacementPolicy::Attached(0),
+                        None,
                    );
+                    shard.config = config;
+
+                    let mut locked = self.inner.write().unwrap();
+                    locked.tenants.insert(attach_req.tenant_shard_id, shard);
                    tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
                }
            }
@@ -1972,11 +1970,12 @@ impl Service {
            .set_attached(scheduler, attach_req.node_id);

        tracing::info!(
-            "attach_hook: tenant {} set generation {:?}, pageserver {}",
+            "attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}",
            attach_req.tenant_shard_id,
            tenant_shard.generation,
            // TODO: this is an odd number of 0xf's
-            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
+            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)),
+            attach_req.config,
        );

        // Trick the reconciler into not doing anything for this tenant: this helps
@@ -4910,7 +4909,7 @@ impl Service {
                    1,
                    10,
                    Duration::from_secs(5),
-                    &self.cancel,
+                    &self.reconcilers_cancel,
                )
                .await
            {
@@ -5161,6 +5160,11 @@ impl Service {
        )
        .await;

+        let _gate = self
+            .reconcilers_gate
+            .enter()
+            .map_err(|_| ApiError::ShuttingDown)?;
+
        let new_shard_count = ShardCount::new(split_req.new_shard_count);
        let new_stripe_size = split_req.new_stripe_size;

@@ -5188,6 +5192,7 @@ impl Service {
                        new_shard_count,
                        new_stripe_size,
                        _tenant_lock,
+                        _gate,
                    })
                    // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
                    .ok();
@@ -5527,7 +5532,10 @@ impl Service {
                "failpoint".to_string()
            )));

-            failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
+            failpoint_support::sleep_millis_async!(
+                "shard-split-post-remote-sleep",
+                &self.reconcilers_cancel
+            );

            tracing::info!(
                "Split {} into {}",
@@ -5585,7 +5593,7 @@ impl Service {
                        stripe_size,
                        preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed),
                    },
-                    &self.cancel,
+                    &self.reconcilers_cancel,
                )
                .await
            {
@@ -8670,9 +8678,24 @@ impl Service {
        failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");

        self.inner.write().unwrap().step_down();
-        // TODO: would it make sense to have a time-out for this?
-        self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
-            .await;
+
+        // Wait for reconciliations to stop, or terminate this process if they
+        // fail to stop in time (this indicates a bug in shutdown)
+        tokio::select! {
+            _ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => {
+                tracing::info!("Reconciliations stopped, proceeding with step down");
+            }
+            _ = async {
+                failpoint_support::sleep_millis_async!("step-down-delay-timeout");
+                tokio::time::sleep(Duration::from_secs(10)).await
+            } => {
+                tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process");
+
+                // The caller may proceed to act as leader when it sees this request fail: reduce the chance
+                // of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state.
+                std::process::exit(1);
+            }
+        }

        let mut global_observed = GlobalObservedState::default();
        let locked = self.inner.read().unwrap();
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -12,13 +12,16 @@ use crate::persistence::{
 use crate::safekeeper::Safekeeper;
 use anyhow::Context;
 use http_utils::error::ApiError;
-use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy};
+use pageserver_api::controller_api::{
+    SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest,
+};
 use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo};
 use safekeeper_api::membership::{MemberSet, SafekeeperId};
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;
+use utils::lsn::Lsn;

 use super::Service;

@@ -298,6 +301,31 @@ impl Service {
            timeline_id,
        })
    }
+
+    /// Directly insert the timeline into the database without reconciling it with safekeepers.
+    ///
+    /// Useful if the timeline already exists on the specified safekeepers,
+    /// but we want to make it storage controller managed.
+    pub(crate) async fn timeline_import(&self, req: TimelineImportRequest) -> Result<(), ApiError> {
+        let persistence = TimelinePersistence {
+            tenant_id: req.tenant_id.to_string(),
+            timeline_id: req.timeline_id.to_string(),
+            start_lsn: Lsn::INVALID.into(),
+            generation: 1,
+            sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(),
+            new_sk_set: None,
+            cplane_notified_generation: 1,
+            deleted_at: None,
+        };
+        let inserted = self.persistence.insert_timeline(persistence).await?;
+        if inserted {
+            tracing::info!("imported timeline into db");
+        } else {
+            tracing::info!("didn't import timeline into db, as it is already present in db");
+        }
+        Ok(())
+    }
+
    /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler.
    pub(super) async fn tenant_timeline_delete_safekeepers(
        self: &Arc<Self>,
--- a/test_runner/cloud_regress/README.md
+++ b/test_runner/cloud_regress/README.md
@@ -3,19 +3,35 @@
 * Create a Neon project on staging.
 * Grant the superuser privileges to the DB user.
 * (Optional) create a branch for testing
-* Configure the endpoint by updating the control-plane database with the following settings:
+* Add the following settings to the `pg_settings` section of the default endpoint configuration for the project using the admin interface:
  * `Timeone`: `America/Los_Angeles`
  * `DateStyle`: `Postgres,MDY`
  * `compute_query_id`: `off`
+* Add the following section to the project configuration:
+```json
+"preload_libraries": {
+    "use_defaults": false,
+    "enabled_libraries": []
+  }
+```
 * Checkout the actual `Neon` sources
 * Patch the sql and expected files for the specific PostgreSQL version, e.g. for v17:
 ```bash
 $ cd vendor/postgres-v17
 $ patch -p1 <../../compute/patches/cloud_regress_pg17.patch
 ```
+* Set the environment variables (please modify according your configuration):
+```bash
+$ export DEFAULT_PG_VERSION=17
+$ export BUILD_TYPE=release
+```
+* Build the Neon binaries see [README.md](../../README.md)
 * Set the environment variable `BENCHMARK_CONNSTR` to the connection URI of your project.
-* Set the environment variable `PG_VERSION` to the version of your project.
+* Update poetry, run
+```bash
+$ scripts/pysync
+```
 * Run 
 ```bash
-$ pytest -m remote_cluster -k cloud_regress
+$ scripts/pytest -m remote_cluster -k cloud_regress
 ```
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -194,6 +194,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
    counter("pageserver_wait_lsn_started_count"),
    counter("pageserver_wait_lsn_finished_count"),
    counter("pageserver_wait_ondemand_download_seconds_sum"),
+    counter("pageserver_page_service_batch_break_reason"),
    *histogram("pageserver_page_service_batch_size"),
    *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"),
    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,6 +14,7 @@ import threading
 import time
 import uuid
 from collections import defaultdict
+from collections.abc import Mapping
 from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
@@ -79,7 +80,12 @@ from fixtures.remote_storage import (
    default_remote_storage,
    remote_storage_to_toml_dict,
 )
-from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.safekeeper.http import (
+    MembershipConfiguration,
+    SafekeeperHttpClient,
+    SafekeeperId,
+    TimelineCreateRequest,
+)
 from fixtures.safekeeper.utils import wait_walreceivers_absent
 from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
@@ -941,6 +947,8 @@ class NeonEnvBuilder:
                    continue
                if SMALL_DB_FILE_NAME_REGEX.fullmatch(test_file.name):
                    continue
+                if FINAL_METRICS_FILE_NAME == test_file.name:
+                    continue
                log.debug(f"Removing large database {test_file} file")
                test_file.unlink()
            elif test_entry.is_dir():
@@ -1249,6 +1257,7 @@ class NeonEnv:
                "mode": "pipelined",
                "execution": "concurrent-futures",
                "max_batch_size": 32,
+                "batching": "scattered-lsn",
            }

            get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
@@ -1450,6 +1459,12 @@ class NeonEnv:
                except Exception as e:
                    metric_errors.append(e)
                    log.error(f"metric validation failed on {pageserver.id}: {e}")
+
+            try:
+                pageserver.snapshot_final_metrics()
+            except Exception as e:
+                log.error(f"metric snapshot failed on {pageserver.id}: {e}")
+
            try:
                pageserver.stop(immediate=immediate)
            except RuntimeError:
@@ -1980,10 +1995,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
        tenant_shard_id: TenantId | TenantShardId,
        pageserver_id: int,
        generation_override: int | None = None,
+        config: None | dict[str, Any] = None,
    ) -> int:
        body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
        if generation_override is not None:
            body["generation_override"] = generation_override
+        if config is not None:
+            body["config"] = config

        response = self.request(
            "POST",
@@ -2878,13 +2896,14 @@ class NeonPageserver(PgProtocol, LogUtils):
        self,
        immediate: bool = False,
        timeout_in_seconds: int | None = None,
+        extra_env_vars: dict[str, str] | None = None,
    ):
        """
        High level wrapper for restart: restarts the process, and waits for
        tenant state to stabilize.
        """
        self.stop(immediate=immediate)
-        self.start(timeout_in_seconds=timeout_in_seconds)
+        self.start(timeout_in_seconds=timeout_in_seconds, extra_env_vars=extra_env_vars)
        self.quiesce_tenants()

    def quiesce_tenants(self):
@@ -2961,6 +2980,20 @@ class NeonPageserver(PgProtocol, LogUtils):
            value = self.http_client().get_metric_value(metric)
            assert value == 0, f"Nonzero {metric} == {value}"

+    def snapshot_final_metrics(self):
+        """
+        Take a snapshot of this pageserver's metrics and stash in its work directory.
+        """
+        if not self.running:
+            log.info(f"Skipping metrics snapshot on pageserver {self.id}, it is not running")
+            return
+
+        metrics = self.http_client().get_metrics_str()
+        metrics_snapshot_path = self.workdir / FINAL_METRICS_FILE_NAME
+
+        with open(metrics_snapshot_path, "w") as f:
+            f.write(metrics)
+
    def tenant_attach(
        self,
        tenant_id: TenantId,
@@ -2973,11 +3006,12 @@ class NeonPageserver(PgProtocol, LogUtils):
        to call into the pageserver HTTP client.
        """
        client = self.http_client()
-        if generation is None:
-            generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
-        elif override_storage_controller_generation:
+        if generation is None or override_storage_controller_generation:
            generation = self.env.storage_controller.attach_hook_issue(
-                tenant_id, self.id, generation
+                tenant_id,
+                self.id,
+                generation_override=generation if override_storage_controller_generation else None,
+                config=config,
            )
        return client.tenant_attach(
            tenant_id,
@@ -4292,31 +4326,32 @@ class Endpoint(PgProtocol, LogUtils):
    def respec_deep(self, **kwargs: Any) -> None:
        """
        Update the endpoint.json file taking into account nested keys.
-        It does one level deep update. Should enough for most cases.
        Distinct method from respec() to do not break existing functionality.
-        NOTE: This method also updates the spec.json file, not endpoint.json.
-        We need it because neon_local also writes to spec.json, so intended
+        NOTE: This method also updates the config.json file, not endpoint.json.
+        We need it because neon_local also writes to config.json, so intended
        use-case is i) start endpoint with some config, ii) respec_deep(),
        iii) call reconfigure() to apply the changes.
        """
-        config_path = os.path.join(self.endpoint_path(), "spec.json")
-        with open(config_path) as f:
-            data_dict: dict[str, Any] = json.load(f)

-        log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4))
-
-        for key, value in kwargs.items():
-            if isinstance(value, dict):
-                if key not in data_dict:
-                    data_dict[key] = value
+        def update(curr, patch):
+            for k, v in patch.items():
+                if isinstance(v, Mapping):
+                    curr[k] = update(curr.get(k, {}), v)
                else:
-                    data_dict[key] = {**data_dict[key], **value}
-            else:
-                data_dict[key] = value
+                    curr[k] = v
+            return curr
+
+        config_path = os.path.join(self.endpoint_path(), "config.json")
+        with open(config_path) as f:
+            config: dict[str, Any] = json.load(f)
+
+        log.debug("Current compute config: %s", json.dumps(config, indent=4))
+
+        update(config, kwargs)

        with open(config_path, "w") as file:
-            log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
-            json.dump(data_dict, file, indent=4)
+            log.debug("Updating compute config to: %s", json.dumps(config, indent=4))
+            json.dump(config, file, indent=4)

    def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None:
        """
@@ -4333,7 +4368,7 @@ class Endpoint(PgProtocol, LogUtils):
            wait_until(check_migrations_done)

    # Mock the extension part of spec passed from control plane for local testing
-    # endpooint.rs adds content of this file as a part of the spec.json
+    # endpooint.rs adds content of this file as a part of the config.json
    def create_remote_extension_spec(self, spec: dict[str, Any]):
        """Create a remote extension spec file for the endpoint."""
        remote_extensions_spec_path = os.path.join(
@@ -4839,6 +4874,50 @@ class Safekeeper(LogUtils):

        wait_until(paused)

+    @staticmethod
+    def sks_to_safekeeper_ids(sks: list[Safekeeper]) -> list[SafekeeperId]:
+        return [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in sks]
+
+    @staticmethod
+    def mconf_sks(env: NeonEnv, mconf: MembershipConfiguration) -> list[Safekeeper]:
+        """
+        List of Safekeepers which are members in `mconf`.
+        """
+        members_ids = [m.id for m in mconf.members]
+        new_members_ids = [m.id for m in mconf.new_members] if mconf.new_members is not None else []
+        return [sk for sk in env.safekeepers if sk.id in members_ids or sk.id in new_members_ids]
+
+    @staticmethod
+    def create_timeline(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        ps: NeonPageserver,
+        mconf: MembershipConfiguration,
+        members_sks: list[Safekeeper],
+    ):
+        """
+        Manually create timeline on safekeepers with given (presumably inital)
+        mconf: figure out LSN from pageserver, bake request and execute it on
+        given safekeepers.
+
+        Normally done by storcon, but some tests want to do it manually so far.
+        """
+        ps_http_cli = ps.http_client()
+        # figure out initial LSN.
+        ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
+        init_lsn = ps_timeline_detail["last_record_lsn"]
+        log.info(f"initial LSN: {init_lsn}")
+        # sk timeline creation request expects minor version
+        pg_version = ps_timeline_detail["pg_version"] * 10000
+        # create inital mconf
+        create_r = TimelineCreateRequest(
+            tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
+        )
+        log.info(f"sending timeline create: {create_r.to_json()}")
+
+        for sk in members_sks:
+            sk.http_client().timeline_create(create_r)
+

 class NeonBroker(LogUtils):
    """An object managing storage_broker instance"""
@@ -5077,6 +5156,8 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern[str] = re.compile(
    r"config-v1|heatmap-v1|tenant-manifest|metadata|.+\.(?:toml|pid|json|sql|conf)"
 )

+FINAL_METRICS_FILE_NAME: str = "final_metrics.txt"
+

 SKIP_DIRS = frozenset(
    (
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -126,8 +126,6 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
    ".*startup_reconcile: Could not scan node.*",
    # Tests run in dev mode
    ".*Starting in dev mode.*",
-    ".*Starting in dev mode - authentication security checks are disabled.*",
-    ".*Starting in dev mode: this may be an insecure configuration.*",
    # Tests that stop endpoints & use the storage controller's neon_local notification
    # mechanism might fail (neon_local's stopping and endpoint isn't atomic wrt the storage
    # controller's attempts to notify the endpoint).
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -65,13 +65,11 @@ def single_timeline(
    assert ps_http.tenant_list() == []

    def attach(tenant):
-        # NB: create the new tenant in the storage controller with the correct tenant config. This
-        # will pick up the existing tenant data from remote storage. If we just attach it to the
-        # Pageserver, the storage controller will reset the tenant config to the default.
-        env.create_tenant(
-            tenant_id=tenant,
-            timeline_id=template_timeline,
-            conf=template_config,
+        env.pageserver.tenant_attach(
+            tenant,
+            config=template_config,
+            generation=100,
+            override_storage_controller_generation=True,
        )

    with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -199,7 +199,7 @@ def wait_for_last_record_lsn(
    """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""

    current_lsn = Lsn(0)
-    for i in range(1000):
+    for i in range(2000):
        current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
        if current_lsn >= lsn:
            return current_lsn
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -25,7 +25,7 @@ class Walreceiver:

@dataclass
 class SafekeeperTimelineStatus:
-    mconf: Configuration | None
+    mconf: MembershipConfiguration | None
    term: int
    last_log_term: int
    pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
@@ -78,17 +78,17 @@ class SafekeeperId:


@dataclass
-class Configuration:
+class MembershipConfiguration:
    generation: int
    members: list[SafekeeperId]
    new_members: list[SafekeeperId] | None

    @classmethod
-    def from_json(cls, d: dict[str, Any]) -> Configuration:
+    def from_json(cls, d: dict[str, Any]) -> MembershipConfiguration:
        generation = d["generation"]
        members = d["members"]
        new_members = d.get("new_members")
-        return Configuration(generation, members, new_members)
+        return MembershipConfiguration(generation, members, new_members)

    def to_json(self) -> str:
        return json.dumps(self, cls=EnhancedJSONEncoder)
@@ -98,7 +98,7 @@ class Configuration:
 class TimelineCreateRequest:
    tenant_id: TenantId
    timeline_id: TimelineId
-    mconf: Configuration
+    mconf: MembershipConfiguration
    # not exactly PgVersion, for example 150002 for 15.2
    pg_version: int
    start_lsn: Lsn
@@ -110,13 +110,13 @@ class TimelineCreateRequest:

@dataclass
 class TimelineMembershipSwitchResponse:
-    previous_conf: Configuration
-    current_conf: Configuration
+    previous_conf: MembershipConfiguration
+    current_conf: MembershipConfiguration

    @classmethod
    def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
-        previous_conf = Configuration.from_json(d["previous_conf"])
-        current_conf = Configuration.from_json(d["current_conf"])
+        previous_conf = MembershipConfiguration.from_json(d["previous_conf"])
+        current_conf = MembershipConfiguration.from_json(d["current_conf"])
        return TimelineMembershipSwitchResponse(previous_conf, current_conf)


@@ -194,7 +194,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
        resj = res.json()
        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
        # It is always normally not None, it is allowed only to make forward compat tests happy.
-        mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None
+        mconf = MembershipConfiguration.from_json(resj["mconf"]) if "mconf" in resj else None
        return SafekeeperTimelineStatus(
            mconf=mconf,
            term=resj["acceptor_state"]["term"],
@@ -223,7 +223,9 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
        return self.timeline_status(tenant_id, timeline_id).commit_lsn

    # Get timeline membership configuration.
-    def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration:
+    def get_membership(
+        self, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> MembershipConfiguration:
        # make mypy happy
        return self.timeline_status(tenant_id, timeline_id).mconf  # type: ignore

@@ -275,7 +277,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
        return res_json

    def timeline_exclude(
-        self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
+        self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
    ) -> dict[str, Any]:
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude",
@@ -287,7 +289,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
        return res_json

    def membership_switch(
-        self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
+        self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
    ) -> TimelineMembershipSwitchResponse:
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership",
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -66,11 +66,11 @@ def test_basebackup_with_high_slru_count(

    n_txns = 500000

-    def setup_wrapper(env: NeonEnv):
-        return setup_tenant_template(env, n_txns)
-
    env = setup_pageserver_with_tenants(
-        neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper
+        neon_env_builder,
+        f"large_slru_count-{n_tenants}-{n_txns}",
+        n_tenants,
+        lambda env: setup_tenant_template(env, n_txns),
    )
    run_benchmark(env, pg_bin, record, duration)

@@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
        "gc_period": "0s",  # disable periodic gc
        "checkpoint_timeout": "10 years",
        "compaction_period": "0s",  # disable periodic compaction
-        "compaction_threshold": 10,
-        "compaction_target_size": 134217728,
-        "checkpoint_distance": 268435456,
-        "image_creation_threshold": 3,
    }

    template_tenant, template_timeline = env.create_tenant(set_default=True)
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -1,5 +1,7 @@
+import concurrent.futures
 import dataclasses
 import json
+import threading
 import time
 from dataclasses import dataclass
 from pathlib import Path
@@ -28,38 +30,33 @@ class PageServicePipeliningConfigSerial(PageServicePipeliningConfig):
 class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
    max_batch_size: int
    execution: str
+    batching: str
    mode: str = "pipelined"


-EXECUTION = ["concurrent-futures", "tasks"]
+EXECUTION = ["concurrent-futures"]
+BATCHING = ["uniform-lsn", "scattered-lsn"]

 NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
 for max_batch_size in [1, 32]:
    for execution in EXECUTION:
-        NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+        for batching in BATCHING:
+            NON_BATCHABLE.append(
+                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
+            )

-BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
-for max_batch_size in [1, 2, 4, 8, 16, 32]:
+BATCHABLE: list[PageServicePipeliningConfig] = []
+for max_batch_size in [32]:
    for execution in EXECUTION:
-        BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+        for batching in BATCHING:
+            BATCHABLE.append(
+                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
+            )


@pytest.mark.parametrize(
    "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
    [
-        # non-batchable workloads
-        # (A separate benchmark will consider latency).
-        *[
-            (
-                50,
-                config,
-                TARGET_RUNTIME,
-                1,
-                128,
-                f"not batchable {dataclasses.asdict(config)}",
-            )
-            for config in NON_BATCHABLE
-        ],
        # batchable workloads should show throughput and CPU efficiency improvements
        *[
            (
@@ -137,7 +134,14 @@ def test_throughput(

    env = neon_env_builder.init_start()
    ps_http = env.pageserver.http_client()
-    endpoint = env.endpoints.create_start("main")
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # minimal lfc & small shared buffers to force requests to pageserver
+            "neon.max_file_cache_size=1MB",
+            "shared_buffers=10MB",
+        ],
+    )
    conn = endpoint.connect()
    cur = conn.cursor()

@@ -155,7 +159,6 @@ def test_throughput(
    tablesize = tablesize_mib * 1024 * 1024
    npages = tablesize // (8 * 1024)
    cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
-    # TODO: can we force postgres to do sequential scans?

    #
    # Run the workload, collect `Metrics` before and after, calculate difference, normalize.
@@ -166,6 +169,7 @@ def test_throughput(
        time: float
        pageserver_batch_size_histo_sum: float
        pageserver_batch_size_histo_count: float
+        pageserver_batch_breaks_reason_count: dict[str, int]
        compute_getpage_count: float
        pageserver_cpu_seconds_total: float

@@ -179,6 +183,10 @@ def test_throughput(
                compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
                pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
                - other.pageserver_cpu_seconds_total,
+                pageserver_batch_breaks_reason_count={
+                    reason: count - other.pageserver_batch_breaks_reason_count.get(reason, 0)
+                    for reason, count in self.pageserver_batch_breaks_reason_count.items()
+                },
            )

        def normalize(self, by) -> "Metrics":
@@ -188,6 +196,10 @@ def test_throughput(
                pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
                compute_getpage_count=self.compute_getpage_count / by,
                pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
+                pageserver_batch_breaks_reason_count={
+                    reason: count / by
+                    for reason, count in self.pageserver_batch_breaks_reason_count.items()
+                },
            )

    def get_metrics() -> Metrics:
@@ -197,6 +209,20 @@ def test_throughput(
            )
            compute_getpage_count = cur.fetchall()[0][0]
            pageserver_metrics = ps_http.get_metrics()
+            for name, samples in pageserver_metrics.metrics.items():
+                for sample in samples:
+                    log.info(f"{name=} labels={sample.labels} {sample.value}")
+
+            raw_batch_break_reason_count = pageserver_metrics.query_all(
+                "pageserver_page_service_batch_break_reason_total",
+                filter={"timeline_id": str(env.initial_timeline)},
+            )
+
+            batch_break_reason_count = {
+                sample.labels["reason"]: int(sample.value)
+                for sample in raw_batch_break_reason_count
+            }
+
            return Metrics(
                time=time.time(),
                pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
@@ -205,34 +231,58 @@ def test_throughput(
                pageserver_batch_size_histo_count=pageserver_metrics.query_one(
                    "pageserver_page_service_batch_size_count"
                ).value,
+                pageserver_batch_breaks_reason_count=batch_break_reason_count,
                compute_getpage_count=compute_getpage_count,
                pageserver_cpu_seconds_total=pageserver_metrics.query_one(
                    "libmetrics_process_cpu_seconds_highres"
                ).value,
            )

-    def workload() -> Metrics:
+    def workload(disruptor_started: threading.Event) -> Metrics:
+        disruptor_started.wait()
        start = time.time()
        iters = 0
        while time.time() - start < target_runtime or iters < 2:
-            log.info("Seqscan %d", iters)
            if iters == 1:
                # round zero for warming up
                before = get_metrics()
-            cur.execute(
-                "select clear_buffer_cache()"
-            )  # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests
            cur.execute("select sum(data::bigint) from t")
            assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
            iters += 1
        after = get_metrics()
        return (after - before).normalize(iters - 1)

+    def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event):
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        iters = 0
+        while True:
+            cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')")
+            if stop_disruptor.is_set():
+                break
+            disruptor_started.set()
+            iters += 1
+            time.sleep(0.001)
+        return iters
+
    env.pageserver.patch_config_toml_nonrecursive(
        {"page_service_pipelining": dataclasses.asdict(pipelining_config)}
    )
-    env.pageserver.restart()
-    metrics = workload()
+
+    # set trace for log analysis below
+    env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"})
+
+    log.info("Starting workload")
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        disruptor_started = threading.Event()
+        stop_disruptor = threading.Event()
+        disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor)
+        workload_fut = executor.submit(workload, disruptor_started)
+        metrics = workload_fut.result()
+        stop_disruptor.set()
+        ndisruptions = disruptor_fut.result()
+        log.info("Disruptor issued %d disrupting requests", ndisruptions)

    log.info("Results: %s", metrics)

@@ -249,7 +299,16 @@ def test_throughput(
    #

    for metric, value in dataclasses.asdict(metrics).items():
-        zenbenchmark.record(f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM)
+        if metric == "pageserver_batch_breaks_reason_count":
+            assert isinstance(value, dict)
+            for reason, count in value.items():
+                zenbenchmark.record(
+                    f"counters.{metric}_{reason}", count, unit="", report=MetricReport.TEST_PARAM
+                )
+        else:
+            zenbenchmark.record(
+                f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM
+            )

    zenbenchmark.record(
        "perfmetric.batching_factor",
@@ -262,7 +321,10 @@ def test_throughput(
 PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
 for max_batch_size in [1, 32]:
    for execution in EXECUTION:
-        PRECISION_CONFIGS.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+        for batching in BATCHING:
+            PRECISION_CONFIGS.append(
+                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
+            )


@pytest.mark.parametrize(
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -97,6 +97,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int)
    _record_branch_creation_durations(neon_compare, branch_creation_durations)


+@pytest.mark.timeout(1000)
@pytest.mark.parametrize("n_branches", [500, 1024])
@pytest.mark.parametrize("shape", ["one_ancestor", "random"])
 def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str):
@@ -205,7 +206,7 @@ def wait_and_record_startup_metrics(
        assert len(matching) == len(expected_labels)
        return matching

-    samples = wait_until(metrics_are_filled)
+    samples = wait_until(metrics_are_filled, timeout=60)

    for sample in samples:
        phase = sample.labels["phase"]
--- a/test_runner/performance/test_ingest_insert_bulk.py
+++ b/test_runner/performance/test_ingest_insert_bulk.py
@@ -52,6 +52,8 @@ def test_ingest_insert_bulk(
        # would compete with Pageserver for bandwidth.
        # neon_env_builder.enable_safekeeper_remote_storage(s3_storage())

+    neon_env_builder.pageserver_config_override = "wait_lsn_timeout='600 s'"
+
    neon_env_builder.disable_scrub_on_exit()  # immediate shutdown may leave stray layers
    env = neon_env_builder.init_start()

@@ -92,7 +94,18 @@ def test_ingest_insert_bulk(
                    worker_rows = rows / CONCURRENCY
                    pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value)

-        end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+        for attempt in range(5):
+            try:
+                end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+                break
+            except Exception as e:
+                # if we disable backpressure, postgres can become unresponsive for longer than a minute
+                # and new connection attempts time out in postgres after 1 minute
+                # so if this happens we retry new connection
+                log.error(f"Attempt {attempt + 1}/5: Failed to select current wal lsn: {e}")
+            if attempt == 4:
+                log.error("Exceeded maximum retry attempts for selecting current wal lsn")
+                raise

        # Wait for pageserver to ingest the WAL.
        client = env.pageserver.http_client()
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -64,8 +64,8 @@ def test_ro_replica_lag(

    project = neon_api.create_project(pg_version)
    project_id = project["project"]["id"]
-    log.info("Project ID: {}", project_id)
-    log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"])
+    log.info("Project ID: %s", project_id)
+    log.info("Primary endpoint ID: %s", project["project"]["endpoints"][0]["id"])
    neon_api.wait_for_operation_to_finish(project_id)
    error_occurred = False
    try:
@@ -81,7 +81,7 @@ def test_ro_replica_lag(
            endpoint_type="read_only",
            settings={"pg_settings": {"hot_standby_feedback": "on"}},
        )
-        log.info("Replica endpoint ID: {}", replica["endpoint"]["id"])
+        log.info("Replica endpoint ID: %s", replica["endpoint"]["id"])
        replica_env = master_env.copy()
        replica_env["PGHOST"] = replica["endpoint"]["host"]
        neon_api.wait_for_operation_to_finish(project_id)
@@ -197,8 +197,8 @@ def test_replication_start_stop(

    project = neon_api.create_project(pg_version)
    project_id = project["project"]["id"]
-    log.info("Project ID: {}", project_id)
-    log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"])
+    log.info("Project ID: %s", project_id)
+    log.info("Primary endpoint ID: %s", project["project"]["endpoints"][0]["id"])
    neon_api.wait_for_operation_to_finish(project_id)
    try:
        branch_id = project["branch"]["id"]
@@ -215,7 +215,7 @@ def test_replication_start_stop(
                endpoint_type="read_only",
                settings={"pg_settings": {"hot_standby_feedback": "on"}},
            )
-            log.info("Replica {} endpoint ID: {}", i + 1, replica["endpoint"]["id"])
+            log.info("Replica %d endpoint ID: %s", i + 1, replica["endpoint"]["id"])
            replicas.append(replica)
            neon_api.wait_for_operation_to_finish(project_id)

--- a/test_runner/performance/test_sharded_ingest.py
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -13,7 +13,7 @@ from fixtures.neon_fixtures import (
 )


-@pytest.mark.timeout(600)
+@pytest.mark.timeout(1200)
@pytest.mark.parametrize("shard_count", [1, 8, 32])
@pytest.mark.parametrize(
    "wal_receiver_protocol",
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -187,6 +187,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
        },
        "rel_size_v2_enabled": False,  # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it
        "gc_compaction_enabled": True,
+        "gc_compaction_verification": False,
        "gc_compaction_initial_threshold_kb": 1024000,
        "gc_compaction_ratio_percent": 200,
        "image_creation_preempt_threshold": 5,
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -162,6 +162,8 @@ def test_pageserver_compaction_preempt(
    conf = PREEMPT_COMPACTION_TENANT_CONF.copy()
    env = neon_env_builder.init_start(initial_tenant_conf=conf)

+    env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*")
+
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

--- a/Show More
+++ b/Show More