Merge 2025-04-11 main commit 'c66444ea1538349d13ab5e87bca880394434004b' into yuchen/direct-io-delta-image-layer-write

2026-05-21 07:00:38 +00:00 · 2025-04-11 16:26:41 +02:00
parent af6c433947 c66444ea15
commit 3f417d4ac8
44 changed files with 1694 additions and 519 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2837,6 +2837,7 @@ dependencies = [
 "utils",
 "uuid",
 "workspace_hack",
+ "x509-cert",
 ]

 [[package]]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -523,11 +523,14 @@ impl ComputeNode {

        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
-            "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
+            "starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
            pspec.tenant_id,
            pspec.timeline_id,
+            pspec.spec.project_id.as_deref().unwrap_or("None"),
+            pspec.spec.branch_id.as_deref().unwrap_or("None"),
+            pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
            pspec.spec.features,
            pspec.spec.remote_extensions,
        );
@@ -631,19 +634,23 @@ impl ComputeNode {
            });
        }

-        // Configure and start rsyslog for HIPAA if necessary
-        if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
-            let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
-            if remote_endpoint.is_empty() {
-                anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+        // Configure and start rsyslog for compliance audit logging
+        match pspec.spec.audit_log_level {
+            ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
+                let remote_endpoint =
+                    std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
+                if remote_endpoint.is_empty() {
+                    anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+                }
+
+                let log_directory_path = Path::new(&self.params.pgdata).join("log");
+                let log_directory_path = log_directory_path.to_string_lossy().to_string();
+                configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
+
+                // Launch a background task to clean up the audit logs
+                launch_pgaudit_gc(log_directory_path);
            }
-
-            let log_directory_path = Path::new(&self.params.pgdata).join("log");
-            let log_directory_path = log_directory_path.to_string_lossy().to_string();
-            configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
-
-            // Launch a background task to clean up the audit logs
-            launch_pgaudit_gc(log_directory_path);
+            _ => {}
        }

        // Configure and start rsyslog for Postgres logs export
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -89,6 +89,15 @@ pub fn write_postgres_conf(
            escape_conf_value(&s.to_string())
        )?;
    }
+    if let Some(s) = &spec.project_id {
+        writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.branch_id {
+        writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.endpoint_id {
+        writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
+    }

    // tls
    if let Some(tls_config) = tls_config {
@@ -169,7 +178,7 @@ pub fn write_postgres_conf(
    // and don't allow the user or the control plane admin to change them.
    match spec.audit_log_level {
        ComputeAudit::Disabled => {}
-        ComputeAudit::Log => {
+        ComputeAudit::Log | ComputeAudit::Base => {
            writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
            writeln!(file, "pgaudit.log='ddl,role'")?;
            // Disable logging of catalog queries to reduce the noise
@@ -193,16 +202,20 @@ pub fn write_postgres_conf(
            }
            writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
        }
-        ComputeAudit::Hipaa => {
+        ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
            writeln!(
                file,
                "# Managed by compute_ctl compliance audit settings: begin"
            )?;
-            // This log level is very verbose
-            // but this is necessary for HIPAA compliance.
-            // Exclude 'misc' category, because it doesn't contain anythig relevant.
-            writeln!(file, "pgaudit.log='all, -misc'")?;
-            writeln!(file, "pgaudit.log_parameter=on")?;
+            // Enable logging of parameters.
+            // This is very verbose and may contain sensitive data.
+            if spec.audit_log_level == ComputeAudit::Full {
+                writeln!(file, "pgaudit.log_parameter=on")?;
+                writeln!(file, "pgaudit.log='all'")?;
+            } else {
+                writeln!(file, "pgaudit.log_parameter=off")?;
+                writeln!(file, "pgaudit.log='all, -misc'")?;
+            }
            // Disable logging of catalog queries
            // The catalog doesn't contain sensitive data, so we don't need to audit it.
            writeln!(file, "pgaudit.log_catalog=off")?;
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -278,12 +278,12 @@ impl ComputeNode {
            // so that all config operations are audit logged.
            match spec.audit_log_level
            {
-                ComputeAudit::Hipaa => {
+                ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
                    phases.push(CreatePgauditExtension);
                    phases.push(CreatePgauditlogtofileExtension);
                    phases.push(DisablePostgresDBPgAudit);
                }
-                ComputeAudit::Log => {
+                ComputeAudit::Log | ComputeAudit::Base => {
                    phases.push(CreatePgauditExtension);
                    phases.push(DisablePostgresDBPgAudit);
                }
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -658,6 +658,9 @@ impl Endpoint {
            delta_operations: None,
            tenant_id: Some(self.tenant_id),
            timeline_id: Some(self.timeline_id),
+            project_id: None,
+            branch_id: None,
+            endpoint_id: Some(self.endpoint_id.clone()),
            mode: self.mode,
            pageserver_connstring: Some(pageserver_connstring),
            safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -13,7 +13,9 @@ use pageserver_api::controller_api::{
    NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
    TenantCreateResponse, TenantLocateResponse,
 };
-use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
+use pageserver_api::models::{
+    TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
+};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
@@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs {
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
    pub node_id: Option<NodeId>,
-    pub generation_override: Option<i32>,
+    pub generation_override: Option<i32>, // only new tenants
+    pub config: Option<TenantConfig>,     // only new tenants
 }

 #[derive(Serialize, Deserialize)]
@@ -805,6 +808,7 @@ impl StorageController {
            tenant_shard_id,
            node_id: Some(pageserver_id),
            generation_override: None,
+            config: None,
        };

        let response = self
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -104,6 +104,12 @@ pub struct ComputeSpec {
    pub timeline_id: Option<TimelineId>,
    pub pageserver_connstring: Option<String>,

+    // More neon ids that we expose to the compute_ctl
+    // and to postgres as neon extension GUCs.
+    pub project_id: Option<String>,
+    pub branch_id: Option<String>,
+    pub endpoint_id: Option<String>,
+
    /// Safekeeper membership config generation. It is put in
    /// neon.safekeepers GUC and serves two purposes:
    /// 1) Non zero value forces walproposer to use membership configurations.
@@ -159,13 +165,7 @@ pub struct ComputeSpec {
    #[serde(default)] // Default false
    pub drop_subscriptions_before_start: bool,

-    /// Log level for audit logging:
-    ///
-    /// Disabled - no audit logging. This is the default.
-    /// log - log masked statements to the postgres log using pgaudit extension
-    /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
-    ///
-    /// Extensions should be present in shared_preload_libraries
+    /// Log level for compute audit logging
    #[serde(default)]
    pub audit_log_level: ComputeAudit,

@@ -289,14 +289,25 @@ impl ComputeMode {
 }

 /// Log level for audit logging
-/// Disabled, log, hipaa
-/// Default is Disabled
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
    #[default]
    Disabled,
+    // Deprecated, use Base instead
    Log,
+    // (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
+    // logged to the standard postgresql log stream
+    Base,
+    // Deprecated, use Full or Extended instead
    Hipaa,
+    // (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access
+    Extended,
+    // (pgaudit.log='all', pgaudit.log_parameter='on'),
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access.
+    Full,
 }

 #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
--- a/libs/http-utils/Cargo.toml
+++ b/libs/http-utils/Cargo.toml
@@ -30,6 +30,7 @@ tokio.workspace = true
 tracing.workspace = true
 url.workspace = true
 uuid.workspace = true
+x509-cert.workspace = true

 # to use tokio channels as streams, this is faster to compile than async_stream
 # why is it only here? no other crate should use it, streams are rarely needed.
--- a/libs/http-utils/src/server.rs
+++ b/libs/http-utils/src/server.rs
@@ -4,6 +4,8 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use hyper0::Body;
 use hyper0::server::conn::Http;
+use metrics::{IntCounterVec, register_int_counter_vec};
+use once_cell::sync::Lazy;
 use routerify::{RequestService, RequestServiceBuilder};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
@@ -26,6 +28,24 @@ pub struct Server {
    tls_acceptor: Option<TlsAcceptor>,
 }

+static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_started_total",
+        "Number of established http/https connections",
+        &["scheme"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_errors_total",
+        "Number of occured connection errors by type",
+        &["type"]
+    )
+    .expect("failed to define a metric")
+});
+
 impl Server {
    pub fn new(
        request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
@@ -60,6 +80,15 @@ impl Server {
            false
        }

+        let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
+        let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
+        let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
+        let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
+        let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
+
+        let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
+        let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
+
        let mut connections = FuturesUnordered::new();
        loop {
            tokio::select! {
@@ -67,6 +96,7 @@ impl Server {
                    let (tcp_stream, remote_addr) = match stream {
                        Ok(stream) => stream,
                        Err(err) => {
+                            tcp_error_cnt.inc();
                            if !suppress_io_error(&err) {
                                info!("Failed to accept TCP connection: {err:#}");
                            }
@@ -78,11 +108,18 @@ impl Server {
                    let tls_acceptor = self.tls_acceptor.clone();
                    let cancel = cancel.clone();

+                    let tls_error_cnt = tls_error_cnt.clone();
+                    let http_error_cnt = http_error_cnt.clone();
+                    let https_error_cnt = https_error_cnt.clone();
+                    let http_connection_cnt = http_connection_cnt.clone();
+                    let https_connection_cnt = https_connection_cnt.clone();
+
                    connections.push(tokio::spawn(
                        async move {
                            match tls_acceptor {
                                Some(tls_acceptor) => {
                                    // Handle HTTPS connection.
+                                    https_connection_cnt.inc();
                                    let tls_stream = tokio::select! {
                                        tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
                                        _ = cancel.cancelled() => return,
@@ -90,6 +127,7 @@ impl Server {
                                    let tls_stream = match tls_stream {
                                        Ok(tls_stream) => tls_stream,
                                        Err(err) => {
+                                            tls_error_cnt.inc();
                                            if !suppress_io_error(&err) {
                                                info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
                                            }
@@ -97,6 +135,7 @@ impl Server {
                                        }
                                    };
                                    if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
+                                        https_error_cnt.inc();
                                        if !suppress_hyper_error(&err) {
                                            info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
                                        }
@@ -104,7 +143,9 @@ impl Server {
                                }
                                None => {
                                    // Handle HTTP connection.
+                                    http_connection_cnt.inc();
                                    if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
+                                        http_error_cnt.inc();
                                        if !suppress_hyper_error(&err) {
                                            info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
                                        }
@@ -115,6 +156,7 @@ impl Server {
                 }
                Some(conn) = connections.next() => {
                    if let Err(err) = conn {
+                        panic_error_cnt.inc();
                        error!("Connection panicked: {err:#}");
                    }
                }
@@ -122,6 +164,7 @@ impl Server {
                    // Wait for graceful shutdown of all connections.
                    while let Some(conn) = connections.next().await {
                        if let Err(err) = conn {
+                            panic_error_cnt.inc();
                            error!("Connection panicked: {err:#}");
                        }
                    }
--- a/libs/http-utils/src/tls_certs.rs
+++ b/libs/http-utils/src/tls_certs.rs
@@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration};
 use anyhow::Context;
 use arc_swap::ArcSwap;
 use camino::Utf8Path;
+use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
+use once_cell::sync::Lazy;
 use rustls::{
-    pki_types::{CertificateDer, PrivateKeyDer},
+    pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
    server::{ClientHello, ResolvesServerCert},
    sign::CertifiedKey,
 };
+use x509_cert::der::Reader;

 pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
    let cert_data = tokio::fs::read(filename)
@@ -53,6 +56,76 @@ pub async fn load_certified_key(
    Ok(certified_key)
 }

+/// rustls's CertifiedKey with extra parsed fields used for metrics.
+struct ParsedCertifiedKey {
+    certified_key: CertifiedKey,
+    expiration_time: UnixTime,
+}
+
+/// Parse expiration time from an X509 certificate.
+fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
+    let parsed_cert = x509_cert::der::SliceReader::new(cert)
+        .context("Failed to parse cerficiate")?
+        .decode::<x509_cert::Certificate>()
+        .context("Failed to parse cerficiate")?;
+
+    Ok(UnixTime::since_unix_epoch(
+        parsed_cert
+            .tbs_certificate
+            .validity
+            .not_after
+            .to_unix_duration(),
+    ))
+}
+
+async fn load_and_parse_certified_key(
+    key_filename: &Utf8Path,
+    cert_filename: &Utf8Path,
+) -> anyhow::Result<ParsedCertifiedKey> {
+    let certified_key = load_certified_key(key_filename, cert_filename).await?;
+    let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
+    Ok(ParsedCertifiedKey {
+        certified_key,
+        expiration_time,
+    })
+}
+
+static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "tls_certs_expiration_time_seconds",
+        "Expiration time of the loaded certificate since unix epoch in seconds",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_started_total",
+        "Number of certificate reload loop iterations started",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_updated_total",
+        "Number of times the certificate was updated to the new one",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_failed_total",
+        "Number of times the certificate reload failed",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
 /// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
 /// the disk periodically.
 #[derive(Debug)]
@@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver {
 impl ReloadingCertificateResolver {
    /// Creates a new Resolver by loading certificate and private key from FS and
    /// creating tokio::task to reload them with provided reload_period.
+    /// resolver_name is used as metric's label.
    pub async fn new(
+        resolver_name: &str,
        key_filename: &Utf8Path,
        cert_filename: &Utf8Path,
        reload_period: Duration,
    ) -> anyhow::Result<Arc<Self>> {
+        // Create metrics for current resolver.
+        let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
+        let cert_reload_started_counter =
+            CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_updated_counter =
+            CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_failed_counter =
+            CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
+
+        let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
+
        let this = Arc::new(Self {
-            certified_key: ArcSwap::from_pointee(
-                load_certified_key(key_filename, cert_filename).await?,
-            ),
+            certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
        });
+        cert_expiration_time.set(parsed_key.expiration_time.as_secs());

        tokio::spawn({
            let weak_this = Arc::downgrade(&this);
@@ -88,17 +173,22 @@ impl ReloadingCertificateResolver {
                        Some(this) => this,
                        None => break, // Resolver has been destroyed, exit.
                    };
-                    match load_certified_key(&key_filename, &cert_filename).await {
-                        Ok(new_certified_key) => {
-                            if new_certified_key.cert == this.certified_key.load().cert {
+                    cert_reload_started_counter.inc();
+
+                    match load_and_parse_certified_key(&key_filename, &cert_filename).await {
+                        Ok(parsed_key) => {
+                            if parsed_key.certified_key.cert == this.certified_key.load().cert {
                                tracing::debug!("Certificate has not changed since last reloading");
                            } else {
                                tracing::info!("Certificate has been reloaded");
-                                this.certified_key.store(Arc::new(new_certified_key));
+                                this.certified_key.store(Arc::new(parsed_key.certified_key));
+                                cert_expiration_time.set(parsed_key.expiration_time.as_secs());
+                                cert_reload_updated_counter.inc();
                            }
                            last_reload_failed = false;
                        }
                        Err(err) => {
+                            cert_reload_failed_counter.inc();
                            // Note: Reloading certs may fail if it conflicts with the script updating
                            // the files at the same time. Warn only if the error is persistent.
                            if last_reload_failed {
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -180,6 +180,7 @@ pub struct ConfigToml {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub generate_unarchival_heatmap: Option<bool>,
    pub tracing: Option<Tracing>,
+    pub enable_tls_page_service_api: bool,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -631,6 +632,7 @@ impl Default for ConfigToml {
            load_previous_heatmap: None,
            generate_unarchival_heatmap: None,
            tracing: None,
+            enable_tls_page_service_api: false,
        }
    }
 }
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -7,7 +7,8 @@ use std::time::{Duration, Instant};
 /// API (`/control/v1` prefix).  Implemented by the server
 /// in [`storage_controller::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;

 use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
 use crate::shard::{ShardStripeSize, TenantShardId};
@@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest {
    pub scheduling_policy: SkSchedulingPolicy,
 }

+/// Import request for safekeeper timelines.
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TimelineImportRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub start_lsn: Lsn,
+    pub sk_set: Vec<NodeId>,
+}
+
 #[cfg(test)]
 mod test {
    use serde_json;
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -927,7 +927,7 @@ impl Key {

    /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
    #[inline(always)]
-    pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
+    pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
        Ok(match self.field1 {
            0x00 => (
                RelTag {
@@ -938,7 +938,7 @@ impl Key {
                },
                self.field6,
            ),
-            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
+            _ => return Err(ToRelBlockError(self.field1)),
        })
    }
 }
@@ -951,6 +951,17 @@ impl std::str::FromStr for Key {
    }
 }

+#[derive(Debug)]
+pub struct ToRelBlockError(u8);
+
+impl fmt::Display for ToRelBlockError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "unexpected value kind 0x{:02x}", self.0)
+    }
+}
+
+impl std::error::Error for ToRelBlockError {}
+
 #[cfg(test)]
 mod tests {
    use std::str::FromStr;
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -452,6 +452,24 @@ fn start_pageserver(
    info!("Using auth for http API: {:#?}", conf.http_auth_type);
    info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);

+    let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
+    {
+        let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
+            "main",
+            &conf.ssl_key_file,
+            &conf.ssl_cert_file,
+            conf.ssl_cert_reload_period,
+        ))?;
+
+        let server_config = rustls::ServerConfig::builder()
+            .with_no_client_auth()
+            .with_cert_resolver(resolver);
+
+        Some(Arc::new(server_config))
+    } else {
+        None
+    };
+
    match var("NEON_AUTH_TOKEN") {
        Ok(v) => {
            info!("Loaded JWT token for authentication with Safekeeper");
@@ -670,17 +688,11 @@ fn start_pageserver(

        let https_task = match https_listener {
            Some(https_listener) => {
-                let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
-                    &conf.ssl_key_file,
-                    &conf.ssl_cert_file,
-                    conf.ssl_cert_reload_period,
-                ))?;
+                let tls_server_config = tls_server_config
+                    .clone()
+                    .expect("tls_server_config is set earlier if https is enabled");

-                let server_config = rustls::ServerConfig::builder()
-                    .with_no_client_auth()
-                    .with_cert_resolver(resolver);
-
-                let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
+                let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);

                let server =
                    http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
@@ -736,6 +748,11 @@ fn start_pageserver(
            tokio::net::TcpListener::from_std(pageserver_listener)
                .context("create tokio listener")?
        },
+        if conf.enable_tls_page_service_api {
+            tls_server_config
+        } else {
+            None
+        },
    );

    // All started up! Now just sit and wait for shutdown signal.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -219,6 +219,11 @@ pub struct PageServerConf {
    pub generate_unarchival_heatmap: bool,

    pub tracing: Option<pageserver_api::config::Tracing>,
+
+    /// Enable TLS in page service API.
+    /// Does not force TLS: the client negotiates TLS usage during the handshake.
+    /// Uses key and certificate from ssl_key_file/ssl_cert_file.
+    pub enable_tls_page_service_api: bool,
 }

 /// Token for authentication to safekeepers
@@ -391,6 +396,7 @@ impl PageServerConf {
            load_previous_heatmap,
            generate_unarchival_heatmap,
            tracing,
+            enable_tls_page_service_api,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -441,6 +447,7 @@ impl PageServerConf {
            page_service_pipelining,
            get_vectored_concurrent_io,
            tracing,
+            enable_tls_page_service_api,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3253,7 +3253,7 @@ async fn ingest_aux_files(
        modification
            .put_file(&fname, content.as_bytes(), &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
    }
    modification
        .commit(&ctx)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -27,7 +27,7 @@ use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
-use crate::walingest::WalIngest;
+use crate::walingest::{WalIngest, WalIngestErrorKind};

 // Returns checkpoint LSN from controlfile
 pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
@@ -157,9 +157,9 @@ async fn import_rel(
        .put_rel_creation(rel, nblocks as u32, ctx)
        .await
    {
-        match e {
-            RelationError::AlreadyExists => {
-                debug!("Relation {} already exist. We must be extending it.", rel)
+        match e.kind {
+            WalIngestErrorKind::RelationAlreadyExists(rel) => {
+                debug!("Relation {rel} already exists. We must be extending it.")
            }
            _ => return Err(e.into()),
        }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -105,6 +105,7 @@ pub fn spawn(
    pg_auth: Option<Arc<SwappableJwtAuth>>,
    perf_trace_dispatch: Option<Dispatch>,
    tcp_listener: tokio::net::TcpListener,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
 ) -> Listener {
    let cancel = CancellationToken::new();
    let libpq_ctx = RequestContext::todo_child(
@@ -124,6 +125,7 @@ pub fn spawn(
            perf_trace_dispatch,
            tcp_listener,
            conf.pg_auth_type,
+            tls_config,
            conf.page_service_pipelining.clone(),
            libpq_ctx,
            cancel.clone(),
@@ -181,6 +183,7 @@ pub async fn libpq_listener_main(
    perf_trace_dispatch: Option<Dispatch>,
    listener: tokio::net::TcpListener,
    auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
    listener_ctx: RequestContext,
    listener_cancel: CancellationToken,
@@ -223,6 +226,7 @@ pub async fn libpq_listener_main(
                    local_auth,
                    socket,
                    auth_type,
+                    tls_config.clone(),
                    pipelining_config.clone(),
                    connection_ctx,
                    connections_cancel.child_token(),
@@ -264,6 +268,7 @@ async fn page_service_conn_main(
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
    connection_ctx: RequestContext,
    cancel: CancellationToken,
@@ -334,7 +339,8 @@ async fn page_service_conn_main(
        cancel.clone(),
        gate_guard,
    );
-    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
+    let pgbackend =
+        PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;

    match pgbackend.run(&mut conn_handler, &cancel).await {
        Ok(()) => {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,8 +9,9 @@
 use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
 use std::ops::{ControlFlow, Range};

-use crate::PERF_TRACE_TARGET;
-use anyhow::{Context, ensure};
+use crate::walingest::{WalIngestError, WalIngestErrorKind};
+use crate::{PERF_TRACE_TARGET, ensure_walingest};
+use anyhow::Context;
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use itertools::Itertools;
@@ -136,12 +137,8 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {

 #[derive(Debug, thiserror::Error)]
 pub enum RelationError {
-    #[error("Relation Already Exists")]
-    AlreadyExists,
    #[error("invalid relnode")]
    InvalidRelnode,
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
 }

 ///
@@ -1478,8 +1475,8 @@ impl DatadirModification<'_> {
    }

    /// Set the current lsn
-    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
-        ensure!(
+    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
+        ensure_walingest!(
            lsn >= self.lsn,
            "setting an older lsn {} than {} is not allowed",
            lsn,
@@ -1578,7 +1575,7 @@ impl DatadirModification<'_> {
        &mut self,
        rel: RelTag,
        ctx: &RequestContext,
-    ) -> Result<u32, PageReconstructError> {
+    ) -> Result<u32, WalIngestError> {
        // Get current size and put rel creation if rel doesn't exist
        //
        // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
@@ -1593,14 +1590,13 @@ impl DatadirModification<'_> {
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
-            self.put_rel_creation(rel, 0, ctx)
-                .await
-                .context("Relation Error")?;
+            self.put_rel_creation(rel, 0, ctx).await?;
            Ok(0)
        } else {
-            self.tline
+            Ok(self
+                .tline
                .get_rel_size(rel, Version::Modified(self), ctx)
-                .await
+                .await?)
        }
    }

@@ -1637,11 +1633,14 @@ impl DatadirModification<'_> {
        // TODO(vlad): remove this argument and replace the shard check with is_key_local
        shard: &ShardIdentity,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let mut gaps_at_lsns = Vec::default();

        for meta in batch.metadata.iter() {
-            let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
+            let key = Key::from_compact(meta.key());
+            let (rel, blkno) = key
+                .to_rel_block()
+                .map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?;
            let new_nblocks = blkno + 1;

            let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
@@ -1683,8 +1682,8 @@ impl DatadirModification<'_> {
        rel: RelTag,
        blknum: BlockNumber,
        rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
        self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
        Ok(())
    }
@@ -1696,7 +1695,7 @@ impl DatadirModification<'_> {
        segno: u32,
        blknum: BlockNumber,
        rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        if !self.tline.tenant_shard_id.is_shard_zero() {
            return Ok(());
        }
@@ -1714,14 +1713,11 @@ impl DatadirModification<'_> {
        rel: RelTag,
        blknum: BlockNumber,
        img: Bytes,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
        let key = rel_block_to_key(rel, blknum);
        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
        }
        self.put(rel_block_to_key(rel, blknum), Value::Image(img));
        Ok(())
@@ -1733,15 +1729,12 @@ impl DatadirModification<'_> {
        segno: u32,
        blknum: BlockNumber,
        img: Bytes,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        assert!(self.tline.tenant_shard_id.is_shard_zero());

        let key = slru_block_to_key(kind, segno, blknum);
        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
        }
        self.put(key, Value::Image(img));
        Ok(())
@@ -1751,15 +1744,11 @@ impl DatadirModification<'_> {
        &mut self,
        rel: RelTag,
        blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
        let key = rel_block_to_key(rel, blknum);
        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
        }

        let batch = self
@@ -1776,15 +1765,11 @@ impl DatadirModification<'_> {
        kind: SlruKind,
        segno: u32,
        blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        assert!(self.tline.tenant_shard_id.is_shard_zero());
        let key = slru_block_to_key(kind, segno, blknum);
        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
        }

        let batch = self
@@ -1832,8 +1817,10 @@ impl DatadirModification<'_> {
        dbnode: Oid,
        img: Bytes,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+    ) -> Result<(), WalIngestError> {
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;

        // Add it to the directory (if it doesn't exist already)
        let buf = self.get(DBDIR_KEY, ctx).await?;
@@ -1874,13 +1861,13 @@ impl DatadirModification<'_> {
        xid: u64,
        img: Bytes,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        // Add it to the directory entry
        let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
        let newdirbuf = if self.tline.pg_version >= 17 {
            let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
            if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
+                Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
            }
            self.pending_directory_entries.push((
                DirectoryKind::TwoPhase,
@@ -1891,7 +1878,7 @@ impl DatadirModification<'_> {
            let xid = xid as u32;
            let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
            if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
+                Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?;
            }
            self.pending_directory_entries.push((
                DirectoryKind::TwoPhase,
@@ -1909,22 +1896,22 @@ impl DatadirModification<'_> {
        &mut self,
        origin_id: RepOriginId,
        origin_lsn: Lsn,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let key = repl_origin_key(origin_id);
        self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
        Ok(())
    }

-    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
+    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> {
        self.set_replorigin(origin_id, Lsn::INVALID).await
    }

-    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
+    pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> {
        self.put(CONTROLFILE_KEY, Value::Image(img));
        Ok(())
    }

-    pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
+    pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> {
        self.put(CHECKPOINT_KEY, Value::Image(img));
        Ok(())
    }
@@ -1934,7 +1921,7 @@ impl DatadirModification<'_> {
        spcnode: Oid,
        dbnode: Oid,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let total_blocks = self
            .tline
            .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
@@ -1973,20 +1960,21 @@ impl DatadirModification<'_> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> Result<(), RelationError> {
+    ) -> Result<(), WalIngestError> {
        if rel.relnode == 0 {
-            return Err(RelationError::InvalidRelnode);
+            Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
+                "invalid relnode"
+            )))?;
        }
        // It's possible that this is the first rel for this db in this
        // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
-            .context("deserialize db")?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;

        let dbdir_exists =
            if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
                // Didn't exist. Update dbdir
                e.insert(false);
-                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+                let buf = DbDirectory::ser(&dbdir)?;
                self.pending_directory_entries.push((
                    DirectoryKind::Db,
                    MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
@@ -2003,27 +1991,25 @@ impl DatadirModification<'_> {
            RelDirectory::default()
        } else {
            // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                .context("deserialize db")?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
        };

-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;

        if v2_enabled {
            if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
            }
            let sparse_rel_dir_key =
                rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
            // check if the rel_dir_key exists in v2
-            let val = self
-                .sparse_get(sparse_rel_dir_key, ctx)
-                .await
-                .map_err(|e| RelationError::Other(e.into()))?;
+            let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
            let val = RelDirExists::decode_option(val)
-                .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                .map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
            if val == RelDirExists::Exists {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
            }
            self.put(
                sparse_rel_dir_key,
@@ -2039,9 +2025,7 @@ impl DatadirModification<'_> {
                // will be key not found errors if we don't create an empty one for rel_size_v2.
                self.put(
                    rel_dir_key,
-                    Value::Image(Bytes::from(
-                        RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
-                    )),
+                    Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
                );
            }
            self.pending_directory_entries
@@ -2049,7 +2033,7 @@ impl DatadirModification<'_> {
        } else {
            // Add the new relation to the rel directory entry, and write it back
            if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
            }
            if !dbdir_exists {
                self.pending_directory_entries
@@ -2059,9 +2043,7 @@ impl DatadirModification<'_> {
                .push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
            self.put(
                rel_dir_key,
-                Value::Image(Bytes::from(
-                    RelDirectory::ser(&rel_dir).context("serialize")?,
-                )),
+                Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
            );
        }

@@ -2086,8 +2068,8 @@ impl DatadirModification<'_> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
        if self
            .tline
            .get_rel_exists(rel, Version::Modified(self), ctx)
@@ -2117,8 +2099,8 @@ impl DatadirModification<'_> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);

        // Put size
        let size_key = rel_size_to_key(rel);
@@ -2142,8 +2124,10 @@ impl DatadirModification<'_> {
        &mut self,
        drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+    ) -> Result<(), WalIngestError> {
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
        for ((spc_node, db_node), rel_tags) in drop_relations {
            let dir_key = rel_dir_to_key(spc_node, db_node);
            let buf = self.get(dir_key, ctx).await?;
@@ -2163,7 +2147,7 @@ impl DatadirModification<'_> {
                    let key =
                        rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
                    let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
-                        .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                        .map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
                    if val == RelDirExists::Exists {
                        self.pending_directory_entries
                            .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
@@ -2206,7 +2190,7 @@ impl DatadirModification<'_> {
        segno: u32,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        assert!(self.tline.tenant_shard_id.is_shard_zero());

        // Add it to the directory entry
@@ -2215,7 +2199,7 @@ impl DatadirModification<'_> {
        let mut dir = SlruSegmentDirectory::des(&buf)?;

        if !dir.segments.insert(segno) {
-            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
+            Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?;
        }
        self.pending_directory_entries.push((
            DirectoryKind::SlruSegment(kind),
@@ -2242,7 +2226,7 @@ impl DatadirModification<'_> {
        kind: SlruKind,
        segno: u32,
        nblocks: BlockNumber,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        assert!(self.tline.tenant_shard_id.is_shard_zero());

        // Put size
@@ -2258,7 +2242,7 @@ impl DatadirModification<'_> {
        kind: SlruKind,
        segno: u32,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        // Remove it from the directory entry
        let dir_key = slru_dir_to_key(kind);
        let buf = self.get(dir_key, ctx).await?;
@@ -2283,7 +2267,7 @@ impl DatadirModification<'_> {
    }

    /// Drop a relmapper file (pg_filenode.map)
-    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
+    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> {
        // TODO
        Ok(())
    }
@@ -2293,7 +2277,7 @@ impl DatadirModification<'_> {
        &mut self,
        xid: u64,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        // Remove it from the directory entry
        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
        let newdirbuf = if self.tline.pg_version >= 17 {
@@ -2308,7 +2292,8 @@ impl DatadirModification<'_> {
            ));
            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
        } else {
-            let xid: u32 = u32::try_from(xid)?;
+            let xid: u32 = u32::try_from(xid)
+                .map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?;
            let mut dir = TwoPhaseDirectory::des(&buf)?;

            if !dir.xids.remove(&xid) {
@@ -2333,7 +2318,7 @@ impl DatadirModification<'_> {
        path: &str,
        content: &[u8],
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let key = aux_file::encode_aux_file_key(path);
        // retrieve the key from the engine
        let old_val = match self.get(key, ctx).await {
@@ -2342,7 +2327,7 @@ impl DatadirModification<'_> {
            Err(e) => return Err(e.into()),
        };
        let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
-            aux_file::decode_file_value(old_val)?
+            aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)?
        } else {
            Vec::new()
        };
@@ -2387,7 +2372,8 @@ impl DatadirModification<'_> {
            }
            (None, true) => warn!("removing non-existing aux file: {}", path),
        }
-        let new_val = aux_file::encode_file_value(&new_files)?;
+        let new_val = aux_file::encode_file_value(&new_files)
+            .map_err(WalIngestErrorKind::EncodeAuxFileError)?;
        self.put(key, Value::Image(new_val.into()));

        Ok(())
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -11571,6 +11571,99 @@ mod tests {
        Ok(())
    }

+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x24),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x24")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x28),
+                // This record will fail to redo
+                Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![], // in-memory layers
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    Lsn(0x20)..Lsn(0x30),
+                    delta1,
+                )], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            tline
+                .applied_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let cancel = CancellationToken::new();
+
+        // Compaction will fail, but should not fire any critical error.
+        // Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction
+        // process. It will always try to redo the logs it reads and if it doesn't work, fail the entire
+        // compaction job. Tracked in <https://github.com/neondatabase/neon/issues/10395>.
+        let res = tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: None,
+                    compact_lsn_range: None,
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await;
+        assert!(res.is_err());
+
+        Ok(())
+    }
+
    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1269,7 +1269,7 @@ mod test {

        // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap();
        let range = input_start..input_end;

        // Build an image layer to filter
@@ -1314,7 +1314,7 @@ mod test {
            let shard_identity = ShardIdentity::new(
                ShardNumber(shard_number),
                shard_count,
-                ShardStripeSize(0x8000),
+                ShardStripeSize(0x800),
            )
            .unwrap();
            let harness = TenantHarness::create_custom(
@@ -1368,12 +1368,12 @@ mod test {

            // This exact size and those below will need updating as/when the layer encoding changes, but
            // should be deterministic for a given version of the format, as we used no randomness generating the input.
-            assert_eq!(original_size, 1597440);
+            assert_eq!(original_size, 122880);

            match shard_number {
                0 => {
                    // We should have written out just one stripe for our shard identity
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                    let replacement = replacement.unwrap();

                    // We should have dropped some of the data
@@ -1381,7 +1381,7 @@ mod test {
                    assert!(replacement.metadata().file_size > 0);

                    // Assert that we dropped ~3/4 of the data.
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                }
                1 => {
                    // Shard 1 has no keys in our input range
@@ -1390,19 +1390,19 @@ mod test {
                }
                2 => {
                    // Shard 2 has one stripes in the input range
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                    let replacement = replacement.unwrap();
                    assert!(replacement.metadata().file_size < original_size);
                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                }
                3 => {
                    // Shard 3 has two stripes in the input range
-                    assert_eq!(wrote_keys, 0x10000);
+                    assert_eq!(wrote_keys, 0x1000);
                    let replacement = replacement.unwrap();
                    assert!(replacement.metadata().file_size < original_size);
                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 811008);
+                    assert_eq!(replacement.metadata().file_size, 73728);
                }
                _ => unreachable!(),
            }
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1141,6 +1141,7 @@ impl Timeline {
    ) -> Result<(), CompactionError> {
        let mut drop_layers = Vec::new();
        let mut layers_to_rewrite: Vec<Layer> = Vec::new();
+        let mut rewrite_max_exceeded: bool = false;

        // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
        // layer is behind this Lsn, it indicates that the layer is being retained beyond the
@@ -1149,12 +1150,7 @@ impl Timeline {
        // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
        // are rewriting layers.
        let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
-
-        tracing::info!(
-            "starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}",
-            *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.time
-        );
+        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;

        let layers = self.layers.read().await;
        for layer_desc in layers.layer_map()?.iter_historic_layers() {
@@ -1172,8 +1168,8 @@ impl Timeline {
                // This ancestral layer only covers keys that belong to other shards.
                // We include the full metadata in the log: if we had some critical bug that caused
                // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
-                info!(%layer, old_metadata=?layer.metadata(),
-                    "dropping layer after shard split, contains no keys for this shard.",
+                debug!(%layer, old_metadata=?layer.metadata(),
+                    "dropping layer after shard split, contains no keys for this shard",
                );

                if cfg!(debug_assertions) {
@@ -1235,9 +1231,10 @@ impl Timeline {
            }

            if layers_to_rewrite.len() >= rewrite_max {
-                tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
+                debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
                    layers_to_rewrite.len()
                );
+                rewrite_max_exceeded = true;
                continue;
            }

@@ -1245,9 +1242,24 @@ impl Timeline {
            layers_to_rewrite.push(layer);
        }

-        // Drop read lock on layer map before we start doing time-consuming I/O
+        // Drop read lock on layer map before we start doing time-consuming I/O.
        drop(layers);

+        // Drop out early if there's nothing to do.
+        if layers_to_rewrite.is_empty() && drop_layers.is_empty() {
+            return Ok(());
+        }
+
+        info!(
+            "starting shard ancestor compaction, rewriting {} layers and dropping {} layers \
+                (latest_gc_cutoff={} pitr_cutoff={})",
+            layers_to_rewrite.len(),
+            drop_layers.len(),
+            *latest_gc_cutoff,
+            pitr_cutoff,
+        );
+        let started = Instant::now();
+
        let mut replace_image_layers = Vec::new();

        for layer in layers_to_rewrite {
@@ -1255,7 +1267,7 @@ impl Timeline {
                return Err(CompactionError::ShuttingDown);
            }

-            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
+            info!(layer=%layer, "rewriting layer after shard split");
            let mut image_layer_writer = ImageLayerWriter::new(
                self.conf,
                self.timeline_id,
@@ -1295,7 +1307,7 @@ impl Timeline {
                    .map_err(CompactionError::Other)?;
                let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
                    .map_err(CompactionError::Other)?;
-                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
+                info!(layer=%new_layer, "rewrote layer, {} -> {} bytes",
                    layer.metadata().file_size,
                    new_layer.metadata().file_size);

@@ -1307,6 +1319,12 @@ impl Timeline {
            }
        }

+        for layer in &drop_layers {
+            info!(%layer, old_metadata=?layer.metadata(),
+                "dropping layer after shard split (no keys for this shard)",
+            );
+        }
+
        // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
        // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
        // to remote index) and be removed. This is inefficient but safe.
@@ -1322,6 +1340,7 @@ impl Timeline {
        // necessary for correctness, but it simplifies testing, and avoids proceeding with another
        // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
        // load.
+        info!("shard ancestor compaction waiting for uploads");
        match self.remote_client.wait_completion().await {
            Ok(()) => (),
            Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
@@ -1330,6 +1349,15 @@ impl Timeline {
            }
        }

+        info!(
+            "shard ancestor compaction done in {:.3}s{}",
+            started.elapsed().as_secs_f64(),
+            match rewrite_max_exceeded {
+                true => format!(", more work pending due to rewrite_max={rewrite_max}"),
+                false => String::new(),
+            }
+        );
+
        fail::fail_point!("compact-shard-ancestors-persistent");

        Ok(())
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -580,6 +580,7 @@ impl ConnectionManagerState {
                                );
                                Ok(())
                            }
+                            WalReceiverError::Cancelled => Ok(()),
                            WalReceiverError::Other(e) => {
                                // give out an error to have task_mgr give it a really verbose logging
                                if cancellation.is_cancelled() {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -73,6 +73,7 @@ pub(super) enum WalReceiverError {
    /// Generic error
    Other(anyhow::Error),
    ClosedGate,
+    Cancelled,
 }

 impl From<tokio_postgres::Error> for WalReceiverError {
@@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection(
                                // with a similar error.
                            },
                            WalReceiverError::SuccessfulCompletion(_) => {}
+                            WalReceiverError::Cancelled => {
+                                debug!("Connection cancelled")
+                            }
                            WalReceiverError::ClosedGate => {
                                // doesn't happen at runtime
                            }
@@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection(

    let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);

-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
+        .await
+        .map_err(|e| match e.kind {
+            crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
+            _ => WalReceiverError::Other(e.into()),
+        })?;

    let shard = vec![*timeline.get_shard_identity()];

--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,13 +21,13 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.

+use std::backtrace::Backtrace;
 use std::collections::HashMap;
 use std::sync::{Arc, OnceLock};
 use std::time::{Duration, Instant, SystemTime};

-use anyhow::{Result, bail};
 use bytes::{Buf, Bytes};
-use pageserver_api::key::rel_block_to_key;
+use pageserver_api::key::{Key, rel_block_to_key};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
@@ -38,7 +38,7 @@ use postgres_ffi::{
    fsm_logical_to_physical, pg_constants,
 };
 use tracing::*;
-use utils::bin_ser::SerializeError;
+use utils::bin_ser::{DeserializeError, SerializeError};
 use utils::lsn::Lsn;
 use utils::rate_limit::RateLimit;
 use utils::{critical, failpoint_support};
@@ -104,12 +104,101 @@ struct WarnIngestLag {
    timestamp_invalid_msg_ratelimit: RateLimit,
 }

+pub struct WalIngestError {
+    pub backtrace: std::backtrace::Backtrace,
+    pub kind: WalIngestErrorKind,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum WalIngestErrorKind {
+    #[error(transparent)]
+    #[allow(private_interfaces)]
+    PageReconstructError(#[from] PageReconstructError),
+    #[error(transparent)]
+    DeserializationFailure(#[from] DeserializeError),
+    #[error(transparent)]
+    SerializationFailure(#[from] SerializeError),
+    #[error("the request contains data not supported by pageserver: {0} @ {1}")]
+    InvalidKey(Key, Lsn),
+    #[error("twophase file for xid {0} already exists")]
+    FileAlreadyExists(u64),
+    #[error("slru segment {0:?}/{1} already exists")]
+    SlruAlreadyExists(SlruKind, u32),
+    #[error("relation already exists")]
+    RelationAlreadyExists(RelTag),
+    #[error("invalid reldir key {0}")]
+    InvalidRelDirKey(Key),
+
+    #[error(transparent)]
+    LogicalError(anyhow::Error),
+    #[error(transparent)]
+    EncodeAuxFileError(anyhow::Error),
+    #[error(transparent)]
+    MaybeRelSizeV2Error(anyhow::Error),
+
+    #[error("timeline shutting down")]
+    Cancelled,
+}
+
+impl<T> From<T> for WalIngestError
+where
+    WalIngestErrorKind: From<T>,
+{
+    fn from(value: T) -> Self {
+        WalIngestError {
+            backtrace: Backtrace::capture(),
+            kind: WalIngestErrorKind::from(value),
+        }
+    }
+}
+
+impl std::error::Error for WalIngestError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        self.kind.source()
+    }
+}
+
+impl core::fmt::Display for WalIngestError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        self.kind.fmt(f)
+    }
+}
+
+impl core::fmt::Debug for WalIngestError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        if f.alternate() {
+            f.debug_map()
+                .key(&"backtrace")
+                .value(&self.backtrace)
+                .key(&"kind")
+                .value(&self.kind)
+                .finish()
+        } else {
+            writeln!(f, "Error: {:?}", self.kind)?;
+            if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured {
+                writeln!(f, "Stack backtrace: {:?}", self.backtrace)?;
+            }
+            Ok(())
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! ensure_walingest {
+    ($($t:tt)*) => {
+        _ = || -> Result<(), anyhow::Error> {
+            anyhow::ensure!($($t)*);
+            Ok(())
+        }().map_err(WalIngestErrorKind::LogicalError)?;
+    };
+}
+
 impl WalIngest {
    pub async fn new(
        timeline: &Timeline,
        startpoint: Lsn,
        ctx: &RequestContext,
-    ) -> anyhow::Result<WalIngest> {
+    ) -> Result<WalIngest, WalIngestError> {
        // Fetch the latest checkpoint into memory, so that we can compare with it
        // quickly in `ingest_record` and update it when it changes.
        let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -145,7 +234,7 @@ impl WalIngest {
        interpreted: InterpretedWalRecord,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<bool> {
+    ) -> Result<bool, WalIngestError> {
        WAL_INGEST.records_received.inc();
        let prev_len = modification.len();

@@ -288,7 +377,7 @@ impl WalIngest {
    }

    /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
-    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
+    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64, WalIngestError> {
        let next_full_xid =
            enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });

@@ -298,9 +387,9 @@ impl WalIngest {
        if xid > next_xid {
            // Wraparound occurred, must be from a prev epoch.
            if epoch == 0 {
-                bail!(
+                Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
                    "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"
-                );
+                )))?;
            }
            epoch -= 1;
        }
@@ -313,7 +402,7 @@ impl WalIngest {
        clear_vm_bits: ClearVmBits,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let ClearVmBits {
            new_heap_blkno,
            old_heap_blkno,
@@ -402,7 +491,7 @@ impl WalIngest {
        create: DbaseCreate,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let DbaseCreate {
            db_id,
            tablespace_id,
@@ -505,7 +594,7 @@ impl WalIngest {
        dbase_drop: DbaseDrop,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let DbaseDrop {
            db_id,
            tablespace_ids,
@@ -523,7 +612,7 @@ impl WalIngest {
        create: SmgrCreate,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let SmgrCreate { rel } = create;
        self.put_rel_creation(modification, rel, ctx).await?;
        Ok(())
@@ -537,7 +626,7 @@ impl WalIngest {
        truncate: XlSmgrTruncate,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let XlSmgrTruncate {
            blkno,
            rnode,
@@ -689,7 +778,7 @@ impl WalIngest {
        record: XactRecord,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let (xact_common, is_commit, is_prepared) = match record {
            XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
                let xid: u64 = if modification.tline.pg_version >= 17 {
@@ -813,7 +902,7 @@ impl WalIngest {
        truncate: ClogTruncate,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let ClogTruncate {
            pageno,
            oldest_xid,
@@ -889,7 +978,7 @@ impl WalIngest {
        zero_page: ClogZeroPage,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let ClogZeroPage { segno, rpageno } = zero_page;

        self.put_slru_page_image(
@@ -907,7 +996,7 @@ impl WalIngest {
        &mut self,
        modification: &mut DatadirModification,
        xlrec: &XlMultiXactCreate,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        // Create WAL record for updating the multixact-offsets page
        let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1010,7 +1099,7 @@ impl WalIngest {
        modification: &mut DatadirModification<'_>,
        xlrec: &XlMultiXactTruncate,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        let (maxsegment, startsegment, endsegment) =
            enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
                cp.oldestMulti = xlrec.end_trunc_off;
@@ -1058,7 +1147,7 @@ impl WalIngest {
        zero_page: MultiXactZeroPage,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        let MultiXactZeroPage {
            slru_kind,
            segno,
@@ -1080,7 +1169,7 @@ impl WalIngest {
        update: RelmapUpdate,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        let RelmapUpdate { update, buf } = update;

        modification
@@ -1093,7 +1182,7 @@ impl WalIngest {
        raw_record: RawXlogRecord,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        let RawXlogRecord { info, lsn, mut buf } = raw_record;
        let pg_version = modification.tline.pg_version;

@@ -1235,12 +1324,12 @@ impl WalIngest {
        put: PutLogicalMessage,
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        let PutLogicalMessage { path, buf } = put;
        modification.put_file(path.as_str(), &buf, ctx).await
    }

-    fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> {
+    fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> {
        match record {
            StandbyRecord::RunningXacts(running_xacts) => {
                enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
@@ -1258,7 +1347,7 @@ impl WalIngest {
        &mut self,
        record: ReploriginRecord,
        modification: &mut DatadirModification<'_>,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        match record {
            ReploriginRecord::Set(set) => {
                modification
@@ -1278,7 +1367,7 @@ impl WalIngest {
        modification: &mut DatadirModification<'_>,
        rel: RelTag,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        modification.put_rel_creation(rel, 0, ctx).await?;
        Ok(())
    }
@@ -1291,7 +1380,7 @@ impl WalIngest {
        blknum: BlockNumber,
        img: Bytes,
        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), WalIngestError> {
        self.handle_rel_extend(modification, rel, blknum, ctx)
            .await?;
        modification.put_rel_page_image(rel, blknum, img)?;
@@ -1305,7 +1394,7 @@ impl WalIngest {
        blknum: BlockNumber,
        rec: NeonWalRecord,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        self.handle_rel_extend(modification, rel, blknum, ctx)
            .await?;
        modification.put_rel_wal_record(rel, blknum, rec)?;
@@ -1318,7 +1407,7 @@ impl WalIngest {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        modification.put_rel_truncation(rel, nblocks, ctx).await?;
        Ok(())
    }
@@ -1329,7 +1418,7 @@ impl WalIngest {
        rel: RelTag,
        blknum: BlockNumber,
        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), WalIngestError> {
        let new_nblocks = blknum + 1;
        // Check if the relation exists. We implicitly create relations on first
        // record.
@@ -1423,7 +1512,7 @@ impl WalIngest {
        blknum: BlockNumber,
        img: Bytes,
        ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
        if !self.shard.is_shard_zero() {
            return Ok(());
        }
@@ -1441,7 +1530,7 @@ impl WalIngest {
        segno: u32,
        blknum: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        // we don't use a cache for this like we do for relations. SLRUS are explcitly
        // extended with ZEROPAGE records, not with commit records, so it happens
        // a lot less frequently.
@@ -1509,6 +1598,7 @@ async fn get_relsize(
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
+    use anyhow::Result;
    use postgres_ffi::RELSEG_SIZE;

    use super::*;
@@ -1530,7 +1620,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
+    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
        for i in 14..=16 {
            dispatch_pgversion!(i, {
                pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -65,6 +65,9 @@ static const struct config_enum_entry neon_compute_modes[] = {
 /* GUCs */
 char	   *neon_timeline;
 char	   *neon_tenant;
+char	   *neon_project_id;
+char	   *neon_branch_id;
+char	   *neon_endpoint_id;
 int32		max_cluster_size;
 char	   *page_server_connstring;
 char	   *neon_auth_token;
@@ -1352,6 +1355,31 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);

+	DefineCustomStringVariable("neon.project_id",
+							   "Neon project_id the server is running on",
+							   NULL,
+							   &neon_project_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+	DefineCustomStringVariable("neon.branch_id",
+							   "Neon branch_id the server is running on",
+							   NULL,
+							   &neon_branch_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+	DefineCustomStringVariable("neon.endpoint_id",
+							   "Neon endpoint_id the server is running on",
+							   NULL,
+							   &neon_endpoint_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+
 	DefineCustomIntVariable("neon.stripe_size",
 							"sharding stripe size",
 							NULL,
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -99,6 +99,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->config = config;
 	wp->api = api;
 	wp->state = WPS_COLLECTING_TERMS;
+	wp->mconf.generation = INVALID_GENERATION;
+	wp->mconf.members.len = 0;
+	wp->mconf.new_members.len = 0;

 	wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);

@@ -170,6 +173,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)

 	if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
 		wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
+	if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3)
+		wp_log(FATAL, "enabling generations requires protocol version 3");
 	wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);

 	/* Fill the greeting package */
@@ -214,7 +219,7 @@ WalProposerFree(WalProposer *wp)
 static bool
 WalProposerGenerationsEnabled(WalProposer *wp)
 {
-	return wp->safekeepers_generation != 0;
+	return wp->safekeepers_generation != INVALID_GENERATION;
 }

 /*
@@ -723,13 +728,176 @@ SendProposerGreeting(Safekeeper *sk)
 	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
 }

+/*
+ * Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in
+ * members_safekeepers & new_members_safekeepers to sk.
+ */
+static void
+UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
+{
+	/* members_safekeepers etc are fixed size, sanity check mconf size */
+	if (wp->mconf.members.len > MAX_SAFEKEEPERS)
+		wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len);
+	if (wp->mconf.new_members.len > MAX_SAFEKEEPERS)
+		wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len);
+
+	/* node id is not known until greeting is received */
+	if (sk->state < SS_WAIT_VOTING)
+		return;
+
+	/* 0 is assumed to be invalid node id, should never happen */
+	if (sk->greetResponse.nodeId == 0)
+	{
+		wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port);
+		return;
+	}
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		SafekeeperId *sk_id = &wp->mconf.members.m[i];
+
+		if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId)
+		{
+			/*
+			 * If mconf or list of safekeepers to connect to changed (the
+			 * latter always currently goes through restart though),
+			 * ResetMemberSafekeeperPtrs is expected to be called before
+			 * UpdateMemberSafekeeperPtr. So, other value suggests that we are
+			 * connected to the same sk under different host name, complain
+			 * about that.
+			 */
+			if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk)
+			{
+				wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu",
+					   sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper);
+			}
+			wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu",
+				   sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
+			wp->members_safekeepers[i] = sk;
+		}
+	}
+	/* repeat for new_members */
+	for (uint32 i = 0; i < wp->mconf.new_members.len; i++)
+	{
+		SafekeeperId *sk_id = &wp->mconf.new_members.m[i];
+
+		if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId)
+		{
+			if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk)
+			{
+				wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu",
+					   sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper);
+			}
+			wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu",
+				   sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
+			wp->new_members_safekeepers[i] = sk;
+		}
+	}
+}
+
+/*
+ * Reset wp->members_safekeepers & new_members_safekeepers and refill them.
+ * Called after wp changes mconf.
+ */
+static void
+ResetMemberSafekeeperPtrs(WalProposer *wp)
+{
+	memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
+	memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		if (wp->safekeeper[i].state >= SS_WAIT_VOTING)
+			UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]);
+	}
+}
+
+static uint32
+MsetQuorum(MemberSet *mset)
+{
+	Assert(mset->len > 0);
+	return mset->len / 2 + 1;
+}
+
+/* Does n forms quorum in mset? */
+static bool
+MsetHasQuorum(MemberSet *mset, uint32 n)
+{
+	return n >= MsetQuorum(mset);
+}
+
+/*
+ * TermsCollected helper for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
+ */
+static bool
+TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
+{
+	uint32		n_greeted = 0;
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		Safekeeper *sk = msk[i];
+
+		if (sk != NULL && sk->state == SS_WAIT_VOTING)
+		{
+			if (n_greeted > 0)
+				appendStringInfoString(s, ", ");
+			appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
+			n_greeted++;
+		}
+	}
+	appendStringInfo(s, ", %u/%u total", n_greeted, mset->len);
+	return MsetHasQuorum(mset, n_greeted);
+}
+
 /*
 * Have we received greeting from enough (quorum) safekeepers to start voting?
 */
 static bool
 TermsCollected(WalProposer *wp)
 {
-	return wp->n_connected >= wp->quorum;
+	StringInfoData s;			/* str for logging */
+	bool		collected = false;
+
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
+	{
+		collected = wp->n_connected >= wp->quorum;
+		if (collected)
+		{
+			wp->propTerm++;
+			wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm);
+		}
+		return collected;
+	}
+
+	/*
+	 * With generations enabled, we start campaign only when 1) some mconf is
+	 * actually received 2) we have greetings from majority of members as well
+	 * as from majority of new_members if it exists.
+	 */
+	if (wp->mconf.generation == INVALID_GENERATION)
+		return false;
+
+	initStringInfo(&s);
+	appendStringInfoString(&s, "mset greeters: ");
+	if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
+		goto res;
+	if (wp->mconf.new_members.len > 0)
+	{
+		appendStringInfoString(&s, ", new_mset greeters: ");
+		if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
+			goto res;
+	}
+	wp->propTerm++;
+	wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm);
+	collected = true;
+
+res:
+	pfree(s.data);
+	return collected;
 }

 static void
@@ -753,13 +921,41 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	pfree(mconf_toml);

 	/*
-	 * Adopt mconf of safekeepers if it is higher. TODO: mconf change should
-	 * restart wp if it started voting.
+	 * Adopt mconf of safekeepers if it is higher.
 	 */
 	if (sk->greetResponse.mconf.generation > wp->mconf.generation)
 	{
+		/* sanity check before adopting, should never happen */
+		if (sk->greetResponse.mconf.members.len == 0)
+		{
+			wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation);
+		}
+
+		/*
+		 * If we at least started campaign, restart wp to get elected in the
+		 * new mconf. Note: in principle once wp is already elected
+		 * re-election is not required, but being conservative here is not
+		 * bad.
+		 *
+		 * TODO: put mconf to shmem to immediately pick it up on start,
+		 * otherwise if some safekeeper(s) misses latest mconf and gets
+		 * connected the first, it may cause redundant restarts here.
+		 *
+		 * More generally, it would be nice to restart walproposer (wiping
+		 * election state) without restarting the process. In particular, that
+		 * would allow sync-safekeepers not to die here if it intersected with
+		 * sk migration (as well as remove 1s delay).
+		 *
+		 * Note that assign_neon_safekeepers also currently restarts the
+		 * process, so during normal migration walproposer may restart twice.
+		 */
+		if (wp->state >= WPS_CAMPAIGN)
+		{
+			wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation);
+		}
 		MembershipConfigurationFree(&wp->mconf);
 		MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
+		ResetMemberSafekeeperPtrs(wp);
 		/* full conf was just logged above */
 		wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
 	}
@@ -767,6 +963,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_WAIT_VOTING;

+	/* In greeting safekeeper sent its id; update mappings accordingly. */
+	UpdateMemberSafekeeperPtr(wp, sk);
+
 	/*
 	 * Note: it would be better to track the counter on per safekeeper basis,
 	 * but at worst walproposer would restart with 'term rejected', so leave
@@ -778,12 +977,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		/* We're still collecting terms from the majority. */
 		wp->propTerm = Max(sk->greetResponse.term, wp->propTerm);

-		/* Quorum is acquried, prepare the vote request. */
+		/* Quorum is acquired, prepare the vote request. */
 		if (TermsCollected(wp))
 		{
-			wp->propTerm++;
-			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
-
 			wp->state = WPS_CAMPAIGN;
 			wp->voteRequest.pam.tag = 'v';
 			wp->voteRequest.generation = wp->mconf.generation;
@@ -832,8 +1028,8 @@ SendVoteRequest(Safekeeper *sk)
 					   &sk->outbuf, wp->config->proto_version);

 	/* We have quorum for voting, send our vote request */
-	wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
-		   wp->voteRequest.generation, wp->voteRequest.term);
+	wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT,
+		   sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term);
 	/* On failure, logging & resetting is handled */
 	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
 	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
@@ -851,8 +1047,8 @@ RecvVoteResponse(Safekeeper *sk)
 		return;

 	wp_log(LOG,
-		   "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-		   sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
+		   "got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+		   sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
 		   sk->voteResponse.voteGiven,
 		   GetHighestTerm(&sk->voteResponse.termHistory),
 		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
@@ -899,6 +1095,53 @@ RecvVoteResponse(Safekeeper *sk)
 	}
 }

+/*
+ * VotesCollected helper for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
+ */
+static bool
+VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
+{
+	uint32		n_votes = 0;
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		Safekeeper *sk = msk[i];
+
+		if (sk != NULL && sk->state == SS_WAIT_ELECTED)
+		{
+			Assert(sk->voteResponse.voteGiven);
+
+			/*
+			 * Find the highest vote. NULL check is for the legacy case where
+			 * safekeeper might be not initialized with LSN at all and return
+			 * 0 LSN in the vote response; we still want to set donor to
+			 * something in this case.
+			 */
+			if (GetLastLogTerm(sk) > wp->donorLastLogTerm ||
+				(GetLastLogTerm(sk) == wp->donorLastLogTerm &&
+				 sk->voteResponse.flushLsn > wp->propTermStartLsn) ||
+				wp->donor == NULL)
+			{
+				wp->donorLastLogTerm = GetLastLogTerm(sk);
+				wp->propTermStartLsn = sk->voteResponse.flushLsn;
+				wp->donor = sk;
+			}
+			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
+
+			if (n_votes > 0)
+				appendStringInfoString(s, ", ");
+			appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
+			n_votes++;
+		}
+	}
+	appendStringInfo(s, ", %u/%u total", n_votes, mset->len);
+	return MsetHasQuorum(mset, n_votes);
+}
+
+
 /*
 * Checks if enough votes has been collected to get elected and if that's the
 * case finds the highest vote, setting donor, donorLastLogTerm,
@@ -907,7 +1150,8 @@ RecvVoteResponse(Safekeeper *sk)
 static bool
 VotesCollected(WalProposer *wp)
 {
-	int			n_ready = 0;
+	StringInfoData s;			/* str for logging */
+	bool		collected = false;

 	/* assumed to be called only when not elected yet */
 	Assert(wp->state == WPS_CAMPAIGN);
@@ -916,25 +1160,62 @@ VotesCollected(WalProposer *wp)
 	wp->donorLastLogTerm = 0;
 	wp->truncateLsn = InvalidXLogRecPtr;

-	for (int i = 0; i < wp->n_safekeepers; i++)
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
 	{
-		if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
-		{
-			n_ready++;
+		int			n_ready = 0;

-			if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
-				(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
-				 wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn))
+		for (int i = 0; i < wp->n_safekeepers; i++)
+		{
+			if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
 			{
-				wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
-				wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
-				wp->donor = i;
+				n_ready++;
+
+				if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
+					(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
+					 wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) ||
+					wp->donor == NULL)
+				{
+					wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
+					wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
+					wp->donor = &wp->safekeeper[i];
+				}
+				wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
 			}
-			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
 		}
+		collected = n_ready >= wp->quorum;
+		if (collected)
+		{
+			wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers);
+		}
+		return collected;
 	}

-	return n_ready >= wp->quorum;
+	/*
+	 * if generations are enabled we're expected to get to voting only when
+	 * mconf is established.
+	 */
+	Assert(wp->mconf.generation != INVALID_GENERATION);
+
+	/*
+	 * We must get votes from both msets if both are present.
+	 */
+	initStringInfo(&s);
+	appendStringInfoString(&s, "mset voters: ");
+	if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
+		goto res;
+	if (wp->mconf.new_members.len > 0)
+	{
+		appendStringInfoString(&s, ", new_mset voters: ");
+		if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
+			goto res;
+	}
+	wp_log(LOG, "walproposer elected, %s", s.data);
+	collected = true;
+
+res:
+	pfree(s.data);
+	return collected;
 }

 /*
@@ -955,7 +1236,7 @@ HandleElectedProposer(WalProposer *wp)
 	 * that only for logical replication (and switching logical walsenders to
 	 * neon_walreader is a todo.)
 	 */
-	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
+	if (!wp->api.recovery_download(wp, wp->donor))
 	{
 		wp_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
@@ -1078,7 +1359,7 @@ ProcessPropStartPos(WalProposer *wp)
 	/*
 	 * Proposer's term history is the donor's + its own entry.
 	 */
-	dth = &wp->safekeeper[wp->donor].voteResponse.termHistory;
+	dth = &wp->donor->voteResponse.termHistory;
 	wp->propTermHistory.n_entries = dth->n_entries + 1;
 	wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries);
 	if (dth->n_entries > 0)
@@ -1086,11 +1367,10 @@ ProcessPropStartPos(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn;

-	wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-		   wp->quorum,
+	wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
 		   wp->propTerm,
 		   LSN_FORMAT_ARGS(wp->propTermStartLsn),
-		   wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+		   wp->donor->host, wp->donor->port,
 		   LSN_FORMAT_ARGS(wp->truncateLsn));

 	/*
@@ -1508,6 +1788,14 @@ RecvAppendResponses(Safekeeper *sk)

 		readAnything = true;

+		/* should never happen: sk is expected to send ERROR instead */
+		if (sk->appendResponse.generation != wp->mconf.generation)
+		{
+			wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u",
+				   sk->greetResponse.nodeId, sk->host, sk->port,
+				   sk->appendResponse.generation, wp->mconf.generation);
+		}
+
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
@@ -1624,30 +1912,101 @@ CalculateMinFlushLsn(WalProposer *wp)
 }

 /*
- * Calculate WAL position acknowledged by quorum
+ * GetAcknowledgedByQuorumWALPosition for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
 */
 static XLogRecPtr
-GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
+GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk)
 {
 	XLogRecPtr	responses[MAX_SAFEKEEPERS];

 	/*
-	 * Sort acknowledged LSNs
+	 * Ascending sort acknowledged LSNs.
 	 */
-	for (int i = 0; i < wp->n_safekeepers; i++)
+	Assert(mset->len <= MAX_SAFEKEEPERS);
+	for (uint32 i = 0; i < mset->len; i++)
 	{
+		Safekeeper *sk = msk[i];
+
 		/*
 		 * Like in Raft, we aren't allowed to commit entries from previous
-		 * terms, so ignore reported LSN until it gets to epochStartLsn.
+		 * terms, so ignore reported LSN until it gets to propTermStartLsn.
+		 *
+		 * Note: we ignore sk state, which is ok: before first ack flushLsn is
+		 * 0, and later we just preserve value across reconnections. It would
+		 * be ok to check for SS_ACTIVE as well.
 		 */
-		responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
+		if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn)
+		{
+			responses[i] = sk->appendResponse.flushLsn;
+		}
+		else
+		{
+			responses[i] = 0;
+		}
 	}
-	qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+	qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn);

 	/*
-	 * Get the smallest LSN committed by quorum
+	 * And get value committed by the quorum. A way to view this: to get the
+	 * highest value committed on the quorum, in the ordered array we skip n -
+	 * n_quorum elements to get to the first (lowest) value present on all sks
+	 * of the highest quorum.
 	 */
-	return responses[wp->n_safekeepers - wp->quorum];
+	return responses[mset->len - MsetQuorum(mset)];
+}
+
+/*
+ * Calculate WAL position acknowledged by quorum, i.e. which may be regarded
+ * committed.
+ *
+ * Zero may be returned when there is no quorum of nodes recovered to term start
+ * lsn which sent feedback yet.
+ */
+static XLogRecPtr
+GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
+{
+	XLogRecPtr	committed;
+
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
+	{
+		XLogRecPtr	responses[MAX_SAFEKEEPERS];
+
+		/*
+		 * Sort acknowledged LSNs
+		 */
+		for (int i = 0; i < wp->n_safekeepers; i++)
+		{
+			/*
+			 * Like in Raft, we aren't allowed to commit entries from previous
+			 * terms, so ignore reported LSN until it gets to
+			 * propTermStartLsn.
+			 *
+			 * Note: we ignore sk state, which is ok: before first ack
+			 * flushLsn is 0, and later we just preserve value across
+			 * reconnections. It would be ok to check for SS_ACTIVE as well.
+			 */
+			responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
+		}
+		qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+
+		/*
+		 * Get the smallest LSN committed by quorum
+		 */
+		return responses[wp->n_safekeepers - wp->quorum];
+	}
+
+	committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers);
+	if (wp->mconf.new_members.len > 0)
+	{
+		XLogRecPtr	new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers);
+
+		committed = Min(committed, new_mset_committed);
+	}
+	return committed;
 }

 /*
@@ -1662,7 +2021,7 @@ UpdateDonorShmem(WalProposer *wp)
 	int			i;
 	XLogRecPtr	donor_lsn = InvalidXLogRecPtr;

-	if (wp->n_votes < wp->quorum)
+	if (wp->state < WPS_ELECTED)
 	{
 		wp_log(WARNING, "UpdateDonorShmem called before elections are won");
 		return;
@@ -1673,9 +2032,9 @@ UpdateDonorShmem(WalProposer *wp)
 	 * about its position immediately after election before any feedbacks are
 	 * sent.
 	 */
-	if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED)
+	if (wp->donor->state >= SS_WAIT_ELECTED)
 	{
-		donor = &wp->safekeeper[wp->donor];
+		donor = wp->donor;
 		donor_lsn = wp->propTermStartLsn;
 	}

@@ -1746,13 +2105,13 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 	}

 	/*
-	 * Generally sync is done when majority switched the epoch so we committed
-	 * epochStartLsn and made the majority aware of it, ensuring they are
-	 * ready to give all WAL to pageserver. It would mean whichever majority
-	 * is alive, there will be at least one safekeeper who is able to stream
-	 * WAL to pageserver to make basebackup possible. However, since at the
-	 * moment we don't have any good mechanism of defining the healthy and
-	 * most advanced safekeeper who should push the wal into pageserver and
+	 * Generally sync is done when majority reached propTermStartLsn so we
+	 * committed it and made the majority aware of it, ensuring they are ready
+	 * to give all WAL to pageserver. It would mean whichever majority is
+	 * alive, there will be at least one safekeeper who is able to stream WAL
+	 * to pageserver to make basebackup possible. However, since at the moment
+	 * we don't have any good mechanism of defining the healthy and most
+	 * advanced safekeeper who should push the wal into pageserver and
 	 * basically the random one gets connected, to prevent hanging basebackup
 	 * (due to pageserver connecting to not-synced-safekeeper) we currently
 	 * wait for all seemingly alive safekeepers to get synced.
@@ -1774,7 +2133,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 				n_synced++;
 		}

-		if (n_synced >= wp->quorum)
+		if (newCommitLsn >= wp->propTermStartLsn)
 		{
 			/* A quorum of safekeepers has been synced! */

--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -145,6 +145,7 @@ typedef uint64 NNodeId;
 * This and following structs pair ones in membership.rs.
 */
 typedef uint32 Generation;
+#define INVALID_GENERATION 0

 typedef struct SafekeeperId
 {
@@ -771,7 +772,17 @@ typedef struct WalProposer
 	/* Current walproposer membership configuration */
 	MembershipConfiguration mconf;

-	/* (n_safekeepers / 2) + 1 */
+	/*
+	 * Parallels mconf.members with pointers to the member's slot in
+	 * safekeepers array of connections, or NULL if such member is not
+	 * connected. Helps to avoid looking slot per id through all
+	 * .safekeepers[] when doing quorum checks.
+	 */
+	Safekeeper *members_safekeepers[MAX_SAFEKEEPERS];
+	/* As above, but for new_members. */
+	Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS];
+
+	/* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */
 	int			quorum;

 	/*
@@ -829,7 +840,7 @@ typedef struct WalProposer
 	term_t		donorLastLogTerm;

 	/* Most advanced acceptor */
-	int			donor;
+	Safekeeper *donor;

 	/* timeline globally starts at this LSN */
 	XLogRecPtr	timelineStartLsn;
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -31,6 +31,7 @@ pub async fn task_main_https(
    global_timelines: Arc<GlobalTimelines>,
 ) -> anyhow::Result<()> {
    let cert_resolver = ReloadingCertificateResolver::new(
+        "main",
        &conf.ssl_key_file,
        &conf.ssl_cert_file,
        conf.ssl_cert_reload_period,
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -22,6 +22,7 @@ use pageserver_api::controller_api::{
    MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest,
    ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
+    TimelineImportRequest,
 };
 use pageserver_api::models::{
    DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest,
@@ -1235,8 +1236,18 @@ async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError
        ForwardOutcome::NotForwarded(req) => req,
    };

-    let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.step_down().await)
+    // Spawn a background task: once we start stepping down, we must finish: if the client drops
+    // their request we should avoid stopping in some part-stepped-down state.
+    let handle = tokio::spawn(async move {
+        let state = get_state(&req);
+        state.service.step_down().await
+    });
+
+    let result = handle
+        .await
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+    json_response(StatusCode::OK, result)
 }

 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1276,6 +1287,37 @@ async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiE
    )
 }

+async fn handle_timeline_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let import_req = json_request::<TimelineImportRequest>(&mut req).await?;
+
+    let state = get_state(&req);
+
+    if import_req.tenant_id != tenant_id || import_req.timeline_id != timeline_id {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "tenant id or timeline id mismatch: url={tenant_id}/{timeline_id}, body={}/{}",
+            import_req.tenant_id,
+            import_req.timeline_id
+        )));
+    }
+
+    json_response(
+        StatusCode::OK,
+        state.service.timeline_import(import_req).await?,
+    )
+}
+
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -1949,6 +1991,16 @@ pub fn make_router(
                RequestName("debug_v1_tenant_locate"),
            )
        })
+        .post(
+            "/debug/v1/tenant/:tenant_id/timeline/:timeline_id/import",
+            |r| {
+                named_request_span(
+                    r,
+                    handle_timeline_import,
+                    RequestName("debug_v1_timeline_import"),
+                )
+            },
+        )
        .get("/debug/v1/scheduler", |r| {
            named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
        })
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -472,6 +472,7 @@ async fn async_main() -> anyhow::Result<()> {
            let https_listener = tcp_listener::bind(https_addr)?;

            let resolver = ReloadingCertificateResolver::new(
+                "main",
                &args.ssl_key_file,
                &args.ssl_cert_file,
                *args.ssl_cert_reload_period,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -61,7 +61,7 @@ use utils::completion::Barrier;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
-use utils::sync::gate::Gate;
+use utils::sync::gate::{Gate, GateGuard};
 use utils::{failpoint_support, pausable_failpoint};

 use crate::background_node_operations::{
@@ -594,6 +594,8 @@ struct TenantShardSplitAbort {
    new_stripe_size: Option<ShardStripeSize>,
    /// Until this abort op is complete, no other operations may be done on the tenant
    _tenant_lock: TracingExclusiveGuard<TenantOperations>,
+    /// The reconciler gate for the duration of the split operation, and any included abort.
+    _gate: GateGuard,
 }

 #[derive(thiserror::Error, Debug)]
@@ -1460,7 +1462,7 @@ impl Service {
            // Retry until shutdown: we must keep this request object alive until it is properly
            // processed, as it holds a lock guard that prevents other operations trying to do things
            // to the tenant while it is in a weird part-split state.
-            while !self.cancel.is_cancelled() {
+            while !self.reconcilers_cancel.is_cancelled() {
                match self.abort_tenant_shard_split(&op).await {
                    Ok(_) => break,
                    Err(e) => {
@@ -1473,9 +1475,12 @@ impl Service {
                        // when we retry, so that the abort op will succeed.  If the abort op is failing
                        // for some other reason, we will keep retrying forever, or until a human notices
                        // and does something about it (either fixing a pageserver or restarting the controller).
-                        tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled())
-                            .await
-                            .ok();
+                        tokio::time::timeout(
+                            Duration::from_secs(5),
+                            self.reconcilers_cancel.cancelled(),
+                        )
+                        .await
+                        .ok();
                    }
                }
            }
@@ -1847,6 +1852,7 @@ impl Service {
        };

        if insert {
+            let config = attach_req.config.clone().unwrap_or_default();
            let tsp = TenantShardPersistence {
                tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
                shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
@@ -1855,7 +1861,7 @@ impl Service {
                generation: attach_req.generation_override.or(Some(0)),
                generation_pageserver: None,
                placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
-                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
+                config: serde_json::to_string(&config).unwrap(),
                splitting: SplitState::default(),
                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                    .unwrap(),
@@ -1878,16 +1884,16 @@ impl Service {
                Ok(()) => {
                    tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);

-                    let mut locked = self.inner.write().unwrap();
-                    locked.tenants.insert(
+                    let mut shard = TenantShard::new(
                        attach_req.tenant_shard_id,
-                        TenantShard::new(
-                            attach_req.tenant_shard_id,
-                            ShardIdentity::unsharded(),
-                            PlacementPolicy::Attached(0),
-                            None,
-                        ),
+                        ShardIdentity::unsharded(),
+                        PlacementPolicy::Attached(0),
+                        None,
                    );
+                    shard.config = config;
+
+                    let mut locked = self.inner.write().unwrap();
+                    locked.tenants.insert(attach_req.tenant_shard_id, shard);
                    tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
                }
            }
@@ -1972,11 +1978,12 @@ impl Service {
            .set_attached(scheduler, attach_req.node_id);

        tracing::info!(
-            "attach_hook: tenant {} set generation {:?}, pageserver {}",
+            "attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}",
            attach_req.tenant_shard_id,
            tenant_shard.generation,
            // TODO: this is an odd number of 0xf's
-            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
+            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)),
+            attach_req.config,
        );

        // Trick the reconciler into not doing anything for this tenant: this helps
@@ -4910,7 +4917,7 @@ impl Service {
                    1,
                    10,
                    Duration::from_secs(5),
-                    &self.cancel,
+                    &self.reconcilers_cancel,
                )
                .await
            {
@@ -5161,6 +5168,11 @@ impl Service {
        )
        .await;

+        let _gate = self
+            .reconcilers_gate
+            .enter()
+            .map_err(|_| ApiError::ShuttingDown)?;
+
        let new_shard_count = ShardCount::new(split_req.new_shard_count);
        let new_stripe_size = split_req.new_stripe_size;

@@ -5188,6 +5200,7 @@ impl Service {
                        new_shard_count,
                        new_stripe_size,
                        _tenant_lock,
+                        _gate,
                    })
                    // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
                    .ok();
@@ -5527,7 +5540,10 @@ impl Service {
                "failpoint".to_string()
            )));

-            failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
+            failpoint_support::sleep_millis_async!(
+                "shard-split-post-remote-sleep",
+                &self.reconcilers_cancel
+            );

            tracing::info!(
                "Split {} into {}",
@@ -5585,7 +5601,7 @@ impl Service {
                        stripe_size,
                        preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed),
                    },
-                    &self.cancel,
+                    &self.reconcilers_cancel,
                )
                .await
            {
@@ -8670,9 +8686,24 @@ impl Service {
        failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");

        self.inner.write().unwrap().step_down();
-        // TODO: would it make sense to have a time-out for this?
-        self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
-            .await;
+
+        // Wait for reconciliations to stop, or terminate this process if they
+        // fail to stop in time (this indicates a bug in shutdown)
+        tokio::select! {
+            _ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => {
+                tracing::info!("Reconciliations stopped, proceeding with step down");
+            }
+            _ = async {
+                failpoint_support::sleep_millis_async!("step-down-delay-timeout");
+                tokio::time::sleep(Duration::from_secs(10)).await
+            } => {
+                tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process");
+
+                // The caller may proceed to act as leader when it sees this request fail: reduce the chance
+                // of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state.
+                std::process::exit(1);
+            }
+        }

        let mut global_observed = GlobalObservedState::default();
        let locked = self.inner.read().unwrap();
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -12,13 +12,16 @@ use crate::persistence::{
 use crate::safekeeper::Safekeeper;
 use anyhow::Context;
 use http_utils::error::ApiError;
-use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy};
+use pageserver_api::controller_api::{
+    SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest,
+};
 use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo};
 use safekeeper_api::membership::{MemberSet, SafekeeperId};
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;
+use utils::lsn::Lsn;

 use super::Service;

@@ -298,6 +301,31 @@ impl Service {
            timeline_id,
        })
    }
+
+    /// Directly insert the timeline into the database without reconciling it with safekeepers.
+    ///
+    /// Useful if the timeline already exists on the specified safekeepers,
+    /// but we want to make it storage controller managed.
+    pub(crate) async fn timeline_import(&self, req: TimelineImportRequest) -> Result<(), ApiError> {
+        let persistence = TimelinePersistence {
+            tenant_id: req.tenant_id.to_string(),
+            timeline_id: req.timeline_id.to_string(),
+            start_lsn: Lsn::INVALID.into(),
+            generation: 1,
+            sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(),
+            new_sk_set: None,
+            cplane_notified_generation: 1,
+            deleted_at: None,
+        };
+        let inserted = self.persistence.insert_timeline(persistence).await?;
+        if inserted {
+            tracing::info!("imported timeline into db");
+        } else {
+            tracing::info!("didn't import timeline into db, as it is already present in db");
+        }
+        Ok(())
+    }
+
    /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler.
    pub(super) async fn tenant_timeline_delete_safekeepers(
        self: &Arc<Self>,
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,6 +14,7 @@ import threading
 import time
 import uuid
 from collections import defaultdict
+from collections.abc import Mapping
 from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
@@ -79,7 +80,12 @@ from fixtures.remote_storage import (
    default_remote_storage,
    remote_storage_to_toml_dict,
 )
-from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.safekeeper.http import (
+    MembershipConfiguration,
+    SafekeeperHttpClient,
+    SafekeeperId,
+    TimelineCreateRequest,
+)
 from fixtures.safekeeper.utils import wait_walreceivers_absent
 from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
@@ -1980,10 +1986,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
        tenant_shard_id: TenantId | TenantShardId,
        pageserver_id: int,
        generation_override: int | None = None,
+        config: None | dict[str, Any] = None,
    ) -> int:
        body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
        if generation_override is not None:
            body["generation_override"] = generation_override
+        if config is not None:
+            body["config"] = config

        response = self.request(
            "POST",
@@ -2878,13 +2887,14 @@ class NeonPageserver(PgProtocol, LogUtils):
        self,
        immediate: bool = False,
        timeout_in_seconds: int | None = None,
+        extra_env_vars: dict[str, str] | None = None,
    ):
        """
        High level wrapper for restart: restarts the process, and waits for
        tenant state to stabilize.
        """
        self.stop(immediate=immediate)
-        self.start(timeout_in_seconds=timeout_in_seconds)
+        self.start(timeout_in_seconds=timeout_in_seconds, extra_env_vars=extra_env_vars)
        self.quiesce_tenants()

    def quiesce_tenants(self):
@@ -2973,11 +2983,12 @@ class NeonPageserver(PgProtocol, LogUtils):
        to call into the pageserver HTTP client.
        """
        client = self.http_client()
-        if generation is None:
-            generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
-        elif override_storage_controller_generation:
+        if generation is None or override_storage_controller_generation:
            generation = self.env.storage_controller.attach_hook_issue(
-                tenant_id, self.id, generation
+                tenant_id,
+                self.id,
+                generation_override=generation if override_storage_controller_generation else None,
+                config=config,
            )
        return client.tenant_attach(
            tenant_id,
@@ -4291,28 +4302,29 @@ class Endpoint(PgProtocol, LogUtils):

    def respec_deep(self, **kwargs: Any) -> None:
        """
-        Update the endpoint.json file taking into account nested keys.
-        It does one level deep update. Should enough for most cases.
-        Distinct method from respec() to do not break existing functionality.
+        Update the spec.json file taking into account nested keys.
+        Distinct method from respec() to not break existing functionality.
        NOTE: This method also updates the spec.json file, not endpoint.json.
        We need it because neon_local also writes to spec.json, so intended
        use-case is i) start endpoint with some config, ii) respec_deep(),
        iii) call reconfigure() to apply the changes.
        """
+
+        def update(curr, patch):
+            for k, v in patch.items():
+                if isinstance(v, Mapping):
+                    curr[k] = update(curr.get(k, {}), v)
+                else:
+                    curr[k] = v
+            return curr
+
        config_path = os.path.join(self.endpoint_path(), "spec.json")
        with open(config_path) as f:
            data_dict: dict[str, Any] = json.load(f)

        log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4))

-        for key, value in kwargs.items():
-            if isinstance(value, dict):
-                if key not in data_dict:
-                    data_dict[key] = value
-                else:
-                    data_dict[key] = {**data_dict[key], **value}
-            else:
-                data_dict[key] = value
+        update(data_dict, kwargs)

        with open(config_path, "w") as file:
            log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
@@ -4839,6 +4851,50 @@ class Safekeeper(LogUtils):

        wait_until(paused)

+    @staticmethod
+    def sks_to_safekeeper_ids(sks: list[Safekeeper]) -> list[SafekeeperId]:
+        return [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in sks]
+
+    @staticmethod
+    def mconf_sks(env: NeonEnv, mconf: MembershipConfiguration) -> list[Safekeeper]:
+        """
+        List of Safekeepers which are members in `mconf`.
+        """
+        members_ids = [m.id for m in mconf.members]
+        new_members_ids = [m.id for m in mconf.new_members] if mconf.new_members is not None else []
+        return [sk for sk in env.safekeepers if sk.id in members_ids or sk.id in new_members_ids]
+
+    @staticmethod
+    def create_timeline(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        ps: NeonPageserver,
+        mconf: MembershipConfiguration,
+        members_sks: list[Safekeeper],
+    ):
+        """
+        Manually create timeline on safekeepers with given (presumably inital)
+        mconf: figure out LSN from pageserver, bake request and execute it on
+        given safekeepers.
+
+        Normally done by storcon, but some tests want to do it manually so far.
+        """
+        ps_http_cli = ps.http_client()
+        # figure out initial LSN.
+        ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
+        init_lsn = ps_timeline_detail["last_record_lsn"]
+        log.info(f"initial LSN: {init_lsn}")
+        # sk timeline creation request expects minor version
+        pg_version = ps_timeline_detail["pg_version"] * 10000
+        # create inital mconf
+        create_r = TimelineCreateRequest(
+            tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
+        )
+        log.info(f"sending timeline create: {create_r.to_json()}")
+
+        for sk in members_sks:
+            sk.http_client().timeline_create(create_r)
+

 class NeonBroker(LogUtils):
    """An object managing storage_broker instance"""
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -65,13 +65,11 @@ def single_timeline(
    assert ps_http.tenant_list() == []

    def attach(tenant):
-        # NB: create the new tenant in the storage controller with the correct tenant config. This
-        # will pick up the existing tenant data from remote storage. If we just attach it to the
-        # Pageserver, the storage controller will reset the tenant config to the default.
-        env.create_tenant(
-            tenant_id=tenant,
-            timeline_id=template_timeline,
-            conf=template_config,
+        env.pageserver.tenant_attach(
+            tenant,
+            config=template_config,
+            generation=100,
+            override_storage_controller_generation=True,
        )

    with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -25,7 +25,7 @@ class Walreceiver:

@dataclass
 class SafekeeperTimelineStatus:
-    mconf: Configuration | None
+    mconf: MembershipConfiguration | None
    term: int
    last_log_term: int
    pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
@@ -78,17 +78,17 @@ class SafekeeperId:


@dataclass
-class Configuration:
+class MembershipConfiguration:
    generation: int
    members: list[SafekeeperId]
    new_members: list[SafekeeperId] | None

    @classmethod
-    def from_json(cls, d: dict[str, Any]) -> Configuration:
+    def from_json(cls, d: dict[str, Any]) -> MembershipConfiguration:
        generation = d["generation"]
        members = d["members"]
        new_members = d.get("new_members")
-        return Configuration(generation, members, new_members)
+        return MembershipConfiguration(generation, members, new_members)

    def to_json(self) -> str:
        return json.dumps(self, cls=EnhancedJSONEncoder)
@@ -98,7 +98,7 @@ class Configuration:
 class TimelineCreateRequest:
    tenant_id: TenantId
    timeline_id: TimelineId
-    mconf: Configuration
+    mconf: MembershipConfiguration
    # not exactly PgVersion, for example 150002 for 15.2
    pg_version: int
    start_lsn: Lsn
@@ -110,13 +110,13 @@ class TimelineCreateRequest:

@dataclass
 class TimelineMembershipSwitchResponse:
-    previous_conf: Configuration
-    current_conf: Configuration
+    previous_conf: MembershipConfiguration
+    current_conf: MembershipConfiguration

    @classmethod
    def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
-        previous_conf = Configuration.from_json(d["previous_conf"])
-        current_conf = Configuration.from_json(d["current_conf"])
+        previous_conf = MembershipConfiguration.from_json(d["previous_conf"])
+        current_conf = MembershipConfiguration.from_json(d["current_conf"])
        return TimelineMembershipSwitchResponse(previous_conf, current_conf)


@@ -194,7 +194,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
        resj = res.json()
        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
        # It is always normally not None, it is allowed only to make forward compat tests happy.
-        mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None
+        mconf = MembershipConfiguration.from_json(resj["mconf"]) if "mconf" in resj else None
        return SafekeeperTimelineStatus(
            mconf=mconf,
            term=resj["acceptor_state"]["term"],
@@ -223,7 +223,9 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
        return self.timeline_status(tenant_id, timeline_id).commit_lsn

    # Get timeline membership configuration.
-    def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration:
+    def get_membership(
+        self, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> MembershipConfiguration:
        # make mypy happy
        return self.timeline_status(tenant_id, timeline_id).mconf  # type: ignore

@@ -275,7 +277,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
        return res_json

    def timeline_exclude(
-        self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
+        self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
    ) -> dict[str, Any]:
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude",
@@ -287,7 +289,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
        return res_json

    def membership_switch(
-        self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
+        self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
    ) -> TimelineMembershipSwitchResponse:
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership",
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -66,11 +66,11 @@ def test_basebackup_with_high_slru_count(

    n_txns = 500000

-    def setup_wrapper(env: NeonEnv):
-        return setup_tenant_template(env, n_txns)
-
    env = setup_pageserver_with_tenants(
-        neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper
+        neon_env_builder,
+        f"large_slru_count-{n_tenants}-{n_txns}",
+        n_tenants,
+        lambda env: setup_tenant_template(env, n_txns),
    )
    run_benchmark(env, pg_bin, record, duration)

@@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
        "gc_period": "0s",  # disable periodic gc
        "checkpoint_timeout": "10 years",
        "compaction_period": "0s",  # disable periodic compaction
-        "compaction_threshold": 10,
-        "compaction_target_size": 134217728,
-        "checkpoint_distance": 268435456,
-        "image_creation_threshold": 3,
    }

    template_tenant, template_timeline = env.create_tenant(set_default=True)
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -1,5 +1,8 @@
+import concurrent.futures
 import dataclasses
 import json
+import re
+import threading
 import time
 from dataclasses import dataclass
 from pathlib import Path
@@ -31,15 +34,15 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
    mode: str = "pipelined"


-EXECUTION = ["concurrent-futures", "tasks"]
+EXECUTION = ["concurrent-futures"]

 NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
 for max_batch_size in [1, 32]:
    for execution in EXECUTION:
        NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))

-BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
-for max_batch_size in [1, 2, 4, 8, 16, 32]:
+BATCHABLE: list[PageServicePipeliningConfig] = []
+for max_batch_size in [32]:
    for execution in EXECUTION:
        BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))

@@ -47,19 +50,6 @@ for max_batch_size in [1, 2, 4, 8, 16, 32]:
@pytest.mark.parametrize(
    "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
    [
-        # non-batchable workloads
-        # (A separate benchmark will consider latency).
-        *[
-            (
-                50,
-                config,
-                TARGET_RUNTIME,
-                1,
-                128,
-                f"not batchable {dataclasses.asdict(config)}",
-            )
-            for config in NON_BATCHABLE
-        ],
        # batchable workloads should show throughput and CPU efficiency improvements
        *[
            (
@@ -137,7 +127,14 @@ def test_throughput(

    env = neon_env_builder.init_start()
    ps_http = env.pageserver.http_client()
-    endpoint = env.endpoints.create_start("main")
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # minimal lfc & small shared buffers to force requests to pageserver
+            "neon.max_file_cache_size=1MB",
+            "shared_buffers=10MB",
+        ],
+    )
    conn = endpoint.connect()
    cur = conn.cursor()

@@ -155,7 +152,6 @@ def test_throughput(
    tablesize = tablesize_mib * 1024 * 1024
    npages = tablesize // (8 * 1024)
    cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
-    # TODO: can we force postgres to do sequential scans?

    #
    # Run the workload, collect `Metrics` before and after, calculate difference, normalize.
@@ -211,31 +207,73 @@ def test_throughput(
                ).value,
            )

-    def workload() -> Metrics:
+    def workload(disruptor_started: threading.Event) -> Metrics:
+        disruptor_started.wait()
        start = time.time()
        iters = 0
        while time.time() - start < target_runtime or iters < 2:
-            log.info("Seqscan %d", iters)
            if iters == 1:
                # round zero for warming up
                before = get_metrics()
-            cur.execute(
-                "select clear_buffer_cache()"
-            )  # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests
            cur.execute("select sum(data::bigint) from t")
            assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
            iters += 1
        after = get_metrics()
        return (after - before).normalize(iters - 1)

+    def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event):
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        iters = 0
+        while True:
+            cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')")
+            if stop_disruptor.is_set():
+                break
+            disruptor_started.set()
+            iters += 1
+            time.sleep(0.001)
+        return iters
+
    env.pageserver.patch_config_toml_nonrecursive(
        {"page_service_pipelining": dataclasses.asdict(pipelining_config)}
    )
-    env.pageserver.restart()
-    metrics = workload()
+
+    # set trace for log analysis below
+    env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"})
+
+    log.info("Starting workload")
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        disruptor_started = threading.Event()
+        stop_disruptor = threading.Event()
+        disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor)
+        workload_fut = executor.submit(workload, disruptor_started)
+        metrics = workload_fut.result()
+        stop_disruptor.set()
+        ndisruptions = disruptor_fut.result()
+        log.info("Disruptor issued %d disrupting requests", ndisruptions)

    log.info("Results: %s", metrics)

+    since_last_start: list[str] = []
+    for line in env.pageserver.logfile.read_text().splitlines():
+        if "git:" in line:
+            since_last_start = []
+        since_last_start.append(line)
+
+    stopping_batching_because_re = re.compile(
+        r"stopping batching because (LSN changed|of batch size|timeline object mismatch|batch key changed|same page was requested at different LSNs|.*)"
+    )
+    reasons_for_stopping_batching = {}
+    for line in since_last_start:
+        match = stopping_batching_because_re.search(line)
+        if match:
+            if match.group(1) not in reasons_for_stopping_batching:
+                reasons_for_stopping_batching[match.group(1)] = 0
+            reasons_for_stopping_batching[match.group(1)] += 1
+
+    log.info("Reasons for stopping batching: %s", reasons_for_stopping_batching)
+
    #
    # Sanity-checks on the collected data
    #
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -1,6 +1,5 @@
 from __future__ import annotations

-import copy
 import json
 import uuid
 from typing import TYPE_CHECKING
@@ -16,7 +15,6 @@ if TYPE_CHECKING:
    from fixtures.neon_fixtures import NeonEnvBuilder, PgBin


-@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395")
 def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env = neon_env_builder.init_start()

@@ -44,7 +42,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
                "refill_interval": "100ms",
                "refill_amount": int(rate_limit_rps / 10),
                "max": int(rate_limit_rps / 10),
-                "fair": True,
            },
        },
    )
@@ -98,17 +95,12 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
    _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))

    log.info("run pagebench")
-    duration_secs = 10
+    duration_secs = 20
    actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)

    log.info("validate the client is capped at the configured rps limit")
    expect_ncompleted = duration_secs * rate_limit_rps
-    delta_abs = abs(expect_ncompleted - actual_ncompleted)
-    threshold = 0.05 * expect_ncompleted
-    assert threshold / rate_limit_rps < 0.1 * duration_secs, (
-        "test self-test: unrealistic expecations regarding precision in this test"
-    )
-    assert delta_abs < 0.05 * expect_ncompleted, (
+    assert pytest.approx(expect_ncompleted, 0.05) == actual_ncompleted, (
        "the throttling deviates more than 5percent from the expectation"
    )

@@ -122,6 +114,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
        timeout=compaction_period,
    )

+    log.info("validate the metrics")
    smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
    assert smgr_query_seconds_post is not None
    throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
@@ -130,72 +123,13 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
    actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
    actual_throttled_secs = actual_throttled_usecs / 1_000_000

-    log.info("validate that the metric doesn't include throttle wait time")
-    assert duration_secs >= 10 * actual_smgr_query_seconds, (
-        "smgr metrics should not include throttle wait time"
-    )
-
-    log.info("validate that the throttling wait time metrics is correct")
    assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, (
-        "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
+        "throttling and processing latency = total request time; this assert validates thi holds on average"
    )

-
-throttle_config_with_field_fair_set = {
-    "task_kinds": ["PageRequestHandler"],
-    "fair": True,
-    "initial": 27,
-    "refill_interval": "43s",
-    "refill_amount": 23,
-    "max": 42,
-}
-
-
-def assert_throttle_config_with_field_fair_set(conf):
-    """
-    Field `fair` is ignored, so, responses don't contain it
-    """
-    without_fair = copy.deepcopy(throttle_config_with_field_fair_set)
-    without_fair.pop("fair")
-
-    assert conf == without_fair
-
-
-def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder):
-    """
-    To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
-    """
-    env = neon_env_builder.init_start()
-    vps_http = env.storage_controller.pageserver_api()
-    # with_fair config should still be settable
-    vps_http.set_tenant_config(
-        env.initial_tenant,
-        {"timeline_get_throttle": throttle_config_with_field_fair_set},
-    )
-    conf = vps_http.tenant_config(env.initial_tenant)
-    assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
-    assert_throttle_config_with_field_fair_set(
-        conf.tenant_specific_overrides["timeline_get_throttle"]
-    )
-
-
-def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
-    neon_env_builder: NeonEnvBuilder,
-):
-    """
-    To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
-    """
-
-    def set_tenant_config(ps_cfg):
-        tenant_config = ps_cfg.setdefault("tenant_config", {})
-        tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set
-
-    neon_env_builder.pageserver_config_override = set_tenant_config
-    env = neon_env_builder.init_start()
-    ps_http = env.pageserver.http_client()
-    conf = ps_http.tenant_config(env.initial_tenant)
-    assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
-
-    env.pageserver.allowed_errors.append(
-        r'.*ignoring unknown configuration item path="tenant_config\.timeline_get_throttle\.fair"*'
+    # without this assertion, the test would pass even if the throttling was completely broken
+    # but the request processing is so slow that it makes up for the latency that a correct throttling
+    # implementation would add
+    assert actual_smgr_query_seconds < 0.66 * duration_secs, (
+        "test self-test: request processing is consuming most of the wall clock time; this risks that we're not actually testing throttling"
    )
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -242,7 +242,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
            pageserver.tenant_location_configure(tenant_id, location_conf)
            last_state[pageserver.id] = (mode, generation)

-            if mode.startswith("Attached"):
+            # It's only valid to connect to the last generation. Newer generations may yank layer
+            # files used in older generations.
+            last_generation = max(
+                [s[1] for s in last_state.values() if s[1] is not None], default=None
+            )
+
+            if mode.startswith("Attached") and generation == last_generation:
                # This is a basic test: we are validating that he endpoint works properly _between_
                # configuration changes.  A stronger test would be to validate that clients see
                # no errors while we are making the changes.
--- a/test_runner/regress/test_ssl.py
+++ b/test_runner/regress/test_ssl.py
@@ -1,5 +1,6 @@
 import os
 import ssl
+from datetime import datetime, timedelta

 import pytest
 import requests
@@ -151,3 +152,63 @@ def test_certificate_rotation(neon_env_builder: NeonEnvBuilder):
    requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
    cur_cert = ssl.get_server_certificate(("localhost", port))
    assert cur_cert == sk_cert
+
+
+def test_server_and_cert_metrics(neon_env_builder: NeonEnvBuilder):
+    """
+    Test metrics exported from http/https server and tls cert reloader.
+    """
+    neon_env_builder.use_https_pageserver_api = True
+    neon_env_builder.pageserver_config_override = "ssl_cert_reload_period='100 ms'"
+    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.append(".*Error reloading certificate.*")
+
+    ps_client = env.pageserver.http_client()
+
+    # 1. Test connection started metric.
+    filter_https = {"scheme": "https"}
+    old_https_conn_count = (
+        ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
+    )
+
+    addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status"
+    requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
+
+    new_https_conn_count = (
+        ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
+    )
+    # The counter should increase after the request,
+    # but it may increase by more than one because of storcon requests.
+    assert new_https_conn_count > old_https_conn_count
+
+    # 2. Test tls connection error.
+    # Request without specified CA cert file should fail.
+    with pytest.raises(requests.exceptions.SSLError):
+        requests.get(addr)
+
+    tls_error_cnt = (
+        ps_client.get_metric_value("http_server_connection_errors_total", {"type": "tls"}) or 0
+    )
+    assert tls_error_cnt == 1
+
+    # 3. Test expiration time metric.
+    expiration_time = datetime.fromtimestamp(
+        ps_client.get_metric_value("tls_certs_expiration_time_seconds") or 0
+    )
+    now = datetime.now()
+    # neon_local generates certs valid for 100 years.
+    # Compare with +-1 year to not care about leap years.
+    assert now + timedelta(days=365 * 99) < expiration_time < now + timedelta(days=365 * 101)
+
+    # 4. Test cert reload failed metric.
+    reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total")
+    assert reload_error_cnt == 0
+
+    os.remove(env.pageserver.workdir / "server.crt")
+
+    def reload_failed():
+        reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") or 0
+        assert reload_error_cnt > 0
+
+    wait_until(reload_failed)
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2892,10 +2892,12 @@ def test_storage_controller_leadership_transfer(
        )


+@pytest.mark.parametrize("step_down_times_out", [False, True])
 def test_storage_controller_leadership_transfer_during_split(
    neon_env_builder: NeonEnvBuilder,
    storage_controller_proxy: StorageControllerProxy,
    port_distributor: PortDistributor,
+    step_down_times_out: bool,
 ):
    """
    Exercise a race between shard splitting and graceful leadership transfer.  This is
@@ -2936,6 +2938,18 @@ def test_storage_controller_leadership_transfer_during_split(
        )
    env.storage_controller.reconcile_until_idle()

+    # We are testing scenarios where the step down API does not complete: either because it is stuck
+    # doing a shard split, or because it totally times out on some other failpoint.
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*step_down.*request was dropped before completing.*",
+            ".*step_down.*operation timed out.*",
+            ".*Send step down request failed, will retry.*",
+            ".*Send step down request still failed after.*retries.*",
+            ".*Leader .+ did not respond to step-down request.*",
+        ]
+    )
+
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        # Start a shard split
        env.storage_controller.allowed_errors.extend(
@@ -2943,6 +2957,14 @@ def test_storage_controller_leadership_transfer_during_split(
        )
        pause_failpoint = "shard-split-pre-complete"
        env.storage_controller.configure_failpoints((pause_failpoint, "pause"))
+
+        if not step_down_times_out:
+            # Prevent the timeout self-terminate code from executing: we will block step down on the
+            # shard split itself
+            env.storage_controller.configure_failpoints(
+                ("step-down-delay-timeout", "return(3600000)")
+            )
+
        split_fut = executor.submit(
            env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2
        )
@@ -2961,12 +2983,20 @@ def test_storage_controller_leadership_transfer_during_split(
            timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
        )

+        if step_down_times_out:
+            # Step down will time out, original controller will terminate itself
+            env.storage_controller.allowed_errors.extend([".*terminating process.*"])
+        else:
+            # Step down does not time out: original controller hits its shard split completion
+            # code path and realises that it must not purge the parent shards from the database.
+            env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"])
+
        def passed_split_abort():
            try:
                log.info("Checking log for pattern...")
-                assert env.storage_controller.log_contains(
-                    ".*Using observed state received from leader.*"
-                )
+                # This log is indicative of entering startup_reconcile, which happens
+                # after the point we would abort shard splits
+                assert env.storage_controller.log_contains(".*Populating tenant shards.*")
            except Exception:
                log.exception("Failed to find pattern in log")
                raise
@@ -2975,34 +3005,42 @@ def test_storage_controller_leadership_transfer_during_split(
        wait_until(passed_split_abort, interval=0.1, status_interval=1.0)
        assert env.storage_controller.log_contains(".*Aborting shard split.*")

-        # Proxy is still talking to original controller here: disable its pause failpoint so
-        # that its shard split can run to completion.
-        log.info("Disabling failpoint")
-        # Bypass the proxy: the python test HTTPServer is single threaded and still blocked
-        # on handling the shard split request.
-        env.storage_controller.request(
-            "PUT",
-            f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
-            json=[{"name": "shard-split-pre-complete", "actions": "off"}],
-            headers=env.storage_controller.headers(TokenScope.ADMIN),
-        )
+        if step_down_times_out:
+            # We will let the old controller hit a timeout path where it terminates itself, rather than
+            # completing step_down and trying to complete a shard split
+            def old_controller_terminated():
+                assert env.storage_controller.log_contains(".*terminating process.*")

-        def previous_stepped_down():
-            assert (
-                env.storage_controller.get_leadership_status()
-                == StorageControllerLeadershipStatus.STEPPED_DOWN
+            wait_until(old_controller_terminated)
+        else:
+            # Proxy is still talking to original controller here: disable its pause failpoint so
+            # that its shard split can run to completion.
+            log.info("Disabling failpoint")
+            # Bypass the proxy: the python test HTTPServer is single threaded and still blocked
+            # on handling the shard split request.
+            env.storage_controller.request(
+                "PUT",
+                f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
+                json=[{"name": "shard-split-pre-complete", "actions": "off"}],
+                headers=env.storage_controller.headers(TokenScope.ADMIN),
            )

-        log.info("Awaiting step down")
-        wait_until(previous_stepped_down)
+            def previous_stepped_down():
+                assert (
+                    env.storage_controller.get_leadership_status()
+                    == StorageControllerLeadershipStatus.STEPPED_DOWN
+                )

-        # Let the shard split complete: this may happen _after_ the replacement has come up
-        # and tried to clean up the databases
-        log.info("Unblocking & awaiting shard split")
-        with pytest.raises(Exception, match="Unexpected child shard count"):
-            # This split fails when it tries to persist results, because it encounters
-            # changes already made by the new controller's abort-on-startup
-            split_fut.result()
+            log.info("Awaiting step down")
+            wait_until(previous_stepped_down)
+
+            # Let the shard split complete: this may happen _after_ the replacement has come up
+            # and tried to clean up the databases
+            log.info("Unblocking & awaiting shard split")
+            with pytest.raises(Exception, match="Unexpected child shard count"):
+                # This split fails when it tries to persist results, because it encounters
+                # changes already made by the new controller's abort-on-startup
+                split_fut.result()

        log.info("Routing to new leader")
        storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
@@ -3020,13 +3058,14 @@ def test_storage_controller_leadership_transfer_during_split(
    env.storage_controller.wait_until_ready()
    env.storage_controller.consistency_check()

-    # Check that the stepped down instance forwards requests
-    # to the new leader while it's still running.
-    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
-    env.storage_controller.tenant_shard_dump()
-    env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
-    status = env.storage_controller.node_status(env.pageservers[0].id)
-    assert status["scheduling"] == "Pause"
+    if not step_down_times_out:
+        # Check that the stepped down instance forwards requests
+        # to the new leader while it's still running.
+        storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
+        env.storage_controller.tenant_shard_dump()
+        env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
+        status = env.storage_controller.node_status(env.pageservers[0].id)
+        assert status["scheduling"] == "Pause"


 def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -45,7 +45,7 @@ from fixtures.remote_storage import (
    s3_storage,
 )
 from fixtures.safekeeper.http import (
-    Configuration,
+    MembershipConfiguration,
    SafekeeperHttpClient,
    SafekeeperId,
    TimelineCreateRequest,
@@ -589,7 +589,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
    for sk in env.safekeepers:
        sk.start()
        cli = sk.http_client()
-        mconf = Configuration(generation=0, members=[], new_members=None)
+        mconf = MembershipConfiguration(generation=0, members=[], new_members=None)
        # set start_lsn to the beginning of the first segment to allow reading
        # WAL from there (could you intidb LSN as well).
        r = TimelineCreateRequest(
@@ -1948,7 +1948,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
    sk_id_2 = SafekeeperId(11, "localhost", 5434)  # just a mock

    # Request to switch before timeline creation should fail.
-    init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None)
+    init_conf = MembershipConfiguration(generation=1, members=[sk_id_1], new_members=None)
    with pytest.raises(requests.exceptions.HTTPError):
        http_cli.membership_switch(tenant_id, timeline_id, init_conf)

@@ -1960,7 +1960,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
    http_cli.timeline_create(create_r)

    # Switch into some conf.
-    joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
+    joint_conf = MembershipConfiguration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
    resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf)
    log.info(f"joint switch resp: {resp}")
    assert resp.previous_conf.generation == 1
@@ -1973,24 +1973,26 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
    assert after_restart.generation == 4

    # Switch into non joint conf of which sk is not a member, must fail.
-    non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None)
+    non_joint_not_member = MembershipConfiguration(
+        generation=5, members=[sk_id_2], new_members=None
+    )
    with pytest.raises(requests.exceptions.HTTPError):
        resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member)

    # Switch into good non joint conf.
-    non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None)
+    non_joint = MembershipConfiguration(generation=6, members=[sk_id_1], new_members=None)
    resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint)
    log.info(f"non joint switch resp: {resp}")
    assert resp.previous_conf.generation == 4
    assert resp.current_conf.generation == 6

    # Switch request to lower conf should be rejected.
-    lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None)
+    lower_conf = MembershipConfiguration(generation=3, members=[sk_id_1], new_members=None)
    with pytest.raises(requests.exceptions.HTTPError):
        http_cli.membership_switch(tenant_id, timeline_id, lower_conf)

    # Now, exclude sk from the membership, timeline should be deleted.
-    excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None)
+    excluded_conf = MembershipConfiguration(generation=7, members=[sk_id_2], new_members=None)
    http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf)
    with pytest.raises(requests.exceptions.HTTPError):
        http_cli.timeline_status(tenant_id, timeline_id)
@@ -2010,11 +2012,6 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    ps = env.pageservers[0]
-    ps_http_cli = ps.http_client()
-
-    http_clis = [sk.http_client() for sk in env.safekeepers]
-
    config_lines = [
        "neon.safekeeper_proto_version = 3",
    ]
@@ -2023,22 +2020,11 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
    # expected to fail because timeline is not created on safekeepers
    with pytest.raises(Exception, match=r".*timed out.*"):
        ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s")
-    # figure out initial LSN.
-    ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
-    init_lsn = ps_timeline_detail["last_record_lsn"]
-    log.info(f"initial LSN: {init_lsn}")
-    # sk timeline creation request expects minor version
-    pg_version = ps_timeline_detail["pg_version"] * 10000
    # create inital mconf
-    sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers]
-    mconf = Configuration(generation=1, members=sk_ids, new_members=None)
-    create_r = TimelineCreateRequest(
-        tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
+    mconf = MembershipConfiguration(
+        generation=1, members=Safekeeper.sks_to_safekeeper_ids(env.safekeepers), new_members=None
    )
-    log.info(f"sending timeline create: {create_r.to_json()}")
-
-    for sk_http_cli in http_clis:
-        sk_http_cli.timeline_create(create_r)
+    Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, env.safekeepers)
    # Once timeline created endpoint should start.
    ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
    ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -18,6 +18,7 @@ from fixtures.neon_fixtures import (
    Safekeeper,
 )
 from fixtures.remote_storage import RemoteStorageKind
+from fixtures.safekeeper.http import MembershipConfiguration
 from fixtures.utils import skip_in_debug_build

 if TYPE_CHECKING:
@@ -452,20 +453,24 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder):
    asyncio.run(run_concurrent_computes(env))


+async def assert_query_hangs(endpoint: Endpoint, query: str):
+    """
+    Start on endpoint query which is expected to hang and check that it does.
+    """
+    conn = await endpoint.connect_async()
+    bg_query = asyncio.create_task(conn.execute(query))
+    await asyncio.sleep(2)
+    assert not bg_query.done()
+    return bg_query
+
+
 # Stop safekeeper and check that query cannot be executed while safekeeper is down.
 # Query will insert a single row into a table.
-async def check_unavailability(
-    sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2
-):
+async def check_unavailability(sk: Safekeeper, ep: Endpoint, key: int, start_delay_sec: int = 2):
    # shutdown one of two acceptors, that is, majority
    sk.stop()

-    bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')"))
-
-    await asyncio.sleep(start_delay_sec)
-    # ensure that the query has not been executed yet
-    assert not bg_query.done()
-
+    bg_query = await assert_query_hangs(ep, f"INSERT INTO t values ({key}, 'payload')")
    # start safekeeper and await the query
    sk.start()
    await bg_query
@@ -480,10 +485,10 @@ async def run_unavailability(env: NeonEnv, endpoint: Endpoint):
    await conn.execute("INSERT INTO t values (1, 'payload')")

    # stop safekeeper and check that query cannot be executed while safekeeper is down
-    await check_unavailability(env.safekeepers[0], conn, 2)
+    await check_unavailability(env.safekeepers[0], endpoint, 2)

    # for the world's balance, do the same with second safekeeper
-    await check_unavailability(env.safekeepers[1], conn, 3)
+    await check_unavailability(env.safekeepers[1], endpoint, 3)

    # check that we can execute queries after restart
    await conn.execute("INSERT INTO t values (4, 'payload')")
@@ -514,15 +519,7 @@ async def run_recovery_uncommitted(env: NeonEnv):
    # insert with only one safekeeper up to create tail of flushed but not committed WAL
    sk1.stop()
    sk2.stop()
-    conn = await ep.connect_async()
-    # query should hang, so execute in separate task
-    bg_query = asyncio.create_task(
-        conn.execute("insert into t select generate_series(1, 2000), 'payload'")
-    )
-    sleep_sec = 2
-    await asyncio.sleep(sleep_sec)
-    # it must still be not finished
-    assert not bg_query.done()
+    await assert_query_hangs(ep, "insert into t select generate_series(1, 2000), 'payload'")
    # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
    ep.stop_and_destroy()

@@ -559,15 +556,7 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
    # insert with only one sk3 up to create tail of flushed but not committed WAL on it
    sk1.stop()
    sk2.stop()
-    conn = await ep.connect_async()
-    # query should hang, so execute in separate task
-    bg_query = asyncio.create_task(
-        conn.execute("insert into t select generate_series(1, 180000), 'Papaya'")
-    )
-    sleep_sec = 2
-    await asyncio.sleep(sleep_sec)
-    # it must still be not finished
-    assert not bg_query.done()
+    await assert_query_hangs(ep, "insert into t select generate_series(1, 180000), 'Papaya'")
    # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
    ep.stop_and_destroy()

@@ -607,6 +596,132 @@ def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_versi
    asyncio.run(run_wal_truncation(env, safekeeper_proto_version))


+async def quorum_sanity_single(
+    env: NeonEnv,
+    compute_sks_ids: list[int],
+    members_sks_ids: list[int],
+    new_members_sks_ids: list[int] | None,
+    sks_to_stop_ids: list[int],
+    should_work_when_stopped: bool,
+):
+    """
+    *_ids params contain safekeeper node ids; it is assumed they are issued
+    from 1 and sequentially assigned to env.safekeepers.
+    """
+    members_sks = [env.safekeepers[i - 1] for i in members_sks_ids]
+    new_members_sks = (
+        [env.safekeepers[i - 1] for i in new_members_sks_ids] if new_members_sks_ids else None
+    )
+    sks_to_stop = [env.safekeepers[i - 1] for i in sks_to_stop_ids]
+
+    mconf = MembershipConfiguration(
+        generation=1,
+        members=Safekeeper.sks_to_safekeeper_ids(members_sks),
+        new_members=Safekeeper.sks_to_safekeeper_ids(new_members_sks) if new_members_sks else None,
+    )
+    members_sks = Safekeeper.mconf_sks(env, mconf)
+
+    tenant_id = env.initial_tenant
+    compute_sks_ids_str = "-".join([str(sk_id) for sk_id in compute_sks_ids])
+    members_sks_ids_str = "-".join([str(sk.id) for sk in mconf.members])
+    new_members_sks_ids_str = "-".join(
+        [str(sk.id) for sk in mconf.new_members] if mconf.new_members is not None else []
+    )
+    sks_to_stop_ids_str = "-".join([str(sk.id) for sk in sks_to_stop])
+    log.info(
+        f"running quorum_sanity_single with compute_sks={compute_sks_ids_str}, members_sks={members_sks_ids_str}, new_members_sks={new_members_sks_ids_str}, sks_to_stop={sks_to_stop_ids_str}, should_work_when_stopped={should_work_when_stopped}"
+    )
+    branch_name = f"test_quorum_single_c{compute_sks_ids_str}_m{members_sks_ids_str}_{new_members_sks_ids_str}_s{sks_to_stop_ids_str}"
+    timeline_id = env.create_branch(branch_name)
+
+    # create timeline on `members_sks`
+    Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, members_sks)
+
+    config_lines = [
+        "neon.safekeeper_proto_version = 3",
+    ]
+    ep = env.endpoints.create(branch_name, config_lines=config_lines)
+    ep.start(safekeeper_generation=1, safekeepers=compute_sks_ids)
+    ep.safe_psql("create table t(key int, value text)")
+
+    # stop specified sks and check whether writes work
+    for sk in sks_to_stop:
+        sk.stop()
+    if should_work_when_stopped:
+        log.info("checking that writes still work")
+        ep.safe_psql("insert into t select generate_series(1, 100), 'Papaya'")
+        # restarting ep should also be fine
+        ep.stop()
+        ep.start()
+        ep.safe_psql("insert into t select generate_series(1, 100), 'plum'")
+        bg_query = None
+    else:
+        log.info("checking that writes hang")
+        bg_query = await assert_query_hangs(
+            ep, "insert into t select generate_series(1, 100), 'Papaya'"
+        )
+    # start again; now they should work
+    for sk in sks_to_stop:
+        sk.start()
+    if bg_query:
+        log.info("awaiting query")
+        await bg_query
+
+
+# It's a bit tempting to iterate over all possible combinations, but let's stick
+# with this for now.
+async def run_quorum_sanity(env: NeonEnv):
+    # 3 members, all up, should work
+    await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [], True)
+    # 3 members, 2/3 up, should work
+    await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [3], True)
+    # 3 members, 1/3 up, should not work
+    await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [2, 3], False)
+
+    # 3 members, all up, should work; wp redundantly talks to 4th.
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], None, [], True)
+    # 3 members, all up, should work with wp talking to 2 of these 3 + plus one redundant
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [], True)
+    # 3 members, 2/3 up, could work but wp talks to different 3s, so it shouldn't
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [3], False)
+
+    # joint conf of 1-2-3 and 4, all up, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [], True)
+    # joint conf of 1-2-3 and 4, 4 down, shouldn't work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [4], False)
+
+    # joint conf of 1-2-3 and 2-3-4, all up, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [], True)
+    # joint conf of 1-2-3 and 2-3-4, 1 and 4 down, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 4], True)
+    # joint conf of 1-2-3 and 2-3-4, 2 down, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2], True)
+    # joint conf of 1-2-3 and 2-3-4, 3 down, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [3], True)
+    # joint conf of 1-2-3 and 2-3-4, 1 and 2 down, shouldn't work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 2], False)
+    # joint conf of 1-2-3 and 2-3-4, 2 and 4 down, shouldn't work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2, 4], False)
+
+    # joint conf of 1-2-3 and 2-3-4 with wp talking to 2-3-4 only.
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [], True)
+    # with 1 down should still be ok
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [1], True)
+    # but with 2 down not ok
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [2], False)
+
+
+# Test various combinations of membership configurations / neon.safekeepers
+# (list of safekeepers endpoint connects to) values / up & down safekeepers and
+# check that endpont can start and write data when we have quorum and can't when
+# we don't.
+def test_quorum_sanity(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 4
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_quorum_sanity(env))
+
+
 async def run_segment_init_failure(env: NeonEnv):
    env.create_branch("test_segment_init_failure")
    ep = env.endpoints.create_start("test_segment_init_failure")