diff --git a/Cargo.lock b/Cargo.lock
index aea8924f4f..5d2cdcea27 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2837,6 +2837,7 @@ dependencies = [
  "utils",
  "uuid",
  "workspace_hack",
+ "x509-cert",
 ]
 
 [[package]]
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index ad8925e7ab..06d5bbb9c5 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -523,11 +523,14 @@ impl ComputeNode {
 
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         info!(
-            "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
+            "starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
             pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
             pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
             pspec.tenant_id,
             pspec.timeline_id,
+            pspec.spec.project_id.as_deref().unwrap_or("None"),
+            pspec.spec.branch_id.as_deref().unwrap_or("None"),
+            pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
             pspec.spec.features,
             pspec.spec.remote_extensions,
         );
@@ -631,19 +634,23 @@ impl ComputeNode {
             });
         }
 
-        // Configure and start rsyslog for HIPAA if necessary
-        if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
-            let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
-            if remote_endpoint.is_empty() {
-                anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+        // Configure and start rsyslog for compliance audit logging
+        match pspec.spec.audit_log_level {
+            ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
+                let remote_endpoint =
+                    std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
+                if remote_endpoint.is_empty() {
+                    anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+                }
+
+                let log_directory_path = Path::new(&self.params.pgdata).join("log");
+                let log_directory_path = log_directory_path.to_string_lossy().to_string();
+                configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
+
+                // Launch a background task to clean up the audit logs
+                launch_pgaudit_gc(log_directory_path);
             }
-
-            let log_directory_path = Path::new(&self.params.pgdata).join("log");
-            let log_directory_path = log_directory_path.to_string_lossy().to_string();
-            configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
-
-            // Launch a background task to clean up the audit logs
-            launch_pgaudit_gc(log_directory_path);
+            _ => {}
         }
 
         // Configure and start rsyslog for Postgres logs export
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 92939f816c..71c6123c3b 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -89,6 +89,15 @@ pub fn write_postgres_conf(
             escape_conf_value(&s.to_string())
         )?;
     }
+    if let Some(s) = &spec.project_id {
+        writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.branch_id {
+        writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.endpoint_id {
+        writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
+    }
 
     // tls
     if let Some(tls_config) = tls_config {
@@ -169,7 +178,7 @@ pub fn write_postgres_conf(
     // and don't allow the user or the control plane admin to change them.
     match spec.audit_log_level {
         ComputeAudit::Disabled => {}
-        ComputeAudit::Log => {
+        ComputeAudit::Log | ComputeAudit::Base => {
             writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
             writeln!(file, "pgaudit.log='ddl,role'")?;
             // Disable logging of catalog queries to reduce the noise
@@ -193,16 +202,20 @@ pub fn write_postgres_conf(
             }
             writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
         }
-        ComputeAudit::Hipaa => {
+        ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
             writeln!(
                 file,
                 "# Managed by compute_ctl compliance audit settings: begin"
             )?;
-            // This log level is very verbose
-            // but this is necessary for HIPAA compliance.
-            // Exclude 'misc' category, because it doesn't contain anythig relevant.
-            writeln!(file, "pgaudit.log='all, -misc'")?;
-            writeln!(file, "pgaudit.log_parameter=on")?;
+            // Enable logging of parameters.
+            // This is very verbose and may contain sensitive data.
+            if spec.audit_log_level == ComputeAudit::Full {
+                writeln!(file, "pgaudit.log_parameter=on")?;
+                writeln!(file, "pgaudit.log='all'")?;
+            } else {
+                writeln!(file, "pgaudit.log_parameter=off")?;
+                writeln!(file, "pgaudit.log='all, -misc'")?;
+            }
             // Disable logging of catalog queries
             // The catalog doesn't contain sensitive data, so we don't need to audit it.
             writeln!(file, "pgaudit.log_catalog=off")?;
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index e7d67f6ac5..0d1389dbad 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -278,12 +278,12 @@ impl ComputeNode {
             // so that all config operations are audit logged.
             match spec.audit_log_level
             {
-                ComputeAudit::Hipaa => {
+                ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
                     phases.push(CreatePgauditExtension);
                     phases.push(CreatePgauditlogtofileExtension);
                     phases.push(DisablePostgresDBPgAudit);
                 }
-                ComputeAudit::Log => {
+                ComputeAudit::Log | ComputeAudit::Base => {
                     phases.push(CreatePgauditExtension);
                     phases.push(DisablePostgresDBPgAudit);
                 }
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 3137bde161..663c024953 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -658,6 +658,9 @@ impl Endpoint {
             delta_operations: None,
             tenant_id: Some(self.tenant_id),
             timeline_id: Some(self.timeline_id),
+            project_id: None,
+            branch_id: None,
+            endpoint_id: Some(self.endpoint_id.clone()),
             mode: self.mode,
             pageserver_connstring: Some(pageserver_connstring),
             safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 8000576e87..a4b56ae5c0 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -13,7 +13,9 @@ use pageserver_api::controller_api::{
     NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
     TenantCreateResponse, TenantLocateResponse,
 };
-use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
+use pageserver_api::models::{
+    TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
+};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
@@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs {
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
     pub node_id: Option<NodeId>,
-    pub generation_override: Option<i32>,
+    pub generation_override: Option<i32>, // only new tenants
+    pub config: Option<TenantConfig>,     // only new tenants
 }
 
 #[derive(Serialize, Deserialize)]
@@ -805,6 +808,7 @@ impl StorageController {
             tenant_shard_id,
             node_id: Some(pageserver_id),
             generation_override: None,
+            config: None,
         };
 
         let response = self
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 994a665a88..82950bcbaa 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -104,6 +104,12 @@ pub struct ComputeSpec {
     pub timeline_id: Option<TimelineId>,
     pub pageserver_connstring: Option<String>,
 
+    // More neon ids that we expose to the compute_ctl
+    // and to postgres as neon extension GUCs.
+    pub project_id: Option<String>,
+    pub branch_id: Option<String>,
+    pub endpoint_id: Option<String>,
+
     /// Safekeeper membership config generation. It is put in
     /// neon.safekeepers GUC and serves two purposes:
     /// 1) Non zero value forces walproposer to use membership configurations.
@@ -159,13 +165,7 @@ pub struct ComputeSpec {
     #[serde(default)] // Default false
     pub drop_subscriptions_before_start: bool,
 
-    /// Log level for audit logging:
-    ///
-    /// Disabled - no audit logging. This is the default.
-    /// log - log masked statements to the postgres log using pgaudit extension
-    /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
-    ///
-    /// Extensions should be present in shared_preload_libraries
+    /// Log level for compute audit logging
     #[serde(default)]
     pub audit_log_level: ComputeAudit,
 
@@ -289,14 +289,25 @@ impl ComputeMode {
 }
 
 /// Log level for audit logging
-/// Disabled, log, hipaa
-/// Default is Disabled
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
     #[default]
     Disabled,
+    // Deprecated, use Base instead
     Log,
+    // (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
+    // logged to the standard postgresql log stream
+    Base,
+    // Deprecated, use Full or Extended instead
     Hipaa,
+    // (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access
+    Extended,
+    // (pgaudit.log='all', pgaudit.log_parameter='on'),
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access.
+    Full,
 }
 
 #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml
index 6d24ee352a..5f6578f76e 100644
--- a/libs/http-utils/Cargo.toml
+++ b/libs/http-utils/Cargo.toml
@@ -30,6 +30,7 @@ tokio.workspace = true
 tracing.workspace = true
 url.workspace = true
 uuid.workspace = true
+x509-cert.workspace = true
 
 # to use tokio channels as streams, this is faster to compile than async_stream
 # why is it only here? no other crate should use it, streams are rarely needed.
diff --git a/libs/http-utils/src/server.rs b/libs/http-utils/src/server.rs
index 07fd56ac01..f93f71c962 100644
--- a/libs/http-utils/src/server.rs
+++ b/libs/http-utils/src/server.rs
@@ -4,6 +4,8 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use hyper0::Body;
 use hyper0::server::conn::Http;
+use metrics::{IntCounterVec, register_int_counter_vec};
+use once_cell::sync::Lazy;
 use routerify::{RequestService, RequestServiceBuilder};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
@@ -26,6 +28,24 @@ pub struct Server {
     tls_acceptor: Option<TlsAcceptor>,
 }
 
+static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_started_total",
+        "Number of established http/https connections",
+        &["scheme"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_errors_total",
+        "Number of occured connection errors by type",
+        &["type"]
+    )
+    .expect("failed to define a metric")
+});
+
 impl Server {
     pub fn new(
         request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
@@ -60,6 +80,15 @@ impl Server {
             false
         }
 
+        let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
+        let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
+        let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
+        let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
+        let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
+
+        let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
+        let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
+
         let mut connections = FuturesUnordered::new();
         loop {
             tokio::select! {
@@ -67,6 +96,7 @@ impl Server {
                     let (tcp_stream, remote_addr) = match stream {
                         Ok(stream) => stream,
                         Err(err) => {
+                            tcp_error_cnt.inc();
                             if !suppress_io_error(&err) {
                                 info!("Failed to accept TCP connection: {err:#}");
                             }
@@ -78,11 +108,18 @@ impl Server {
                     let tls_acceptor = self.tls_acceptor.clone();
                     let cancel = cancel.clone();
 
+                    let tls_error_cnt = tls_error_cnt.clone();
+                    let http_error_cnt = http_error_cnt.clone();
+                    let https_error_cnt = https_error_cnt.clone();
+                    let http_connection_cnt = http_connection_cnt.clone();
+                    let https_connection_cnt = https_connection_cnt.clone();
+
                     connections.push(tokio::spawn(
                         async move {
                             match tls_acceptor {
                                 Some(tls_acceptor) => {
                                     // Handle HTTPS connection.
+                                    https_connection_cnt.inc();
                                     let tls_stream = tokio::select! {
                                         tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
                                         _ = cancel.cancelled() => return,
@@ -90,6 +127,7 @@ impl Server {
                                     let tls_stream = match tls_stream {
                                         Ok(tls_stream) => tls_stream,
                                         Err(err) => {
+                                            tls_error_cnt.inc();
                                             if !suppress_io_error(&err) {
                                                 info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
                                             }
@@ -97,6 +135,7 @@ impl Server {
                                         }
                                     };
                                     if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
+                                        https_error_cnt.inc();
                                         if !suppress_hyper_error(&err) {
                                             info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
                                         }
@@ -104,7 +143,9 @@ impl Server {
                                 }
                                 None => {
                                     // Handle HTTP connection.
+                                    http_connection_cnt.inc();
                                     if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
+                                        http_error_cnt.inc();
                                         if !suppress_hyper_error(&err) {
                                             info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
                                         }
@@ -115,6 +156,7 @@ impl Server {
                  }
                 Some(conn) = connections.next() => {
                     if let Err(err) = conn {
+                        panic_error_cnt.inc();
                         error!("Connection panicked: {err:#}");
                     }
                 }
@@ -122,6 +164,7 @@ impl Server {
                     // Wait for graceful shutdown of all connections.
                     while let Some(conn) = connections.next().await {
                         if let Err(err) = conn {
+                            panic_error_cnt.inc();
                             error!("Connection panicked: {err:#}");
                         }
                     }
diff --git a/libs/http-utils/src/tls_certs.rs b/libs/http-utils/src/tls_certs.rs
index 0c18d84d98..2799db78a6 100644
--- a/libs/http-utils/src/tls_certs.rs
+++ b/libs/http-utils/src/tls_certs.rs
@@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration};
 use anyhow::Context;
 use arc_swap::ArcSwap;
 use camino::Utf8Path;
+use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
+use once_cell::sync::Lazy;
 use rustls::{
-    pki_types::{CertificateDer, PrivateKeyDer},
+    pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
     server::{ClientHello, ResolvesServerCert},
     sign::CertifiedKey,
 };
+use x509_cert::der::Reader;
 
 pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
     let cert_data = tokio::fs::read(filename)
@@ -53,6 +56,76 @@ pub async fn load_certified_key(
     Ok(certified_key)
 }
 
+/// rustls's CertifiedKey with extra parsed fields used for metrics.
+struct ParsedCertifiedKey {
+    certified_key: CertifiedKey,
+    expiration_time: UnixTime,
+}
+
+/// Parse expiration time from an X509 certificate.
+fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
+    let parsed_cert = x509_cert::der::SliceReader::new(cert)
+        .context("Failed to parse cerficiate")?
+        .decode::<x509_cert::Certificate>()
+        .context("Failed to parse cerficiate")?;
+
+    Ok(UnixTime::since_unix_epoch(
+        parsed_cert
+            .tbs_certificate
+            .validity
+            .not_after
+            .to_unix_duration(),
+    ))
+}
+
+async fn load_and_parse_certified_key(
+    key_filename: &Utf8Path,
+    cert_filename: &Utf8Path,
+) -> anyhow::Result<ParsedCertifiedKey> {
+    let certified_key = load_certified_key(key_filename, cert_filename).await?;
+    let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
+    Ok(ParsedCertifiedKey {
+        certified_key,
+        expiration_time,
+    })
+}
+
+static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "tls_certs_expiration_time_seconds",
+        "Expiration time of the loaded certificate since unix epoch in seconds",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_started_total",
+        "Number of certificate reload loop iterations started",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_updated_total",
+        "Number of times the certificate was updated to the new one",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_failed_total",
+        "Number of times the certificate reload failed",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
 /// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
 /// the disk periodically.
 #[derive(Debug)]
@@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver {
 impl ReloadingCertificateResolver {
     /// Creates a new Resolver by loading certificate and private key from FS and
     /// creating tokio::task to reload them with provided reload_period.
+    /// resolver_name is used as metric's label.
     pub async fn new(
+        resolver_name: &str,
         key_filename: &Utf8Path,
         cert_filename: &Utf8Path,
         reload_period: Duration,
     ) -> anyhow::Result<Arc<Self>> {
+        // Create metrics for current resolver.
+        let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
+        let cert_reload_started_counter =
+            CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_updated_counter =
+            CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_failed_counter =
+            CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
+
+        let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
+
         let this = Arc::new(Self {
-            certified_key: ArcSwap::from_pointee(
-                load_certified_key(key_filename, cert_filename).await?,
-            ),
+            certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
         });
+        cert_expiration_time.set(parsed_key.expiration_time.as_secs());
 
         tokio::spawn({
             let weak_this = Arc::downgrade(&this);
@@ -88,17 +173,22 @@ impl ReloadingCertificateResolver {
                         Some(this) => this,
                         None => break, // Resolver has been destroyed, exit.
                     };
-                    match load_certified_key(&key_filename, &cert_filename).await {
-                        Ok(new_certified_key) => {
-                            if new_certified_key.cert == this.certified_key.load().cert {
+                    cert_reload_started_counter.inc();
+
+                    match load_and_parse_certified_key(&key_filename, &cert_filename).await {
+                        Ok(parsed_key) => {
+                            if parsed_key.certified_key.cert == this.certified_key.load().cert {
                                 tracing::debug!("Certificate has not changed since last reloading");
                             } else {
                                 tracing::info!("Certificate has been reloaded");
-                                this.certified_key.store(Arc::new(new_certified_key));
+                                this.certified_key.store(Arc::new(parsed_key.certified_key));
+                                cert_expiration_time.set(parsed_key.expiration_time.as_secs());
+                                cert_reload_updated_counter.inc();
                             }
                             last_reload_failed = false;
                         }
                         Err(err) => {
+                            cert_reload_failed_counter.inc();
                             // Note: Reloading certs may fail if it conflicts with the script updating
                             // the files at the same time. Warn only if the error is persistent.
                             if last_reload_failed {
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 8f56d60a4a..bd9f7efb7f 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -180,6 +180,7 @@ pub struct ConfigToml {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub generate_unarchival_heatmap: Option<bool>,
     pub tracing: Option<Tracing>,
+    pub enable_tls_page_service_api: bool,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -631,6 +632,7 @@ impl Default for ConfigToml {
             load_previous_heatmap: None,
             generate_unarchival_heatmap: None,
             tracing: None,
+            enable_tls_page_service_api: false,
         }
     }
 }
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 3cb62f9d18..91f9c03ba4 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -7,7 +7,8 @@ use std::time::{Duration, Instant};
 /// API (`/control/v1` prefix).  Implemented by the server
 /// in [`storage_controller::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;
 
 use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
 use crate::shard::{ShardStripeSize, TenantShardId};
@@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest {
     pub scheduling_policy: SkSchedulingPolicy,
 }
 
+/// Import request for safekeeper timelines.
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TimelineImportRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub start_lsn: Lsn,
+    pub sk_set: Vec<NodeId>,
+}
+
 #[cfg(test)]
 mod test {
     use serde_json;
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 8836e7ec87..0c4d7fd4cb 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -927,7 +927,7 @@ impl Key {
 
     /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
     #[inline(always)]
-    pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
+    pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
         Ok(match self.field1 {
             0x00 => (
                 RelTag {
@@ -938,7 +938,7 @@ impl Key {
                 },
                 self.field6,
             ),
-            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
+            _ => return Err(ToRelBlockError(self.field1)),
         })
     }
 }
@@ -951,6 +951,17 @@ impl std::str::FromStr for Key {
     }
 }
 
+#[derive(Debug)]
+pub struct ToRelBlockError(u8);
+
+impl fmt::Display for ToRelBlockError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "unexpected value kind 0x{:02x}", self.0)
+    }
+}
+
+impl std::error::Error for ToRelBlockError {}
+
 #[cfg(test)]
 mod tests {
     use std::str::FromStr;
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 54fecee588..250d4180f5 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -452,6 +452,24 @@ fn start_pageserver(
     info!("Using auth for http API: {:#?}", conf.http_auth_type);
     info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);
 
+    let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
+    {
+        let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
+            "main",
+            &conf.ssl_key_file,
+            &conf.ssl_cert_file,
+            conf.ssl_cert_reload_period,
+        ))?;
+
+        let server_config = rustls::ServerConfig::builder()
+            .with_no_client_auth()
+            .with_cert_resolver(resolver);
+
+        Some(Arc::new(server_config))
+    } else {
+        None
+    };
+
     match var("NEON_AUTH_TOKEN") {
         Ok(v) => {
             info!("Loaded JWT token for authentication with Safekeeper");
@@ -670,17 +688,11 @@ fn start_pageserver(
 
         let https_task = match https_listener {
             Some(https_listener) => {
-                let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
-                    &conf.ssl_key_file,
-                    &conf.ssl_cert_file,
-                    conf.ssl_cert_reload_period,
-                ))?;
+                let tls_server_config = tls_server_config
+                    .clone()
+                    .expect("tls_server_config is set earlier if https is enabled");
 
-                let server_config = rustls::ServerConfig::builder()
-                    .with_no_client_auth()
-                    .with_cert_resolver(resolver);
-
-                let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
+                let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);
 
                 let server =
                     http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
@@ -736,6 +748,11 @@ fn start_pageserver(
             tokio::net::TcpListener::from_std(pageserver_listener)
                 .context("create tokio listener")?
         },
+        if conf.enable_tls_page_service_api {
+            tls_server_config
+        } else {
+            None
+        },
     );
 
     // All started up! Now just sit and wait for shutdown signal.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index ccc29e59d4..26ae6af70e 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -219,6 +219,11 @@ pub struct PageServerConf {
     pub generate_unarchival_heatmap: bool,
 
     pub tracing: Option<pageserver_api::config::Tracing>,
+
+    /// Enable TLS in page service API.
+    /// Does not force TLS: the client negotiates TLS usage during the handshake.
+    /// Uses key and certificate from ssl_key_file/ssl_cert_file.
+    pub enable_tls_page_service_api: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -391,6 +396,7 @@ impl PageServerConf {
             load_previous_heatmap,
             generate_unarchival_heatmap,
             tracing,
+            enable_tls_page_service_api,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -441,6 +447,7 @@ impl PageServerConf {
             page_service_pipelining,
             get_vectored_concurrent_io,
             tracing,
+            enable_tls_page_service_api,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9bb761dc48..bbc4bfae1b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3253,7 +3253,7 @@ async fn ingest_aux_files(
         modification
             .put_file(&fname, content.as_bytes(), &ctx)
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
     }
     modification
         .commit(&ctx)
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 6dd005de50..911449c7c5 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -27,7 +27,7 @@ use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
-use crate::walingest::WalIngest;
+use crate::walingest::{WalIngest, WalIngestErrorKind};
 
 // Returns checkpoint LSN from controlfile
 pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
@@ -157,9 +157,9 @@ async fn import_rel(
         .put_rel_creation(rel, nblocks as u32, ctx)
         .await
     {
-        match e {
-            RelationError::AlreadyExists => {
-                debug!("Relation {} already exist. We must be extending it.", rel)
+        match e.kind {
+            WalIngestErrorKind::RelationAlreadyExists(rel) => {
+                debug!("Relation {rel} already exists. We must be extending it.")
             }
             _ => return Err(e.into()),
         }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 7e3991dbdc..61f524fc29 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -105,6 +105,7 @@ pub fn spawn(
     pg_auth: Option<Arc<SwappableJwtAuth>>,
     perf_trace_dispatch: Option<Dispatch>,
     tcp_listener: tokio::net::TcpListener,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
 ) -> Listener {
     let cancel = CancellationToken::new();
     let libpq_ctx = RequestContext::todo_child(
@@ -124,6 +125,7 @@ pub fn spawn(
             perf_trace_dispatch,
             tcp_listener,
             conf.pg_auth_type,
+            tls_config,
             conf.page_service_pipelining.clone(),
             libpq_ctx,
             cancel.clone(),
@@ -181,6 +183,7 @@ pub async fn libpq_listener_main(
     perf_trace_dispatch: Option<Dispatch>,
     listener: tokio::net::TcpListener,
     auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
     pipelining_config: PageServicePipeliningConfig,
     listener_ctx: RequestContext,
     listener_cancel: CancellationToken,
@@ -223,6 +226,7 @@ pub async fn libpq_listener_main(
                     local_auth,
                     socket,
                     auth_type,
+                    tls_config.clone(),
                     pipelining_config.clone(),
                     connection_ctx,
                     connections_cancel.child_token(),
@@ -264,6 +268,7 @@ async fn page_service_conn_main(
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
     pipelining_config: PageServicePipeliningConfig,
     connection_ctx: RequestContext,
     cancel: CancellationToken,
@@ -334,7 +339,8 @@ async fn page_service_conn_main(
         cancel.clone(),
         gate_guard,
     );
-    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
+    let pgbackend =
+        PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;
 
     match pgbackend.run(&mut conn_handler, &cancel).await {
         Ok(()) => {
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 4c5a07ba57..f33a8baec1 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,8 +9,9 @@
 use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
 use std::ops::{ControlFlow, Range};
 
-use crate::PERF_TRACE_TARGET;
-use anyhow::{Context, ensure};
+use crate::walingest::{WalIngestError, WalIngestErrorKind};
+use crate::{PERF_TRACE_TARGET, ensure_walingest};
+use anyhow::Context;
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use itertools::Itertools;
@@ -136,12 +137,8 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {
 
 #[derive(Debug, thiserror::Error)]
 pub enum RelationError {
-    #[error("Relation Already Exists")]
-    AlreadyExists,
     #[error("invalid relnode")]
     InvalidRelnode,
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
 }
 
 ///
@@ -1478,8 +1475,8 @@ impl DatadirModification<'_> {
     }
 
     /// Set the current lsn
-    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
-        ensure!(
+    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
+        ensure_walingest!(
             lsn >= self.lsn,
             "setting an older lsn {} than {} is not allowed",
             lsn,
@@ -1578,7 +1575,7 @@ impl DatadirModification<'_> {
         &mut self,
         rel: RelTag,
         ctx: &RequestContext,
-    ) -> Result<u32, PageReconstructError> {
+    ) -> Result<u32, WalIngestError> {
         // Get current size and put rel creation if rel doesn't exist
         //
         // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
@@ -1593,14 +1590,13 @@ impl DatadirModification<'_> {
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
-            self.put_rel_creation(rel, 0, ctx)
-                .await
-                .context("Relation Error")?;
+            self.put_rel_creation(rel, 0, ctx).await?;
             Ok(0)
         } else {
-            self.tline
+            Ok(self
+                .tline
                 .get_rel_size(rel, Version::Modified(self), ctx)
-                .await
+                .await?)
         }
     }
 
@@ -1637,11 +1633,14 @@ impl DatadirModification<'_> {
         // TODO(vlad): remove this argument and replace the shard check with is_key_local
         shard: &ShardIdentity,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let mut gaps_at_lsns = Vec::default();
 
         for meta in batch.metadata.iter() {
-            let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
+            let key = Key::from_compact(meta.key());
+            let (rel, blkno) = key
+                .to_rel_block()
+                .map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?;
             let new_nblocks = blkno + 1;
 
             let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
@@ -1683,8 +1682,8 @@ impl DatadirModification<'_> {
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
         self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
         Ok(())
     }
@@ -1696,7 +1695,7 @@ impl DatadirModification<'_> {
         segno: u32,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         if !self.tline.tenant_shard_id.is_shard_zero() {
             return Ok(());
         }
@@ -1714,14 +1713,11 @@ impl DatadirModification<'_> {
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
         let key = rel_block_to_key(rel, blknum);
         if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
         }
         self.put(rel_block_to_key(rel, blknum), Value::Image(img));
         Ok(())
@@ -1733,15 +1729,12 @@ impl DatadirModification<'_> {
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         assert!(self.tline.tenant_shard_id.is_shard_zero());
 
         let key = slru_block_to_key(kind, segno, blknum);
         if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
         }
         self.put(key, Value::Image(img));
         Ok(())
@@ -1751,15 +1744,11 @@ impl DatadirModification<'_> {
         &mut self,
         rel: RelTag,
         blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
         let key = rel_block_to_key(rel, blknum);
         if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
         }
 
         let batch = self
@@ -1776,15 +1765,11 @@ impl DatadirModification<'_> {
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         assert!(self.tline.tenant_shard_id.is_shard_zero());
         let key = slru_block_to_key(kind, segno, blknum);
         if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
         }
 
         let batch = self
@@ -1832,8 +1817,10 @@ impl DatadirModification<'_> {
         dbnode: Oid,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+    ) -> Result<(), WalIngestError> {
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
 
         // Add it to the directory (if it doesn't exist already)
         let buf = self.get(DBDIR_KEY, ctx).await?;
@@ -1874,13 +1861,13 @@ impl DatadirModification<'_> {
         xid: u64,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         // Add it to the directory entry
         let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let newdirbuf = if self.tline.pg_version >= 17 {
             let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
             if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
+                Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
             }
             self.pending_directory_entries.push((
                 DirectoryKind::TwoPhase,
@@ -1891,7 +1878,7 @@ impl DatadirModification<'_> {
             let xid = xid as u32;
             let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
             if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
+                Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?;
             }
             self.pending_directory_entries.push((
                 DirectoryKind::TwoPhase,
@@ -1909,22 +1896,22 @@ impl DatadirModification<'_> {
         &mut self,
         origin_id: RepOriginId,
         origin_lsn: Lsn,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let key = repl_origin_key(origin_id);
         self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
         Ok(())
     }
 
-    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
+    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> {
         self.set_replorigin(origin_id, Lsn::INVALID).await
     }
 
-    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
+    pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> {
         self.put(CONTROLFILE_KEY, Value::Image(img));
         Ok(())
     }
 
-    pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
+    pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> {
         self.put(CHECKPOINT_KEY, Value::Image(img));
         Ok(())
     }
@@ -1934,7 +1921,7 @@ impl DatadirModification<'_> {
         spcnode: Oid,
         dbnode: Oid,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let total_blocks = self
             .tline
             .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
@@ -1973,20 +1960,21 @@ impl DatadirModification<'_> {
         rel: RelTag,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> Result<(), RelationError> {
+    ) -> Result<(), WalIngestError> {
         if rel.relnode == 0 {
-            return Err(RelationError::InvalidRelnode);
+            Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
+                "invalid relnode"
+            )))?;
         }
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
-            .context("deserialize db")?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
 
         let dbdir_exists =
             if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
                 // Didn't exist. Update dbdir
                 e.insert(false);
-                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+                let buf = DbDirectory::ser(&dbdir)?;
                 self.pending_directory_entries.push((
                     DirectoryKind::Db,
                     MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
@@ -2003,27 +1991,25 @@ impl DatadirModification<'_> {
             RelDirectory::default()
         } else {
             // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                .context("deserialize db")?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
         };
 
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
 
         if v2_enabled {
             if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
             }
             let sparse_rel_dir_key =
                 rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
             // check if the rel_dir_key exists in v2
-            let val = self
-                .sparse_get(sparse_rel_dir_key, ctx)
-                .await
-                .map_err(|e| RelationError::Other(e.into()))?;
+            let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
             let val = RelDirExists::decode_option(val)
-                .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                .map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
             if val == RelDirExists::Exists {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
             }
             self.put(
                 sparse_rel_dir_key,
@@ -2039,9 +2025,7 @@ impl DatadirModification<'_> {
                 // will be key not found errors if we don't create an empty one for rel_size_v2.
                 self.put(
                     rel_dir_key,
-                    Value::Image(Bytes::from(
-                        RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
-                    )),
+                    Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
                 );
             }
             self.pending_directory_entries
@@ -2049,7 +2033,7 @@ impl DatadirModification<'_> {
         } else {
             // Add the new relation to the rel directory entry, and write it back
             if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
             }
             if !dbdir_exists {
                 self.pending_directory_entries
@@ -2059,9 +2043,7 @@ impl DatadirModification<'_> {
                 .push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
             self.put(
                 rel_dir_key,
-                Value::Image(Bytes::from(
-                    RelDirectory::ser(&rel_dir).context("serialize")?,
-                )),
+                Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
             );
         }
 
@@ -2086,8 +2068,8 @@ impl DatadirModification<'_> {
         rel: RelTag,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
         if self
             .tline
             .get_rel_exists(rel, Version::Modified(self), ctx)
@@ -2117,8 +2099,8 @@ impl DatadirModification<'_> {
         rel: RelTag,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
 
         // Put size
         let size_key = rel_size_to_key(rel);
@@ -2142,8 +2124,10 @@ impl DatadirModification<'_> {
         &mut self,
         drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+    ) -> Result<(), WalIngestError> {
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
         for ((spc_node, db_node), rel_tags) in drop_relations {
             let dir_key = rel_dir_to_key(spc_node, db_node);
             let buf = self.get(dir_key, ctx).await?;
@@ -2163,7 +2147,7 @@ impl DatadirModification<'_> {
                     let key =
                         rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
                     let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
-                        .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                        .map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
                     if val == RelDirExists::Exists {
                         self.pending_directory_entries
                             .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
@@ -2206,7 +2190,7 @@ impl DatadirModification<'_> {
         segno: u32,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         assert!(self.tline.tenant_shard_id.is_shard_zero());
 
         // Add it to the directory entry
@@ -2215,7 +2199,7 @@ impl DatadirModification<'_> {
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.insert(segno) {
-            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
+            Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?;
         }
         self.pending_directory_entries.push((
             DirectoryKind::SlruSegment(kind),
@@ -2242,7 +2226,7 @@ impl DatadirModification<'_> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         assert!(self.tline.tenant_shard_id.is_shard_zero());
 
         // Put size
@@ -2258,7 +2242,7 @@ impl DatadirModification<'_> {
         kind: SlruKind,
         segno: u32,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         // Remove it from the directory entry
         let dir_key = slru_dir_to_key(kind);
         let buf = self.get(dir_key, ctx).await?;
@@ -2283,7 +2267,7 @@ impl DatadirModification<'_> {
     }
 
     /// Drop a relmapper file (pg_filenode.map)
-    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
+    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> {
         // TODO
         Ok(())
     }
@@ -2293,7 +2277,7 @@ impl DatadirModification<'_> {
         &mut self,
         xid: u64,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         // Remove it from the directory entry
         let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let newdirbuf = if self.tline.pg_version >= 17 {
@@ -2308,7 +2292,8 @@ impl DatadirModification<'_> {
             ));
             Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
         } else {
-            let xid: u32 = u32::try_from(xid)?;
+            let xid: u32 = u32::try_from(xid)
+                .map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?;
             let mut dir = TwoPhaseDirectory::des(&buf)?;
 
             if !dir.xids.remove(&xid) {
@@ -2333,7 +2318,7 @@ impl DatadirModification<'_> {
         path: &str,
         content: &[u8],
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let key = aux_file::encode_aux_file_key(path);
         // retrieve the key from the engine
         let old_val = match self.get(key, ctx).await {
@@ -2342,7 +2327,7 @@ impl DatadirModification<'_> {
             Err(e) => return Err(e.into()),
         };
         let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
-            aux_file::decode_file_value(old_val)?
+            aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)?
         } else {
             Vec::new()
         };
@@ -2387,7 +2372,8 @@ impl DatadirModification<'_> {
             }
             (None, true) => warn!("removing non-existing aux file: {}", path),
         }
-        let new_val = aux_file::encode_file_value(&new_files)?;
+        let new_val = aux_file::encode_file_value(&new_files)
+            .map_err(WalIngestErrorKind::EncodeAuxFileError)?;
         self.put(key, Value::Image(new_val.into()));
 
         Ok(())
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d3623fc3b9..ad4a0d804d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -11571,6 +11571,99 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x24),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x24")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x28),
+                // This record will fail to redo
+                Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![], // in-memory layers
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    Lsn(0x20)..Lsn(0x30),
+                    delta1,
+                )], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            tline
+                .applied_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let cancel = CancellationToken::new();
+
+        // Compaction will fail, but should not fire any critical error.
+        // Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction
+        // process. It will always try to redo the logs it reads and if it doesn't work, fail the entire
+        // compaction job. Tracked in <https://github.com/neondatabase/neon/issues/10395>.
+        let res = tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: None,
+                    compact_lsn_range: None,
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await;
+        assert!(res.is_err());
+
+        Ok(())
+    }
+
     #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 228048ac9c..490b1ce664 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1269,7 +1269,7 @@ mod test {
 
         // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
         let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap();
         let range = input_start..input_end;
 
         // Build an image layer to filter
@@ -1314,7 +1314,7 @@ mod test {
             let shard_identity = ShardIdentity::new(
                 ShardNumber(shard_number),
                 shard_count,
-                ShardStripeSize(0x8000),
+                ShardStripeSize(0x800),
             )
             .unwrap();
             let harness = TenantHarness::create_custom(
@@ -1368,12 +1368,12 @@ mod test {
 
             // This exact size and those below will need updating as/when the layer encoding changes, but
             // should be deterministic for a given version of the format, as we used no randomness generating the input.
-            assert_eq!(original_size, 1597440);
+            assert_eq!(original_size, 122880);
 
             match shard_number {
                 0 => {
                     // We should have written out just one stripe for our shard identity
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                     let replacement = replacement.unwrap();
 
                     // We should have dropped some of the data
@@ -1381,7 +1381,7 @@ mod test {
                     assert!(replacement.metadata().file_size > 0);
 
                     // Assert that we dropped ~3/4 of the data.
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                 }
                 1 => {
                     // Shard 1 has no keys in our input range
@@ -1390,19 +1390,19 @@ mod test {
                 }
                 2 => {
                     // Shard 2 has one stripes in the input range
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                     let replacement = replacement.unwrap();
                     assert!(replacement.metadata().file_size < original_size);
                     assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                 }
                 3 => {
                     // Shard 3 has two stripes in the input range
-                    assert_eq!(wrote_keys, 0x10000);
+                    assert_eq!(wrote_keys, 0x1000);
                     let replacement = replacement.unwrap();
                     assert!(replacement.metadata().file_size < original_size);
                     assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 811008);
+                    assert_eq!(replacement.metadata().file_size, 73728);
                 }
                 _ => unreachable!(),
             }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8e778e8674..3b38712b0b 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1141,6 +1141,7 @@ impl Timeline {
     ) -> Result<(), CompactionError> {
         let mut drop_layers = Vec::new();
         let mut layers_to_rewrite: Vec<Layer> = Vec::new();
+        let mut rewrite_max_exceeded: bool = false;
 
         // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
         // layer is behind this Lsn, it indicates that the layer is being retained beyond the
@@ -1149,12 +1150,7 @@ impl Timeline {
         // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
         // are rewriting layers.
         let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
-
-        tracing::info!(
-            "starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}",
-            *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.time
-        );
+        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;
 
         let layers = self.layers.read().await;
         for layer_desc in layers.layer_map()?.iter_historic_layers() {
@@ -1172,8 +1168,8 @@ impl Timeline {
                 // This ancestral layer only covers keys that belong to other shards.
                 // We include the full metadata in the log: if we had some critical bug that caused
                 // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
-                info!(%layer, old_metadata=?layer.metadata(),
-                    "dropping layer after shard split, contains no keys for this shard.",
+                debug!(%layer, old_metadata=?layer.metadata(),
+                    "dropping layer after shard split, contains no keys for this shard",
                 );
 
                 if cfg!(debug_assertions) {
@@ -1235,9 +1231,10 @@ impl Timeline {
             }
 
             if layers_to_rewrite.len() >= rewrite_max {
-                tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
+                debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
                     layers_to_rewrite.len()
                 );
+                rewrite_max_exceeded = true;
                 continue;
             }
 
@@ -1245,9 +1242,24 @@ impl Timeline {
             layers_to_rewrite.push(layer);
         }
 
-        // Drop read lock on layer map before we start doing time-consuming I/O
+        // Drop read lock on layer map before we start doing time-consuming I/O.
         drop(layers);
 
+        // Drop out early if there's nothing to do.
+        if layers_to_rewrite.is_empty() && drop_layers.is_empty() {
+            return Ok(());
+        }
+
+        info!(
+            "starting shard ancestor compaction, rewriting {} layers and dropping {} layers \
+                (latest_gc_cutoff={} pitr_cutoff={})",
+            layers_to_rewrite.len(),
+            drop_layers.len(),
+            *latest_gc_cutoff,
+            pitr_cutoff,
+        );
+        let started = Instant::now();
+
         let mut replace_image_layers = Vec::new();
 
         for layer in layers_to_rewrite {
@@ -1255,7 +1267,7 @@ impl Timeline {
                 return Err(CompactionError::ShuttingDown);
             }
 
-            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
+            info!(layer=%layer, "rewriting layer after shard split");
             let mut image_layer_writer = ImageLayerWriter::new(
                 self.conf,
                 self.timeline_id,
@@ -1295,7 +1307,7 @@ impl Timeline {
                     .map_err(CompactionError::Other)?;
                 let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
                     .map_err(CompactionError::Other)?;
-                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
+                info!(layer=%new_layer, "rewrote layer, {} -> {} bytes",
                     layer.metadata().file_size,
                     new_layer.metadata().file_size);
 
@@ -1307,6 +1319,12 @@ impl Timeline {
             }
         }
 
+        for layer in &drop_layers {
+            info!(%layer, old_metadata=?layer.metadata(),
+                "dropping layer after shard split (no keys for this shard)",
+            );
+        }
+
         // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
         // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
         // to remote index) and be removed. This is inefficient but safe.
@@ -1322,6 +1340,7 @@ impl Timeline {
         // necessary for correctness, but it simplifies testing, and avoids proceeding with another
         // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
         // load.
+        info!("shard ancestor compaction waiting for uploads");
         match self.remote_client.wait_completion().await {
             Ok(()) => (),
             Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
@@ -1330,6 +1349,15 @@ impl Timeline {
             }
         }
 
+        info!(
+            "shard ancestor compaction done in {:.3}s{}",
+            started.elapsed().as_secs_f64(),
+            match rewrite_max_exceeded {
+                true => format!(", more work pending due to rewrite_max={rewrite_max}"),
+                false => String::new(),
+            }
+        );
+
         fail::fail_point!("compact-shard-ancestors-persistent");
 
         Ok(())
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index df2663f6bb..3c3608d1bd 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -580,6 +580,7 @@ impl ConnectionManagerState {
                                 );
                                 Ok(())
                             }
+                            WalReceiverError::Cancelled => Ok(()),
                             WalReceiverError::Other(e) => {
                                 // give out an error to have task_mgr give it a really verbose logging
                                 if cancellation.is_cancelled() {
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 6bf05a0f86..52259f205b 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -73,6 +73,7 @@ pub(super) enum WalReceiverError {
     /// Generic error
     Other(anyhow::Error),
     ClosedGate,
+    Cancelled,
 }
 
 impl From<tokio_postgres::Error> for WalReceiverError {
@@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection(
                                 // with a similar error.
                             },
                             WalReceiverError::SuccessfulCompletion(_) => {}
+                            WalReceiverError::Cancelled => {
+                                debug!("Connection cancelled")
+                            }
                             WalReceiverError::ClosedGate => {
                                 // doesn't happen at runtime
                             }
@@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection(
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
+        .await
+        .map_err(|e| match e.kind {
+            crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
+            _ => WalReceiverError::Other(e.into()),
+        })?;
 
     let shard = vec![*timeline.get_shard_identity()];
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 18df065f76..e60c590f87 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,13 +21,13 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.
 
+use std::backtrace::Backtrace;
 use std::collections::HashMap;
 use std::sync::{Arc, OnceLock};
 use std::time::{Duration, Instant, SystemTime};
 
-use anyhow::{Result, bail};
 use bytes::{Buf, Bytes};
-use pageserver_api::key::rel_block_to_key;
+use pageserver_api::key::{Key, rel_block_to_key};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
@@ -38,7 +38,7 @@ use postgres_ffi::{
     fsm_logical_to_physical, pg_constants,
 };
 use tracing::*;
-use utils::bin_ser::SerializeError;
+use utils::bin_ser::{DeserializeError, SerializeError};
 use utils::lsn::Lsn;
 use utils::rate_limit::RateLimit;
 use utils::{critical, failpoint_support};
@@ -104,12 +104,101 @@ struct WarnIngestLag {
     timestamp_invalid_msg_ratelimit: RateLimit,
 }
 
+pub struct WalIngestError {
+    pub backtrace: std::backtrace::Backtrace,
+    pub kind: WalIngestErrorKind,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum WalIngestErrorKind {
+    #[error(transparent)]
+    #[allow(private_interfaces)]
+    PageReconstructError(#[from] PageReconstructError),
+    #[error(transparent)]
+    DeserializationFailure(#[from] DeserializeError),
+    #[error(transparent)]
+    SerializationFailure(#[from] SerializeError),
+    #[error("the request contains data not supported by pageserver: {0} @ {1}")]
+    InvalidKey(Key, Lsn),
+    #[error("twophase file for xid {0} already exists")]
+    FileAlreadyExists(u64),
+    #[error("slru segment {0:?}/{1} already exists")]
+    SlruAlreadyExists(SlruKind, u32),
+    #[error("relation already exists")]
+    RelationAlreadyExists(RelTag),
+    #[error("invalid reldir key {0}")]
+    InvalidRelDirKey(Key),
+
+    #[error(transparent)]
+    LogicalError(anyhow::Error),
+    #[error(transparent)]
+    EncodeAuxFileError(anyhow::Error),
+    #[error(transparent)]
+    MaybeRelSizeV2Error(anyhow::Error),
+
+    #[error("timeline shutting down")]
+    Cancelled,
+}
+
+impl<T> From<T> for WalIngestError
+where
+    WalIngestErrorKind: From<T>,
+{
+    fn from(value: T) -> Self {
+        WalIngestError {
+            backtrace: Backtrace::capture(),
+            kind: WalIngestErrorKind::from(value),
+        }
+    }
+}
+
+impl std::error::Error for WalIngestError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        self.kind.source()
+    }
+}
+
+impl core::fmt::Display for WalIngestError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        self.kind.fmt(f)
+    }
+}
+
+impl core::fmt::Debug for WalIngestError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        if f.alternate() {
+            f.debug_map()
+                .key(&"backtrace")
+                .value(&self.backtrace)
+                .key(&"kind")
+                .value(&self.kind)
+                .finish()
+        } else {
+            writeln!(f, "Error: {:?}", self.kind)?;
+            if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured {
+                writeln!(f, "Stack backtrace: {:?}", self.backtrace)?;
+            }
+            Ok(())
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! ensure_walingest {
+    ($($t:tt)*) => {
+        _ = || -> Result<(), anyhow::Error> {
+            anyhow::ensure!($($t)*);
+            Ok(())
+        }().map_err(WalIngestErrorKind::LogicalError)?;
+    };
+}
+
 impl WalIngest {
     pub async fn new(
         timeline: &Timeline,
         startpoint: Lsn,
         ctx: &RequestContext,
-    ) -> anyhow::Result<WalIngest> {
+    ) -> Result<WalIngest, WalIngestError> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
         let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -145,7 +234,7 @@ impl WalIngest {
         interpreted: InterpretedWalRecord,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<bool> {
+    ) -> Result<bool, WalIngestError> {
         WAL_INGEST.records_received.inc();
         let prev_len = modification.len();
 
@@ -288,7 +377,7 @@ impl WalIngest {
     }
 
     /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
-    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
+    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64, WalIngestError> {
         let next_full_xid =
             enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });
 
@@ -298,9 +387,9 @@ impl WalIngest {
         if xid > next_xid {
             // Wraparound occurred, must be from a prev epoch.
             if epoch == 0 {
-                bail!(
+                Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
                     "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"
-                );
+                )))?;
             }
             epoch -= 1;
         }
@@ -313,7 +402,7 @@ impl WalIngest {
         clear_vm_bits: ClearVmBits,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let ClearVmBits {
             new_heap_blkno,
             old_heap_blkno,
@@ -402,7 +491,7 @@ impl WalIngest {
         create: DbaseCreate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let DbaseCreate {
             db_id,
             tablespace_id,
@@ -505,7 +594,7 @@ impl WalIngest {
         dbase_drop: DbaseDrop,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let DbaseDrop {
             db_id,
             tablespace_ids,
@@ -523,7 +612,7 @@ impl WalIngest {
         create: SmgrCreate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let SmgrCreate { rel } = create;
         self.put_rel_creation(modification, rel, ctx).await?;
         Ok(())
@@ -537,7 +626,7 @@ impl WalIngest {
         truncate: XlSmgrTruncate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let XlSmgrTruncate {
             blkno,
             rnode,
@@ -689,7 +778,7 @@ impl WalIngest {
         record: XactRecord,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let (xact_common, is_commit, is_prepared) = match record {
             XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
                 let xid: u64 = if modification.tline.pg_version >= 17 {
@@ -813,7 +902,7 @@ impl WalIngest {
         truncate: ClogTruncate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let ClogTruncate {
             pageno,
             oldest_xid,
@@ -889,7 +978,7 @@ impl WalIngest {
         zero_page: ClogZeroPage,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let ClogZeroPage { segno, rpageno } = zero_page;
 
         self.put_slru_page_image(
@@ -907,7 +996,7 @@ impl WalIngest {
         &mut self,
         modification: &mut DatadirModification,
         xlrec: &XlMultiXactCreate,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         // Create WAL record for updating the multixact-offsets page
         let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
         let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1010,7 +1099,7 @@ impl WalIngest {
         modification: &mut DatadirModification<'_>,
         xlrec: &XlMultiXactTruncate,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let (maxsegment, startsegment, endsegment) =
             enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
                 cp.oldestMulti = xlrec.end_trunc_off;
@@ -1058,7 +1147,7 @@ impl WalIngest {
         zero_page: MultiXactZeroPage,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let MultiXactZeroPage {
             slru_kind,
             segno,
@@ -1080,7 +1169,7 @@ impl WalIngest {
         update: RelmapUpdate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let RelmapUpdate { update, buf } = update;
 
         modification
@@ -1093,7 +1182,7 @@ impl WalIngest {
         raw_record: RawXlogRecord,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let RawXlogRecord { info, lsn, mut buf } = raw_record;
         let pg_version = modification.tline.pg_version;
 
@@ -1235,12 +1324,12 @@ impl WalIngest {
         put: PutLogicalMessage,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let PutLogicalMessage { path, buf } = put;
         modification.put_file(path.as_str(), &buf, ctx).await
     }
 
-    fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> {
+    fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> {
         match record {
             StandbyRecord::RunningXacts(running_xacts) => {
                 enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
@@ -1258,7 +1347,7 @@ impl WalIngest {
         &mut self,
         record: ReploriginRecord,
         modification: &mut DatadirModification<'_>,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         match record {
             ReploriginRecord::Set(set) => {
                 modification
@@ -1278,7 +1367,7 @@ impl WalIngest {
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         modification.put_rel_creation(rel, 0, ctx).await?;
         Ok(())
     }
@@ -1291,7 +1380,7 @@ impl WalIngest {
         blknum: BlockNumber,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), WalIngestError> {
         self.handle_rel_extend(modification, rel, blknum, ctx)
             .await?;
         modification.put_rel_page_image(rel, blknum, img)?;
@@ -1305,7 +1394,7 @@ impl WalIngest {
         blknum: BlockNumber,
         rec: NeonWalRecord,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         self.handle_rel_extend(modification, rel, blknum, ctx)
             .await?;
         modification.put_rel_wal_record(rel, blknum, rec)?;
@@ -1318,7 +1407,7 @@ impl WalIngest {
         rel: RelTag,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         modification.put_rel_truncation(rel, nblocks, ctx).await?;
         Ok(())
     }
@@ -1329,7 +1418,7 @@ impl WalIngest {
         rel: RelTag,
         blknum: BlockNumber,
         ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), WalIngestError> {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
@@ -1423,7 +1512,7 @@ impl WalIngest {
         blknum: BlockNumber,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         if !self.shard.is_shard_zero() {
             return Ok(());
         }
@@ -1441,7 +1530,7 @@ impl WalIngest {
         segno: u32,
         blknum: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         // we don't use a cache for this like we do for relations. SLRUS are explcitly
         // extended with ZEROPAGE records, not with commit records, so it happens
         // a lot less frequently.
@@ -1509,6 +1598,7 @@ async fn get_relsize(
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
+    use anyhow::Result;
     use postgres_ffi::RELSEG_SIZE;
 
     use super::*;
@@ -1530,7 +1620,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
+    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
         for i in 14..=16 {
             dispatch_pgversion!(i, {
                 pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 9ea708f29a..dfabb6919e 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -65,6 +65,9 @@ static const struct config_enum_entry neon_compute_modes[] = {
 /* GUCs */
 char	   *neon_timeline;
 char	   *neon_tenant;
+char	   *neon_project_id;
+char	   *neon_branch_id;
+char	   *neon_endpoint_id;
 int32		max_cluster_size;
 char	   *page_server_connstring;
 char	   *neon_auth_token;
@@ -1352,6 +1355,31 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);
 
+	DefineCustomStringVariable("neon.project_id",
+							   "Neon project_id the server is running on",
+							   NULL,
+							   &neon_project_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+	DefineCustomStringVariable("neon.branch_id",
+							   "Neon branch_id the server is running on",
+							   NULL,
+							   &neon_branch_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+	DefineCustomStringVariable("neon.endpoint_id",
+							   "Neon endpoint_id the server is running on",
+							   NULL,
+							   &neon_endpoint_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+
 	DefineCustomIntVariable("neon.stripe_size",
 							"sharding stripe size",
 							NULL,
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 0336d63e8d..6b133e4dc4 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -99,6 +99,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->config = config;
 	wp->api = api;
 	wp->state = WPS_COLLECTING_TERMS;
+	wp->mconf.generation = INVALID_GENERATION;
+	wp->mconf.members.len = 0;
+	wp->mconf.new_members.len = 0;
 
 	wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
 
@@ -170,6 +173,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 
 	if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
 		wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
+	if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3)
+		wp_log(FATAL, "enabling generations requires protocol version 3");
 	wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
 
 	/* Fill the greeting package */
@@ -214,7 +219,7 @@ WalProposerFree(WalProposer *wp)
 static bool
 WalProposerGenerationsEnabled(WalProposer *wp)
 {
-	return wp->safekeepers_generation != 0;
+	return wp->safekeepers_generation != INVALID_GENERATION;
 }
 
 /*
@@ -723,13 +728,176 @@ SendProposerGreeting(Safekeeper *sk)
 	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
 }
 
+/*
+ * Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in
+ * members_safekeepers & new_members_safekeepers to sk.
+ */
+static void
+UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
+{
+	/* members_safekeepers etc are fixed size, sanity check mconf size */
+	if (wp->mconf.members.len > MAX_SAFEKEEPERS)
+		wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len);
+	if (wp->mconf.new_members.len > MAX_SAFEKEEPERS)
+		wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len);
+
+	/* node id is not known until greeting is received */
+	if (sk->state < SS_WAIT_VOTING)
+		return;
+
+	/* 0 is assumed to be invalid node id, should never happen */
+	if (sk->greetResponse.nodeId == 0)
+	{
+		wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port);
+		return;
+	}
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		SafekeeperId *sk_id = &wp->mconf.members.m[i];
+
+		if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId)
+		{
+			/*
+			 * If mconf or list of safekeepers to connect to changed (the
+			 * latter always currently goes through restart though),
+			 * ResetMemberSafekeeperPtrs is expected to be called before
+			 * UpdateMemberSafekeeperPtr. So, other value suggests that we are
+			 * connected to the same sk under different host name, complain
+			 * about that.
+			 */
+			if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk)
+			{
+				wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu",
+					   sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper);
+			}
+			wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu",
+				   sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
+			wp->members_safekeepers[i] = sk;
+		}
+	}
+	/* repeat for new_members */
+	for (uint32 i = 0; i < wp->mconf.new_members.len; i++)
+	{
+		SafekeeperId *sk_id = &wp->mconf.new_members.m[i];
+
+		if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId)
+		{
+			if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk)
+			{
+				wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu",
+					   sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper);
+			}
+			wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu",
+				   sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
+			wp->new_members_safekeepers[i] = sk;
+		}
+	}
+}
+
+/*
+ * Reset wp->members_safekeepers & new_members_safekeepers and refill them.
+ * Called after wp changes mconf.
+ */
+static void
+ResetMemberSafekeeperPtrs(WalProposer *wp)
+{
+	memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
+	memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		if (wp->safekeeper[i].state >= SS_WAIT_VOTING)
+			UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]);
+	}
+}
+
+static uint32
+MsetQuorum(MemberSet *mset)
+{
+	Assert(mset->len > 0);
+	return mset->len / 2 + 1;
+}
+
+/* Does n forms quorum in mset? */
+static bool
+MsetHasQuorum(MemberSet *mset, uint32 n)
+{
+	return n >= MsetQuorum(mset);
+}
+
+/*
+ * TermsCollected helper for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
+ */
+static bool
+TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
+{
+	uint32		n_greeted = 0;
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		Safekeeper *sk = msk[i];
+
+		if (sk != NULL && sk->state == SS_WAIT_VOTING)
+		{
+			if (n_greeted > 0)
+				appendStringInfoString(s, ", ");
+			appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
+			n_greeted++;
+		}
+	}
+	appendStringInfo(s, ", %u/%u total", n_greeted, mset->len);
+	return MsetHasQuorum(mset, n_greeted);
+}
+
 /*
  * Have we received greeting from enough (quorum) safekeepers to start voting?
  */
 static bool
 TermsCollected(WalProposer *wp)
 {
-	return wp->n_connected >= wp->quorum;
+	StringInfoData s;			/* str for logging */
+	bool		collected = false;
+
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
+	{
+		collected = wp->n_connected >= wp->quorum;
+		if (collected)
+		{
+			wp->propTerm++;
+			wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm);
+		}
+		return collected;
+	}
+
+	/*
+	 * With generations enabled, we start campaign only when 1) some mconf is
+	 * actually received 2) we have greetings from majority of members as well
+	 * as from majority of new_members if it exists.
+	 */
+	if (wp->mconf.generation == INVALID_GENERATION)
+		return false;
+
+	initStringInfo(&s);
+	appendStringInfoString(&s, "mset greeters: ");
+	if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
+		goto res;
+	if (wp->mconf.new_members.len > 0)
+	{
+		appendStringInfoString(&s, ", new_mset greeters: ");
+		if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
+			goto res;
+	}
+	wp->propTerm++;
+	wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm);
+	collected = true;
+
+res:
+	pfree(s.data);
+	return collected;
 }
 
 static void
@@ -753,13 +921,41 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	pfree(mconf_toml);
 
 	/*
-	 * Adopt mconf of safekeepers if it is higher. TODO: mconf change should
-	 * restart wp if it started voting.
+	 * Adopt mconf of safekeepers if it is higher.
 	 */
 	if (sk->greetResponse.mconf.generation > wp->mconf.generation)
 	{
+		/* sanity check before adopting, should never happen */
+		if (sk->greetResponse.mconf.members.len == 0)
+		{
+			wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation);
+		}
+
+		/*
+		 * If we at least started campaign, restart wp to get elected in the
+		 * new mconf. Note: in principle once wp is already elected
+		 * re-election is not required, but being conservative here is not
+		 * bad.
+		 *
+		 * TODO: put mconf to shmem to immediately pick it up on start,
+		 * otherwise if some safekeeper(s) misses latest mconf and gets
+		 * connected the first, it may cause redundant restarts here.
+		 *
+		 * More generally, it would be nice to restart walproposer (wiping
+		 * election state) without restarting the process. In particular, that
+		 * would allow sync-safekeepers not to die here if it intersected with
+		 * sk migration (as well as remove 1s delay).
+		 *
+		 * Note that assign_neon_safekeepers also currently restarts the
+		 * process, so during normal migration walproposer may restart twice.
+		 */
+		if (wp->state >= WPS_CAMPAIGN)
+		{
+			wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation);
+		}
 		MembershipConfigurationFree(&wp->mconf);
 		MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
+		ResetMemberSafekeeperPtrs(wp);
 		/* full conf was just logged above */
 		wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
 	}
@@ -767,6 +963,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_WAIT_VOTING;
 
+	/* In greeting safekeeper sent its id; update mappings accordingly. */
+	UpdateMemberSafekeeperPtr(wp, sk);
+
 	/*
 	 * Note: it would be better to track the counter on per safekeeper basis,
 	 * but at worst walproposer would restart with 'term rejected', so leave
@@ -778,12 +977,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		/* We're still collecting terms from the majority. */
 		wp->propTerm = Max(sk->greetResponse.term, wp->propTerm);
 
-		/* Quorum is acquried, prepare the vote request. */
+		/* Quorum is acquired, prepare the vote request. */
 		if (TermsCollected(wp))
 		{
-			wp->propTerm++;
-			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
-
 			wp->state = WPS_CAMPAIGN;
 			wp->voteRequest.pam.tag = 'v';
 			wp->voteRequest.generation = wp->mconf.generation;
@@ -832,8 +1028,8 @@ SendVoteRequest(Safekeeper *sk)
 					   &sk->outbuf, wp->config->proto_version);
 
 	/* We have quorum for voting, send our vote request */
-	wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
-		   wp->voteRequest.generation, wp->voteRequest.term);
+	wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT,
+		   sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term);
 	/* On failure, logging & resetting is handled */
 	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
 	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
@@ -851,8 +1047,8 @@ RecvVoteResponse(Safekeeper *sk)
 		return;
 
 	wp_log(LOG,
-		   "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-		   sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
+		   "got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+		   sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
 		   sk->voteResponse.voteGiven,
 		   GetHighestTerm(&sk->voteResponse.termHistory),
 		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
@@ -899,6 +1095,53 @@ RecvVoteResponse(Safekeeper *sk)
 	}
 }
 
+/*
+ * VotesCollected helper for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
+ */
+static bool
+VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
+{
+	uint32		n_votes = 0;
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		Safekeeper *sk = msk[i];
+
+		if (sk != NULL && sk->state == SS_WAIT_ELECTED)
+		{
+			Assert(sk->voteResponse.voteGiven);
+
+			/*
+			 * Find the highest vote. NULL check is for the legacy case where
+			 * safekeeper might be not initialized with LSN at all and return
+			 * 0 LSN in the vote response; we still want to set donor to
+			 * something in this case.
+			 */
+			if (GetLastLogTerm(sk) > wp->donorLastLogTerm ||
+				(GetLastLogTerm(sk) == wp->donorLastLogTerm &&
+				 sk->voteResponse.flushLsn > wp->propTermStartLsn) ||
+				wp->donor == NULL)
+			{
+				wp->donorLastLogTerm = GetLastLogTerm(sk);
+				wp->propTermStartLsn = sk->voteResponse.flushLsn;
+				wp->donor = sk;
+			}
+			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
+
+			if (n_votes > 0)
+				appendStringInfoString(s, ", ");
+			appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
+			n_votes++;
+		}
+	}
+	appendStringInfo(s, ", %u/%u total", n_votes, mset->len);
+	return MsetHasQuorum(mset, n_votes);
+}
+
+
 /*
  * Checks if enough votes has been collected to get elected and if that's the
  * case finds the highest vote, setting donor, donorLastLogTerm,
@@ -907,7 +1150,8 @@ RecvVoteResponse(Safekeeper *sk)
 static bool
 VotesCollected(WalProposer *wp)
 {
-	int			n_ready = 0;
+	StringInfoData s;			/* str for logging */
+	bool		collected = false;
 
 	/* assumed to be called only when not elected yet */
 	Assert(wp->state == WPS_CAMPAIGN);
@@ -916,25 +1160,62 @@ VotesCollected(WalProposer *wp)
 	wp->donorLastLogTerm = 0;
 	wp->truncateLsn = InvalidXLogRecPtr;
 
-	for (int i = 0; i < wp->n_safekeepers; i++)
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
 	{
-		if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
-		{
-			n_ready++;
+		int			n_ready = 0;
 
-			if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
-				(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
-				 wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn))
+		for (int i = 0; i < wp->n_safekeepers; i++)
+		{
+			if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
 			{
-				wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
-				wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
-				wp->donor = i;
+				n_ready++;
+
+				if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
+					(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
+					 wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) ||
+					wp->donor == NULL)
+				{
+					wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
+					wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
+					wp->donor = &wp->safekeeper[i];
+				}
+				wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
 			}
-			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
 		}
+		collected = n_ready >= wp->quorum;
+		if (collected)
+		{
+			wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers);
+		}
+		return collected;
 	}
 
-	return n_ready >= wp->quorum;
+	/*
+	 * if generations are enabled we're expected to get to voting only when
+	 * mconf is established.
+	 */
+	Assert(wp->mconf.generation != INVALID_GENERATION);
+
+	/*
+	 * We must get votes from both msets if both are present.
+	 */
+	initStringInfo(&s);
+	appendStringInfoString(&s, "mset voters: ");
+	if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
+		goto res;
+	if (wp->mconf.new_members.len > 0)
+	{
+		appendStringInfoString(&s, ", new_mset voters: ");
+		if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
+			goto res;
+	}
+	wp_log(LOG, "walproposer elected, %s", s.data);
+	collected = true;
+
+res:
+	pfree(s.data);
+	return collected;
 }
 
 /*
@@ -955,7 +1236,7 @@ HandleElectedProposer(WalProposer *wp)
 	 * that only for logical replication (and switching logical walsenders to
 	 * neon_walreader is a todo.)
 	 */
-	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
+	if (!wp->api.recovery_download(wp, wp->donor))
 	{
 		wp_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
@@ -1078,7 +1359,7 @@ ProcessPropStartPos(WalProposer *wp)
 	/*
 	 * Proposer's term history is the donor's + its own entry.
 	 */
-	dth = &wp->safekeeper[wp->donor].voteResponse.termHistory;
+	dth = &wp->donor->voteResponse.termHistory;
 	wp->propTermHistory.n_entries = dth->n_entries + 1;
 	wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries);
 	if (dth->n_entries > 0)
@@ -1086,11 +1367,10 @@ ProcessPropStartPos(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn;
 
-	wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-		   wp->quorum,
+	wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
 		   wp->propTerm,
 		   LSN_FORMAT_ARGS(wp->propTermStartLsn),
-		   wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+		   wp->donor->host, wp->donor->port,
 		   LSN_FORMAT_ARGS(wp->truncateLsn));
 
 	/*
@@ -1508,6 +1788,14 @@ RecvAppendResponses(Safekeeper *sk)
 
 		readAnything = true;
 
+		/* should never happen: sk is expected to send ERROR instead */
+		if (sk->appendResponse.generation != wp->mconf.generation)
+		{
+			wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u",
+				   sk->greetResponse.nodeId, sk->host, sk->port,
+				   sk->appendResponse.generation, wp->mconf.generation);
+		}
+
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
@@ -1624,30 +1912,101 @@ CalculateMinFlushLsn(WalProposer *wp)
 }
 
 /*
- * Calculate WAL position acknowledged by quorum
+ * GetAcknowledgedByQuorumWALPosition for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
  */
 static XLogRecPtr
-GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
+GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk)
 {
 	XLogRecPtr	responses[MAX_SAFEKEEPERS];
 
 	/*
-	 * Sort acknowledged LSNs
+	 * Ascending sort acknowledged LSNs.
 	 */
-	for (int i = 0; i < wp->n_safekeepers; i++)
+	Assert(mset->len <= MAX_SAFEKEEPERS);
+	for (uint32 i = 0; i < mset->len; i++)
 	{
+		Safekeeper *sk = msk[i];
+
 		/*
 		 * Like in Raft, we aren't allowed to commit entries from previous
-		 * terms, so ignore reported LSN until it gets to epochStartLsn.
+		 * terms, so ignore reported LSN until it gets to propTermStartLsn.
+		 *
+		 * Note: we ignore sk state, which is ok: before first ack flushLsn is
+		 * 0, and later we just preserve value across reconnections. It would
+		 * be ok to check for SS_ACTIVE as well.
 		 */
-		responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
+		if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn)
+		{
+			responses[i] = sk->appendResponse.flushLsn;
+		}
+		else
+		{
+			responses[i] = 0;
+		}
 	}
-	qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+	qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn);
 
 	/*
-	 * Get the smallest LSN committed by quorum
+	 * And get value committed by the quorum. A way to view this: to get the
+	 * highest value committed on the quorum, in the ordered array we skip n -
+	 * n_quorum elements to get to the first (lowest) value present on all sks
+	 * of the highest quorum.
 	 */
-	return responses[wp->n_safekeepers - wp->quorum];
+	return responses[mset->len - MsetQuorum(mset)];
+}
+
+/*
+ * Calculate WAL position acknowledged by quorum, i.e. which may be regarded
+ * committed.
+ *
+ * Zero may be returned when there is no quorum of nodes recovered to term start
+ * lsn which sent feedback yet.
+ */
+static XLogRecPtr
+GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
+{
+	XLogRecPtr	committed;
+
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
+	{
+		XLogRecPtr	responses[MAX_SAFEKEEPERS];
+
+		/*
+		 * Sort acknowledged LSNs
+		 */
+		for (int i = 0; i < wp->n_safekeepers; i++)
+		{
+			/*
+			 * Like in Raft, we aren't allowed to commit entries from previous
+			 * terms, so ignore reported LSN until it gets to
+			 * propTermStartLsn.
+			 *
+			 * Note: we ignore sk state, which is ok: before first ack
+			 * flushLsn is 0, and later we just preserve value across
+			 * reconnections. It would be ok to check for SS_ACTIVE as well.
+			 */
+			responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
+		}
+		qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+
+		/*
+		 * Get the smallest LSN committed by quorum
+		 */
+		return responses[wp->n_safekeepers - wp->quorum];
+	}
+
+	committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers);
+	if (wp->mconf.new_members.len > 0)
+	{
+		XLogRecPtr	new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers);
+
+		committed = Min(committed, new_mset_committed);
+	}
+	return committed;
 }
 
 /*
@@ -1662,7 +2021,7 @@ UpdateDonorShmem(WalProposer *wp)
 	int			i;
 	XLogRecPtr	donor_lsn = InvalidXLogRecPtr;
 
-	if (wp->n_votes < wp->quorum)
+	if (wp->state < WPS_ELECTED)
 	{
 		wp_log(WARNING, "UpdateDonorShmem called before elections are won");
 		return;
@@ -1673,9 +2032,9 @@ UpdateDonorShmem(WalProposer *wp)
 	 * about its position immediately after election before any feedbacks are
 	 * sent.
 	 */
-	if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED)
+	if (wp->donor->state >= SS_WAIT_ELECTED)
 	{
-		donor = &wp->safekeeper[wp->donor];
+		donor = wp->donor;
 		donor_lsn = wp->propTermStartLsn;
 	}
 
@@ -1746,13 +2105,13 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 	}
 
 	/*
-	 * Generally sync is done when majority switched the epoch so we committed
-	 * epochStartLsn and made the majority aware of it, ensuring they are
-	 * ready to give all WAL to pageserver. It would mean whichever majority
-	 * is alive, there will be at least one safekeeper who is able to stream
-	 * WAL to pageserver to make basebackup possible. However, since at the
-	 * moment we don't have any good mechanism of defining the healthy and
-	 * most advanced safekeeper who should push the wal into pageserver and
+	 * Generally sync is done when majority reached propTermStartLsn so we
+	 * committed it and made the majority aware of it, ensuring they are ready
+	 * to give all WAL to pageserver. It would mean whichever majority is
+	 * alive, there will be at least one safekeeper who is able to stream WAL
+	 * to pageserver to make basebackup possible. However, since at the moment
+	 * we don't have any good mechanism of defining the healthy and most
+	 * advanced safekeeper who should push the wal into pageserver and
 	 * basically the random one gets connected, to prevent hanging basebackup
 	 * (due to pageserver connecting to not-synced-safekeeper) we currently
 	 * wait for all seemingly alive safekeepers to get synced.
@@ -1774,7 +2133,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 				n_synced++;
 		}
 
-		if (n_synced >= wp->quorum)
+		if (newCommitLsn >= wp->propTermStartLsn)
 		{
 			/* A quorum of safekeepers has been synced! */
 
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index d116bce806..648b0015ad 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -145,6 +145,7 @@ typedef uint64 NNodeId;
  * This and following structs pair ones in membership.rs.
  */
 typedef uint32 Generation;
+#define INVALID_GENERATION 0
 
 typedef struct SafekeeperId
 {
@@ -771,7 +772,17 @@ typedef struct WalProposer
 	/* Current walproposer membership configuration */
 	MembershipConfiguration mconf;
 
-	/* (n_safekeepers / 2) + 1 */
+	/*
+	 * Parallels mconf.members with pointers to the member's slot in
+	 * safekeepers array of connections, or NULL if such member is not
+	 * connected. Helps to avoid looking slot per id through all
+	 * .safekeepers[] when doing quorum checks.
+	 */
+	Safekeeper *members_safekeepers[MAX_SAFEKEEPERS];
+	/* As above, but for new_members. */
+	Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS];
+
+	/* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */
 	int			quorum;
 
 	/*
@@ -829,7 +840,7 @@ typedef struct WalProposer
 	term_t		donorLastLogTerm;
 
 	/* Most advanced acceptor */
-	int			donor;
+	Safekeeper *donor;
 
 	/* timeline globally starts at this LSN */
 	XLogRecPtr	timelineStartLsn;
diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs
index 003a75faa6..6e7c5d971d 100644
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -31,6 +31,7 @@ pub async fn task_main_https(
     global_timelines: Arc<GlobalTimelines>,
 ) -> anyhow::Result<()> {
     let cert_resolver = ReloadingCertificateResolver::new(
+        "main",
         &conf.ssl_key_file,
         &conf.ssl_cert_file,
         conf.ssl_cert_reload_period,
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 0d1dc8f8ee..fb4530d0d2 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -22,6 +22,7 @@ use pageserver_api::controller_api::{
     MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
     NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest,
     ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
+    TimelineImportRequest,
 };
 use pageserver_api::models::{
     DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest,
@@ -1235,8 +1236,18 @@ async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError
         ForwardOutcome::NotForwarded(req) => req,
     };
 
-    let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.step_down().await)
+    // Spawn a background task: once we start stepping down, we must finish: if the client drops
+    // their request we should avoid stopping in some part-stepped-down state.
+    let handle = tokio::spawn(async move {
+        let state = get_state(&req);
+        state.service.step_down().await
+    });
+
+    let result = handle
+        .await
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+    json_response(StatusCode::OK, result)
 }
 
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1276,6 +1287,37 @@ async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiE
     )
 }
 
+async fn handle_timeline_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let import_req = json_request::<TimelineImportRequest>(&mut req).await?;
+
+    let state = get_state(&req);
+
+    if import_req.tenant_id != tenant_id || import_req.timeline_id != timeline_id {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "tenant id or timeline id mismatch: url={tenant_id}/{timeline_id}, body={}/{}",
+            import_req.tenant_id,
+            import_req.timeline_id
+        )));
+    }
+
+    json_response(
+        StatusCode::OK,
+        state.service.timeline_import(import_req).await?,
+    )
+}
+
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1949,6 +1991,16 @@ pub fn make_router(
                 RequestName("debug_v1_tenant_locate"),
             )
         })
+        .post(
+            "/debug/v1/tenant/:tenant_id/timeline/:timeline_id/import",
+            |r| {
+                named_request_span(
+                    r,
+                    handle_timeline_import,
+                    RequestName("debug_v1_timeline_import"),
+                )
+            },
+        )
         .get("/debug/v1/scheduler", |r| {
             named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
         })
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 1aa9ae10ae..9358c9da4d 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -472,6 +472,7 @@ async fn async_main() -> anyhow::Result<()> {
             let https_listener = tcp_listener::bind(https_addr)?;
 
             let resolver = ReloadingCertificateResolver::new(
+                "main",
                 &args.ssl_key_file,
                 &args.ssl_cert_file,
                 *args.ssl_cert_reload_period,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 2ef09cd2e3..0982e56155 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -61,7 +61,7 @@ use utils::completion::Barrier;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
-use utils::sync::gate::Gate;
+use utils::sync::gate::{Gate, GateGuard};
 use utils::{failpoint_support, pausable_failpoint};
 
 use crate::background_node_operations::{
@@ -594,6 +594,8 @@ struct TenantShardSplitAbort {
     new_stripe_size: Option<ShardStripeSize>,
     /// Until this abort op is complete, no other operations may be done on the tenant
     _tenant_lock: TracingExclusiveGuard<TenantOperations>,
+    /// The reconciler gate for the duration of the split operation, and any included abort.
+    _gate: GateGuard,
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -1460,7 +1462,7 @@ impl Service {
             // Retry until shutdown: we must keep this request object alive until it is properly
             // processed, as it holds a lock guard that prevents other operations trying to do things
             // to the tenant while it is in a weird part-split state.
-            while !self.cancel.is_cancelled() {
+            while !self.reconcilers_cancel.is_cancelled() {
                 match self.abort_tenant_shard_split(&op).await {
                     Ok(_) => break,
                     Err(e) => {
@@ -1473,9 +1475,12 @@ impl Service {
                         // when we retry, so that the abort op will succeed.  If the abort op is failing
                         // for some other reason, we will keep retrying forever, or until a human notices
                         // and does something about it (either fixing a pageserver or restarting the controller).
-                        tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled())
-                            .await
-                            .ok();
+                        tokio::time::timeout(
+                            Duration::from_secs(5),
+                            self.reconcilers_cancel.cancelled(),
+                        )
+                        .await
+                        .ok();
                     }
                 }
             }
@@ -1847,6 +1852,7 @@ impl Service {
         };
 
         if insert {
+            let config = attach_req.config.clone().unwrap_or_default();
             let tsp = TenantShardPersistence {
                 tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
                 shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
@@ -1855,7 +1861,7 @@ impl Service {
                 generation: attach_req.generation_override.or(Some(0)),
                 generation_pageserver: None,
                 placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
-                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
+                config: serde_json::to_string(&config).unwrap(),
                 splitting: SplitState::default(),
                 scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                     .unwrap(),
@@ -1878,16 +1884,16 @@ impl Service {
                 Ok(()) => {
                     tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
 
-                    let mut locked = self.inner.write().unwrap();
-                    locked.tenants.insert(
+                    let mut shard = TenantShard::new(
                         attach_req.tenant_shard_id,
-                        TenantShard::new(
-                            attach_req.tenant_shard_id,
-                            ShardIdentity::unsharded(),
-                            PlacementPolicy::Attached(0),
-                            None,
-                        ),
+                        ShardIdentity::unsharded(),
+                        PlacementPolicy::Attached(0),
+                        None,
                     );
+                    shard.config = config;
+
+                    let mut locked = self.inner.write().unwrap();
+                    locked.tenants.insert(attach_req.tenant_shard_id, shard);
                     tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
                 }
             }
@@ -1972,11 +1978,12 @@ impl Service {
             .set_attached(scheduler, attach_req.node_id);
 
         tracing::info!(
-            "attach_hook: tenant {} set generation {:?}, pageserver {}",
+            "attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}",
             attach_req.tenant_shard_id,
             tenant_shard.generation,
             // TODO: this is an odd number of 0xf's
-            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
+            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)),
+            attach_req.config,
         );
 
         // Trick the reconciler into not doing anything for this tenant: this helps
@@ -4910,7 +4917,7 @@ impl Service {
                     1,
                     10,
                     Duration::from_secs(5),
-                    &self.cancel,
+                    &self.reconcilers_cancel,
                 )
                 .await
             {
@@ -5161,6 +5168,11 @@ impl Service {
         )
         .await;
 
+        let _gate = self
+            .reconcilers_gate
+            .enter()
+            .map_err(|_| ApiError::ShuttingDown)?;
+
         let new_shard_count = ShardCount::new(split_req.new_shard_count);
         let new_stripe_size = split_req.new_stripe_size;
 
@@ -5188,6 +5200,7 @@ impl Service {
                         new_shard_count,
                         new_stripe_size,
                         _tenant_lock,
+                        _gate,
                     })
                     // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
                     .ok();
@@ -5527,7 +5540,10 @@ impl Service {
                 "failpoint".to_string()
             )));
 
-            failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
+            failpoint_support::sleep_millis_async!(
+                "shard-split-post-remote-sleep",
+                &self.reconcilers_cancel
+            );
 
             tracing::info!(
                 "Split {} into {}",
@@ -5585,7 +5601,7 @@ impl Service {
                         stripe_size,
                         preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed),
                     },
-                    &self.cancel,
+                    &self.reconcilers_cancel,
                 )
                 .await
             {
@@ -8670,9 +8686,24 @@ impl Service {
         failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");
 
         self.inner.write().unwrap().step_down();
-        // TODO: would it make sense to have a time-out for this?
-        self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
-            .await;
+
+        // Wait for reconciliations to stop, or terminate this process if they
+        // fail to stop in time (this indicates a bug in shutdown)
+        tokio::select! {
+            _ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => {
+                tracing::info!("Reconciliations stopped, proceeding with step down");
+            }
+            _ = async {
+                failpoint_support::sleep_millis_async!("step-down-delay-timeout");
+                tokio::time::sleep(Duration::from_secs(10)).await
+            } => {
+                tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process");
+
+                // The caller may proceed to act as leader when it sees this request fail: reduce the chance
+                // of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state.
+                std::process::exit(1);
+            }
+        }
 
         let mut global_observed = GlobalObservedState::default();
         let locked = self.inner.read().unwrap();
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 099d0305ba..a23b9a4a02 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -12,13 +12,16 @@ use crate::persistence::{
 use crate::safekeeper::Safekeeper;
 use anyhow::Context;
 use http_utils::error::ApiError;
-use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy};
+use pageserver_api::controller_api::{
+    SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest,
+};
 use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo};
 use safekeeper_api::membership::{MemberSet, SafekeeperId};
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;
+use utils::lsn::Lsn;
 
 use super::Service;
 
@@ -298,6 +301,31 @@ impl Service {
             timeline_id,
         })
     }
+
+    /// Directly insert the timeline into the database without reconciling it with safekeepers.
+    ///
+    /// Useful if the timeline already exists on the specified safekeepers,
+    /// but we want to make it storage controller managed.
+    pub(crate) async fn timeline_import(&self, req: TimelineImportRequest) -> Result<(), ApiError> {
+        let persistence = TimelinePersistence {
+            tenant_id: req.tenant_id.to_string(),
+            timeline_id: req.timeline_id.to_string(),
+            start_lsn: Lsn::INVALID.into(),
+            generation: 1,
+            sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(),
+            new_sk_set: None,
+            cplane_notified_generation: 1,
+            deleted_at: None,
+        };
+        let inserted = self.persistence.insert_timeline(persistence).await?;
+        if inserted {
+            tracing::info!("imported timeline into db");
+        } else {
+            tracing::info!("didn't import timeline into db, as it is already present in db");
+        }
+        Ok(())
+    }
+
     /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler.
     pub(super) async fn tenant_timeline_delete_safekeepers(
         self: &Arc<Self>,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d000dcb69f..9d4068b583 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,6 +14,7 @@ import threading
 import time
 import uuid
 from collections import defaultdict
+from collections.abc import Mapping
 from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
@@ -79,7 +80,12 @@ from fixtures.remote_storage import (
     default_remote_storage,
     remote_storage_to_toml_dict,
 )
-from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.safekeeper.http import (
+    MembershipConfiguration,
+    SafekeeperHttpClient,
+    SafekeeperId,
+    TimelineCreateRequest,
+)
 from fixtures.safekeeper.utils import wait_walreceivers_absent
 from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
@@ -1980,10 +1986,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
         tenant_shard_id: TenantId | TenantShardId,
         pageserver_id: int,
         generation_override: int | None = None,
+        config: None | dict[str, Any] = None,
     ) -> int:
         body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
         if generation_override is not None:
             body["generation_override"] = generation_override
+        if config is not None:
+            body["config"] = config
 
         response = self.request(
             "POST",
@@ -2878,13 +2887,14 @@ class NeonPageserver(PgProtocol, LogUtils):
         self,
         immediate: bool = False,
         timeout_in_seconds: int | None = None,
+        extra_env_vars: dict[str, str] | None = None,
     ):
         """
         High level wrapper for restart: restarts the process, and waits for
         tenant state to stabilize.
         """
         self.stop(immediate=immediate)
-        self.start(timeout_in_seconds=timeout_in_seconds)
+        self.start(timeout_in_seconds=timeout_in_seconds, extra_env_vars=extra_env_vars)
         self.quiesce_tenants()
 
     def quiesce_tenants(self):
@@ -2973,11 +2983,12 @@ class NeonPageserver(PgProtocol, LogUtils):
         to call into the pageserver HTTP client.
         """
         client = self.http_client()
-        if generation is None:
-            generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
-        elif override_storage_controller_generation:
+        if generation is None or override_storage_controller_generation:
             generation = self.env.storage_controller.attach_hook_issue(
-                tenant_id, self.id, generation
+                tenant_id,
+                self.id,
+                generation_override=generation if override_storage_controller_generation else None,
+                config=config,
             )
         return client.tenant_attach(
             tenant_id,
@@ -4291,28 +4302,29 @@ class Endpoint(PgProtocol, LogUtils):
 
     def respec_deep(self, **kwargs: Any) -> None:
         """
-        Update the endpoint.json file taking into account nested keys.
-        It does one level deep update. Should enough for most cases.
-        Distinct method from respec() to do not break existing functionality.
+        Update the spec.json file taking into account nested keys.
+        Distinct method from respec() to not break existing functionality.
         NOTE: This method also updates the spec.json file, not endpoint.json.
         We need it because neon_local also writes to spec.json, so intended
         use-case is i) start endpoint with some config, ii) respec_deep(),
         iii) call reconfigure() to apply the changes.
         """
+
+        def update(curr, patch):
+            for k, v in patch.items():
+                if isinstance(v, Mapping):
+                    curr[k] = update(curr.get(k, {}), v)
+                else:
+                    curr[k] = v
+            return curr
+
         config_path = os.path.join(self.endpoint_path(), "spec.json")
         with open(config_path) as f:
             data_dict: dict[str, Any] = json.load(f)
 
         log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4))
 
-        for key, value in kwargs.items():
-            if isinstance(value, dict):
-                if key not in data_dict:
-                    data_dict[key] = value
-                else:
-                    data_dict[key] = {**data_dict[key], **value}
-            else:
-                data_dict[key] = value
+        update(data_dict, kwargs)
 
         with open(config_path, "w") as file:
             log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
@@ -4839,6 +4851,50 @@ class Safekeeper(LogUtils):
 
         wait_until(paused)
 
+    @staticmethod
+    def sks_to_safekeeper_ids(sks: list[Safekeeper]) -> list[SafekeeperId]:
+        return [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in sks]
+
+    @staticmethod
+    def mconf_sks(env: NeonEnv, mconf: MembershipConfiguration) -> list[Safekeeper]:
+        """
+        List of Safekeepers which are members in `mconf`.
+        """
+        members_ids = [m.id for m in mconf.members]
+        new_members_ids = [m.id for m in mconf.new_members] if mconf.new_members is not None else []
+        return [sk for sk in env.safekeepers if sk.id in members_ids or sk.id in new_members_ids]
+
+    @staticmethod
+    def create_timeline(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        ps: NeonPageserver,
+        mconf: MembershipConfiguration,
+        members_sks: list[Safekeeper],
+    ):
+        """
+        Manually create timeline on safekeepers with given (presumably inital)
+        mconf: figure out LSN from pageserver, bake request and execute it on
+        given safekeepers.
+
+        Normally done by storcon, but some tests want to do it manually so far.
+        """
+        ps_http_cli = ps.http_client()
+        # figure out initial LSN.
+        ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
+        init_lsn = ps_timeline_detail["last_record_lsn"]
+        log.info(f"initial LSN: {init_lsn}")
+        # sk timeline creation request expects minor version
+        pg_version = ps_timeline_detail["pg_version"] * 10000
+        # create inital mconf
+        create_r = TimelineCreateRequest(
+            tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
+        )
+        log.info(f"sending timeline create: {create_r.to_json()}")
+
+        for sk in members_sks:
+            sk.http_client().timeline_create(create_r)
+
 
 class NeonBroker(LogUtils):
     """An object managing storage_broker instance"""
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index eedb693e3d..71c750b9eb 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -65,13 +65,11 @@ def single_timeline(
     assert ps_http.tenant_list() == []
 
     def attach(tenant):
-        # NB: create the new tenant in the storage controller with the correct tenant config. This
-        # will pick up the existing tenant data from remote storage. If we just attach it to the
-        # Pageserver, the storage controller will reset the tenant config to the default.
-        env.create_tenant(
-            tenant_id=tenant,
-            timeline_id=template_timeline,
-            conf=template_config,
+        env.pageserver.tenant_attach(
+            tenant,
+            config=template_config,
+            generation=100,
+            override_storage_controller_generation=True,
         )
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index e409151b76..839e985419 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -25,7 +25,7 @@ class Walreceiver:
 
 @dataclass
 class SafekeeperTimelineStatus:
-    mconf: Configuration | None
+    mconf: MembershipConfiguration | None
     term: int
     last_log_term: int
     pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
@@ -78,17 +78,17 @@ class SafekeeperId:
 
 
 @dataclass
-class Configuration:
+class MembershipConfiguration:
     generation: int
     members: list[SafekeeperId]
     new_members: list[SafekeeperId] | None
 
     @classmethod
-    def from_json(cls, d: dict[str, Any]) -> Configuration:
+    def from_json(cls, d: dict[str, Any]) -> MembershipConfiguration:
         generation = d["generation"]
         members = d["members"]
         new_members = d.get("new_members")
-        return Configuration(generation, members, new_members)
+        return MembershipConfiguration(generation, members, new_members)
 
     def to_json(self) -> str:
         return json.dumps(self, cls=EnhancedJSONEncoder)
@@ -98,7 +98,7 @@ class Configuration:
 class TimelineCreateRequest:
     tenant_id: TenantId
     timeline_id: TimelineId
-    mconf: Configuration
+    mconf: MembershipConfiguration
     # not exactly PgVersion, for example 150002 for 15.2
     pg_version: int
     start_lsn: Lsn
@@ -110,13 +110,13 @@ class TimelineCreateRequest:
 
 @dataclass
 class TimelineMembershipSwitchResponse:
-    previous_conf: Configuration
-    current_conf: Configuration
+    previous_conf: MembershipConfiguration
+    current_conf: MembershipConfiguration
 
     @classmethod
     def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
-        previous_conf = Configuration.from_json(d["previous_conf"])
-        current_conf = Configuration.from_json(d["current_conf"])
+        previous_conf = MembershipConfiguration.from_json(d["previous_conf"])
+        current_conf = MembershipConfiguration.from_json(d["current_conf"])
         return TimelineMembershipSwitchResponse(previous_conf, current_conf)
 
 
@@ -194,7 +194,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         resj = res.json()
         walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
         # It is always normally not None, it is allowed only to make forward compat tests happy.
-        mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None
+        mconf = MembershipConfiguration.from_json(resj["mconf"]) if "mconf" in resj else None
         return SafekeeperTimelineStatus(
             mconf=mconf,
             term=resj["acceptor_state"]["term"],
@@ -223,7 +223,9 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         return self.timeline_status(tenant_id, timeline_id).commit_lsn
 
     # Get timeline membership configuration.
-    def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration:
+    def get_membership(
+        self, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> MembershipConfiguration:
         # make mypy happy
         return self.timeline_status(tenant_id, timeline_id).mconf  # type: ignore
 
@@ -275,7 +277,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         return res_json
 
     def timeline_exclude(
-        self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
+        self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
     ) -> dict[str, Any]:
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude",
@@ -287,7 +289,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         return res_json
 
     def membership_switch(
-        self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
+        self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
     ) -> TimelineMembershipSwitchResponse:
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership",
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index efd423104d..8af52dcbd0 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -66,11 +66,11 @@ def test_basebackup_with_high_slru_count(
 
     n_txns = 500000
 
-    def setup_wrapper(env: NeonEnv):
-        return setup_tenant_template(env, n_txns)
-
     env = setup_pageserver_with_tenants(
-        neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper
+        neon_env_builder,
+        f"large_slru_count-{n_tenants}-{n_txns}",
+        n_tenants,
+        lambda env: setup_tenant_template(env, n_txns),
     )
     run_benchmark(env, pg_bin, record, duration)
 
@@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
         "gc_period": "0s",  # disable periodic gc
         "checkpoint_timeout": "10 years",
         "compaction_period": "0s",  # disable periodic compaction
-        "compaction_threshold": 10,
-        "compaction_target_size": 134217728,
-        "checkpoint_distance": 268435456,
-        "image_creation_threshold": 3,
     }
 
     template_tenant, template_timeline = env.create_tenant(set_default=True)
diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py
index 2c27368001..5169add6cb 100644
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -1,5 +1,8 @@
+import concurrent.futures
 import dataclasses
 import json
+import re
+import threading
 import time
 from dataclasses import dataclass
 from pathlib import Path
@@ -31,15 +34,15 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
     mode: str = "pipelined"
 
 
-EXECUTION = ["concurrent-futures", "tasks"]
+EXECUTION = ["concurrent-futures"]
 
 NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
 for max_batch_size in [1, 32]:
     for execution in EXECUTION:
         NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
 
-BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
-for max_batch_size in [1, 2, 4, 8, 16, 32]:
+BATCHABLE: list[PageServicePipeliningConfig] = []
+for max_batch_size in [32]:
     for execution in EXECUTION:
         BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
 
@@ -47,19 +50,6 @@ for max_batch_size in [1, 2, 4, 8, 16, 32]:
 @pytest.mark.parametrize(
     "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
     [
-        # non-batchable workloads
-        # (A separate benchmark will consider latency).
-        *[
-            (
-                50,
-                config,
-                TARGET_RUNTIME,
-                1,
-                128,
-                f"not batchable {dataclasses.asdict(config)}",
-            )
-            for config in NON_BATCHABLE
-        ],
         # batchable workloads should show throughput and CPU efficiency improvements
         *[
             (
@@ -137,7 +127,14 @@ def test_throughput(
 
     env = neon_env_builder.init_start()
     ps_http = env.pageserver.http_client()
-    endpoint = env.endpoints.create_start("main")
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # minimal lfc & small shared buffers to force requests to pageserver
+            "neon.max_file_cache_size=1MB",
+            "shared_buffers=10MB",
+        ],
+    )
     conn = endpoint.connect()
     cur = conn.cursor()
 
@@ -155,7 +152,6 @@ def test_throughput(
     tablesize = tablesize_mib * 1024 * 1024
     npages = tablesize // (8 * 1024)
     cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
-    # TODO: can we force postgres to do sequential scans?
 
     #
     # Run the workload, collect `Metrics` before and after, calculate difference, normalize.
@@ -211,31 +207,73 @@ def test_throughput(
                 ).value,
             )
 
-    def workload() -> Metrics:
+    def workload(disruptor_started: threading.Event) -> Metrics:
+        disruptor_started.wait()
         start = time.time()
         iters = 0
         while time.time() - start < target_runtime or iters < 2:
-            log.info("Seqscan %d", iters)
             if iters == 1:
                 # round zero for warming up
                 before = get_metrics()
-            cur.execute(
-                "select clear_buffer_cache()"
-            )  # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests
             cur.execute("select sum(data::bigint) from t")
             assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
             iters += 1
         after = get_metrics()
         return (after - before).normalize(iters - 1)
 
+    def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event):
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        iters = 0
+        while True:
+            cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')")
+            if stop_disruptor.is_set():
+                break
+            disruptor_started.set()
+            iters += 1
+            time.sleep(0.001)
+        return iters
+
     env.pageserver.patch_config_toml_nonrecursive(
         {"page_service_pipelining": dataclasses.asdict(pipelining_config)}
     )
-    env.pageserver.restart()
-    metrics = workload()
+
+    # set trace for log analysis below
+    env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"})
+
+    log.info("Starting workload")
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        disruptor_started = threading.Event()
+        stop_disruptor = threading.Event()
+        disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor)
+        workload_fut = executor.submit(workload, disruptor_started)
+        metrics = workload_fut.result()
+        stop_disruptor.set()
+        ndisruptions = disruptor_fut.result()
+        log.info("Disruptor issued %d disrupting requests", ndisruptions)
 
     log.info("Results: %s", metrics)
 
+    since_last_start: list[str] = []
+    for line in env.pageserver.logfile.read_text().splitlines():
+        if "git:" in line:
+            since_last_start = []
+        since_last_start.append(line)
+
+    stopping_batching_because_re = re.compile(
+        r"stopping batching because (LSN changed|of batch size|timeline object mismatch|batch key changed|same page was requested at different LSNs|.*)"
+    )
+    reasons_for_stopping_batching = {}
+    for line in since_last_start:
+        match = stopping_batching_because_re.search(line)
+        if match:
+            if match.group(1) not in reasons_for_stopping_batching:
+                reasons_for_stopping_batching[match.group(1)] = 0
+            reasons_for_stopping_batching[match.group(1)] += 1
+
+    log.info("Reasons for stopping batching: %s", reasons_for_stopping_batching)
+
     #
     # Sanity-checks on the collected data
     #
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 3d7204d883..5ef63e2fe9 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import copy
 import json
 import uuid
 from typing import TYPE_CHECKING
@@ -16,7 +15,6 @@ if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 
 
-@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395")
 def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     env = neon_env_builder.init_start()
 
@@ -44,7 +42,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
                 "refill_interval": "100ms",
                 "refill_amount": int(rate_limit_rps / 10),
                 "max": int(rate_limit_rps / 10),
-                "fair": True,
             },
         },
     )
@@ -98,17 +95,12 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))
 
     log.info("run pagebench")
-    duration_secs = 10
+    duration_secs = 20
     actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
 
     log.info("validate the client is capped at the configured rps limit")
     expect_ncompleted = duration_secs * rate_limit_rps
-    delta_abs = abs(expect_ncompleted - actual_ncompleted)
-    threshold = 0.05 * expect_ncompleted
-    assert threshold / rate_limit_rps < 0.1 * duration_secs, (
-        "test self-test: unrealistic expecations regarding precision in this test"
-    )
-    assert delta_abs < 0.05 * expect_ncompleted, (
+    assert pytest.approx(expect_ncompleted, 0.05) == actual_ncompleted, (
         "the throttling deviates more than 5percent from the expectation"
     )
 
@@ -122,6 +114,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
         timeout=compaction_period,
     )
 
+    log.info("validate the metrics")
     smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_post is not None
     throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
@@ -130,72 +123,13 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
     actual_throttled_secs = actual_throttled_usecs / 1_000_000
 
-    log.info("validate that the metric doesn't include throttle wait time")
-    assert duration_secs >= 10 * actual_smgr_query_seconds, (
-        "smgr metrics should not include throttle wait time"
-    )
-
-    log.info("validate that the throttling wait time metrics is correct")
     assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, (
-        "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
+        "throttling and processing latency = total request time; this assert validates thi holds on average"
     )
 
-
-throttle_config_with_field_fair_set = {
-    "task_kinds": ["PageRequestHandler"],
-    "fair": True,
-    "initial": 27,
-    "refill_interval": "43s",
-    "refill_amount": 23,
-    "max": 42,
-}
-
-
-def assert_throttle_config_with_field_fair_set(conf):
-    """
-    Field `fair` is ignored, so, responses don't contain it
-    """
-    without_fair = copy.deepcopy(throttle_config_with_field_fair_set)
-    without_fair.pop("fair")
-
-    assert conf == without_fair
-
-
-def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder):
-    """
-    To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
-    """
-    env = neon_env_builder.init_start()
-    vps_http = env.storage_controller.pageserver_api()
-    # with_fair config should still be settable
-    vps_http.set_tenant_config(
-        env.initial_tenant,
-        {"timeline_get_throttle": throttle_config_with_field_fair_set},
-    )
-    conf = vps_http.tenant_config(env.initial_tenant)
-    assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
-    assert_throttle_config_with_field_fair_set(
-        conf.tenant_specific_overrides["timeline_get_throttle"]
-    )
-
-
-def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
-    neon_env_builder: NeonEnvBuilder,
-):
-    """
-    To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
-    """
-
-    def set_tenant_config(ps_cfg):
-        tenant_config = ps_cfg.setdefault("tenant_config", {})
-        tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set
-
-    neon_env_builder.pageserver_config_override = set_tenant_config
-    env = neon_env_builder.init_start()
-    ps_http = env.pageserver.http_client()
-    conf = ps_http.tenant_config(env.initial_tenant)
-    assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
-
-    env.pageserver.allowed_errors.append(
-        r'.*ignoring unknown configuration item path="tenant_config\.timeline_get_throttle\.fair"*'
+    # without this assertion, the test would pass even if the throttling was completely broken
+    # but the request processing is so slow that it makes up for the latency that a correct throttling
+    # implementation would add
+    assert actual_smgr_query_seconds < 0.66 * duration_secs, (
+        "test self-test: request processing is consuming most of the wall clock time; this risks that we're not actually testing throttling"
     )
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index d03d05d33d..d48e731394 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -242,7 +242,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
             pageserver.tenant_location_configure(tenant_id, location_conf)
             last_state[pageserver.id] = (mode, generation)
 
-            if mode.startswith("Attached"):
+            # It's only valid to connect to the last generation. Newer generations may yank layer
+            # files used in older generations.
+            last_generation = max(
+                [s[1] for s in last_state.values() if s[1] is not None], default=None
+            )
+
+            if mode.startswith("Attached") and generation == last_generation:
                 # This is a basic test: we are validating that he endpoint works properly _between_
                 # configuration changes.  A stronger test would be to validate that clients see
                 # no errors while we are making the changes.
diff --git a/test_runner/regress/test_ssl.py b/test_runner/regress/test_ssl.py
index 9a7204ca17..39c94c05a9 100644
--- a/test_runner/regress/test_ssl.py
+++ b/test_runner/regress/test_ssl.py
@@ -1,5 +1,6 @@
 import os
 import ssl
+from datetime import datetime, timedelta
 
 import pytest
 import requests
@@ -151,3 +152,63 @@ def test_certificate_rotation(neon_env_builder: NeonEnvBuilder):
     requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
     cur_cert = ssl.get_server_certificate(("localhost", port))
     assert cur_cert == sk_cert
+
+
+def test_server_and_cert_metrics(neon_env_builder: NeonEnvBuilder):
+    """
+    Test metrics exported from http/https server and tls cert reloader.
+    """
+    neon_env_builder.use_https_pageserver_api = True
+    neon_env_builder.pageserver_config_override = "ssl_cert_reload_period='100 ms'"
+    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.append(".*Error reloading certificate.*")
+
+    ps_client = env.pageserver.http_client()
+
+    # 1. Test connection started metric.
+    filter_https = {"scheme": "https"}
+    old_https_conn_count = (
+        ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
+    )
+
+    addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status"
+    requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
+
+    new_https_conn_count = (
+        ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
+    )
+    # The counter should increase after the request,
+    # but it may increase by more than one because of storcon requests.
+    assert new_https_conn_count > old_https_conn_count
+
+    # 2. Test tls connection error.
+    # Request without specified CA cert file should fail.
+    with pytest.raises(requests.exceptions.SSLError):
+        requests.get(addr)
+
+    tls_error_cnt = (
+        ps_client.get_metric_value("http_server_connection_errors_total", {"type": "tls"}) or 0
+    )
+    assert tls_error_cnt == 1
+
+    # 3. Test expiration time metric.
+    expiration_time = datetime.fromtimestamp(
+        ps_client.get_metric_value("tls_certs_expiration_time_seconds") or 0
+    )
+    now = datetime.now()
+    # neon_local generates certs valid for 100 years.
+    # Compare with +-1 year to not care about leap years.
+    assert now + timedelta(days=365 * 99) < expiration_time < now + timedelta(days=365 * 101)
+
+    # 4. Test cert reload failed metric.
+    reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total")
+    assert reload_error_cnt == 0
+
+    os.remove(env.pageserver.workdir / "server.crt")
+
+    def reload_failed():
+        reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") or 0
+        assert reload_error_cnt > 0
+
+    wait_until(reload_failed)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index ce73c9a738..b2c8415e9a 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2892,10 +2892,12 @@ def test_storage_controller_leadership_transfer(
         )
 
 
+@pytest.mark.parametrize("step_down_times_out", [False, True])
 def test_storage_controller_leadership_transfer_during_split(
     neon_env_builder: NeonEnvBuilder,
     storage_controller_proxy: StorageControllerProxy,
     port_distributor: PortDistributor,
+    step_down_times_out: bool,
 ):
     """
     Exercise a race between shard splitting and graceful leadership transfer.  This is
@@ -2936,6 +2938,18 @@ def test_storage_controller_leadership_transfer_during_split(
         )
     env.storage_controller.reconcile_until_idle()
 
+    # We are testing scenarios where the step down API does not complete: either because it is stuck
+    # doing a shard split, or because it totally times out on some other failpoint.
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*step_down.*request was dropped before completing.*",
+            ".*step_down.*operation timed out.*",
+            ".*Send step down request failed, will retry.*",
+            ".*Send step down request still failed after.*retries.*",
+            ".*Leader .+ did not respond to step-down request.*",
+        ]
+    )
+
     with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
         # Start a shard split
         env.storage_controller.allowed_errors.extend(
@@ -2943,6 +2957,14 @@ def test_storage_controller_leadership_transfer_during_split(
         )
         pause_failpoint = "shard-split-pre-complete"
         env.storage_controller.configure_failpoints((pause_failpoint, "pause"))
+
+        if not step_down_times_out:
+            # Prevent the timeout self-terminate code from executing: we will block step down on the
+            # shard split itself
+            env.storage_controller.configure_failpoints(
+                ("step-down-delay-timeout", "return(3600000)")
+            )
+
         split_fut = executor.submit(
             env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2
         )
@@ -2961,12 +2983,20 @@ def test_storage_controller_leadership_transfer_during_split(
             timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
         )
 
+        if step_down_times_out:
+            # Step down will time out, original controller will terminate itself
+            env.storage_controller.allowed_errors.extend([".*terminating process.*"])
+        else:
+            # Step down does not time out: original controller hits its shard split completion
+            # code path and realises that it must not purge the parent shards from the database.
+            env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"])
+
         def passed_split_abort():
             try:
                 log.info("Checking log for pattern...")
-                assert env.storage_controller.log_contains(
-                    ".*Using observed state received from leader.*"
-                )
+                # This log is indicative of entering startup_reconcile, which happens
+                # after the point we would abort shard splits
+                assert env.storage_controller.log_contains(".*Populating tenant shards.*")
             except Exception:
                 log.exception("Failed to find pattern in log")
                 raise
@@ -2975,34 +3005,42 @@ def test_storage_controller_leadership_transfer_during_split(
         wait_until(passed_split_abort, interval=0.1, status_interval=1.0)
         assert env.storage_controller.log_contains(".*Aborting shard split.*")
 
-        # Proxy is still talking to original controller here: disable its pause failpoint so
-        # that its shard split can run to completion.
-        log.info("Disabling failpoint")
-        # Bypass the proxy: the python test HTTPServer is single threaded and still blocked
-        # on handling the shard split request.
-        env.storage_controller.request(
-            "PUT",
-            f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
-            json=[{"name": "shard-split-pre-complete", "actions": "off"}],
-            headers=env.storage_controller.headers(TokenScope.ADMIN),
-        )
+        if step_down_times_out:
+            # We will let the old controller hit a timeout path where it terminates itself, rather than
+            # completing step_down and trying to complete a shard split
+            def old_controller_terminated():
+                assert env.storage_controller.log_contains(".*terminating process.*")
 
-        def previous_stepped_down():
-            assert (
-                env.storage_controller.get_leadership_status()
-                == StorageControllerLeadershipStatus.STEPPED_DOWN
+            wait_until(old_controller_terminated)
+        else:
+            # Proxy is still talking to original controller here: disable its pause failpoint so
+            # that its shard split can run to completion.
+            log.info("Disabling failpoint")
+            # Bypass the proxy: the python test HTTPServer is single threaded and still blocked
+            # on handling the shard split request.
+            env.storage_controller.request(
+                "PUT",
+                f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
+                json=[{"name": "shard-split-pre-complete", "actions": "off"}],
+                headers=env.storage_controller.headers(TokenScope.ADMIN),
             )
 
-        log.info("Awaiting step down")
-        wait_until(previous_stepped_down)
+            def previous_stepped_down():
+                assert (
+                    env.storage_controller.get_leadership_status()
+                    == StorageControllerLeadershipStatus.STEPPED_DOWN
+                )
 
-        # Let the shard split complete: this may happen _after_ the replacement has come up
-        # and tried to clean up the databases
-        log.info("Unblocking & awaiting shard split")
-        with pytest.raises(Exception, match="Unexpected child shard count"):
-            # This split fails when it tries to persist results, because it encounters
-            # changes already made by the new controller's abort-on-startup
-            split_fut.result()
+            log.info("Awaiting step down")
+            wait_until(previous_stepped_down)
+
+            # Let the shard split complete: this may happen _after_ the replacement has come up
+            # and tried to clean up the databases
+            log.info("Unblocking & awaiting shard split")
+            with pytest.raises(Exception, match="Unexpected child shard count"):
+                # This split fails when it tries to persist results, because it encounters
+                # changes already made by the new controller's abort-on-startup
+                split_fut.result()
 
         log.info("Routing to new leader")
         storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
@@ -3020,13 +3058,14 @@ def test_storage_controller_leadership_transfer_during_split(
     env.storage_controller.wait_until_ready()
     env.storage_controller.consistency_check()
 
-    # Check that the stepped down instance forwards requests
-    # to the new leader while it's still running.
-    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
-    env.storage_controller.tenant_shard_dump()
-    env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
-    status = env.storage_controller.node_status(env.pageservers[0].id)
-    assert status["scheduling"] == "Pause"
+    if not step_down_times_out:
+        # Check that the stepped down instance forwards requests
+        # to the new leader while it's still running.
+        storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
+        env.storage_controller.tenant_shard_dump()
+        env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
+        status = env.storage_controller.node_status(env.pageservers[0].id)
+        assert status["scheduling"] == "Pause"
 
 
 def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index e3d39f9315..a9a6699e5c 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -45,7 +45,7 @@ from fixtures.remote_storage import (
     s3_storage,
 )
 from fixtures.safekeeper.http import (
-    Configuration,
+    MembershipConfiguration,
     SafekeeperHttpClient,
     SafekeeperId,
     TimelineCreateRequest,
@@ -589,7 +589,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
     for sk in env.safekeepers:
         sk.start()
         cli = sk.http_client()
-        mconf = Configuration(generation=0, members=[], new_members=None)
+        mconf = MembershipConfiguration(generation=0, members=[], new_members=None)
         # set start_lsn to the beginning of the first segment to allow reading
         # WAL from there (could you intidb LSN as well).
         r = TimelineCreateRequest(
@@ -1948,7 +1948,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
     sk_id_2 = SafekeeperId(11, "localhost", 5434)  # just a mock
 
     # Request to switch before timeline creation should fail.
-    init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None)
+    init_conf = MembershipConfiguration(generation=1, members=[sk_id_1], new_members=None)
     with pytest.raises(requests.exceptions.HTTPError):
         http_cli.membership_switch(tenant_id, timeline_id, init_conf)
 
@@ -1960,7 +1960,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
     http_cli.timeline_create(create_r)
 
     # Switch into some conf.
-    joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
+    joint_conf = MembershipConfiguration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
     resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf)
     log.info(f"joint switch resp: {resp}")
     assert resp.previous_conf.generation == 1
@@ -1973,24 +1973,26 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
     assert after_restart.generation == 4
 
     # Switch into non joint conf of which sk is not a member, must fail.
-    non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None)
+    non_joint_not_member = MembershipConfiguration(
+        generation=5, members=[sk_id_2], new_members=None
+    )
     with pytest.raises(requests.exceptions.HTTPError):
         resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member)
 
     # Switch into good non joint conf.
-    non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None)
+    non_joint = MembershipConfiguration(generation=6, members=[sk_id_1], new_members=None)
     resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint)
     log.info(f"non joint switch resp: {resp}")
     assert resp.previous_conf.generation == 4
     assert resp.current_conf.generation == 6
 
     # Switch request to lower conf should be rejected.
-    lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None)
+    lower_conf = MembershipConfiguration(generation=3, members=[sk_id_1], new_members=None)
     with pytest.raises(requests.exceptions.HTTPError):
         http_cli.membership_switch(tenant_id, timeline_id, lower_conf)
 
     # Now, exclude sk from the membership, timeline should be deleted.
-    excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None)
+    excluded_conf = MembershipConfiguration(generation=7, members=[sk_id_2], new_members=None)
     http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf)
     with pytest.raises(requests.exceptions.HTTPError):
         http_cli.timeline_status(tenant_id, timeline_id)
@@ -2010,11 +2012,6 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    ps = env.pageservers[0]
-    ps_http_cli = ps.http_client()
-
-    http_clis = [sk.http_client() for sk in env.safekeepers]
-
     config_lines = [
         "neon.safekeeper_proto_version = 3",
     ]
@@ -2023,22 +2020,11 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
     # expected to fail because timeline is not created on safekeepers
     with pytest.raises(Exception, match=r".*timed out.*"):
         ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s")
-    # figure out initial LSN.
-    ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
-    init_lsn = ps_timeline_detail["last_record_lsn"]
-    log.info(f"initial LSN: {init_lsn}")
-    # sk timeline creation request expects minor version
-    pg_version = ps_timeline_detail["pg_version"] * 10000
     # create inital mconf
-    sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers]
-    mconf = Configuration(generation=1, members=sk_ids, new_members=None)
-    create_r = TimelineCreateRequest(
-        tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
+    mconf = MembershipConfiguration(
+        generation=1, members=Safekeeper.sks_to_safekeeper_ids(env.safekeepers), new_members=None
     )
-    log.info(f"sending timeline create: {create_r.to_json()}")
-
-    for sk_http_cli in http_clis:
-        sk_http_cli.timeline_create(create_r)
+    Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, env.safekeepers)
     # Once timeline created endpoint should start.
     ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
     ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index b7c7478e78..c5dd34f64f 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -18,6 +18,7 @@ from fixtures.neon_fixtures import (
     Safekeeper,
 )
 from fixtures.remote_storage import RemoteStorageKind
+from fixtures.safekeeper.http import MembershipConfiguration
 from fixtures.utils import skip_in_debug_build
 
 if TYPE_CHECKING:
@@ -452,20 +453,24 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder):
     asyncio.run(run_concurrent_computes(env))
 
 
+async def assert_query_hangs(endpoint: Endpoint, query: str):
+    """
+    Start on endpoint query which is expected to hang and check that it does.
+    """
+    conn = await endpoint.connect_async()
+    bg_query = asyncio.create_task(conn.execute(query))
+    await asyncio.sleep(2)
+    assert not bg_query.done()
+    return bg_query
+
+
 # Stop safekeeper and check that query cannot be executed while safekeeper is down.
 # Query will insert a single row into a table.
-async def check_unavailability(
-    sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2
-):
+async def check_unavailability(sk: Safekeeper, ep: Endpoint, key: int, start_delay_sec: int = 2):
     # shutdown one of two acceptors, that is, majority
     sk.stop()
 
-    bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')"))
-
-    await asyncio.sleep(start_delay_sec)
-    # ensure that the query has not been executed yet
-    assert not bg_query.done()
-
+    bg_query = await assert_query_hangs(ep, f"INSERT INTO t values ({key}, 'payload')")
     # start safekeeper and await the query
     sk.start()
     await bg_query
@@ -480,10 +485,10 @@ async def run_unavailability(env: NeonEnv, endpoint: Endpoint):
     await conn.execute("INSERT INTO t values (1, 'payload')")
 
     # stop safekeeper and check that query cannot be executed while safekeeper is down
-    await check_unavailability(env.safekeepers[0], conn, 2)
+    await check_unavailability(env.safekeepers[0], endpoint, 2)
 
     # for the world's balance, do the same with second safekeeper
-    await check_unavailability(env.safekeepers[1], conn, 3)
+    await check_unavailability(env.safekeepers[1], endpoint, 3)
 
     # check that we can execute queries after restart
     await conn.execute("INSERT INTO t values (4, 'payload')")
@@ -514,15 +519,7 @@ async def run_recovery_uncommitted(env: NeonEnv):
     # insert with only one safekeeper up to create tail of flushed but not committed WAL
     sk1.stop()
     sk2.stop()
-    conn = await ep.connect_async()
-    # query should hang, so execute in separate task
-    bg_query = asyncio.create_task(
-        conn.execute("insert into t select generate_series(1, 2000), 'payload'")
-    )
-    sleep_sec = 2
-    await asyncio.sleep(sleep_sec)
-    # it must still be not finished
-    assert not bg_query.done()
+    await assert_query_hangs(ep, "insert into t select generate_series(1, 2000), 'payload'")
     # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
     ep.stop_and_destroy()
 
@@ -559,15 +556,7 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
     # insert with only one sk3 up to create tail of flushed but not committed WAL on it
     sk1.stop()
     sk2.stop()
-    conn = await ep.connect_async()
-    # query should hang, so execute in separate task
-    bg_query = asyncio.create_task(
-        conn.execute("insert into t select generate_series(1, 180000), 'Papaya'")
-    )
-    sleep_sec = 2
-    await asyncio.sleep(sleep_sec)
-    # it must still be not finished
-    assert not bg_query.done()
+    await assert_query_hangs(ep, "insert into t select generate_series(1, 180000), 'Papaya'")
     # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
     ep.stop_and_destroy()
 
@@ -607,6 +596,132 @@ def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_versi
     asyncio.run(run_wal_truncation(env, safekeeper_proto_version))
 
 
+async def quorum_sanity_single(
+    env: NeonEnv,
+    compute_sks_ids: list[int],
+    members_sks_ids: list[int],
+    new_members_sks_ids: list[int] | None,
+    sks_to_stop_ids: list[int],
+    should_work_when_stopped: bool,
+):
+    """
+    *_ids params contain safekeeper node ids; it is assumed they are issued
+    from 1 and sequentially assigned to env.safekeepers.
+    """
+    members_sks = [env.safekeepers[i - 1] for i in members_sks_ids]
+    new_members_sks = (
+        [env.safekeepers[i - 1] for i in new_members_sks_ids] if new_members_sks_ids else None
+    )
+    sks_to_stop = [env.safekeepers[i - 1] for i in sks_to_stop_ids]
+
+    mconf = MembershipConfiguration(
+        generation=1,
+        members=Safekeeper.sks_to_safekeeper_ids(members_sks),
+        new_members=Safekeeper.sks_to_safekeeper_ids(new_members_sks) if new_members_sks else None,
+    )
+    members_sks = Safekeeper.mconf_sks(env, mconf)
+
+    tenant_id = env.initial_tenant
+    compute_sks_ids_str = "-".join([str(sk_id) for sk_id in compute_sks_ids])
+    members_sks_ids_str = "-".join([str(sk.id) for sk in mconf.members])
+    new_members_sks_ids_str = "-".join(
+        [str(sk.id) for sk in mconf.new_members] if mconf.new_members is not None else []
+    )
+    sks_to_stop_ids_str = "-".join([str(sk.id) for sk in sks_to_stop])
+    log.info(
+        f"running quorum_sanity_single with compute_sks={compute_sks_ids_str}, members_sks={members_sks_ids_str}, new_members_sks={new_members_sks_ids_str}, sks_to_stop={sks_to_stop_ids_str}, should_work_when_stopped={should_work_when_stopped}"
+    )
+    branch_name = f"test_quorum_single_c{compute_sks_ids_str}_m{members_sks_ids_str}_{new_members_sks_ids_str}_s{sks_to_stop_ids_str}"
+    timeline_id = env.create_branch(branch_name)
+
+    # create timeline on `members_sks`
+    Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, members_sks)
+
+    config_lines = [
+        "neon.safekeeper_proto_version = 3",
+    ]
+    ep = env.endpoints.create(branch_name, config_lines=config_lines)
+    ep.start(safekeeper_generation=1, safekeepers=compute_sks_ids)
+    ep.safe_psql("create table t(key int, value text)")
+
+    # stop specified sks and check whether writes work
+    for sk in sks_to_stop:
+        sk.stop()
+    if should_work_when_stopped:
+        log.info("checking that writes still work")
+        ep.safe_psql("insert into t select generate_series(1, 100), 'Papaya'")
+        # restarting ep should also be fine
+        ep.stop()
+        ep.start()
+        ep.safe_psql("insert into t select generate_series(1, 100), 'plum'")
+        bg_query = None
+    else:
+        log.info("checking that writes hang")
+        bg_query = await assert_query_hangs(
+            ep, "insert into t select generate_series(1, 100), 'Papaya'"
+        )
+    # start again; now they should work
+    for sk in sks_to_stop:
+        sk.start()
+    if bg_query:
+        log.info("awaiting query")
+        await bg_query
+
+
+# It's a bit tempting to iterate over all possible combinations, but let's stick
+# with this for now.
+async def run_quorum_sanity(env: NeonEnv):
+    # 3 members, all up, should work
+    await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [], True)
+    # 3 members, 2/3 up, should work
+    await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [3], True)
+    # 3 members, 1/3 up, should not work
+    await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [2, 3], False)
+
+    # 3 members, all up, should work; wp redundantly talks to 4th.
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], None, [], True)
+    # 3 members, all up, should work with wp talking to 2 of these 3 + plus one redundant
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [], True)
+    # 3 members, 2/3 up, could work but wp talks to different 3s, so it shouldn't
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [3], False)
+
+    # joint conf of 1-2-3 and 4, all up, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [], True)
+    # joint conf of 1-2-3 and 4, 4 down, shouldn't work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [4], False)
+
+    # joint conf of 1-2-3 and 2-3-4, all up, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [], True)
+    # joint conf of 1-2-3 and 2-3-4, 1 and 4 down, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 4], True)
+    # joint conf of 1-2-3 and 2-3-4, 2 down, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2], True)
+    # joint conf of 1-2-3 and 2-3-4, 3 down, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [3], True)
+    # joint conf of 1-2-3 and 2-3-4, 1 and 2 down, shouldn't work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 2], False)
+    # joint conf of 1-2-3 and 2-3-4, 2 and 4 down, shouldn't work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2, 4], False)
+
+    # joint conf of 1-2-3 and 2-3-4 with wp talking to 2-3-4 only.
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [], True)
+    # with 1 down should still be ok
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [1], True)
+    # but with 2 down not ok
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [2], False)
+
+
+# Test various combinations of membership configurations / neon.safekeepers
+# (list of safekeepers endpoint connects to) values / up & down safekeepers and
+# check that endpont can start and write data when we have quorum and can't when
+# we don't.
+def test_quorum_sanity(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 4
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_quorum_sanity(env))
+
+
 async def run_segment_init_failure(env: NeonEnv):
     env.create_branch("test_segment_init_failure")
     ep = env.endpoints.create_start("test_segment_init_failure")