Compute release 2025-04-11

storage: add http/https server and cert resover metrics (#11450 )
## Problem We need to export some metrics about certs/connections to configure alerts and make sure that all HTTP requests are gone before turning https-only mode on. - Closes: https://github.com/neondatabase/cloud/issues/25526 ## Summary of changes - Add started connection and connection error metrics to http/https Server. - Add certificate expiration time and reload metrics to ReloadingCertificateResolver.
2026-07-02 19:50:40 +00:00 · 2025-04-11 07:01:04 +00:00 · 2025-04-11 06:11:35 +00:00 · 2025-04-10 19:55:51 +00:00 · 2025-04-10 17:03:22 +00:00 · 2025-04-10 16:55:37 +00:00
119 changed files with 6051 additions and 3236 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -19,6 +19,7 @@
 !pageserver/
 !pgxn/
 !proxy/
+!object_storage/
 !storage_scrubber/
 !safekeeper/
 !storage_broker/
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+/artifact_cache
 /pg_install
 /target
 /tmp_check
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2837,6 +2837,7 @@ dependencies = [
 "utils",
 "uuid",
 "workspace_hack",
+ "x509-cert",
 ]

 [[package]]
@@ -3991,6 +3992,33 @@ dependencies = [
 "memchr",
 ]

+[[package]]
+name = "object_storage"
+version = "0.0.1"
+dependencies = [
+ "anyhow",
+ "axum",
+ "axum-extra",
+ "camino",
+ "camino-tempfile",
+ "futures",
+ "http-body-util",
+ "itertools 0.10.5",
+ "jsonwebtoken",
+ "prometheus",
+ "rand 0.8.5",
+ "remote_storage",
+ "serde",
+ "serde_json",
+ "test-log",
+ "tokio",
+ "tokio-util",
+ "tower 0.5.2",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.20.2"
@@ -4693,7 +4721,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
 dependencies = [
 "base64 0.22.1",
 "byteorder",
@@ -4727,7 +4755,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
 dependencies = [
 "bytes",
 "chrono",
@@ -6925,6 +6953,28 @@ dependencies = [
 "syn 2.0.100",
 ]

+[[package]]
+name = "test-log"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7f46083d221181166e5b6f6b1e5f1d499f3a76888826e6cb1d057554157cd0f"
+dependencies = [
+ "env_logger",
+ "test-log-macros",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "test-log-macros"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "888d0c3c6db53c0fdab160d2ed5e12ba745383d3e85813f2ea0f2b1475ab553f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "thiserror"
 version = "1.0.69"
@@ -7172,7 +7222,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.10"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
 dependencies = [
 "async-trait",
 "byteorder",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -40,6 +40,7 @@ members = [
    "libs/proxy/postgres-protocol2",
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
+    "object_storage",
 ]

 [workspace.package]
@@ -208,6 +209,7 @@ tracing-opentelemetry = "0.28"
 tracing-serde = "0.2.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
+test-log = { version = "0.2.17", default-features = false, features = ["log"] }
 twox-hash = { version = "1.6.3", default-features = false }
 typed-json = "0.1"
 url = "2.2"
--- a/2
+++ b/2
@@ -89,6 +89,7 @@ RUN set -e \
      --bin storage_broker  \
      --bin storage_controller  \
      --bin proxy  \
+      --bin object_storage \
      --bin neon_local \
      --bin storage_scrubber \
      --locked --release
@@ -121,6 +122,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/object_storage      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin

--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -118,16 +118,18 @@ struct Cli {
    #[arg(long)]
    pub set_disk_quota_for_fs: Option<String>,

-    #[arg(short = 's', long = "spec", group = "spec")]
-    pub spec_json: Option<String>,
-
    #[arg(short = 'S', long, group = "spec-path")]
    pub spec_path: Option<OsString>,

    #[arg(short = 'i', long, group = "compute-id")]
    pub compute_id: String,

-    #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
+    #[arg(
+        short = 'p',
+        long,
+        conflicts_with = "spec-path",
+        value_name = "CONTROL_PLANE_API_BASE_URL"
+    )]
    pub control_plane_uri: Option<String>,
 }

@@ -172,7 +174,6 @@ fn main() -> Result<()> {
            cgroup: cli.cgroup,
            #[cfg(target_os = "linux")]
            vm_monitor_addr: cli.vm_monitor_addr,
-            live_config_allowed: cli_spec.live_config_allowed,
        },
        cli_spec.spec,
        cli_spec.compute_ctl_config,
@@ -201,23 +202,12 @@ async fn init() -> Result<()> {
 }

 fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
-    // First, try to get cluster spec from the cli argument
-    if let Some(ref spec_json) = cli.spec_json {
-        info!("got spec from cli argument {}", spec_json);
-        return Ok(CliSpecParams {
-            spec: Some(serde_json::from_str(spec_json)?),
-            compute_ctl_config: ComputeCtlConfig::default(),
-            live_config_allowed: false,
-        });
-    }
-
-    // Second, try to read it from the file if path is provided
+    // First, read spec from the path if provided
    if let Some(ref spec_path) = cli.spec_path {
        let file = File::open(Path::new(spec_path))?;
        return Ok(CliSpecParams {
            spec: Some(serde_json::from_reader(file)?),
            compute_ctl_config: ComputeCtlConfig::default(),
-            live_config_allowed: true,
        });
    }

@@ -225,11 +215,12 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
        panic!("must specify --control-plane-uri");
    };

+    // If the spec wasn't provided in the CLI arguments, then retrieve it from
+    // the control plane
    match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
        Ok(resp) => Ok(CliSpecParams {
            spec: resp.0,
            compute_ctl_config: resp.1,
-            live_config_allowed: true,
        }),
        Err(e) => {
            error!(
@@ -247,7 +238,6 @@ struct CliSpecParams {
    spec: Option<ComputeSpec>,
    #[allow(dead_code)]
    compute_ctl_config: ComputeCtlConfig,
-    live_config_allowed: bool,
 }

 fn deinit_and_exit(exit_code: Option<i32>) -> ! {
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -98,13 +98,15 @@ pub async fn get_database_schema(
        .kill_on_drop(true)
        .spawn()?;

-    let stdout = cmd.stdout.take().ok_or_else(|| {
-        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
-    })?;
+    let stdout = cmd
+        .stdout
+        .take()
+        .ok_or_else(|| std::io::Error::other("Failed to capture stdout."))?;

-    let stderr = cmd.stderr.take().ok_or_else(|| {
-        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
-    })?;
+    let stderr = cmd
+        .stderr
+        .take()
+        .ok_or_else(|| std::io::Error::other("Failed to capture stderr."))?;

    let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
    let stderr_reader = BufReader::new(stderr);
@@ -128,8 +130,7 @@ pub async fn get_database_schema(
                }
            });

-            return Err(SchemaDumpError::IO(std::io::Error::new(
-                std::io::ErrorKind::Other,
+            return Err(SchemaDumpError::IO(std::io::Error::other(
                "failed to start pg_dump",
            )));
        }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -93,20 +93,6 @@ pub struct ComputeNodeParams {

    /// the address of extension storage proxy gateway
    pub ext_remote_storage: Option<String>,
-
-    /// We should only allow live re- / configuration of the compute node if
-    /// it uses 'pull model', i.e. it can go to control-plane and fetch
-    /// the latest configuration. Otherwise, there could be a case:
-    /// - we start compute with some spec provided as argument
-    /// - we push new spec and it does reconfiguration
-    /// - but then something happens and compute pod / VM is destroyed,
-    ///   so k8s controller starts it again with the **old** spec
-    ///
-    /// and the same for empty computes:
-    /// - we started compute without any spec
-    /// - we push spec and it does configuration
-    /// - but then it is restarted without any spec again
-    pub live_config_allowed: bool,
 }

 /// Compute node info shared across several `compute_ctl` threads.
@@ -537,11 +523,14 @@ impl ComputeNode {

        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
-            "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
+            "starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
            pspec.tenant_id,
            pspec.timeline_id,
+            pspec.spec.project_id.as_deref().unwrap_or("None"),
+            pspec.spec.branch_id.as_deref().unwrap_or("None"),
+            pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
            pspec.spec.features,
            pspec.spec.remote_extensions,
        );
@@ -645,31 +634,28 @@ impl ComputeNode {
            });
        }

-        // Configure and start rsyslog for HIPAA if necessary
-        if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
-            let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
-            if remote_endpoint.is_empty() {
-                anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+        // Configure and start rsyslog for compliance audit logging
+        match pspec.spec.audit_log_level {
+            ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
+                let remote_endpoint =
+                    std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
+                if remote_endpoint.is_empty() {
+                    anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+                }
+
+                let log_directory_path = Path::new(&self.params.pgdata).join("log");
+                let log_directory_path = log_directory_path.to_string_lossy().to_string();
+                configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
+
+                // Launch a background task to clean up the audit logs
+                launch_pgaudit_gc(log_directory_path);
            }
-
-            let log_directory_path = Path::new(&self.params.pgdata).join("log");
-            let log_directory_path = log_directory_path.to_string_lossy().to_string();
-            configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
-
-            // Launch a background task to clean up the audit logs
-            launch_pgaudit_gc(log_directory_path);
+            _ => {}
        }

        // Configure and start rsyslog for Postgres logs export
-        if self.has_feature(ComputeFeature::PostgresLogsExport) {
-            if let Some(ref project_id) = pspec.spec.cluster.cluster_id {
-                let host = PostgresLogsRsyslogConfig::default_host(project_id);
-                let conf = PostgresLogsRsyslogConfig::new(Some(&host));
-                configure_postgres_logs_export(conf)?;
-            } else {
-                warn!("not configuring rsyslog for Postgres logs export: project ID is missing")
-            }
-        }
+        let conf = PostgresLogsRsyslogConfig::new(pspec.spec.logs_export_host.as_deref());
+        configure_postgres_logs_export(conf)?;

        // Launch remaining service threads
        let _monitor_handle = launch_monitor(self);
@@ -1573,6 +1559,10 @@ impl ComputeNode {
            });
        }

+        // Reconfigure rsyslog for Postgres logs export
+        let conf = PostgresLogsRsyslogConfig::new(spec.logs_export_host.as_deref());
+        configure_postgres_logs_export(conf)?;
+
        // Write new config
        let pgdata_path = Path::new(&self.params.pgdata);
        config::write_postgres_conf(
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -7,7 +7,7 @@ use std::io::prelude::*;
 use std::path::Path;

 use compute_api::responses::TlsConfig;
-use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption};
+use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};

 use crate::pg_helpers::{
    GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
@@ -89,6 +89,15 @@ pub fn write_postgres_conf(
            escape_conf_value(&s.to_string())
        )?;
    }
+    if let Some(s) = &spec.project_id {
+        writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.branch_id {
+        writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.endpoint_id {
+        writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
+    }

    // tls
    if let Some(tls_config) = tls_config {
@@ -169,7 +178,7 @@ pub fn write_postgres_conf(
    // and don't allow the user or the control plane admin to change them.
    match spec.audit_log_level {
        ComputeAudit::Disabled => {}
-        ComputeAudit::Log => {
+        ComputeAudit::Log | ComputeAudit::Base => {
            writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
            writeln!(file, "pgaudit.log='ddl,role'")?;
            // Disable logging of catalog queries to reduce the noise
@@ -193,16 +202,20 @@ pub fn write_postgres_conf(
            }
            writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
        }
-        ComputeAudit::Hipaa => {
+        ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
            writeln!(
                file,
                "# Managed by compute_ctl compliance audit settings: begin"
            )?;
-            // This log level is very verbose
-            // but this is necessary for HIPAA compliance.
-            // Exclude 'misc' category, because it doesn't contain anythig relevant.
-            writeln!(file, "pgaudit.log='all, -misc'")?;
-            writeln!(file, "pgaudit.log_parameter=on")?;
+            // Enable logging of parameters.
+            // This is very verbose and may contain sensitive data.
+            if spec.audit_log_level == ComputeAudit::Full {
+                writeln!(file, "pgaudit.log_parameter=on")?;
+                writeln!(file, "pgaudit.log='all'")?;
+            } else {
+                writeln!(file, "pgaudit.log_parameter=off")?;
+                writeln!(file, "pgaudit.log='all, -misc'")?;
+            }
            // Disable logging of catalog queries
            // The catalog doesn't contain sensitive data, so we don't need to audit it.
            writeln!(file, "pgaudit.log_catalog=off")?;
@@ -255,7 +268,7 @@ pub fn write_postgres_conf(

    // We need Postgres to send logs to rsyslog so that we can forward them
    // further to customers' log aggregation systems.
-    if spec.features.contains(&ComputeFeature::PostgresLogsExport) {
+    if spec.logs_export_host.is_some() {
        writeln!(file, "log_destination='stderr,syslog'")?;
    }

--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -6,20 +6,15 @@ use axum_extra::{
    TypedHeader,
    headers::{Authorization, authorization::Bearer},
 };
+use compute_api::requests::ComputeClaims;
 use futures::future::BoxFuture;
 use http::{Request, Response, StatusCode};
 use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
-use serde::Deserialize;
 use tower_http::auth::AsyncAuthorizeRequest;
 use tracing::warn;

 use crate::http::{JsonResponse, extract::RequestId};

-#[derive(Clone, Debug, Deserialize)]
-pub(in crate::http) struct Claims {
-    compute_id: String,
-}
-
 #[derive(Clone, Debug)]
 pub(in crate::http) struct Authorize {
    compute_id: String,
@@ -112,7 +107,11 @@ impl AsyncAuthorizeRequest<Body> for Authorize {

 impl Authorize {
    /// Verify the token using the JSON Web Key set and return the token data.
-    fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
+    fn verify(
+        jwks: &JwkSet,
+        token: &str,
+        validation: &Validation,
+    ) -> Result<TokenData<ComputeClaims>> {
        for jwk in jwks.keys.iter() {
            let decoding_key = match DecodingKey::from_jwk(jwk) {
                Ok(key) => key,
@@ -127,7 +126,7 @@ impl Authorize {
                }
            };

-            match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
+            match jsonwebtoken::decode::<ComputeClaims>(token, &decoding_key, validation) {
                Ok(data) => return Ok(data),
                Err(e) => {
                    warn!(
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -306,36 +306,6 @@ paths:
              schema:
                $ref: "#/components/schemas/GenericError"

-  /configure_telemetry:
-    post:
-      tags:
-        - Configure
-      summary: Configure rsyslog
-      description: |
-        This API endpoint configures rsyslog to forward Postgres logs
-        to a specified otel collector.
-      operationId: configureTelemetry
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              type: object
-              properties:
-                logs_export_host:
-                  type: string
-                  description: |
-                    Hostname and the port of the otel collector. Leave empty to disable logs forwarding.
-                    Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526
-      responses:
-        204:
-          description: "Telemetry configured successfully"
-        500:
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/GenericError"
-
 components:
  securitySchemes:
    JWT:
--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -1,11 +1,9 @@
 use std::sync::Arc;

-use axum::body::Body;
 use axum::extract::State;
 use axum::response::Response;
-use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest};
+use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
-use compute_api::spec::ComputeFeature;
 use http::StatusCode;
 use tokio::task;
 use tracing::info;
@@ -13,7 +11,6 @@ use tracing::info;
 use crate::compute::{ComputeNode, ParsedSpec};
 use crate::http::JsonResponse;
 use crate::http::extract::Json;
-use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export};

 // Accept spec in JSON format and request compute configuration. If anything
 // goes wrong after we set the compute status to `ConfigurationPending` and
@@ -25,13 +22,6 @@ pub(in crate::http) async fn configure(
    State(compute): State<Arc<ComputeNode>>,
    request: Json<ConfigurationRequest>,
 ) -> Response {
-    if !compute.params.live_config_allowed {
-        return JsonResponse::error(
-            StatusCode::PRECONDITION_FAILED,
-            "live configuration is not allowed for this compute node".to_string(),
-        );
-    }
-
    let pspec = match ParsedSpec::try_from(request.spec.clone()) {
        Ok(p) => p,
        Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
@@ -95,25 +85,3 @@ pub(in crate::http) async fn configure(

    JsonResponse::success(StatusCode::OK, body)
 }
-
-pub(in crate::http) async fn configure_telemetry(
-    State(compute): State<Arc<ComputeNode>>,
-    request: Json<ConfigureTelemetryRequest>,
-) -> Response {
-    if !compute.has_feature(ComputeFeature::PostgresLogsExport) {
-        return JsonResponse::error(
-            StatusCode::PRECONDITION_FAILED,
-            "Postgres logs export feature is not enabled".to_string(),
-        );
-    }
-
-    let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref());
-    if let Err(err) = configure_postgres_logs_export(conf) {
-        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string());
-    }
-
-    Response::builder()
-        .status(StatusCode::NO_CONTENT)
-        .body(Body::from(""))
-        .unwrap()
-}
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -87,7 +87,6 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
                let authenticated_router = Router::<Arc<ComputeNode>>::new()
                    .route("/check_writability", post(check_writability::is_writable))
                    .route("/configure", post(configure::configure))
-                    .route("/configure_telemetry", post(configure::configure_telemetry))
                    .route("/database_schema", get(database_schema::get_schema_dump))
                    .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
                    .route("/insights", get(insights::get_insights))
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -119,16 +119,9 @@ impl<'a> PostgresLogsRsyslogConfig<'a> {
        };
        Ok(config_content)
    }
-
-    /// Returns the default host for otel collector that receives Postgres logs
-    pub fn default_host(project_id: &str) -> String {
-        format!(
-            "config-{}-collector.neon-telemetry.svc.cluster.local:10514",
-            project_id
-        )
-    }
 }

+/// Writes rsyslogd configuration for Postgres logs export and restarts rsyslog.
 pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> {
    let new_config = conf.build()?;
    let current_config = PostgresLogsRsyslogConfig::current_config()?;
@@ -261,16 +254,5 @@ mod tests {
            let res = conf.build();
            assert!(res.is_err());
        }
-
-        {
-            // Verify config with default host
-            let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123");
-            let conf = PostgresLogsRsyslogConfig::new(Some(&host));
-            let res = conf.build();
-            assert!(res.is_ok());
-            let conf_str = res.unwrap();
-            assert!(conf_str.contains(r#"shy-breeze-123"#));
-            assert!(conf_str.contains(r#"port="10514""#));
-        }
    }
 }
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -278,12 +278,12 @@ impl ComputeNode {
            // so that all config operations are audit logged.
            match spec.audit_log_level
            {
-                ComputeAudit::Hipaa => {
+                ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
                    phases.push(CreatePgauditExtension);
                    phases.push(CreatePgauditlogtofileExtension);
                    phases.push(DisablePostgresDBPgAudit);
                }
-                ComputeAudit::Log => {
+                ComputeAudit::Log | ComputeAudit::Base => {
                    phases.push(CreatePgauditExtension);
                    phases.push(DisablePostgresDBPgAudit);
                }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -20,8 +20,10 @@ use compute_api::spec::ComputeMode;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{
    InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
-    SafekeeperConf,
+    ObjectStorageConf, SafekeeperConf,
 };
+use control_plane::object_storage::OBJECT_STORAGE_DEFAULT_PORT;
+use control_plane::object_storage::ObjectStorage;
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::{
@@ -39,7 +41,7 @@ use pageserver_api::controller_api::{
 use pageserver_api::models::{
    ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
 };
-use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
+use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
 use safekeeper_api::membership::SafekeeperGeneration;
@@ -91,6 +93,8 @@ enum NeonLocalCmd {
    #[command(subcommand)]
    Safekeeper(SafekeeperCmd),
    #[command(subcommand)]
+    ObjectStorage(ObjectStorageCmd),
+    #[command(subcommand)]
    Endpoint(EndpointCmd),
    #[command(subcommand)]
    Mappings(MappingsCmd),
@@ -454,6 +458,32 @@ enum SafekeeperCmd {
    Restart(SafekeeperRestartCmdArgs),
 }

+#[derive(clap::Subcommand)]
+#[clap(about = "Manage object storage")]
+enum ObjectStorageCmd {
+    Start(ObjectStorageStartCmd),
+    Stop(ObjectStorageStopCmd),
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Start object storage")]
+struct ObjectStorageStartCmd {
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[arg(default_value = "10s")]
+    start_timeout: humantime::Duration,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Stop object storage")]
+struct ObjectStorageStopCmd {
+    #[arg(value_enum, default_value = "fast")]
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
+    stop_mode: StopMode,
+}
+
 #[derive(clap::Args)]
 #[clap(about = "Start local safekeeper")]
 struct SafekeeperStartCmdArgs {
@@ -759,6 +789,7 @@ fn main() -> Result<()> {
            }
            NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)),
            NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)),
+            NeonLocalCmd::ObjectStorage(subcmd) => rt.block_on(handle_object_storage(&subcmd, env)),
            NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)),
            NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env),
        };
@@ -975,6 +1006,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
                    }
                })
                .collect(),
+            object_storage: ObjectStorageConf {
+                port: OBJECT_STORAGE_DEFAULT_PORT,
+            },
            pg_distrib_dir: None,
            neon_distrib_dir: None,
            default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
@@ -1083,7 +1117,7 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
                        stripe_size: args
                            .shard_stripe_size
                            .map(ShardStripeSize)
-                            .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
+                            .unwrap_or(DEFAULT_STRIPE_SIZE),
                    },
                    placement_policy: args.placement_policy.clone(),
                    config: tenant_conf,
@@ -1396,7 +1430,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                    vec![(parsed.0, parsed.1.unwrap_or(5432))],
                    // If caller is telling us what pageserver to use, this is not a tenant which is
                    // full managed by storage controller, therefore not sharded.
-                    ShardParameters::DEFAULT_STRIPE_SIZE,
+                    DEFAULT_STRIPE_SIZE,
                )
            } else {
                // Look up the currently attached location of the tenant, and its striping metadata,
@@ -1683,6 +1717,41 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
    Ok(())
 }

+async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::LocalEnv) -> Result<()> {
+    use ObjectStorageCmd::*;
+    let storage = ObjectStorage::from_env(env);
+
+    // In tests like test_forward_compatibility or test_graceful_cluster_restart
+    // old neon binaries (without object_storage) are present
+    if !storage.bin.exists() {
+        eprintln!(
+            "{} binary not found. Ignore if this is a compatibility test",
+            storage.bin
+        );
+        return Ok(());
+    }
+
+    match subcmd {
+        Start(ObjectStorageStartCmd { start_timeout }) => {
+            if let Err(e) = storage.start(start_timeout).await {
+                eprintln!("object_storage start failed: {e}");
+                exit(1);
+            }
+        }
+        Stop(ObjectStorageStopCmd { stop_mode }) => {
+            let immediate = match stop_mode {
+                StopMode::Fast => false,
+                StopMode::Immediate => true,
+            };
+            if let Err(e) = storage.stop(immediate) {
+                eprintln!("proxy stop failed: {e}");
+                exit(1);
+            }
+        }
+    };
+    Ok(())
+}
+
 async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> {
    match subcmd {
        StorageBrokerCmd::Start(args) => {
@@ -1777,6 +1846,13 @@ async fn handle_start_all_impl(
                    .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
            });
        }
+
+        js.spawn(async move {
+            ObjectStorage::from_env(env)
+                .start(&retry_timeout)
+                .await
+                .map_err(|e| e.context("start object_storage"))
+        });
    })();

    let mut errors = Vec::new();
@@ -1874,6 +1950,11 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        }
    }

+    let storage = ObjectStorage::from_env(env);
+    if let Err(e) = storage.stop(immediate) {
+        eprintln!("object_storage stop failed: {:#}", e);
+    }
+
    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
        if let Err(e) = pageserver.stop(immediate) {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -658,6 +658,9 @@ impl Endpoint {
            delta_operations: None,
            tenant_id: Some(self.tenant_id),
            timeline_id: Some(self.timeline_id),
+            project_id: None,
+            branch_id: None,
+            endpoint_id: Some(self.endpoint_id.clone()),
            mode: self.mode,
            pageserver_connstring: Some(pageserver_connstring),
            safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
@@ -670,6 +673,7 @@ impl Endpoint {
            reconfigure_concurrency: self.reconfigure_concurrency,
            drop_subscriptions_before_start: self.drop_subscriptions_before_start,
            audit_log_level: ComputeAudit::Disabled,
+            logs_export_host: None::<String>,
        };

        // this strange code is needed to support respec() in tests
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -10,6 +10,7 @@ mod background_process;
 pub mod broker;
 pub mod endpoint;
 pub mod local_env;
+pub mod object_storage;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -15,9 +15,10 @@ use clap::ValueEnum;
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use utils::auth::{Claims, encode_from_key_file};
+use utils::auth::encode_from_key_file;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};

+use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
 use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
 use crate::safekeeper::SafekeeperNode;

@@ -55,6 +56,7 @@ pub struct LocalEnv {

    // used to issue tokens during e.g pg start
    pub private_key_path: PathBuf,
+    pub public_key_path: PathBuf,

    pub broker: NeonBroker,

@@ -68,6 +70,8 @@ pub struct LocalEnv {

    pub safekeepers: Vec<SafekeeperConf>,

+    pub object_storage: ObjectStorageConf,
+
    // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
    // be propagated into each pageserver's configuration.
    pub control_plane_api: Url,
@@ -95,6 +99,7 @@ pub struct OnDiskConfig {
    pub neon_distrib_dir: PathBuf,
    pub default_tenant_id: Option<TenantId>,
    pub private_key_path: PathBuf,
+    pub public_key_path: PathBuf,
    pub broker: NeonBroker,
    pub storage_controller: NeonStorageControllerConf,
    #[serde(
@@ -103,6 +108,7 @@ pub struct OnDiskConfig {
    )]
    pub pageservers: Vec<PageServerConf>,
    pub safekeepers: Vec<SafekeeperConf>,
+    pub object_storage: ObjectStorageConf,
    pub control_plane_api: Option<Url>,
    pub control_plane_hooks_api: Option<Url>,
    pub control_plane_compute_hook_api: Option<Url>,
@@ -136,11 +142,18 @@ pub struct NeonLocalInitConf {
    pub storage_controller: Option<NeonStorageControllerConf>,
    pub pageservers: Vec<NeonLocalInitPageserverConf>,
    pub safekeepers: Vec<SafekeeperConf>,
+    pub object_storage: ObjectStorageConf,
    pub control_plane_api: Option<Url>,
    pub control_plane_hooks_api: Option<Url>,
    pub generate_local_ssl_certs: bool,
 }

+#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[serde(default)]
+pub struct ObjectStorageConf {
+    pub port: u16,
+}
+
 /// Broker config for cluster internal communication.
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
@@ -398,6 +411,10 @@ impl LocalEnv {
        self.pg_dir(pg_version, "lib")
    }

+    pub fn object_storage_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("object_storage")
+    }
+
    pub fn pageserver_bin(&self) -> PathBuf {
        self.neon_distrib_dir.join("pageserver")
    }
@@ -431,6 +448,10 @@ impl LocalEnv {
        self.base_data_dir.join("safekeepers").join(data_dir_name)
    }

+    pub fn object_storage_data_dir(&self) -> PathBuf {
+        self.base_data_dir.join("object_storage")
+    }
+
    pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
        if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
            Ok(conf)
@@ -582,6 +603,7 @@ impl LocalEnv {
                neon_distrib_dir,
                default_tenant_id,
                private_key_path,
+                public_key_path,
                broker,
                storage_controller,
                pageservers,
@@ -591,6 +613,7 @@ impl LocalEnv {
                control_plane_compute_hook_api: _,
                branch_name_mappings,
                generate_local_ssl_certs,
+                object_storage,
            } = on_disk_config;
            LocalEnv {
                base_data_dir: repopath.to_owned(),
@@ -598,6 +621,7 @@ impl LocalEnv {
                neon_distrib_dir,
                default_tenant_id,
                private_key_path,
+                public_key_path,
                broker,
                storage_controller,
                pageservers,
@@ -606,6 +630,7 @@ impl LocalEnv {
                control_plane_hooks_api,
                branch_name_mappings,
                generate_local_ssl_certs,
+                object_storage,
            }
        };

@@ -705,6 +730,7 @@ impl LocalEnv {
                neon_distrib_dir: self.neon_distrib_dir.clone(),
                default_tenant_id: self.default_tenant_id,
                private_key_path: self.private_key_path.clone(),
+                public_key_path: self.public_key_path.clone(),
                broker: self.broker.clone(),
                storage_controller: self.storage_controller.clone(),
                pageservers: vec![], // it's skip_serializing anyway
@@ -714,6 +740,7 @@ impl LocalEnv {
                control_plane_compute_hook_api: None,
                branch_name_mappings: self.branch_name_mappings.clone(),
                generate_local_ssl_certs: self.generate_local_ssl_certs,
+                object_storage: self.object_storage.clone(),
            },
        )
    }
@@ -730,7 +757,7 @@ impl LocalEnv {
    }

    // this function is used only for testing purposes in CLI e g generate tokens during init
-    pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
+    pub fn generate_auth_token<S: Serialize>(&self, claims: &S) -> anyhow::Result<String> {
        let private_key_path = self.get_private_key_path();
        let key_data = fs::read(private_key_path)?;
        encode_from_key_file(claims, &key_data)
@@ -797,6 +824,7 @@ impl LocalEnv {
            control_plane_api,
            generate_local_ssl_certs,
            control_plane_hooks_api,
+            object_storage,
        } = conf;

        // Find postgres binaries.
@@ -828,6 +856,7 @@ impl LocalEnv {
        )
        .context("generate auth keys")?;
        let private_key_path = PathBuf::from("auth_private_key.pem");
+        let public_key_path = PathBuf::from("auth_public_key.pem");

        // create the runtime type because the remaining initialization code below needs
        // a LocalEnv instance op operation
@@ -838,6 +867,7 @@ impl LocalEnv {
            neon_distrib_dir,
            default_tenant_id: Some(default_tenant_id),
            private_key_path,
+            public_key_path,
            broker,
            storage_controller: storage_controller.unwrap_or_default(),
            pageservers: pageservers.iter().map(Into::into).collect(),
@@ -846,6 +876,7 @@ impl LocalEnv {
            control_plane_hooks_api,
            branch_name_mappings: Default::default(),
            generate_local_ssl_certs,
+            object_storage,
        };

        if generate_local_ssl_certs {
@@ -873,8 +904,13 @@ impl LocalEnv {
                .context("pageserver init failed")?;
        }

+        ObjectStorage::from_env(&env)
+            .init()
+            .context("object storage init failed")?;
+
        // setup remote remote location for default LocalFs remote storage
        std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
+        std::fs::create_dir_all(env.base_data_dir.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR))?;

        env.persist_config()
    }
--- a/control_plane/src/object_storage.rs
+++ b/control_plane/src/object_storage.rs
@@ -0,0 +1,107 @@
+use crate::background_process::{self, start_process, stop_process};
+use crate::local_env::LocalEnv;
+use anyhow::anyhow;
+use anyhow::{Context, Result};
+use camino::Utf8PathBuf;
+use std::io::Write;
+use std::time::Duration;
+
+/// Directory within .neon which will be used by default for LocalFs remote storage.
+pub const OBJECT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/object_storage";
+pub const OBJECT_STORAGE_DEFAULT_PORT: u16 = 9993;
+
+pub struct ObjectStorage {
+    pub bin: Utf8PathBuf,
+    pub data_dir: Utf8PathBuf,
+    pub pemfile: Utf8PathBuf,
+    pub port: u16,
+}
+
+impl ObjectStorage {
+    pub fn from_env(env: &LocalEnv) -> ObjectStorage {
+        ObjectStorage {
+            bin: Utf8PathBuf::from_path_buf(env.object_storage_bin()).unwrap(),
+            data_dir: Utf8PathBuf::from_path_buf(env.object_storage_data_dir()).unwrap(),
+            pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(),
+            port: env.object_storage.port,
+        }
+    }
+
+    fn config_path(&self) -> Utf8PathBuf {
+        self.data_dir.join("object_storage.json")
+    }
+
+    fn listen_addr(&self) -> Utf8PathBuf {
+        format!("127.0.0.1:{}", self.port).into()
+    }
+
+    pub fn init(&self) -> Result<()> {
+        println!("Initializing object storage in {:?}", self.data_dir);
+        let parent = self.data_dir.parent().unwrap();
+
+        #[derive(serde::Serialize)]
+        struct Cfg {
+            listen: Utf8PathBuf,
+            pemfile: Utf8PathBuf,
+            local_path: Utf8PathBuf,
+            r#type: String,
+        }
+        let cfg = Cfg {
+            listen: self.listen_addr(),
+            pemfile: parent.join(self.pemfile.clone()),
+            local_path: parent.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR),
+            r#type: "LocalFs".to_string(),
+        };
+        std::fs::create_dir_all(self.config_path().parent().unwrap())?;
+        std::fs::write(self.config_path(), serde_json::to_string(&cfg)?)
+            .context("write object storage config")?;
+        Ok(())
+    }
+
+    pub async fn start(&self, retry_timeout: &Duration) -> Result<()> {
+        println!("Starting s3 proxy at {}", self.listen_addr());
+        std::io::stdout().flush().context("flush stdout")?;
+
+        let process_status_check = || async {
+            tokio::time::sleep(Duration::from_millis(500)).await;
+            let res = reqwest::Client::new()
+                .get(format!("http://{}/metrics", self.listen_addr()))
+                .send()
+                .await;
+            match res {
+                Ok(response) if response.status().is_success() => Ok(true),
+                Ok(_) => Err(anyhow!("Failed to query /metrics")),
+                Err(e) => Err(anyhow!("Failed to check node status: {e}")),
+            }
+        };
+
+        let res = start_process(
+            "object_storage",
+            &self.data_dir.clone().into_std_path_buf(),
+            &self.bin.clone().into_std_path_buf(),
+            vec![self.config_path().to_string()],
+            vec![("RUST_LOG".into(), "debug".into())],
+            background_process::InitialPidFile::Create(self.pid_file()),
+            retry_timeout,
+            process_status_check,
+        )
+        .await;
+        if res.is_err() {
+            eprintln!("Logs:\n{}", std::fs::read_to_string(self.log_file())?);
+        }
+
+        res
+    }
+
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        stop_process(immediate, "object_storage", &self.pid_file())
+    }
+
+    fn log_file(&self) -> Utf8PathBuf {
+        self.data_dir.join("object_storage.log")
+    }
+
+    fn pid_file(&self) -> Utf8PathBuf {
+        self.data_dir.join("object_storage.pid")
+    }
+}
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -941,7 +941,7 @@ async fn main() -> anyhow::Result<()> {
            let mut node_to_fill_descs = Vec::new();

            for desc in node_descs {
-                let to_drain = nodes.iter().any(|id| *id == desc.id);
+                let to_drain = nodes.contains(&desc.id);
                if to_drain {
                    node_to_drain_descs.push(desc);
                } else {
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -151,7 +151,7 @@ Example body:
 ```
 {
  "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
-  "stripe_size": 32768,
+  "stripe_size": 2048,
  "shards": [
      {"node_id": 344, "shard_number": 0},
      {"node_id": 722, "shard_number": 1},
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -5,6 +5,14 @@ use crate::privilege::Privilege;
 use crate::responses::ComputeCtlConfig;
 use crate::spec::{ComputeSpec, ExtVersion, PgIdent};

+/// When making requests to the `compute_ctl` external HTTP server, the client
+/// must specify a set of claims in `Authorization` header JWTs such that
+/// `compute_ctl` can authorize the request.
+#[derive(Clone, Debug, Deserialize, Serialize)]
+pub struct ComputeClaims {
+    pub compute_id: String,
+}
+
 /// Request of the /configure API
 ///
 /// We now pass only `spec` in the configuration request, but later we can
@@ -30,9 +38,3 @@ pub struct SetRoleGrantsRequest {
    pub privileges: Vec<Privilege>,
    pub role: PgIdent,
 }
-
-/// Request of the /configure_telemetry API
-#[derive(Debug, Deserialize, Serialize)]
-pub struct ConfigureTelemetryRequest {
-    pub logs_export_host: Option<String>,
-}
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -104,6 +104,12 @@ pub struct ComputeSpec {
    pub timeline_id: Option<TimelineId>,
    pub pageserver_connstring: Option<String>,

+    // More neon ids that we expose to the compute_ctl
+    // and to postgres as neon extension GUCs.
+    pub project_id: Option<String>,
+    pub branch_id: Option<String>,
+    pub endpoint_id: Option<String>,
+
    /// Safekeeper membership config generation. It is put in
    /// neon.safekeepers GUC and serves two purposes:
    /// 1) Non zero value forces walproposer to use membership configurations.
@@ -159,15 +165,13 @@ pub struct ComputeSpec {
    #[serde(default)] // Default false
    pub drop_subscriptions_before_start: bool,

-    /// Log level for audit logging:
-    ///
-    /// Disabled - no audit logging. This is the default.
-    /// log - log masked statements to the postgres log using pgaudit extension
-    /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
-    ///
-    /// Extensions should be present in shared_preload_libraries
+    /// Log level for compute audit logging
    #[serde(default)]
    pub audit_log_level: ComputeAudit,
+
+    /// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding.
+    /// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514
+    pub logs_export_host: Option<String>,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -179,9 +183,6 @@ pub enum ComputeFeature {
    /// track short-lived connections as user activity.
    ActivityMonitorExperimental,

-    /// Allow to configure rsyslog for Postgres logs export
-    PostgresLogsExport,
-
    /// This is a special feature flag that is used to represent unknown feature flags.
    /// Basically all unknown to enum flags are represented as this one. See unit test
    /// `parse_unknown_features()` for more details.
@@ -288,14 +289,25 @@ impl ComputeMode {
 }

 /// Log level for audit logging
-/// Disabled, log, hipaa
-/// Default is Disabled
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
    #[default]
    Disabled,
+    // Deprecated, use Base instead
    Log,
+    // (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
+    // logged to the standard postgresql log stream
+    Base,
+    // Deprecated, use Full or Extended instead
    Hipaa,
+    // (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access
+    Extended,
+    // (pgaudit.log='all', pgaudit.log_parameter='on'),
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access.
+    Full,
 }

 #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
--- a/libs/http-utils/Cargo.toml
+++ b/libs/http-utils/Cargo.toml
@@ -30,6 +30,7 @@ tokio.workspace = true
 tracing.workspace = true
 url.workspace = true
 uuid.workspace = true
+x509-cert.workspace = true

 # to use tokio channels as streams, this is faster to compile than async_stream
 # why is it only here? no other crate should use it, streams are rarely needed.
--- a/libs/http-utils/src/server.rs
+++ b/libs/http-utils/src/server.rs
@@ -4,6 +4,8 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use hyper0::Body;
 use hyper0::server::conn::Http;
+use metrics::{IntCounterVec, register_int_counter_vec};
+use once_cell::sync::Lazy;
 use routerify::{RequestService, RequestServiceBuilder};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
@@ -26,6 +28,24 @@ pub struct Server {
    tls_acceptor: Option<TlsAcceptor>,
 }

+static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_started_total",
+        "Number of established http/https connections",
+        &["scheme"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_errors_total",
+        "Number of occured connection errors by type",
+        &["type"]
+    )
+    .expect("failed to define a metric")
+});
+
 impl Server {
    pub fn new(
        request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
@@ -60,6 +80,15 @@ impl Server {
            false
        }

+        let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
+        let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
+        let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
+        let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
+        let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
+
+        let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
+        let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
+
        let mut connections = FuturesUnordered::new();
        loop {
            tokio::select! {
@@ -67,6 +96,7 @@ impl Server {
                    let (tcp_stream, remote_addr) = match stream {
                        Ok(stream) => stream,
                        Err(err) => {
+                            tcp_error_cnt.inc();
                            if !suppress_io_error(&err) {
                                info!("Failed to accept TCP connection: {err:#}");
                            }
@@ -78,11 +108,18 @@ impl Server {
                    let tls_acceptor = self.tls_acceptor.clone();
                    let cancel = cancel.clone();

+                    let tls_error_cnt = tls_error_cnt.clone();
+                    let http_error_cnt = http_error_cnt.clone();
+                    let https_error_cnt = https_error_cnt.clone();
+                    let http_connection_cnt = http_connection_cnt.clone();
+                    let https_connection_cnt = https_connection_cnt.clone();
+
                    connections.push(tokio::spawn(
                        async move {
                            match tls_acceptor {
                                Some(tls_acceptor) => {
                                    // Handle HTTPS connection.
+                                    https_connection_cnt.inc();
                                    let tls_stream = tokio::select! {
                                        tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
                                        _ = cancel.cancelled() => return,
@@ -90,6 +127,7 @@ impl Server {
                                    let tls_stream = match tls_stream {
                                        Ok(tls_stream) => tls_stream,
                                        Err(err) => {
+                                            tls_error_cnt.inc();
                                            if !suppress_io_error(&err) {
                                                info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
                                            }
@@ -97,6 +135,7 @@ impl Server {
                                        }
                                    };
                                    if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
+                                        https_error_cnt.inc();
                                        if !suppress_hyper_error(&err) {
                                            info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
                                        }
@@ -104,7 +143,9 @@ impl Server {
                                }
                                None => {
                                    // Handle HTTP connection.
+                                    http_connection_cnt.inc();
                                    if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
+                                        http_error_cnt.inc();
                                        if !suppress_hyper_error(&err) {
                                            info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
                                        }
@@ -115,6 +156,7 @@ impl Server {
                 }
                Some(conn) = connections.next() => {
                    if let Err(err) = conn {
+                        panic_error_cnt.inc();
                        error!("Connection panicked: {err:#}");
                    }
                }
@@ -122,6 +164,7 @@ impl Server {
                    // Wait for graceful shutdown of all connections.
                    while let Some(conn) = connections.next().await {
                        if let Err(err) = conn {
+                            panic_error_cnt.inc();
                            error!("Connection panicked: {err:#}");
                        }
                    }
--- a/libs/http-utils/src/tls_certs.rs
+++ b/libs/http-utils/src/tls_certs.rs
@@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration};
 use anyhow::Context;
 use arc_swap::ArcSwap;
 use camino::Utf8Path;
+use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
+use once_cell::sync::Lazy;
 use rustls::{
-    pki_types::{CertificateDer, PrivateKeyDer},
+    pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
    server::{ClientHello, ResolvesServerCert},
    sign::CertifiedKey,
 };
+use x509_cert::der::Reader;

 pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
    let cert_data = tokio::fs::read(filename)
@@ -53,6 +56,76 @@ pub async fn load_certified_key(
    Ok(certified_key)
 }

+/// rustls's CertifiedKey with extra parsed fields used for metrics.
+struct ParsedCertifiedKey {
+    certified_key: CertifiedKey,
+    expiration_time: UnixTime,
+}
+
+/// Parse expiration time from an X509 certificate.
+fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
+    let parsed_cert = x509_cert::der::SliceReader::new(cert)
+        .context("Failed to parse cerficiate")?
+        .decode::<x509_cert::Certificate>()
+        .context("Failed to parse cerficiate")?;
+
+    Ok(UnixTime::since_unix_epoch(
+        parsed_cert
+            .tbs_certificate
+            .validity
+            .not_after
+            .to_unix_duration(),
+    ))
+}
+
+async fn load_and_parse_certified_key(
+    key_filename: &Utf8Path,
+    cert_filename: &Utf8Path,
+) -> anyhow::Result<ParsedCertifiedKey> {
+    let certified_key = load_certified_key(key_filename, cert_filename).await?;
+    let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
+    Ok(ParsedCertifiedKey {
+        certified_key,
+        expiration_time,
+    })
+}
+
+static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "tls_certs_expiration_time_seconds",
+        "Expiration time of the loaded certificate since unix epoch in seconds",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_started_total",
+        "Number of certificate reload loop iterations started",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_updated_total",
+        "Number of times the certificate was updated to the new one",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_failed_total",
+        "Number of times the certificate reload failed",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
 /// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
 /// the disk periodically.
 #[derive(Debug)]
@@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver {
 impl ReloadingCertificateResolver {
    /// Creates a new Resolver by loading certificate and private key from FS and
    /// creating tokio::task to reload them with provided reload_period.
+    /// resolver_name is used as metric's label.
    pub async fn new(
+        resolver_name: &str,
        key_filename: &Utf8Path,
        cert_filename: &Utf8Path,
        reload_period: Duration,
    ) -> anyhow::Result<Arc<Self>> {
+        // Create metrics for current resolver.
+        let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
+        let cert_reload_started_counter =
+            CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_updated_counter =
+            CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_failed_counter =
+            CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
+
+        let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
+
        let this = Arc::new(Self {
-            certified_key: ArcSwap::from_pointee(
-                load_certified_key(key_filename, cert_filename).await?,
-            ),
+            certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
        });
+        cert_expiration_time.set(parsed_key.expiration_time.as_secs());

        tokio::spawn({
            let weak_this = Arc::downgrade(&this);
@@ -88,17 +173,22 @@ impl ReloadingCertificateResolver {
                        Some(this) => this,
                        None => break, // Resolver has been destroyed, exit.
                    };
-                    match load_certified_key(&key_filename, &cert_filename).await {
-                        Ok(new_certified_key) => {
-                            if new_certified_key.cert == this.certified_key.load().cert {
+                    cert_reload_started_counter.inc();
+
+                    match load_and_parse_certified_key(&key_filename, &cert_filename).await {
+                        Ok(parsed_key) => {
+                            if parsed_key.certified_key.cert == this.certified_key.load().cert {
                                tracing::debug!("Certificate has not changed since last reloading");
                            } else {
                                tracing::info!("Certificate has been reloaded");
-                                this.certified_key.store(Arc::new(new_certified_key));
+                                this.certified_key.store(Arc::new(parsed_key.certified_key));
+                                cert_expiration_time.set(parsed_key.expiration_time.as_secs());
+                                cert_reload_updated_counter.inc();
                            }
                            last_reload_failed = false;
                        }
                        Err(err) => {
+                            cert_reload_failed_counter.inc();
                            // Note: Reloading certs may fail if it conflicts with the script updating
                            // the files at the same time. Warn only if the error is persistent.
                            if last_reload_failed {
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -180,6 +180,7 @@ pub struct ConfigToml {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub generate_unarchival_heatmap: Option<bool>,
    pub tracing: Option<Tracing>,
+    pub enable_tls_page_service_api: bool,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -631,6 +632,7 @@ impl Default for ConfigToml {
            load_previous_heatmap: None,
            generate_unarchival_heatmap: None,
            tracing: None,
+            enable_tls_page_service_api: false,
        }
    }
 }
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -613,8 +613,7 @@ mod tests {
    use rand::{RngCore, SeedableRng};

    use super::*;
-    use crate::models::ShardParameters;
-    use crate::shard::{ShardCount, ShardNumber};
+    use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber, ShardStripeSize};

    // Helper function to create a key range.
    //
@@ -964,12 +963,8 @@ mod tests {
    }
    #[test]
    fn sharded_range_relation_gap() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
+        let shard_identity =
+            ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();

        let range = ShardedRange::new(
            Range {
@@ -985,12 +980,8 @@ mod tests {

    #[test]
    fn shard_identity_keyspaces_single_key() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
+        let shard_identity =
+            ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();

        let range = ShardedRange::new(
            Range {
@@ -1034,12 +1025,8 @@ mod tests {

    #[test]
    fn shard_identity_keyspaces_forkno_gap() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
+        let shard_identity =
+            ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();

        let range = ShardedRange::new(
            Range {
@@ -1061,7 +1048,7 @@ mod tests {
            let shard_identity = ShardIdentity::new(
                ShardNumber(shard_number),
                ShardCount::new(4),
-                ShardParameters::DEFAULT_STRIPE_SIZE,
+                DEFAULT_STRIPE_SIZE,
            )
            .unwrap();

@@ -1144,37 +1131,44 @@ mod tests {
    /// for a single tenant.
    #[test]
    fn sharded_range_fragment_simple() {
+        const SHARD_COUNT: u8 = 4;
+        const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
+
        let shard_identity = ShardIdentity::new(
            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
+            ShardCount::new(SHARD_COUNT),
+            ShardStripeSize(STRIPE_SIZE),
        )
        .unwrap();

        // A range which we happen to know covers exactly one stripe which belongs to this shard
        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
+        let mut input_end = input_start;
+        input_end.field6 += STRIPE_SIZE; // field6 is block number

        // Ask for stripe_size blocks, we get the whole stripe
        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 32768),
-            (32768, vec![(32768, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE),
+            (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
        );

        // Ask for more, we still get the whole stripe
        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 10000000),
-            (32768, vec![(32768, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, 10 * STRIPE_SIZE),
+            (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
        );

        // Ask for target_nblocks of half the stripe size, we get two halves
        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16384),
+            do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE / 2),
            (
-                32768,
+                STRIPE_SIZE,
                vec![
-                    (16384, input_start..input_start.add(16384)),
-                    (16384, input_start.add(16384)..input_end)
+                    (
+                        STRIPE_SIZE / 2,
+                        input_start..input_start.add(STRIPE_SIZE / 2)
+                    ),
+                    (STRIPE_SIZE / 2, input_start.add(STRIPE_SIZE / 2)..input_end)
                ]
            )
        );
@@ -1182,40 +1176,53 @@ mod tests {

    #[test]
    fn sharded_range_fragment_multi_stripe() {
+        const SHARD_COUNT: u8 = 4;
+        const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
+        const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE;
+
        let shard_identity = ShardIdentity::new(
            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
+            ShardCount::new(SHARD_COUNT),
+            ShardStripeSize(STRIPE_SIZE),
        )
        .unwrap();

        // A range which covers multiple stripes, exactly one of which belongs to the current shard.
        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let mut input_end = input_start;
+        input_end.field6 += RANGE_SIZE; // field6 is block number
+
        // Ask for all the blocks, get a fragment that covers the whole range but reports
        // its size to be just the blocks belonging to our shard.
        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 131072),
-            (32768, vec![(32768, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, RANGE_SIZE),
+            (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
        );

-        // Ask for a sub-stripe quantity
+        // Ask for a sub-stripe quantity that results in 3 fragments.
+        let limit = STRIPE_SIZE / 3 + 1;
        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16000),
+            do_fragment(input_start, input_end, &shard_identity, limit),
            (
-                32768,
+                STRIPE_SIZE,
                vec![
-                    (16000, input_start..input_start.add(16000)),
-                    (16000, input_start.add(16000)..input_start.add(32000)),
-                    (768, input_start.add(32000)..input_end),
+                    (limit, input_start..input_start.add(limit)),
+                    (limit, input_start.add(limit)..input_start.add(2 * limit)),
+                    (
+                        STRIPE_SIZE - 2 * limit,
+                        input_start.add(2 * limit)..input_end
+                    ),
                ]
            )
        );

        // Try on a range that starts slightly after our owned stripe
        assert_eq!(
-            do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
-            (32767, vec![(32767, input_start.add(1)..input_end)])
+            do_fragment(input_start.add(1), input_end, &shard_identity, RANGE_SIZE),
+            (
+                STRIPE_SIZE - 1,
+                vec![(STRIPE_SIZE - 1, input_start.add(1)..input_end)]
+            )
        );
    }

@@ -1223,32 +1230,40 @@ mod tests {
    /// a previous relation.
    #[test]
    fn sharded_range_fragment_starting_from_logical_size() {
+        const SHARD_COUNT: u8 = 4;
+        const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
+        const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE;
+
        let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
+        let mut input_end = Key::from_hex("000000067f00000001000000ae0100000000").unwrap();
+        input_end.field6 += RANGE_SIZE; // field6 is block number

        // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
        let shard_identity = ShardIdentity::new(
            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
+            ShardCount::new(SHARD_COUNT),
+            ShardStripeSize(STRIPE_SIZE),
        )
        .unwrap();
        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x8001, vec![(0x8001, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE),
+            (
+                STRIPE_SIZE + 1,
+                vec![(STRIPE_SIZE + 1, input_start..input_end)]
+            )
        );

        // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
        // store all logical sizes)
        let shard_identity = ShardIdentity::new(
            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
+            ShardCount::new(SHARD_COUNT),
+            ShardStripeSize(STRIPE_SIZE),
        )
        .unwrap();
        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x1, vec![(0x1, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE),
+            (1, vec![(1, input_start..input_end)])
        );
    }

@@ -1284,12 +1299,8 @@ mod tests {
        );

        // Same, but using a sharded identity
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
+        let shard_identity =
+            ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
        assert_eq!(
            do_fragment(input_start, input_end, &shard_identity, 0x8000),
            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
@@ -1331,7 +1342,7 @@ mod tests {
                ShardIdentity::new(
                    ShardNumber((prng.next_u32() % shard_count) as u8),
                    ShardCount::new(shard_count as u8),
-                    ShardParameters::DEFAULT_STRIPE_SIZE,
+                    DEFAULT_STRIPE_SIZE,
                )
                .unwrap()
            };
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -26,7 +26,7 @@ use utils::{completion, serde_system_time};
 use crate::config::Ratio;
 use crate::key::{CompactKey, Key};
 use crate::reltag::RelTag;
-use crate::shard::{ShardCount, ShardStripeSize, TenantShardId};
+use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};

 /// The state of a tenant in this pageserver.
 ///
@@ -438,8 +438,6 @@ pub struct ShardParameters {
 }

 impl ShardParameters {
-    pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
-
    pub fn is_unsharded(&self) -> bool {
        self.count.is_unsharded()
    }
@@ -449,7 +447,7 @@ impl Default for ShardParameters {
    fn default() -> Self {
        Self {
            count: ShardCount::new(0),
-            stripe_size: Self::DEFAULT_STRIPE_SIZE,
+            stripe_size: DEFAULT_STRIPE_SIZE,
        }
    }
 }
@@ -1680,6 +1678,7 @@ pub struct SecondaryProgress {
 pub struct TenantScanRemoteStorageShard {
    pub tenant_shard_id: TenantShardId,
    pub generation: Option<u32>,
+    pub stripe_size: Option<ShardStripeSize>,
 }

 #[derive(Serialize, Deserialize, Debug, Default)]
--- a/libs/pageserver_api/src/record.rs
+++ b/libs/pageserver_api/src/record.rs
@@ -58,6 +58,8 @@ pub enum NeonWalRecord {
        /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
        /// its references in `timeline.rs`.
        will_init: bool,
+        /// Only append the record if the current image is the same as the one specified in this field.
+        only_if: Option<String>,
    },
 }

@@ -81,6 +83,17 @@ impl NeonWalRecord {
            append: s.as_ref().to_string(),
            clear: false,
            will_init: false,
+            only_if: None,
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    pub fn wal_append_conditional(s: impl AsRef<str>, only_if: impl AsRef<str>) -> Self {
+        Self::Test {
+            append: s.as_ref().to_string(),
+            clear: false,
+            will_init: false,
+            only_if: Some(only_if.as_ref().to_string()),
        }
    }

@@ -90,6 +103,7 @@ impl NeonWalRecord {
            append: s.as_ref().to_string(),
            clear: true,
            will_init: false,
+            only_if: None,
        }
    }

@@ -99,6 +113,7 @@ impl NeonWalRecord {
            append: s.as_ref().to_string(),
            clear: true,
            will_init: true,
+            only_if: None,
        }
    }
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -78,6 +78,12 @@ impl Default for ShardStripeSize {
    }
 }

+impl std::fmt::Display for ShardStripeSize {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
 pub struct ShardLayout(u8);
@@ -86,8 +92,11 @@ const LAYOUT_V1: ShardLayout = ShardLayout(1);
 /// ShardIdentity uses a magic layout value to indicate if it is unusable
 const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);

-/// Default stripe size in pages: 256MiB divided by 8kiB page size.
-const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
+/// The default stripe size in pages. 16 MiB divided by 8 kiB page size.
+///
+/// A lower stripe size distributes ingest load better across shards, but reduces IO amortization.
+/// 16 MiB appears to be a reasonable balance: <https://github.com/neondatabase/neon/pull/10510>.
+pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(16 * 1024 / 8);

 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
@@ -537,7 +546,7 @@ mod tests {
            field6: 0x7d06,
        };

-        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
+        let shard = key_to_shard_number(ShardCount(10), ShardStripeSize(32768), &key);
        assert_eq!(shard, ShardNumber(8));
    }

--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -5,7 +5,6 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 use std::future::Future;
-use std::io::ErrorKind;
 use std::net::SocketAddr;
 use std::os::fd::{AsRawFd, RawFd};
 use std::pin::Pin;
@@ -227,7 +226,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
        match self {
            MaybeWriteOnly::Full(framed) => framed.read_startup_message().await,
            MaybeWriteOnly::WriteOnly(_) => {
-                Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
+                Err(io::Error::other("reading from write only half").into())
            }
            MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
        }
@@ -237,7 +236,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
        match self {
            MaybeWriteOnly::Full(framed) => framed.read_message().await,
            MaybeWriteOnly::WriteOnly(_) => {
-                Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
+                Err(io::Error::other("reading from write only half").into())
            }
            MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
        }
@@ -975,7 +974,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'_, IO> {
            .write_message_noflush(&BeMessage::CopyData(buf))
            // write_message only writes to the buffer, so it can fail iff the
            // message is invaid, but CopyData can't be invalid.
-            .map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?;
+            .map_err(|_| io::Error::other("failed to serialize CopyData"))?;

        Poll::Ready(Ok(buf.len()))
    }
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -85,8 +85,8 @@ static KEY: Lazy<rustls::pki_types::PrivateKeyDer<'static>> = Lazy::new(|| {

 static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
    let mut cursor = Cursor::new(include_bytes!("cert.pem"));
-    let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap();
-    cert
+
+    rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap()
 });

 // test that basic select with ssl works
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -35,7 +35,7 @@ impl ConnectionError {
    pub fn into_io_error(self) -> io::Error {
        match self {
            ConnectionError::Io(io) => io,
-            ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()),
+            ConnectionError::Protocol(pe) => io::Error::other(pe.to_string()),
        }
    }
 }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -257,7 +257,7 @@ pub enum ProtocolError {
 impl ProtocolError {
    /// Proxy stream.rs uses only io::Error; provide it.
    pub fn into_io_error(self) -> io::Error {
-        io::Error::new(io::ErrorKind::Other, self.to_string())
+        io::Error::other(self.to_string())
    }
 }

--- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -212,7 +212,7 @@ impl ScramSha256 {
                    password,
                    channel_binding,
                } => (nonce, password, channel_binding),
-                _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
+                _ => return Err(io::Error::other("invalid SCRAM state")),
            };

        let message =
@@ -291,7 +291,7 @@ impl ScramSha256 {
                server_key,
                auth_message,
            } => (server_key, auth_message),
-            _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
+            _ => return Err(io::Error::other("invalid SCRAM state")),
        };

        let message =
@@ -301,10 +301,7 @@ impl ScramSha256 {

        let verifier = match parsed {
            ServerFinalMessage::Error(e) => {
-                return Err(io::Error::new(
-                    io::ErrorKind::Other,
-                    format!("SCRAM error: {}", e),
-                ));
+                return Err(io::Error::other(format!("SCRAM error: {}", e)));
            }
            ServerFinalMessage::Verifier(verifier) => verifier,
        };
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -28,7 +28,7 @@ toml_edit.workspace = true
 tracing.workspace = true
 scopeguard.workspace = true
 metrics.workspace = true
-utils.workspace = true
+utils = { path = "../utils", default-features = false }
 pin-project-lite.workspace = true

 azure_core.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -801,8 +801,7 @@ where
            // that support needs to be hacked in.
            //
            // including {self:?} into the message would be useful, but unsure how to unproject.
-            _ => std::task::Poll::Ready(Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
+            _ => std::task::Poll::Ready(Err(std::io::Error::other(
                "cloned or initial values cannot be read",
            ))),
        }
@@ -855,7 +854,7 @@ where
        };
        Err(azure_core::error::Error::new(
            azure_core::error::ErrorKind::Io,
-            std::io::Error::new(std::io::ErrorKind::Other, msg),
+            std::io::Error::other(msg),
        ))
    }

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,7 +5,8 @@ edition.workspace = true
 license.workspace = true

 [features]
-default = []
+default = ["rename_noreplace"]
+rename_noreplace = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
 testing = ["fail/failpoints"]
@@ -35,7 +36,7 @@ serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["signal"] }
 tokio-tar.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = ["serde"] }
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -173,7 +173,7 @@ impl std::fmt::Debug for JwtAuth {
 }

 // this function is used only for testing purposes in CLI e g generate tokens during init
-pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
+pub fn encode_from_key_file<S: Serialize>(claims: &S, key_data: &[u8]) -> Result<String> {
    let key = EncodingKey::from_ed_pem(key_data)?;
    Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
 }
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -81,12 +81,9 @@ pub fn path_with_suffix_extension(
 }

 pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
-    let parent = file_path.parent().ok_or_else(|| {
-        io::Error::new(
-            io::ErrorKind::Other,
-            format!("File {file_path:?} has no parent"),
-        )
-    })?;
+    let parent = file_path
+        .parent()
+        .ok_or_else(|| io::Error::other(format!("File {file_path:?} has no parent")))?;

    fsync(file_path)?;
    fsync(parent)?;
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -3,7 +3,9 @@ use std::{fs, io, path::Path};

 use anyhow::Context;

+#[cfg(feature = "rename_noreplace")]
 mod rename_noreplace;
+#[cfg(feature = "rename_noreplace")]
 pub use rename_noreplace::rename_noreplace;

 pub trait PathExt {
--- a/libs/utils/src/fs_ext/rename_noreplace.rs
+++ b/libs/utils/src/fs_ext/rename_noreplace.rs
@@ -8,7 +8,7 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
    dst: &P2,
 ) -> nix::Result<()> {
    {
-        #[cfg(target_os = "linux")]
+        #[cfg(all(target_os = "linux", target_env = "gnu"))]
        {
            nix::fcntl::renameat2(
                None,
@@ -29,7 +29,7 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
            })??;
            nix::errno::Errno::result(res).map(drop)
        }
-        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+        #[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "macos")))]
        {
            std::compile_error!("OS does not support no-replace renames");
        }
--- a/libs/utils/src/signals.rs
+++ b/libs/utils/src/signals.rs
@@ -1,6 +1,8 @@
 pub use signal_hook::consts::TERM_SIGNALS;
 pub use signal_hook::consts::signal::*;
 use signal_hook::iterator::Signals;
+use tokio::signal::unix::{SignalKind, signal};
+use tracing::info;

 pub enum Signal {
    Quit,
@@ -36,3 +38,30 @@ impl ShutdownSignals {
        Ok(())
    }
 }
+
+/// Runs in a loop since we want to be responsive to multiple signals
+/// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown)
+/// <https://github.com/neondatabase/neon/issues/9740>
+pub async fn signal_handler(token: tokio_util::sync::CancellationToken) {
+    let mut sigint = signal(SignalKind::interrupt()).unwrap();
+    let mut sigterm = signal(SignalKind::terminate()).unwrap();
+    let mut sigquit = signal(SignalKind::quit()).unwrap();
+
+    loop {
+        let signal = tokio::select! {
+            _ = sigquit.recv() => {
+                info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
+                std::process::exit(111);
+            }
+            _ = sigint.recv() => "SIGINT",
+            _ = sigterm.recv() => "SIGTERM",
+        };
+
+        if !token.is_cancelled() {
+            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
+            token.cancel();
+        } else {
+            info!("Got signal {signal}. Already shutting down.");
+        }
+    }
+}
--- a/object_storage/Cargo.toml
+++ b/object_storage/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "object_storage"
+version = "0.0.1"
+edition.workspace = true
+license.workspace = true
+[dependencies]
+anyhow.workspace = true
+axum-extra.workspace = true
+axum.workspace = true
+camino.workspace = true
+futures.workspace = true
+jsonwebtoken.workspace = true
+prometheus.workspace = true
+remote_storage.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+tokio-util.workspace = true
+tokio.workspace = true
+tracing.workspace = true
+utils = { path = "../libs/utils", default-features = false }
+workspace_hack.workspace = true
+[dev-dependencies]
+camino-tempfile.workspace = true
+http-body-util.workspace = true
+itertools.workspace = true
+rand.workspace = true
+test-log.workspace = true
+tower.workspace = true
--- a/object_storage/src/app.rs
+++ b/object_storage/src/app.rs
@@ -0,0 +1,561 @@
+use anyhow::anyhow;
+use axum::body::{Body, Bytes};
+use axum::response::{IntoResponse, Response};
+use axum::{Router, http::StatusCode};
+use object_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
+use remote_storage::TimeoutOrCancel;
+use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath};
+use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info};
+use utils::backoff::retry;
+
+pub fn app(state: Arc<Storage>) -> Router<()> {
+    use axum::routing::{delete as _delete, get as _get};
+    let delete_prefix = _delete(delete_prefix);
+    Router::new()
+        .route(
+            "/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}",
+            _get(get).put(set).delete(delete),
+        )
+        .route(
+            "/{tenant_id}/{timeline_id}/{endpoint_id}",
+            delete_prefix.clone(),
+        )
+        .route("/{tenant_id}/{timeline_id}", delete_prefix.clone())
+        .route("/{tenant_id}", delete_prefix)
+        .route("/metrics", _get(metrics))
+        .route("/status", _get(async || StatusCode::OK.into_response()))
+        .with_state(state)
+}
+
+type Result = anyhow::Result<Response, Response>;
+type State = axum::extract::State<Arc<Storage>>;
+
+const CONTENT_TYPE: &str = "content-type";
+const APPLICATION_OCTET_STREAM: &str = "application/octet-stream";
+const WARN_THRESHOLD: u32 = 3;
+const MAX_RETRIES: u32 = 10;
+
+async fn metrics() -> Result {
+    prometheus::TextEncoder::new()
+        .encode_to_string(&prometheus::gather())
+        .map(|s| s.into_response())
+        .map_err(|e| internal_error(e, "/metrics", "collecting metrics"))
+}
+
+async fn get(S3Path { path }: S3Path, state: State) -> Result {
+    info!(%path, "downloading");
+    let download_err = |e| {
+        if let DownloadError::NotFound = e {
+            info!(%path, %e, "downloading"); // 404 is not an issue of _this_ service
+            return not_found(&path);
+        }
+        internal_error(e, &path, "downloading")
+    };
+    let cancel = state.cancel.clone();
+    let opts = &DownloadOpts::default();
+
+    let stream = retry(
+        async || state.storage.download(&path, opts, &cancel).await,
+        DownloadError::is_permanent,
+        WARN_THRESHOLD,
+        MAX_RETRIES,
+        "downloading",
+        &cancel,
+    )
+    .await
+    .unwrap_or(Err(DownloadError::Cancelled))
+    .map_err(download_err)?
+    .download_stream;
+
+    Response::builder()
+        .status(StatusCode::OK)
+        .header(CONTENT_TYPE, APPLICATION_OCTET_STREAM)
+        .body(Body::from_stream(stream))
+        .map_err(|e| internal_error(e, path, "reading response"))
+}
+
+// Best solution for files is multipart upload, but remote_storage doesn't support it,
+// so we can either read Bytes in memory and push at once or forward BodyDataStream to
+// remote_storage. The latter may seem more peformant, but BodyDataStream doesn't have a
+// guaranteed size() which may produce issues while uploading to s3.
+// So, currently we're going with an in-memory copy plus a boundary to prevent uploading
+// very large files.
+async fn set(S3Path { path }: S3Path, state: State, bytes: Bytes) -> Result {
+    info!(%path, "uploading");
+    let request_len = bytes.len();
+    let max_len = state.max_upload_file_limit;
+    if request_len > max_len {
+        return Err(bad_request(
+            anyhow!("File size {request_len} exceeds max {max_len}"),
+            "uploading",
+        ));
+    }
+
+    let cancel = state.cancel.clone();
+    let fun = async || {
+        let stream = bytes_to_stream(bytes.clone());
+        state
+            .storage
+            .upload(stream, request_len, &path, None, &cancel)
+            .await
+    };
+    retry(
+        fun,
+        TimeoutOrCancel::caused_by_cancel,
+        WARN_THRESHOLD,
+        MAX_RETRIES,
+        "uploading",
+        &cancel,
+    )
+    .await
+    .unwrap_or(Err(anyhow!("uploading cancelled")))
+    .map_err(|e| internal_error(e, path, "reading response"))?;
+    Ok(ok())
+}
+
+async fn delete(S3Path { path }: S3Path, state: State) -> Result {
+    info!(%path, "deleting");
+    let cancel = state.cancel.clone();
+    retry(
+        async || state.storage.delete(&path, &cancel).await,
+        TimeoutOrCancel::caused_by_cancel,
+        WARN_THRESHOLD,
+        MAX_RETRIES,
+        "deleting",
+        &cancel,
+    )
+    .await
+    .unwrap_or(Err(anyhow!("deleting cancelled")))
+    .map_err(|e| internal_error(e, path, "deleting"))?;
+    Ok(ok())
+}
+
+async fn delete_prefix(PrefixS3Path { path }: PrefixS3Path, state: State) -> Result {
+    info!(%path, "deleting prefix");
+    let cancel = state.cancel.clone();
+    retry(
+        async || state.storage.delete_prefix(&path, &cancel).await,
+        TimeoutOrCancel::caused_by_cancel,
+        WARN_THRESHOLD,
+        MAX_RETRIES,
+        "deleting prefix",
+        &cancel,
+    )
+    .await
+    .unwrap_or(Err(anyhow!("deleting prefix cancelled")))
+    .map_err(|e| internal_error(e, path, "deleting prefix"))?;
+    Ok(ok())
+}
+
+pub async fn check_storage_permissions(
+    client: &GenericRemoteStorage,
+    cancel: CancellationToken,
+) -> anyhow::Result<()> {
+    info!("storage permissions check");
+
+    // as_nanos() as multiple instances proxying same bucket may be started at once
+    let now = SystemTime::now()
+        .duration_since(UNIX_EPOCH)?
+        .as_nanos()
+        .to_string();
+
+    let path = RemotePath::from_string(&format!("write_access_{now}"))?;
+    info!(%path, "uploading");
+
+    let body = now.to_string();
+    let stream = bytes_to_stream(Bytes::from(body.clone()));
+    client
+        .upload(stream, body.len(), &path, None, &cancel)
+        .await?;
+
+    use tokio::io::AsyncReadExt;
+    info!(%path, "downloading");
+    let download_opts = DownloadOpts {
+        kind: remote_storage::DownloadKind::Small,
+        ..Default::default()
+    };
+    let mut body_read_buf = Vec::new();
+    let stream = client
+        .download(&path, &download_opts, &cancel)
+        .await?
+        .download_stream;
+    tokio_util::io::StreamReader::new(stream)
+        .read_to_end(&mut body_read_buf)
+        .await?;
+    let body_read = String::from_utf8(body_read_buf)?;
+    if body != body_read {
+        error!(%body, %body_read, "File contents do not match");
+        anyhow::bail!("Read back file doesn't match original")
+    }
+
+    info!(%path, "removing");
+    client.delete(&path, &cancel).await
+}
+
+fn bytes_to_stream(bytes: Bytes) -> impl futures::Stream<Item = std::io::Result<Bytes>> {
+    futures::stream::once(futures::future::ready(Ok(bytes)))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use axum::{body::Body, extract::Request, response::Response};
+    use http_body_util::BodyExt;
+    use itertools::iproduct;
+    use std::env::var;
+    use std::sync::Arc;
+    use std::time::Duration;
+    use test_log::test as testlog;
+    use tower::{Service, util::ServiceExt};
+    use utils::id::{TenantId, TimelineId};
+
+    // see libs/remote_storage/tests/test_real_s3.rs
+    const REAL_S3_ENV: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
+    const REAL_S3_BUCKET: &str = "REMOTE_STORAGE_S3_BUCKET";
+    const REAL_S3_REGION: &str = "REMOTE_STORAGE_S3_REGION";
+
+    async fn proxy() -> (Storage, Option<camino_tempfile::Utf8TempDir>) {
+        let cancel = CancellationToken::new();
+        let (dir, storage) = if var(REAL_S3_ENV).is_err() {
+            // tests execute in parallel and we need a new directory for each of them
+            let dir = camino_tempfile::tempdir().unwrap();
+            let fs =
+                remote_storage::LocalFs::new(dir.path().into(), Duration::from_secs(5)).unwrap();
+            (Some(dir), GenericRemoteStorage::LocalFs(fs))
+        } else {
+            // test_real_s3::create_s3_client is hard to reference, reimplementing here
+            let millis = SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis();
+            use rand::Rng;
+            let random = rand::thread_rng().r#gen::<u32>();
+
+            let s3_config = remote_storage::S3Config {
+                bucket_name: var(REAL_S3_BUCKET).unwrap(),
+                bucket_region: var(REAL_S3_REGION).unwrap(),
+                prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
+                endpoint: None,
+                concurrency_limit: std::num::NonZeroUsize::new(100).unwrap(),
+                max_keys_per_list_response: None,
+                upload_storage_class: None,
+            };
+            let bucket = remote_storage::S3Bucket::new(&s3_config, Duration::from_secs(1))
+                .await
+                .unwrap();
+            (None, GenericRemoteStorage::AwsS3(Arc::new(bucket)))
+        };
+
+        let proxy = Storage {
+            auth: object_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
+            storage,
+            cancel: cancel.clone(),
+            max_upload_file_limit: usize::MAX,
+        };
+        check_storage_permissions(&proxy.storage, cancel)
+            .await
+            .unwrap();
+        (proxy, dir)
+    }
+
+    // see libs/utils/src/auth.rs
+    const TEST_PUB_KEY_ED25519: &[u8] = b"
+-----BEGIN PUBLIC KEY-----
+MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
+-----END PUBLIC KEY-----
+";
+
+    const TEST_PRIV_KEY_ED25519: &[u8] = br#"
+-----BEGIN PRIVATE KEY-----
+MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
+-----END PRIVATE KEY-----
+"#;
+
+    async fn request(req: Request<Body>) -> Response<Body> {
+        let (proxy, _) = proxy().await;
+        app(Arc::new(proxy))
+            .into_service()
+            .oneshot(req)
+            .await
+            .unwrap()
+    }
+
+    #[testlog(tokio::test)]
+    async fn status() {
+        let res = Request::builder()
+            .uri("/status")
+            .body(Body::empty())
+            .map(request)
+            .unwrap()
+            .await;
+        assert_eq!(res.status(), StatusCode::OK);
+    }
+
+    fn routes() -> impl Iterator<Item = (&'static str, &'static str)> {
+        iproduct!(
+            vec!["/1", "/1/2", "/1/2/3", "/1/2/3/4"],
+            vec!["GET", "PUT", "DELETE"]
+        )
+    }
+
+    #[testlog(tokio::test)]
+    async fn no_token() {
+        for (uri, method) in routes() {
+            info!(%uri, %method);
+            let res = Request::builder()
+                .uri(uri)
+                .method(method)
+                .body(Body::empty())
+                .map(request)
+                .unwrap()
+                .await;
+            assert!(matches!(
+                res.status(),
+                StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST
+            ));
+        }
+    }
+
+    #[testlog(tokio::test)]
+    async fn invalid_token() {
+        for (uri, method) in routes() {
+            info!(%uri, %method);
+            let status = Request::builder()
+                .uri(uri)
+                .header("Authorization", "Bearer 123")
+                .method(method)
+                .body(Body::empty())
+                .map(request)
+                .unwrap()
+                .await;
+            assert!(matches!(
+                status.status(),
+                StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST
+            ));
+        }
+    }
+
+    const TENANT_ID: TenantId =
+        TenantId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]);
+    const TIMELINE_ID: TimelineId =
+        TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
+    const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
+    fn token() -> String {
+        let claims = object_storage::Claims {
+            tenant_id: TENANT_ID,
+            timeline_id: TIMELINE_ID,
+            endpoint_id: ENDPOINT_ID.into(),
+            exp: u64::MAX,
+        };
+        let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
+        let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
+        jsonwebtoken::encode(&header, &claims, &key).unwrap()
+    }
+
+    #[testlog(tokio::test)]
+    async fn unauthorized() {
+        let (proxy, _) = proxy().await;
+        let mut app = app(Arc::new(proxy)).into_service();
+        let token = token();
+        let args = itertools::iproduct!(
+            vec![TENANT_ID.to_string(), TenantId::generate().to_string()],
+            vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()],
+            vec![ENDPOINT_ID, "ep-ololo"]
+        )
+        .skip(1);
+
+        for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) {
+            info!(%uri, %method, %tenant, %timeline, %endpoint);
+            let request = Request::builder()
+                .uri(format!("/{tenant}/{timeline}/{endpoint}/sub/path/key"))
+                .method(method)
+                .header("Authorization", format!("Bearer {}", token))
+                .body(Body::empty())
+                .unwrap();
+            let status = ServiceExt::ready(&mut app)
+                .await
+                .unwrap()
+                .call(request)
+                .await
+                .unwrap()
+                .status();
+            assert_eq!(status, StatusCode::UNAUTHORIZED);
+        }
+    }
+
+    #[testlog(tokio::test)]
+    async fn method_not_allowed() {
+        let token = token();
+        let iter = iproduct!(vec!["", "/.."], vec!["GET", "PUT"]);
+        for (key, method) in iter {
+            let status = Request::builder()
+                .uri(format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}{key}"))
+                .method(method)
+                .header("Authorization", format!("Bearer {token}"))
+                .body(Body::empty())
+                .map(request)
+                .unwrap()
+                .await
+                .status();
+            assert!(matches!(
+                status,
+                StatusCode::BAD_REQUEST | StatusCode::METHOD_NOT_ALLOWED
+            ));
+        }
+    }
+
+    async fn requests_chain(
+        chain: impl Iterator<Item = (String, &str, &'static str, StatusCode, bool)>,
+        token: impl Fn(&str) -> String,
+    ) {
+        let (proxy, _) = proxy().await;
+        let mut app = app(Arc::new(proxy)).into_service();
+        for (uri, method, body, expected_status, compare_body) in chain {
+            info!(%uri, %method, %body, %expected_status);
+            let bearer = format!("Bearer {}", token(&uri));
+            let request = Request::builder()
+                .uri(uri)
+                .method(method)
+                .header("Authorization", &bearer)
+                .body(Body::from(body))
+                .unwrap();
+            let response = ServiceExt::ready(&mut app)
+                .await
+                .unwrap()
+                .call(request)
+                .await
+                .unwrap();
+            assert_eq!(response.status(), expected_status);
+            if !compare_body {
+                continue;
+            }
+            let read_body = response.into_body().collect().await.unwrap().to_bytes();
+            assert_eq!(body, read_body);
+        }
+    }
+
+    #[testlog(tokio::test)]
+    async fn metrics() {
+        let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key");
+        let req = vec![
+            (uri.clone(), "PUT", "body", StatusCode::OK, false),
+            (uri.clone(), "DELETE", "", StatusCode::OK, false),
+        ];
+        requests_chain(req.into_iter(), |_| token()).await;
+
+        let res = Request::builder()
+            .uri("/metrics")
+            .body(Body::empty())
+            .map(request)
+            .unwrap()
+            .await;
+        assert_eq!(res.status(), StatusCode::OK);
+        let body = res.into_body().collect().await.unwrap().to_bytes();
+        let body = String::from_utf8_lossy(&body);
+        tracing::debug!(%body);
+        // Storage metrics are not gathered for LocalFs
+        if var(REAL_S3_ENV).is_ok() {
+            assert!(body.contains("remote_storage_s3_deleted_objects_total"));
+        }
+        assert!(body.contains("process_threads"));
+    }
+
+    #[testlog(tokio::test)]
+    async fn insert_retrieve_remove() {
+        let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key");
+        let chain = vec![
+            (uri.clone(), "GET", "", StatusCode::NOT_FOUND, false),
+            (uri.clone(), "PUT", "пыщьпыщь", StatusCode::OK, false),
+            (uri.clone(), "GET", "пыщьпыщь", StatusCode::OK, true),
+            (uri.clone(), "DELETE", "", StatusCode::OK, false),
+            (uri, "GET", "", StatusCode::NOT_FOUND, false),
+        ];
+        requests_chain(chain.into_iter(), |_| token()).await;
+    }
+
+    fn delete_prefix_token(uri: &str) -> String {
+        use serde::Serialize;
+        let parts = uri.split("/").collect::<Vec<&str>>();
+        #[derive(Serialize)]
+        struct PrefixClaims {
+            tenant_id: TenantId,
+            timeline_id: Option<TimelineId>,
+            endpoint_id: Option<object_storage::EndpointId>,
+            exp: u64,
+        }
+        let claims = PrefixClaims {
+            tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(),
+            timeline_id: parts.get(2).map(|c| c.parse().unwrap()),
+            endpoint_id: parts.get(3).map(ToString::to_string),
+            exp: u64::MAX,
+        };
+        let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
+        let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
+        jsonwebtoken::encode(&header, &claims, &key).unwrap()
+    }
+
+    // Can't use single digit numbers as they won't be validated as TimelineId and EndpointId
+    #[testlog(tokio::test)]
+    async fn delete_prefix() {
+        let tenant_id =
+            TenantId::from_array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).to_string();
+        let t2 = TimelineId::from_array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        let t3 = TimelineId::from_array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        let t4 = TimelineId::from_array([4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        let f = |timeline, path| format!("/{tenant_id}/{timeline}{path}");
+        // Why extra slash in string literals? Axum is weird with URIs:
+        // /1/2 and 1/2/ match different routes, thus first yields OK and second NOT_FOUND
+        //  as it matches /tenant/timeline/endpoint, see https://stackoverflow.com/a/75355932
+        // The cost of removing trailing slash is suprisingly hard:
+        // * Add tower dependency with NormalizePath layer
+        // * wrap Router<()> in this layer https://github.com/tokio-rs/axum/discussions/2377
+        // * Rewrite make_service() -> into_make_service()
+        // * Rewrite oneshot() (not available for NormalizePath)
+        // I didn't manage to get it working correctly
+        let chain = vec![
+            // create 1/2/3/4, 1/2/3/5, delete prefix 1/2/3 -> empty
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), // we can override file contents
+            (f(t2, "/3/5"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/3/5"), "GET", "", StatusCode::NOT_FOUND, false),
+            // create 1/2/3/4, 1/2/5/6, delete prefix 1/2/3 -> 1/2/5/6
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/5/6"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/5/6"), "GET", "", StatusCode::OK, false),
+            // create 1/2/3/4, 1/2/7/8, delete prefix 1/2 -> empty
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/7/8"), "PUT", "", StatusCode::OK, false),
+            (f(t2, ""), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/7/8"), "GET", "", StatusCode::NOT_FOUND, false),
+            // create 1/2/3/4, 1/2/5/6, 1/3/8/9, delete prefix 1/2/3 -> 1/2/5/6, 1/3/8/9
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/5/6"), "PUT", "", StatusCode::OK, false),
+            (f(t3, "/8/9"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/5/6"), "GET", "", StatusCode::OK, false),
+            (f(t3, "/8/9"), "GET", "", StatusCode::OK, false),
+            // create 1/4/5/6, delete prefix 1/2 -> 1/3/8/9, 1/4/5/6
+            (f(t4, "/5/6"), "PUT", "", StatusCode::OK, false),
+            (f(t2, ""), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t3, "/8/9"), "GET", "", StatusCode::OK, false),
+            (f(t4, "/5/6"), "GET", "", StatusCode::OK, false),
+            // delete prefix 1 -> empty
+            (format!("/{tenant_id}"), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t3, "/8/9"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t4, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
+        ];
+        requests_chain(chain.into_iter(), delete_prefix_token).await;
+    }
+}
--- a/object_storage/src/lib.rs
+++ b/object_storage/src/lib.rs
@@ -0,0 +1,344 @@
+use anyhow::Result;
+use axum::extract::{FromRequestParts, Path};
+use axum::response::{IntoResponse, Response};
+use axum::{RequestPartsExt, http::StatusCode, http::request::Parts};
+use axum_extra::TypedHeader;
+use axum_extra::headers::{Authorization, authorization::Bearer};
+use camino::Utf8PathBuf;
+use jsonwebtoken::{DecodingKey, Validation};
+use remote_storage::{GenericRemoteStorage, RemotePath};
+use serde::{Deserialize, Serialize};
+use std::fmt::Display;
+use std::result::Result as StdResult;
+use std::sync::Arc;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, error};
+use utils::id::{TenantId, TimelineId};
+
+// simplified version of utils::auth::JwtAuth
+pub struct JwtAuth {
+    decoding_key: DecodingKey,
+    validation: Validation,
+}
+
+pub const VALIDATION_ALGO: jsonwebtoken::Algorithm = jsonwebtoken::Algorithm::EdDSA;
+impl JwtAuth {
+    pub fn new(key: &[u8]) -> Result<Self> {
+        Ok(Self {
+            decoding_key: DecodingKey::from_ed_pem(key)?,
+            validation: Validation::new(VALIDATION_ALGO),
+        })
+    }
+
+    pub fn decode<T: serde::de::DeserializeOwned>(&self, token: &str) -> Result<T> {
+        Ok(jsonwebtoken::decode(token, &self.decoding_key, &self.validation).map(|t| t.claims)?)
+    }
+}
+
+fn normalize_key(key: &str) -> StdResult<Utf8PathBuf, String> {
+    let key = clean_utf8(&Utf8PathBuf::from(key));
+    if key.starts_with("..") || key == "." || key == "/" {
+        return Err(format!("invalid key {key}"));
+    }
+    match key.strip_prefix("/").map(Utf8PathBuf::from) {
+        Ok(p) => Ok(p),
+        _ => Ok(key),
+    }
+}
+
+// Copied from path_clean crate with PathBuf->Utf8PathBuf
+fn clean_utf8(path: &camino::Utf8Path) -> Utf8PathBuf {
+    use camino::Utf8Component as Comp;
+    let mut out = Vec::new();
+    for comp in path.components() {
+        match comp {
+            Comp::CurDir => (),
+            Comp::ParentDir => match out.last() {
+                Some(Comp::RootDir) => (),
+                Some(Comp::Normal(_)) => {
+                    out.pop();
+                }
+                None | Some(Comp::CurDir) | Some(Comp::ParentDir) | Some(Comp::Prefix(_)) => {
+                    out.push(comp)
+                }
+            },
+            comp => out.push(comp),
+        }
+    }
+    if !out.is_empty() {
+        out.iter().collect()
+    } else {
+        Utf8PathBuf::from(".")
+    }
+}
+
+pub struct Storage {
+    pub auth: JwtAuth,
+    pub storage: GenericRemoteStorage,
+    pub cancel: CancellationToken,
+    pub max_upload_file_limit: usize,
+}
+
+pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc
+
+#[derive(Deserialize, Serialize, PartialEq)]
+pub struct Claims {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub endpoint_id: EndpointId,
+    pub exp: u64,
+}
+
+impl Display for Claims {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})",
+            self.tenant_id, self.timeline_id, self.endpoint_id, self.exp
+        )
+    }
+}
+
+#[derive(Deserialize, Serialize)]
+struct KeyRequest {
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    endpoint_id: EndpointId,
+    path: String,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct S3Path {
+    pub path: RemotePath,
+}
+
+impl TryFrom<&KeyRequest> for S3Path {
+    type Error = String;
+    fn try_from(req: &KeyRequest) -> StdResult<Self, Self::Error> {
+        let KeyRequest {
+            tenant_id,
+            timeline_id,
+            endpoint_id,
+            path,
+        } = &req;
+        let prefix = format!("{tenant_id}/{timeline_id}/{endpoint_id}",);
+        let path = Utf8PathBuf::from(prefix).join(normalize_key(path)?);
+        let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative
+        Ok(S3Path { path })
+    }
+}
+
+fn unauthorized(route: impl Display, claims: impl Display) -> Response {
+    debug!(%route, %claims, "route doesn't match claims");
+    StatusCode::UNAUTHORIZED.into_response()
+}
+
+pub fn bad_request(err: impl Display, desc: &'static str) -> Response {
+    debug!(%err, desc);
+    (StatusCode::BAD_REQUEST, err.to_string()).into_response()
+}
+
+pub fn ok() -> Response {
+    StatusCode::OK.into_response()
+}
+
+pub fn internal_error(err: impl Display, path: impl Display, desc: &'static str) -> Response {
+    error!(%err, %path, desc);
+    StatusCode::INTERNAL_SERVER_ERROR.into_response()
+}
+
+pub fn not_found(key: impl ToString) -> Response {
+    (StatusCode::NOT_FOUND, key.to_string()).into_response()
+}
+
+impl FromRequestParts<Arc<Storage>> for S3Path {
+    type Rejection = Response;
+    async fn from_request_parts(
+        parts: &mut Parts,
+        state: &Arc<Storage>,
+    ) -> Result<Self, Self::Rejection> {
+        let Path(path): Path<KeyRequest> = parts
+            .extract()
+            .await
+            .map_err(|e| bad_request(e, "invalid route"))?;
+        let TypedHeader(Authorization(bearer)) = parts
+            .extract::<TypedHeader<Authorization<Bearer>>>()
+            .await
+            .map_err(|e| bad_request(e, "invalid token"))?;
+        let claims: Claims = state
+            .auth
+            .decode(bearer.token())
+            .map_err(|e| bad_request(e, "decoding token"))?;
+        let route = Claims {
+            tenant_id: path.tenant_id,
+            timeline_id: path.timeline_id,
+            endpoint_id: path.endpoint_id.clone(),
+            exp: claims.exp,
+        };
+        if route != claims {
+            return Err(unauthorized(route, claims));
+        }
+        (&path)
+            .try_into()
+            .map_err(|e| bad_request(e, "invalid route"))
+    }
+}
+
+#[derive(Deserialize, Serialize, PartialEq)]
+pub struct PrefixKeyPath {
+    pub tenant_id: TenantId,
+    pub timeline_id: Option<TimelineId>,
+    pub endpoint_id: Option<EndpointId>,
+}
+
+impl Display for PrefixKeyPath {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})",
+            self.tenant_id,
+            self.timeline_id
+                .as_ref()
+                .map(ToString::to_string)
+                .unwrap_or("".to_string()),
+            self.endpoint_id
+                .as_ref()
+                .map(ToString::to_string)
+                .unwrap_or("".to_string())
+        )
+    }
+}
+
+#[derive(Debug, PartialEq)]
+pub struct PrefixS3Path {
+    pub path: RemotePath,
+}
+
+impl From<&PrefixKeyPath> for PrefixS3Path {
+    fn from(path: &PrefixKeyPath) -> Self {
+        let timeline_id = path
+            .timeline_id
+            .as_ref()
+            .map(ToString::to_string)
+            .unwrap_or("".to_string());
+        let endpoint_id = path
+            .endpoint_id
+            .as_ref()
+            .map(ToString::to_string)
+            .unwrap_or("".to_string());
+        let path = Utf8PathBuf::from(path.tenant_id.to_string())
+            .join(timeline_id)
+            .join(endpoint_id);
+        let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative
+        PrefixS3Path { path }
+    }
+}
+
+impl FromRequestParts<Arc<Storage>> for PrefixS3Path {
+    type Rejection = Response;
+    async fn from_request_parts(
+        parts: &mut Parts,
+        state: &Arc<Storage>,
+    ) -> Result<Self, Self::Rejection> {
+        let Path(path) = parts
+            .extract::<Path<PrefixKeyPath>>()
+            .await
+            .map_err(|e| bad_request(e, "invalid route"))?;
+        let TypedHeader(Authorization(bearer)) = parts
+            .extract::<TypedHeader<Authorization<Bearer>>>()
+            .await
+            .map_err(|e| bad_request(e, "invalid token"))?;
+        let claims: PrefixKeyPath = state
+            .auth
+            .decode(bearer.token())
+            .map_err(|e| bad_request(e, "invalid token"))?;
+        if path != claims {
+            return Err(unauthorized(path, claims));
+        }
+        Ok((&path).into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn normalize_key() {
+        let f = super::normalize_key;
+        assert_eq!(f("hello/world/..").unwrap(), Utf8PathBuf::from("hello"));
+        assert_eq!(
+            f("ololo/1/../../not_ololo").unwrap(),
+            Utf8PathBuf::from("not_ololo")
+        );
+        assert!(f("ololo/1/../../../").is_err());
+        assert!(f(".").is_err());
+        assert!(f("../").is_err());
+        assert!(f("").is_err());
+        assert_eq!(f("/1/2/3").unwrap(), Utf8PathBuf::from("1/2/3"));
+        assert!(f("/1/2/3/../../../").is_err());
+        assert!(f("/1/2/3/../../../../").is_err());
+    }
+
+    const TENANT_ID: TenantId =
+        TenantId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]);
+    const TIMELINE_ID: TimelineId =
+        TimelineId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
+    const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
+
+    #[test]
+    fn s3_path() {
+        let auth = Claims {
+            tenant_id: TENANT_ID,
+            timeline_id: TIMELINE_ID,
+            endpoint_id: ENDPOINT_ID.into(),
+            exp: u64::MAX,
+        };
+        let s3_path = |key| {
+            let path = &format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/{key}");
+            let path = RemotePath::from_string(path).unwrap();
+            S3Path { path }
+        };
+
+        let path = "cache_key".to_string();
+        let mut key_path = KeyRequest {
+            path,
+            tenant_id: auth.tenant_id,
+            timeline_id: auth.timeline_id,
+            endpoint_id: auth.endpoint_id,
+        };
+        assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path));
+
+        key_path.path = "we/can/have/nested/paths".to_string();
+        assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path));
+
+        key_path.path = "../error/hello/../".to_string();
+        assert!(S3Path::try_from(&key_path).is_err());
+    }
+
+    #[test]
+    fn prefix_s3_path() {
+        let mut path = PrefixKeyPath {
+            tenant_id: TENANT_ID,
+            timeline_id: None,
+            endpoint_id: None,
+        };
+        let prefix_path = |s: String| RemotePath::from_string(&s).unwrap();
+        assert_eq!(
+            PrefixS3Path::from(&path).path,
+            prefix_path(format!("{TENANT_ID}"))
+        );
+
+        path.timeline_id = Some(TIMELINE_ID);
+        assert_eq!(
+            PrefixS3Path::from(&path).path,
+            prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}"))
+        );
+
+        path.endpoint_id = Some(ENDPOINT_ID.into());
+        assert_eq!(
+            PrefixS3Path::from(&path).path,
+            prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}"))
+        );
+    }
+}
--- a/object_storage/src/main.rs
+++ b/object_storage/src/main.rs
@@ -0,0 +1,65 @@
+//! `object_storage` is a service which provides API for uploading and downloading
+//! files. It is used by compute and control plane for accessing LFC prewarm data.
+//! This service is deployed either as a separate component or as part of compute image
+//! for large computes.
+mod app;
+use anyhow::Context;
+use tracing::info;
+use utils::logging;
+
+//see set()
+const fn max_upload_file_limit() -> usize {
+    100 * 1024 * 1024
+}
+
+#[derive(serde::Deserialize)]
+#[serde(tag = "type")]
+struct Config {
+    listen: std::net::SocketAddr,
+    pemfile: camino::Utf8PathBuf,
+    #[serde(flatten)]
+    storage_config: remote_storage::RemoteStorageConfig,
+    #[serde(default = "max_upload_file_limit")]
+    max_upload_file_limit: usize,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    logging::init(
+        logging::LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+        logging::Output::Stdout,
+    )?;
+
+    let config: String = std::env::args().skip(1).take(1).collect();
+    if config.is_empty() {
+        anyhow::bail!("Usage: object_storage config.json")
+    }
+    info!("Reading config from {config}");
+    let config = std::fs::read_to_string(config.clone())?;
+    let config: Config = serde_json::from_str(&config).context("parsing config")?;
+    info!("Reading pemfile from {}", config.pemfile.clone());
+    let pemfile = std::fs::read(config.pemfile.clone())?;
+    info!("Loading public key from {}", config.pemfile.clone());
+    let auth = object_storage::JwtAuth::new(&pemfile)?;
+
+    let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
+    info!("listening on {}", listener.local_addr().unwrap());
+
+    let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?;
+    let cancel = tokio_util::sync::CancellationToken::new();
+    app::check_storage_permissions(&storage, cancel.clone()).await?;
+
+    let proxy = std::sync::Arc::new(object_storage::Storage {
+        auth,
+        storage,
+        cancel: cancel.clone(),
+        max_upload_file_limit: config.max_upload_file_limit,
+    });
+
+    tokio::spawn(utils::signals::signal_handler(cancel.clone()));
+    axum::serve(listener, app::app(proxy))
+        .with_graceful_shutdown(async move { cancel.cancelled().await })
+        .await?;
+    Ok(())
+}
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -65,7 +65,7 @@ use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
 use once_cell::sync::Lazy;
 use pageserver::config::PageServerConf;
-use pageserver::walredo::PostgresRedoManager;
+use pageserver::walredo::{PostgresRedoManager, RedoAttemptType};
 use pageserver_api::key::Key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::shard::TenantShardId;
@@ -223,7 +223,14 @@ impl Request {

        // TODO: avoid these clones
        manager
-            .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
+            .request_redo(
+                *key,
+                *lsn,
+                base_img.clone(),
+                records.clone(),
+                *pg_version,
+                RedoAttemptType::ReadPage,
+            )
            .await
            .context("request_redo")
    }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -31,7 +31,6 @@ use pageserver::{
 };
 use postgres_backend::AuthType;
 use remote_storage::GenericRemoteStorage;
-use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -453,6 +452,24 @@ fn start_pageserver(
    info!("Using auth for http API: {:#?}", conf.http_auth_type);
    info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);

+    let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
+    {
+        let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
+            "main",
+            &conf.ssl_key_file,
+            &conf.ssl_cert_file,
+            conf.ssl_cert_reload_period,
+        ))?;
+
+        let server_config = rustls::ServerConfig::builder()
+            .with_no_client_auth()
+            .with_cert_resolver(resolver);
+
+        Some(Arc::new(server_config))
+    } else {
+        None
+    };
+
    match var("NEON_AUTH_TOKEN") {
        Ok(v) => {
            info!("Loaded JWT token for authentication with Safekeeper");
@@ -671,17 +688,11 @@ fn start_pageserver(

        let https_task = match https_listener {
            Some(https_listener) => {
-                let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
-                    &conf.ssl_key_file,
-                    &conf.ssl_cert_file,
-                    conf.ssl_cert_reload_period,
-                ))?;
+                let tls_server_config = tls_server_config
+                    .clone()
+                    .expect("tls_server_config is set earlier if https is enabled");

-                let server_config = rustls::ServerConfig::builder()
-                    .with_no_client_auth()
-                    .with_cert_resolver(resolver);
-
-                let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
+                let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);

                let server =
                    http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
@@ -737,6 +748,11 @@ fn start_pageserver(
            tokio::net::TcpListener::from_std(pageserver_listener)
                .context("create tokio listener")?
        },
+        if conf.enable_tls_page_service_api {
+            tls_server_config
+        } else {
+            None
+        },
    );

    // All started up! Now just sit and wait for shutdown signal.
@@ -744,32 +760,7 @@ fn start_pageserver(
        let signal_token = CancellationToken::new();
        let signal_cancel = signal_token.child_token();

-        // Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals
-        // even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See:
-        // https://github.com/neondatabase/neon/issues/9740.
-        tokio::spawn(async move {
-            let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
-            let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
-            let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
-
-            loop {
-                let signal = tokio::select! {
-                    _ = sigquit.recv() => {
-                        info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
-                        std::process::exit(111);
-                    }
-                    _ = sigint.recv() => "SIGINT",
-                    _ = sigterm.recv() => "SIGTERM",
-                };
-
-                if !signal_token.is_cancelled() {
-                    info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
-                    signal_token.cancel();
-                } else {
-                    info!("Got signal {signal}. Already shutting down.");
-                }
-            }
-        });
+        tokio::spawn(utils::signals::signal_handler(signal_token));

        // Wait for cancellation signal and shut down the pageserver.
        //
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -219,6 +219,11 @@ pub struct PageServerConf {
    pub generate_unarchival_heatmap: bool,

    pub tracing: Option<pageserver_api::config::Tracing>,
+
+    /// Enable TLS in page service API.
+    /// Does not force TLS: the client negotiates TLS usage during the handshake.
+    /// Uses key and certificate from ssl_key_file/ssl_cert_file.
+    pub enable_tls_page_service_api: bool,
 }

 /// Token for authentication to safekeepers
@@ -391,6 +396,7 @@ impl PageServerConf {
            load_previous_heatmap,
            generate_unarchival_heatmap,
            tracing,
+            enable_tls_page_service_api,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -441,6 +447,7 @@ impl PageServerConf {
            page_service_pipelining,
            get_vectored_concurrent_io,
            tracing,
+            enable_tls_page_service_api,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -212,6 +212,12 @@ paths:
              schema:
                type: string
                format: date-time
+        "412":
+          description: No timestamp is found for given LSN, e.g. if there had been no commits till LSN
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PreconditionFailedError"

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
    parameters:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -67,7 +67,7 @@ use crate::tenant::mgr::{
 };
 use crate::tenant::remote_timeline_client::index::GcCompactionState;
 use crate::tenant::remote_timeline_client::{
-    download_index_part, list_remote_tenant_shards, list_remote_timelines,
+    download_index_part, download_tenant_manifest, list_remote_tenant_shards, list_remote_timelines,
 };
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
@@ -989,7 +989,7 @@ async fn get_lsn_by_timestamp_handler(
    if !tenant_shard_id.is_shard_zero() {
        // Requires SLRU contents, which are only stored on shard zero
        return Err(ApiError::BadRequest(anyhow!(
-            "Size calculations are only available on shard zero"
+            "Lsn calculations by timestamp are only available on shard zero"
        )));
    }

@@ -1064,7 +1064,7 @@ async fn get_timestamp_of_lsn_handler(
    if !tenant_shard_id.is_shard_zero() {
        // Requires SLRU contents, which are only stored on shard zero
        return Err(ApiError::BadRequest(anyhow!(
-            "Size calculations are only available on shard zero"
+            "Timestamp calculations by lsn are only available on shard zero"
        )));
    }

@@ -1090,8 +1090,8 @@ async fn get_timestamp_of_lsn_handler(
            .to_string();
            json_response(StatusCode::OK, time)
        }
-        None => Err(ApiError::NotFound(
-            anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
+        None => Err(ApiError::PreconditionFailed(
+            format!("Timestamp for lsn {} not found", lsn).into(),
        )),
    }
 }
@@ -2274,6 +2274,7 @@ async fn timeline_compact_handler(
    if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
        flags |= CompactFlags::DryRun;
    }
+    // Manual compaction does not yield for L0.

    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
@@ -2911,9 +2912,22 @@ async fn tenant_scan_remote_handler(
            };
        }

+        let result =
+            download_tenant_manifest(&state.remote_storage, &tenant_shard_id, generation, &cancel)
+                .instrument(info_span!("download_tenant_manifest",
+                            tenant_id=%tenant_shard_id.tenant_id,
+                            shard_id=%tenant_shard_id.shard_slug()))
+                .await;
+        let stripe_size = match result {
+            Ok((manifest, _, _)) => manifest.stripe_size,
+            Err(DownloadError::NotFound) => None,
+            Err(err) => return Err(ApiError::InternalServerError(anyhow!(err))),
+        };
+
        response.shards.push(TenantScanRemoteStorageShard {
            tenant_shard_id,
            generation: generation.into(),
+            stripe_size,
        });
    }

@@ -3368,11 +3382,11 @@ async fn put_tenant_timeline_import_basebackup(

        let broker_client = state.broker_client.clone();

-        let mut body = StreamReader::new(request.into_body().map(|res| {
-            res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
-            })
-        }));
+        let mut body = StreamReader::new(
+            request
+                .into_body()
+                .map(|res| res.map_err(|error| std::io::Error::other(anyhow::anyhow!(error)))),
+        );

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -3446,7 +3460,7 @@ async fn put_tenant_timeline_import_wal(

        let mut body = StreamReader::new(request.into_body().map(|res| {
            res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+                std::io::Error::other( anyhow::anyhow!(error))
            })
        }));

--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -105,6 +105,7 @@ pub fn spawn(
    pg_auth: Option<Arc<SwappableJwtAuth>>,
    perf_trace_dispatch: Option<Dispatch>,
    tcp_listener: tokio::net::TcpListener,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
 ) -> Listener {
    let cancel = CancellationToken::new();
    let libpq_ctx = RequestContext::todo_child(
@@ -124,6 +125,7 @@ pub fn spawn(
            perf_trace_dispatch,
            tcp_listener,
            conf.pg_auth_type,
+            tls_config,
            conf.page_service_pipelining.clone(),
            libpq_ctx,
            cancel.clone(),
@@ -181,6 +183,7 @@ pub async fn libpq_listener_main(
    perf_trace_dispatch: Option<Dispatch>,
    listener: tokio::net::TcpListener,
    auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
    listener_ctx: RequestContext,
    listener_cancel: CancellationToken,
@@ -223,6 +226,7 @@ pub async fn libpq_listener_main(
                    local_auth,
                    socket,
                    auth_type,
+                    tls_config.clone(),
                    pipelining_config.clone(),
                    connection_ctx,
                    connections_cancel.child_token(),
@@ -264,6 +268,7 @@ async fn page_service_conn_main(
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
    connection_ctx: RequestContext,
    cancel: CancellationToken,
@@ -334,7 +339,8 @@ async fn page_service_conn_main(
        cancel.clone(),
        gate_guard,
    );
-    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
+    let pgbackend =
+        PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;

    match pgbackend.run(&mut conn_handler, &cancel).await {
        Ok(()) => {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -691,7 +691,7 @@ impl Timeline {
        Ok(buf.get_u32_le())
    }

-    /// Get size of an SLRU segment
+    /// Does the slru segment exist?
    pub(crate) async fn get_slru_segment_exists(
        &self,
        kind: SlruKind,
@@ -844,9 +844,9 @@ impl Timeline {
        .await
    }

-    /// Obtain the possible timestamp range for the given lsn.
+    /// Obtain the timestamp for the given lsn.
    ///
-    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
+    /// If the lsn has no timestamps (e.g. no commits), returns None.
    pub(crate) async fn get_timestamp_for_lsn(
        &self,
        probe_lsn: Lsn,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -100,7 +100,7 @@ use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walingest::WalLagCooldown;
-use crate::walredo::PostgresRedoManager;
+use crate::walredo::{PostgresRedoManager, RedoAttemptType};
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo};

 static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
@@ -473,15 +473,16 @@ impl WalRedoManager {
        base_img: Option<(Lsn, bytes::Bytes)>,
        records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
        pg_version: u32,
+        redo_attempt_type: RedoAttemptType,
    ) -> Result<bytes::Bytes, walredo::Error> {
        match self {
            Self::Prod(_, mgr) => {
-                mgr.request_redo(key, lsn, base_img, records, pg_version)
+                mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type)
                    .await
            }
            #[cfg(test)]
            Self::Test(mgr) => {
-                mgr.request_redo(key, lsn, base_img, records, pg_version)
+                mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type)
                    .await
            }
        }
@@ -920,6 +921,7 @@ enum StartCreatingTimelineResult {
    Idempotent(Arc<Timeline>),
 }

+#[allow(clippy::large_enum_variant, reason = "TODO")]
 enum TimelineInitAndSyncResult {
    ReadyToActivate(Arc<Timeline>),
    NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata),
@@ -1006,6 +1008,7 @@ enum CreateTimelineCause {
    Delete,
 }

+#[allow(clippy::large_enum_variant, reason = "TODO")]
 enum LoadTimelineCause {
    Attach,
    Unoffload,
@@ -4079,6 +4082,7 @@ impl Tenant {

        TenantManifest {
            version: LATEST_TENANT_MANIFEST_VERSION,
+            stripe_size: Some(self.get_shard_stripe_size()),
            offloaded_timelines,
        }
    }
@@ -4398,10 +4402,7 @@ impl Tenant {
        .to_string();

        fail::fail_point!("tenant-config-before-write", |_| {
-            Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                "tenant-config-before-write",
-            ))
+            Err(std::io::Error::other("tenant-config-before-write"))
        });

        // Convert the config to a toml file.
@@ -5879,6 +5880,7 @@ pub(crate) mod harness {
            base_img: Option<(Lsn, Bytes)>,
            records: Vec<(Lsn, NeonWalRecord)>,
            _pg_version: u32,
+            _redo_attempt_type: RedoAttemptType,
        ) -> Result<Bytes, walredo::Error> {
            let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
            if records_neon {
@@ -8733,6 +8735,21 @@ mod tests {
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_init("i")),
            ),
+            (
+                get_key(4),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "i")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_init("1")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "2")),
+            ),
        ];
        let image1 = vec![(get_key(1), "0x10".into())];

@@ -8763,8 +8780,18 @@ mod tests {

        // Need to remove the limit of "Neon WAL redo requires base image".

-        // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
-        // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
+        assert_eq!(
+            tline.get(get_key(3), Lsn(0x50), &ctx).await?,
+            Bytes::from_static(b"c")
+        );
+        assert_eq!(
+            tline.get(get_key(4), Lsn(0x50), &ctx).await?,
+            Bytes::from_static(b"ij")
+        );
+
+        // Manual testing required: currently, read errors will panic the process in debug mode. So we
+        // cannot enable this assertion in the unit test.
+        // assert!(tline.get(get_key(5), Lsn(0x50), &ctx).await.is_err());

        Ok(())
    }
@@ -11544,6 +11571,99 @@ mod tests {
        Ok(())
    }

+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x24),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x24")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x28),
+                // This record will fail to redo
+                Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![], // in-memory layers
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    Lsn(0x20)..Lsn(0x30),
+                    delta1,
+                )], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            tline
+                .applied_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let cancel = CancellationToken::new();
+
+        // Compaction will fail, but should not fire any critical error.
+        // Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction
+        // process. It will always try to redo the logs it reads and if it doesn't work, fail the entire
+        // compaction job. Tracked in <https://github.com/neondatabase/neon/issues/10395>.
+        let res = tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: None,
+                    compact_lsn_range: None,
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await;
+        assert!(res.is_err());
+
+        Ok(())
+    }
+
    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -15,7 +15,7 @@
 //! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use std::cmp::min;
-use std::io::{Error, ErrorKind};
+use std::io::Error;

 use async_compression::Level;
 use bytes::{BufMut, BytesMut};
@@ -331,10 +331,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                    return (
                        (
                            io_buf.slice_len(),
-                            Err(Error::new(
-                                ErrorKind::Other,
-                                format!("blob too large ({len} bytes)"),
-                            )),
+                            Err(Error::other(format!("blob too large ({len} bytes)"))),
                        ),
                        srcbuf,
                    );
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -216,12 +216,8 @@ impl<'a> FileBlockReader<'a> {
        match cache
            .read_immutable_buf(self.file_id, blknum, ctx)
            .await
-            .map_err(|e| {
-                std::io::Error::new(
-                    std::io::ErrorKind::Other,
-                    format!("Failed to read immutable buf: {e:#}"),
-                )
-            })? {
+            .map_err(|e| std::io::Error::other(format!("Failed to read immutable buf: {e:#}")))?
+        {
            ReadBufResult::Found(guard) => Ok(guard.into()),
            ReadBufResult::NotFound(write_guard) => {
                // Read the page from disk into the buffer
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -1,4 +1,5 @@
 use chrono::NaiveDateTime;
+use pageserver_api::shard::ShardStripeSize;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -14,6 +15,12 @@ pub struct TenantManifest {
    /// allow release rollbacks.
    pub version: usize,

+    /// This tenant's stripe size. This is only advisory, and used to recover tenant data from
+    /// remote storage. The autoritative source is the storage controller. If None, assume the
+    /// original default value of 32768 blocks (256 MB).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stripe_size: Option<ShardStripeSize>,
+
    /// The list of offloaded timelines together with enough information
    /// to not have to actually load them.
    ///
@@ -42,7 +49,12 @@ pub struct OffloadedTimelineManifest {

 /// The newest manifest version. This should be incremented on changes, even non-breaking ones. We
 /// do not use deny_unknown_fields, so new fields are not breaking.
-pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
+///
+/// 1: initial version
+/// 2: +stripe_size
+///
+/// When adding new versions, also add a parse_vX test case below.
+pub const LATEST_TENANT_MANIFEST_VERSION: usize = 2;

 impl TenantManifest {
    /// Returns true if the manifests are equal, ignoring the version number. This avoids
@@ -56,10 +68,11 @@ impl TenantManifest {
        // We could alternatively just clone and modify the version here.
        let Self {
            version: _, // ignore version
+            stripe_size,
            offloaded_timelines,
        } = self;

-        offloaded_timelines == &other.offloaded_timelines
+        stripe_size == &other.stripe_size && offloaded_timelines == &other.offloaded_timelines
    }

    /// Decodes a manifest from JSON.
@@ -89,6 +102,7 @@ mod tests {
         }"#;
        let expected = TenantManifest {
            version: 0,
+            stripe_size: None,
            offloaded_timelines: Vec::new(),
        };
        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
@@ -104,6 +118,7 @@ mod tests {
         }"#;
        let expected = TenantManifest {
            version: 1,
+            stripe_size: None,
            offloaded_timelines: Vec::new(),
        };
        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
@@ -130,6 +145,50 @@ mod tests {
         }"#;
        let expected = TenantManifest {
            version: 1,
+            stripe_size: None,
+            offloaded_timelines: vec![
+                OffloadedTimelineManifest {
+                    timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
+                    ancestor_timeline_id: None,
+                    ancestor_retain_lsn: None,
+                    archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?,
+                },
+                OffloadedTimelineManifest {
+                    timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?,
+                    ancestor_timeline_id: Some(TimelineId::from_str(
+                        "5c4df612fd159e63c1b7853fe94d97da",
+                    )?),
+                    ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?),
+                    archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?,
+                },
+            ],
+        };
+        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
+        Ok(())
+    }
+
+    /// v2 manifests should be parsed, for backwards compatibility.
+    #[test]
+    fn parse_v2() -> anyhow::Result<()> {
+        let json = r#"{
+             "version": 2,
+             "stripe_size": 32768,
+             "offloaded_timelines": [
+                 {
+                     "timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
+                     "archived_at": "2025-03-07T11:07:11.373105434"
+                 },
+                 {
+                     "timeline_id": "f3def5823ad7080d2ea538d8e12163fa",
+                     "ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
+                     "ancestor_retain_lsn": "0/1F79038",
+                     "archived_at": "2025-03-05T11:10:22.257901390"
+                 }
+             ]
+         }"#;
+        let expected = TenantManifest {
+            version: 2,
+            stripe_size: Some(ShardStripeSize(32768)),
            offloaded_timelines: vec![
                OffloadedTimelineManifest {
                    timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -366,7 +366,7 @@ impl SplitDeltaLayerWriter {
                )
                .await?;
                let (start_key, prev_delta_writer) =
-                    std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
+                    self.inner.replace((key, next_delta_writer)).unwrap();
                self.batches.add_unfinished_delta_writer(
                    prev_delta_writer,
                    start_key..key,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1192,7 +1192,7 @@ mod test {

        // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap();
        let range = input_start..input_end;

        // Build an image layer to filter
@@ -1235,7 +1235,7 @@ mod test {
            let shard_identity = ShardIdentity::new(
                ShardNumber(shard_number),
                shard_count,
-                ShardStripeSize(0x8000),
+                ShardStripeSize(0x800),
            )
            .unwrap();
            let harness = TenantHarness::create_custom(
@@ -1287,12 +1287,12 @@ mod test {

            // This exact size and those below will need updating as/when the layer encoding changes, but
            // should be deterministic for a given version of the format, as we used no randomness generating the input.
-            assert_eq!(original_size, 1597440);
+            assert_eq!(original_size, 122880);

            match shard_number {
                0 => {
                    // We should have written out just one stripe for our shard identity
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                    let replacement = replacement.unwrap();

                    // We should have dropped some of the data
@@ -1300,7 +1300,7 @@ mod test {
                    assert!(replacement.metadata().file_size > 0);

                    // Assert that we dropped ~3/4 of the data.
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                }
                1 => {
                    // Shard 1 has no keys in our input range
@@ -1309,19 +1309,19 @@ mod test {
                }
                2 => {
                    // Shard 2 has one stripes in the input range
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                    let replacement = replacement.unwrap();
                    assert!(replacement.metadata().file_size < original_size);
                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                }
                3 => {
                    // Shard 3 has two stripes in the input range
-                    assert_eq!(wrote_keys, 0x10000);
+                    assert_eq!(wrote_keys, 0x1000);
                    let replacement = replacement.unwrap();
                    assert!(replacement.metadata().file_size < original_size);
                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 811008);
+                    assert_eq!(replacement.metadata().file_size, 73728);
                }
                _ => unreachable!(),
            }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -766,7 +766,7 @@ mod tests {
                    rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs
                    Ok((dst, len))
                }
-                Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
+                Err(e) => Err(std::io::Error::other(e)),
            }
        }
    }
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -59,6 +59,7 @@ impl LayerIterRef<'_> {
 /// 1. Unified iterator for image and delta layers.
 /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
 /// 3. Lazy creation of the real delta/image iterator.
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 pub(crate) enum IteratorWrapper<'a> {
    NotLoaded {
        ctx: &'a RequestContext,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -24,6 +24,7 @@ use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};

 use crate::PERF_TRACE_TARGET;
+use crate::walredo::RedoAttemptType;
 use anyhow::{Context, Result, anyhow, bail, ensure};
 use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
@@ -115,7 +116,7 @@ use crate::pgdatadir_mapping::{
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::AttachmentMode;
 use crate::tenant::gc_result::GcResult;
-use crate::tenant::layer_map::{LayerMap, SearchResult};
+use crate::tenant::layer_map::LayerMap;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
@@ -1039,6 +1040,7 @@ pub(crate) enum ShutdownMode {
    Hard,
 }

+#[allow(clippy::large_enum_variant, reason = "TODO")]
 enum ImageLayerCreationOutcome {
    /// We generated an image layer
    Generated {
@@ -1292,6 +1294,12 @@ impl Timeline {
        };
        reconstruct_state.read_path = read_path;

+        let redo_attempt_type = if ctx.task_kind() == TaskKind::Compaction {
+            RedoAttemptType::LegacyCompaction
+        } else {
+            RedoAttemptType::ReadPage
+        };
+
        let traversal_res: Result<(), _> = {
            let ctx = RequestContextBuilder::from(ctx)
                .perf_span(|crnt_perf_span| {
@@ -1379,7 +1387,7 @@ impl Timeline {

                    let walredo_deltas = converted.num_deltas();
                    let walredo_res = walredo_self
-                        .reconstruct_value(key, lsn, converted)
+                        .reconstruct_value(key, lsn, converted, redo_attempt_type)
                        .maybe_perf_instrument(&ctx, |crnt_perf_span| {
                            info_span!(
                                target: PERF_TRACE_TARGET,
@@ -4104,12 +4112,6 @@ impl Timeline {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<TimelineVisitOutcome, GetVectoredError> {
-        let mut unmapped_keyspace = keyspace.clone();
-        let mut fringe = LayerFringe::new();
-
-        let mut completed_keyspace = KeySpace::default();
-        let mut image_covered_keyspace = KeySpaceRandomAccum::new();
-
        // Prevent GC from progressing while visiting the current timeline.
        // If we are GC-ing because a new image layer was added while traversing
        // the timeline, then it will remove layers that are required for fulfilling
@@ -4120,11 +4122,44 @@ impl Timeline {
        // See `compaction::compact_with_gc` for why we need this.
        let _guard = timeline.gc_compaction_layer_update_lock.read().await;

-        loop {
+        // Initialize the fringe
+        let mut fringe = {
+            let mut fringe = LayerFringe::new();
+
+            let guard = timeline.layers.read().await;
+            guard.update_search_fringe(&keyspace, cont_lsn, &mut fringe)?;
+
+            fringe
+        };
+
+        let mut completed_keyspace = KeySpace::default();
+        let mut image_covered_keyspace = KeySpaceRandomAccum::new();
+
+        while let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
            if cancel.is_cancelled() {
                return Err(GetVectoredError::Cancelled);
            }

+            if let Some(ref mut read_path) = reconstruct_state.read_path {
+                read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
+            }
+
+            // Visit the layer and plan IOs for it
+            let next_cont_lsn = lsn_range.start;
+            layer_to_read
+                .get_values_reconstruct_data(
+                    keyspace_to_read.clone(),
+                    lsn_range,
+                    reconstruct_state,
+                    ctx,
+                )
+                .await?;
+
+            let mut unmapped_keyspace = keyspace_to_read;
+            cont_lsn = next_cont_lsn;
+
+            reconstruct_state.on_layer_visited(&layer_to_read);
+
            let (keys_done_last_step, keys_with_image_coverage) =
                reconstruct_state.consume_done_keys();
            unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
@@ -4135,31 +4170,15 @@ impl Timeline {
                image_covered_keyspace.add_range(keys_with_image_coverage);
            }

+            // Query the layer map for the next layers to read.
+            //
            // Do not descent any further if the last layer we visited
            // completed all keys in the keyspace it inspected. This is not
            // required for correctness, but avoids visiting extra layers
            // which turns out to be a perf bottleneck in some cases.
            if !unmapped_keyspace.is_empty() {
                let guard = timeline.layers.read().await;
-                let layers = guard.layer_map()?;
-
-                for range in unmapped_keyspace.ranges.iter() {
-                    let results = layers.range_search(range.clone(), cont_lsn);
-
-                    results
-                        .found
-                        .into_iter()
-                        .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                            (
-                                guard.upgrade(layer),
-                                keyspace_accum.to_keyspace(),
-                                lsn_floor..cont_lsn,
-                            )
-                        })
-                        .for_each(|(layer, keyspace, lsn_range)| {
-                            fringe.update(layer, keyspace, lsn_range)
-                        });
-                }
+                guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?;

                // It's safe to drop the layer map lock after planning the next round of reads.
                // The fringe keeps readable handles for the layers which are safe to read even
@@ -4173,28 +4192,6 @@ impl Timeline {
                // at two different time points.
                drop(guard);
            }
-
-            if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
-                if let Some(ref mut read_path) = reconstruct_state.read_path {
-                    read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
-                }
-                let next_cont_lsn = lsn_range.start;
-                layer_to_read
-                    .get_values_reconstruct_data(
-                        keyspace_to_read.clone(),
-                        lsn_range,
-                        reconstruct_state,
-                        ctx,
-                    )
-                    .await?;
-
-                unmapped_keyspace = keyspace_to_read;
-                cont_lsn = next_cont_lsn;
-
-                reconstruct_state.on_layer_visited(&layer_to_read);
-            } else {
-                break;
-            }
        }

        Ok(TimelineVisitOutcome {
@@ -6353,37 +6350,21 @@ impl Timeline {

    /// Reconstruct a value, using the given base image and WAL records in 'data'.
    async fn reconstruct_value(
-        &self,
-        key: Key,
-        request_lsn: Lsn,
-        data: ValueReconstructState,
-    ) -> Result<Bytes, PageReconstructError> {
-        self.reconstruct_value_inner(key, request_lsn, data, false)
-            .await
-    }
-
-    /// Reconstruct a value, using the given base image and WAL records in 'data'. It does not fire critical errors because
-    /// sometimes it is expected to fail due to unreplayable history described in <https://github.com/neondatabase/neon/issues/10395>.
-    async fn reconstruct_value_wo_critical_error(
-        &self,
-        key: Key,
-        request_lsn: Lsn,
-        data: ValueReconstructState,
-    ) -> Result<Bytes, PageReconstructError> {
-        self.reconstruct_value_inner(key, request_lsn, data, true)
-            .await
-    }
-
-    async fn reconstruct_value_inner(
        &self,
        key: Key,
        request_lsn: Lsn,
        mut data: ValueReconstructState,
-        no_critical_error: bool,
+        redo_attempt_type: RedoAttemptType,
    ) -> Result<Bytes, PageReconstructError> {
        // Perform WAL redo if needed
        data.records.reverse();

+        let fire_critical_error = match redo_attempt_type {
+            RedoAttemptType::ReadPage => true,
+            RedoAttemptType::LegacyCompaction => true,
+            RedoAttemptType::GcCompaction => false,
+        };
+
        // If we have a page image, and no WAL, we're all set
        if data.records.is_empty() {
            if let Some((img_lsn, img)) = &data.img {
@@ -6430,13 +6411,20 @@ impl Timeline {
                    .as_ref()
                    .context("timeline has no walredo manager")
                    .map_err(PageReconstructError::WalRedo)?
-                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
+                    .request_redo(
+                        key,
+                        request_lsn,
+                        data.img,
+                        data.records,
+                        self.pg_version,
+                        redo_attempt_type,
+                    )
                    .await;
                let img = match res {
                    Ok(img) => img,
                    Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
                    Err(walredo::Error::Other(err)) => {
-                        if !no_critical_error {
+                        if fire_critical_error {
                            critical!("walredo failure during page reconstruction: {err:?}");
                        }
                        return Err(PageReconstructError::WalRedo(
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -7,7 +7,7 @@
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
-use std::time::Instant;
+use std::time::{Duration, Instant};

 use super::layer_manager::LayerManager;
 use super::{
@@ -16,6 +16,8 @@ use super::{
    Timeline,
 };

+use crate::tenant::timeline::DeltaEntry;
+use crate::walredo::RedoAttemptType;
 use anyhow::{Context, anyhow};
 use bytes::Bytes;
 use enumset::EnumSet;
@@ -315,6 +317,9 @@ impl GcCompactionQueue {
                    flags: {
                        let mut flags = EnumSet::new();
                        flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                        if timeline.get_compaction_l0_first() {
+                            flags |= CompactFlags::YieldForL0;
+                        }
                        flags
                    },
                    sub_compaction: true,
@@ -819,15 +824,16 @@ pub struct CompactionStatistics {
    time_acquire_lock_secs: f64,
    time_analyze_secs: f64,
    time_download_layer_secs: f64,
+    time_to_first_kv_pair_secs: f64,
    time_main_loop_secs: f64,
    time_final_phase_secs: f64,
    time_total_secs: f64,

    // Summary
-    /// Ratio of the key-value size before/after gc-compaction.
-    uncompressed_size_ratio: f64,
-    /// Ratio of the physical size before/after gc-compaction.
-    physical_size_ratio: f64,
+    /// Ratio of the key-value size after/before gc-compaction.
+    uncompressed_retention_ratio: f64,
+    /// Ratio of the physical size after/before gc-compaction.
+    compressed_retention_ratio: f64,
 }

 impl CompactionStatistics {
@@ -896,15 +902,15 @@ impl CompactionStatistics {
    fn finalize(&mut self) {
        let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
        let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
-        self.uncompressed_size_ratio =
-            original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
+        self.uncompressed_retention_ratio =
+            produced_key_value_size as f64 / (original_key_value_size as f64 + 1.0); // avoid div by 0
        let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
        let produced_physical_size = self.image_layer_produced.size
            + self.delta_layer_produced.size
            + self.image_layer_discarded.size
            + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
-        self.physical_size_ratio =
-            original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
+        self.compressed_retention_ratio =
+            produced_physical_size as f64 / (original_physical_size as f64 + 1.0); // avoid div by 0
    }
 }

@@ -2411,7 +2417,7 @@ impl Timeline {
                    lsn_split_points[i]
                };
                let img = self
-                    .reconstruct_value_wo_critical_error(key, request_lsn, state)
+                    .reconstruct_value(key, request_lsn, state, RedoAttemptType::GcCompaction)
                    .await?;
                Some((request_lsn, img))
            } else {
@@ -3032,7 +3038,7 @@ impl Timeline {
        .map_err(CompactionError::Other)?;

        let time_download_layer = timer.elapsed();
-        let timer = Instant::now();
+        let mut timer = Instant::now();

        // Step 2: Produce images+deltas.
        let mut accumulated_values = Vec::new();
@@ -3107,6 +3113,7 @@ impl Timeline {
        // Actually, we can decide not to write to the image layer at all at this point because
        // the key and LSN range are determined. However, to keep things simple here, we still
        // create this writer, and discard the writer in the end.
+        let mut time_to_first_kv_pair = None;

        while let Some(((key, lsn, val), desc)) = merge_iter
            .next_with_trace()
@@ -3114,6 +3121,11 @@ impl Timeline {
            .context("failed to get next key-value pair")
            .map_err(CompactionError::Other)?
        {
+            if time_to_first_kv_pair.is_none() {
+                time_to_first_kv_pair = Some(timer.elapsed());
+                timer = Instant::now();
+            }
+
            if cancel.is_cancelled() {
                return Err(CompactionError::ShuttingDown);
            }
@@ -3449,6 +3461,9 @@ impl Timeline {
        let time_final_phase = timer.elapsed();

        stat.time_final_phase_secs = time_final_phase.as_secs_f64();
+        stat.time_to_first_kv_pair_secs = time_to_first_kv_pair
+            .unwrap_or(Duration::ZERO)
+            .as_secs_f64();
        stat.time_main_loop_secs = time_main_loop.as_secs_f64();
        stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
        stat.time_download_layer_secs = time_download_layer.as_secs_f64();
@@ -3909,8 +3924,6 @@ impl CompactionLayer<Key> for OwnArc<DeltaLayer> {
    }
 }

-use crate::tenant::timeline::DeltaEntry;
-
 impl CompactionLayer<Key> for ResidentDeltaLayer {
    fn key_range(&self) -> &Range<Key> {
        &self.0.layer_desc().key_range
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -3,17 +3,18 @@ use std::sync::Arc;

 use anyhow::{Context, bail, ensure};
 use itertools::Itertools;
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
 use tracing::trace;
 use utils::id::TimelineId;
 use utils::lsn::{AtomicLsn, Lsn};

-use super::{ReadableLayer, TimelineWriterState};
+use super::{LayerFringe, ReadableLayer, TimelineWriterState};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::metrics::TimelineMetrics;
-use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
+use crate::tenant::layer_map::{BatchedUpdates, LayerMap, SearchResult};
 use crate::tenant::storage_layer::{
    AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
    PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
@@ -38,7 +39,7 @@ impl Default for LayerManager {
 }

 impl LayerManager {
-    pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
+    fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
        match weak {
            ReadableLayerWeak::PersistentLayer(desc) => {
                ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
@@ -147,6 +148,36 @@ impl LayerManager {
        self.layers().keys().cloned().collect_vec()
    }

+    /// Update the [`LayerFringe`] of a read request
+    ///
+    /// Take a key space at a given LSN and query the layer map below each range
+    /// of the key space to find the next layers to visit.
+    pub(crate) fn update_search_fringe(
+        &self,
+        keyspace: &KeySpace,
+        cont_lsn: Lsn,
+        fringe: &mut LayerFringe,
+    ) -> Result<(), Shutdown> {
+        let map = self.layer_map()?;
+
+        for range in keyspace.ranges.iter() {
+            let results = map.range_search(range.clone(), cont_lsn);
+            results
+                .found
+                .into_iter()
+                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                    (
+                        self.upgrade(layer),
+                        keyspace_accum.to_keyspace(),
+                        lsn_floor..cont_lsn,
+                    )
+                })
+                .for_each(|(layer, keyspace, lsn_range)| fringe.update(layer, keyspace, lsn_range));
+        }
+
+        Ok(())
+    }
+
    fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
        use LayerManager::*;
        match self {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -445,7 +445,7 @@ pub(super) async fn handle_walreceiver_connection(
                        .inspect_err(|err| {
                            // TODO: we can't differentiate cancellation errors with
                            // anyhow::Error, so just ignore it if we're cancelled.
-                            if !cancellation.is_cancelled() {
+                            if !cancellation.is_cancelled() && !timeline.is_stopping() {
                                critical!("{err:?}")
                            }
                        })?;
@@ -577,7 +577,7 @@ pub(super) async fn handle_walreceiver_connection(
                            .inspect_err(|err| {
                                // TODO: we can't differentiate cancellation errors with
                                // anyhow::Error, so just ignore it if we're cancelled.
-                                if !cancellation.is_cancelled() {
+                                if !cancellation.is_cancelled() && !timeline.is_stopping() {
                                    critical!("{err:?}")
                                }
                            })?;
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -302,6 +302,7 @@ pub struct UploadQueueStoppedDeletable {
    pub(super) deleted_at: SetDeletedFlagProgress,
 }

+#[allow(clippy::large_enum_variant, reason = "TODO")]
 pub enum UploadQueueStopped {
    Deletable(UploadQueueStoppedDeletable),
    Uninitialized,
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -136,6 +136,16 @@ macro_rules! bail {
    }
 }

+#[derive(Debug, Clone, Copy)]
+pub enum RedoAttemptType {
+    /// Used for the read path. Will fire critical errors and retry twice if failure.
+    ReadPage,
+    // Used for legacy compaction (only used in image compaction). Will fire critical errors and retry once if failure.
+    LegacyCompaction,
+    // Used for gc compaction. Will not fire critical errors and not retry.
+    GcCompaction,
+}
+
 ///
 /// Public interface of WAL redo manager
 ///
@@ -156,11 +166,18 @@ impl PostgresRedoManager {
        base_img: Option<(Lsn, Bytes)>,
        records: Vec<(Lsn, NeonWalRecord)>,
        pg_version: u32,
+        redo_attempt_type: RedoAttemptType,
    ) -> Result<Bytes, Error> {
        if records.is_empty() {
            bail!("invalid WAL redo request with no records");
        }

+        let max_retry_attempts = match redo_attempt_type {
+            RedoAttemptType::ReadPage => 2,
+            RedoAttemptType::LegacyCompaction => 1,
+            RedoAttemptType::GcCompaction => 0,
+        };
+
        let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
        let mut img = base_img.map(|p| p.1);
        let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1);
@@ -180,6 +197,7 @@ impl PostgresRedoManager {
                        &records[batch_start..i],
                        self.conf.wal_redo_timeout,
                        pg_version,
+                        max_retry_attempts,
                    )
                    .await
                };
@@ -201,6 +219,7 @@ impl PostgresRedoManager {
                &records[batch_start..],
                self.conf.wal_redo_timeout,
                pg_version,
+                max_retry_attempts,
            )
            .await
        }
@@ -424,11 +443,11 @@ impl PostgresRedoManager {
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
        pg_version: u32,
+        max_retry_attempts: u32,
    ) -> Result<Bytes, Error> {
        *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());

        let (rel, blknum) = key.to_rel_block().context("invalid record")?;
-        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let mut n_attempts = 0u32;
        loop {
            let base_img = &base_img;
@@ -486,7 +505,7 @@ impl PostgresRedoManager {
                info!(n_attempts, "retried walredo succeeded");
            }
            n_attempts += 1;
-            if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
+            if n_attempts > max_retry_attempts || result.is_ok() {
                return result;
            }
        }
@@ -560,6 +579,7 @@ mod tests {

    use super::PostgresRedoManager;
    use crate::config::PageServerConf;
+    use crate::walredo::RedoAttemptType;

    #[tokio::test]
    async fn test_ping() {
@@ -593,6 +613,7 @@ mod tests {
                None,
                short_records(),
                14,
+                RedoAttemptType::ReadPage,
            )
            .instrument(h.span())
            .await
@@ -621,6 +642,7 @@ mod tests {
                None,
                short_records(),
                14,
+                RedoAttemptType::ReadPage,
            )
            .instrument(h.span())
            .await
@@ -642,6 +664,7 @@ mod tests {
                None,
                short_records(),
                16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
+                RedoAttemptType::ReadPage,
            )
            .instrument(h.span())
            .await
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -276,6 +276,7 @@ pub(crate) fn apply_in_neon(
            append,
            clear,
            will_init,
+            only_if,
        } => {
            use bytes::BufMut;
            if *will_init {
@@ -288,6 +289,13 @@ pub(crate) fn apply_in_neon(
            if *clear {
                page.clear();
            }
+            if let Some(only_if) = only_if {
+                if page != only_if.as_bytes() {
+                    return Err(anyhow::anyhow!(
+                        "the current image does not match the expected image, cannot append"
+                    ));
+                }
+            }
            page.put_slice(append.as_bytes());
        }
    }
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,6 +4,7 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
+	communicator.o \
 	extension_server.o \
 	file_cache.o \
 	hll.o \
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
--- a/pgxn/neon/communicator.h
+++ b/pgxn/neon/communicator.h
@@ -0,0 +1,48 @@
+/*-------------------------------------------------------------------------
+ *
+ * communicator.h
+ *	  internal interface for communicating with remote pageservers
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef COMMUNICATOR_h
+#define COMMUNICATOR_h
+
+#include "neon_pgversioncompat.h"
+
+#include "storage/buf_internals.h"
+
+#include "pagestore_client.h"
+
+/* initialization at postmaster startup */
+extern void pg_init_communicator(void);
+
+/* initialization at backend startup */
+extern void communicator_init(void);
+
+extern bool communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum,
+								neon_request_lsns *request_lsns);
+extern BlockNumber communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum,
+										neon_request_lsns *request_lsns);
+extern int64 communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns);
+extern void communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum,
+									  BlockNumber base_blockno, neon_request_lsns *request_lsns,
+									  void **buffers, BlockNumber nblocks, const bits8 *mask);
+extern int communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum,
+										 neon_request_lsns *lsns,
+										 BlockNumber nblocks, void **buffers, bits8 *mask);
+extern void communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
+												   BlockNumber nblocks, const bits8 *mask);
+extern int communicator_read_slru_segment(SlruKind kind, int64 segno,
+										  neon_request_lsns *request_lsns,
+										  void *buffer);
+
+extern void communicator_reconfigure_timeout_if_needed(void);
+extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts);
+
+
+#endif
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -21,7 +21,6 @@
 #include "access/xlog.h"
 #include "funcapi.h"
 #include "miscadmin.h"
-#include "pagestore_client.h"
 #include "common/hashfn.h"
 #include "pgstat.h"
 #include "port/pg_iovec.h"
@@ -43,6 +42,7 @@

 #include "hll.h"
 #include "bitmap.h"
+#include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
 #include "neon_perf_counters.h"
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -0,0 +1,52 @@
+/*-------------------------------------------------------------------------
+ *
+ * file_cache.h
+ *	  Local File Cache definitions
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FILE_CACHE_h
+#define FILE_CACHE_h
+
+#include "neon_pgversioncompat.h"
+
+/* GUCs */
+extern bool lfc_store_prefetch_result;
+
+/* functions for local file cache */
+extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
+					   BlockNumber blkno, const void *const *buffers,
+					   BlockNumber nblocks);
+/* returns number of blocks read, with one bit set in *read for each  */
+extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum,
+							BlockNumber blkno, void **buffers,
+							BlockNumber nblocks, bits8 *mask);
+
+extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
+							   BlockNumber blkno);
+extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
+							   BlockNumber blkno, int nblocks, bits8 *bitmap);
+extern void lfc_init(void);
+extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+						 const void* buffer, XLogRecPtr lsn);
+
+
+static inline bool
+lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		 void *buffer)
+{
+	bits8		rv = 0;
+	return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
+}
+
+static inline void
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		  const void *buffer)
+{
+	return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
+}
+
+#endif							/* FILE_CACHE_H */
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -65,6 +65,9 @@ static const struct config_enum_entry neon_compute_modes[] = {
 /* GUCs */
 char	   *neon_timeline;
 char	   *neon_tenant;
+char	   *neon_project_id;
+char	   *neon_branch_id;
+char	   *neon_endpoint_id;
 int32		max_cluster_size;
 char	   *page_server_connstring;
 char	   *neon_auth_token;
@@ -1352,6 +1355,31 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);

+	DefineCustomStringVariable("neon.project_id",
+							   "Neon project_id the server is running on",
+							   NULL,
+							   &neon_project_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+	DefineCustomStringVariable("neon.branch_id",
+							   "Neon branch_id the server is running on",
+							   NULL,
+							   &neon_branch_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+	DefineCustomStringVariable("neon.endpoint_id",
+							   "Neon endpoint_id the server is running on",
+							   NULL,
+							   &neon_endpoint_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+
 	DefineCustomIntVariable("neon.stripe_size",
 							"sharding stripe size",
 							NULL,
@@ -1475,6 +1503,4 @@ pg_init_libpagestore(void)
 	}

 	memset(page_servers, 0, sizeof(page_servers));
-
-	lfc_init();
 }
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -28,7 +28,9 @@
 #include "utils/guc.h"
 #include "utils/guc_tables.h"

+#include "communicator.h"
 #include "extension_server.h"
+#include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
 #include "control_plane_connector.h"
@@ -434,10 +436,11 @@ _PG_init(void)
 #endif

 	pg_init_libpagestore();
+	lfc_init();
 	pg_init_walproposer();
 	init_lwlsncache();

-	pagestore_smgr_init();
+	pg_init_communicator();
 	Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

 	InitUnstableExtensionsSupport();
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -47,9 +47,18 @@ extern uint32		WAIT_EVENT_NEON_WAL_DL;
 #define WAIT_EVENT_NEON_WAL_DL			WAIT_EVENT_WAL_READ
 #endif

+
+#define NEON_TAG "[NEON_SMGR] "
+#define neon_log(tag, fmt, ...) ereport(tag,                                  \
+										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
+										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
+#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag,	\
+														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
+														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
+
+
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);
-extern void pagestore_smgr_init(void);

 extern uint64 BackpressureThrottlingTime(void);
 extern void SetNeonCurrentClusterSize(uint64 size);
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -58,14 +58,6 @@ typedef struct

 #define messageTag(m) (((const NeonMessage *)(m))->tag)

-#define NEON_TAG "[NEON_SMGR] "
-#define neon_log(tag, fmt, ...) ereport(tag,                                  \
-										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
-										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
-#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag,	\
-														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
-														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
-
 /* SLRUs downloadable from page server */
 typedef enum {
 	SLRU_CLOG,
@@ -234,7 +226,6 @@ extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;
 extern int  neon_protocol_version;
-extern bool lfc_store_prefetch_result;

 extern shardno_t get_shard_number(BufferTag* tag);

@@ -242,6 +233,7 @@ extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);

+
 /*
 * LSN values associated with each request to the pageserver
 */
@@ -278,6 +270,10 @@ extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum,
 										 neon_request_lsns request_lsns, void *buffer);
 extern int64 neon_dbsize(Oid dbNode);

+extern void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
+								  BlockNumber blkno, neon_request_lsns *output,
+								  BlockNumber nblocks);
+
 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
 extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size);
@@ -285,37 +281,4 @@ extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumb
 extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
 extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);

-/* functions for local file cache */
-extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
-					   BlockNumber blkno, const void *const *buffers,
-					   BlockNumber nblocks);
-/* returns number of blocks read, with one bit set in *read for each  */
-extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum,
-							BlockNumber blkno, void **buffers,
-							BlockNumber nblocks, bits8 *mask);
-
-extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
-							   BlockNumber blkno);
-extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
-							   BlockNumber blkno, int nblocks, bits8 *bitmap);
-extern void lfc_init(void);
-extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-						 const void* buffer, XLogRecPtr lsn);
-
-
-static inline bool
-lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-		 void *buffer)
-{
-	bits8		rv = 0;
-	return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
-}
-
-static inline void
-lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-		  const void *buffer)
-{
-	return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
-}
-
 #endif							/* PAGESTORE_CLIENT_H */
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -99,6 +99,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->config = config;
 	wp->api = api;
 	wp->state = WPS_COLLECTING_TERMS;
+	wp->mconf.generation = INVALID_GENERATION;
+	wp->mconf.members.len = 0;
+	wp->mconf.new_members.len = 0;

 	wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);

@@ -170,6 +173,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)

 	if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
 		wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
+	if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3)
+		wp_log(FATAL, "enabling generations requires protocol version 3");
 	wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);

 	/* Fill the greeting package */
@@ -214,7 +219,7 @@ WalProposerFree(WalProposer *wp)
 static bool
 WalProposerGenerationsEnabled(WalProposer *wp)
 {
-	return wp->safekeepers_generation != 0;
+	return wp->safekeepers_generation != INVALID_GENERATION;
 }

 /*
@@ -723,13 +728,176 @@ SendProposerGreeting(Safekeeper *sk)
 	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
 }

+/*
+ * Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in
+ * members_safekeepers & new_members_safekeepers to sk.
+ */
+static void
+UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
+{
+	/* members_safekeepers etc are fixed size, sanity check mconf size */
+	if (wp->mconf.members.len > MAX_SAFEKEEPERS)
+		wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len);
+	if (wp->mconf.new_members.len > MAX_SAFEKEEPERS)
+		wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len);
+
+	/* node id is not known until greeting is received */
+	if (sk->state < SS_WAIT_VOTING)
+		return;
+
+	/* 0 is assumed to be invalid node id, should never happen */
+	if (sk->greetResponse.nodeId == 0)
+	{
+		wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port);
+		return;
+	}
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		SafekeeperId *sk_id = &wp->mconf.members.m[i];
+
+		if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId)
+		{
+			/*
+			 * If mconf or list of safekeepers to connect to changed (the
+			 * latter always currently goes through restart though),
+			 * ResetMemberSafekeeperPtrs is expected to be called before
+			 * UpdateMemberSafekeeperPtr. So, other value suggests that we are
+			 * connected to the same sk under different host name, complain
+			 * about that.
+			 */
+			if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk)
+			{
+				wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu",
+					   sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper);
+			}
+			wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu",
+				   sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
+			wp->members_safekeepers[i] = sk;
+		}
+	}
+	/* repeat for new_members */
+	for (uint32 i = 0; i < wp->mconf.new_members.len; i++)
+	{
+		SafekeeperId *sk_id = &wp->mconf.new_members.m[i];
+
+		if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId)
+		{
+			if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk)
+			{
+				wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu",
+					   sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper);
+			}
+			wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu",
+				   sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
+			wp->new_members_safekeepers[i] = sk;
+		}
+	}
+}
+
+/*
+ * Reset wp->members_safekeepers & new_members_safekeepers and refill them.
+ * Called after wp changes mconf.
+ */
+static void
+ResetMemberSafekeeperPtrs(WalProposer *wp)
+{
+	memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
+	memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		if (wp->safekeeper[i].state >= SS_WAIT_VOTING)
+			UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]);
+	}
+}
+
+static uint32
+MsetQuorum(MemberSet *mset)
+{
+	Assert(mset->len > 0);
+	return mset->len / 2 + 1;
+}
+
+/* Does n forms quorum in mset? */
+static bool
+MsetHasQuorum(MemberSet *mset, uint32 n)
+{
+	return n >= MsetQuorum(mset);
+}
+
+/*
+ * TermsCollected helper for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
+ */
+static bool
+TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
+{
+	uint32		n_greeted = 0;
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		Safekeeper *sk = msk[i];
+
+		if (sk != NULL && sk->state == SS_WAIT_VOTING)
+		{
+			if (n_greeted > 0)
+				appendStringInfoString(s, ", ");
+			appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
+			n_greeted++;
+		}
+	}
+	appendStringInfo(s, ", %u/%u total", n_greeted, mset->len);
+	return MsetHasQuorum(mset, n_greeted);
+}
+
 /*
 * Have we received greeting from enough (quorum) safekeepers to start voting?
 */
 static bool
 TermsCollected(WalProposer *wp)
 {
-	return wp->n_connected >= wp->quorum;
+	StringInfoData s;			/* str for logging */
+	bool		collected = false;
+
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
+	{
+		collected = wp->n_connected >= wp->quorum;
+		if (collected)
+		{
+			wp->propTerm++;
+			wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm);
+		}
+		return collected;
+	}
+
+	/*
+	 * With generations enabled, we start campaign only when 1) some mconf is
+	 * actually received 2) we have greetings from majority of members as well
+	 * as from majority of new_members if it exists.
+	 */
+	if (wp->mconf.generation == INVALID_GENERATION)
+		return false;
+
+	initStringInfo(&s);
+	appendStringInfoString(&s, "mset greeters: ");
+	if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
+		goto res;
+	if (wp->mconf.new_members.len > 0)
+	{
+		appendStringInfoString(&s, ", new_mset greeters: ");
+		if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
+			goto res;
+	}
+	wp->propTerm++;
+	wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm);
+	collected = true;
+
+res:
+	pfree(s.data);
+	return collected;
 }

 static void
@@ -753,13 +921,41 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	pfree(mconf_toml);

 	/*
-	 * Adopt mconf of safekeepers if it is higher. TODO: mconf change should
-	 * restart wp if it started voting.
+	 * Adopt mconf of safekeepers if it is higher.
 	 */
 	if (sk->greetResponse.mconf.generation > wp->mconf.generation)
 	{
+		/* sanity check before adopting, should never happen */
+		if (sk->greetResponse.mconf.members.len == 0)
+		{
+			wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation);
+		}
+
+		/*
+		 * If we at least started campaign, restart wp to get elected in the
+		 * new mconf. Note: in principle once wp is already elected
+		 * re-election is not required, but being conservative here is not
+		 * bad.
+		 *
+		 * TODO: put mconf to shmem to immediately pick it up on start,
+		 * otherwise if some safekeeper(s) misses latest mconf and gets
+		 * connected the first, it may cause redundant restarts here.
+		 *
+		 * More generally, it would be nice to restart walproposer (wiping
+		 * election state) without restarting the process. In particular, that
+		 * would allow sync-safekeepers not to die here if it intersected with
+		 * sk migration (as well as remove 1s delay).
+		 *
+		 * Note that assign_neon_safekeepers also currently restarts the
+		 * process, so during normal migration walproposer may restart twice.
+		 */
+		if (wp->state >= WPS_CAMPAIGN)
+		{
+			wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation);
+		}
 		MembershipConfigurationFree(&wp->mconf);
 		MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
+		ResetMemberSafekeeperPtrs(wp);
 		/* full conf was just logged above */
 		wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
 	}
@@ -767,6 +963,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_WAIT_VOTING;

+	/* In greeting safekeeper sent its id; update mappings accordingly. */
+	UpdateMemberSafekeeperPtr(wp, sk);
+
 	/*
 	 * Note: it would be better to track the counter on per safekeeper basis,
 	 * but at worst walproposer would restart with 'term rejected', so leave
@@ -778,12 +977,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		/* We're still collecting terms from the majority. */
 		wp->propTerm = Max(sk->greetResponse.term, wp->propTerm);

-		/* Quorum is acquried, prepare the vote request. */
+		/* Quorum is acquired, prepare the vote request. */
 		if (TermsCollected(wp))
 		{
-			wp->propTerm++;
-			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
-
 			wp->state = WPS_CAMPAIGN;
 			wp->voteRequest.pam.tag = 'v';
 			wp->voteRequest.generation = wp->mconf.generation;
@@ -832,8 +1028,8 @@ SendVoteRequest(Safekeeper *sk)
 					   &sk->outbuf, wp->config->proto_version);

 	/* We have quorum for voting, send our vote request */
-	wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
-		   wp->voteRequest.generation, wp->voteRequest.term);
+	wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT,
+		   sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term);
 	/* On failure, logging & resetting is handled */
 	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
 	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
@@ -851,8 +1047,8 @@ RecvVoteResponse(Safekeeper *sk)
 		return;

 	wp_log(LOG,
-		   "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-		   sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
+		   "got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+		   sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
 		   sk->voteResponse.voteGiven,
 		   GetHighestTerm(&sk->voteResponse.termHistory),
 		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
@@ -899,6 +1095,53 @@ RecvVoteResponse(Safekeeper *sk)
 	}
 }

+/*
+ * VotesCollected helper for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
+ */
+static bool
+VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
+{
+	uint32		n_votes = 0;
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		Safekeeper *sk = msk[i];
+
+		if (sk != NULL && sk->state == SS_WAIT_ELECTED)
+		{
+			Assert(sk->voteResponse.voteGiven);
+
+			/*
+			 * Find the highest vote. NULL check is for the legacy case where
+			 * safekeeper might be not initialized with LSN at all and return
+			 * 0 LSN in the vote response; we still want to set donor to
+			 * something in this case.
+			 */
+			if (GetLastLogTerm(sk) > wp->donorLastLogTerm ||
+				(GetLastLogTerm(sk) == wp->donorLastLogTerm &&
+				 sk->voteResponse.flushLsn > wp->propTermStartLsn) ||
+				wp->donor == NULL)
+			{
+				wp->donorLastLogTerm = GetLastLogTerm(sk);
+				wp->propTermStartLsn = sk->voteResponse.flushLsn;
+				wp->donor = sk;
+			}
+			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
+
+			if (n_votes > 0)
+				appendStringInfoString(s, ", ");
+			appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
+			n_votes++;
+		}
+	}
+	appendStringInfo(s, ", %u/%u total", n_votes, mset->len);
+	return MsetHasQuorum(mset, n_votes);
+}
+
+
 /*
 * Checks if enough votes has been collected to get elected and if that's the
 * case finds the highest vote, setting donor, donorLastLogTerm,
@@ -907,7 +1150,8 @@ RecvVoteResponse(Safekeeper *sk)
 static bool
 VotesCollected(WalProposer *wp)
 {
-	int			n_ready = 0;
+	StringInfoData s;			/* str for logging */
+	bool		collected = false;

 	/* assumed to be called only when not elected yet */
 	Assert(wp->state == WPS_CAMPAIGN);
@@ -916,25 +1160,62 @@ VotesCollected(WalProposer *wp)
 	wp->donorLastLogTerm = 0;
 	wp->truncateLsn = InvalidXLogRecPtr;

-	for (int i = 0; i < wp->n_safekeepers; i++)
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
 	{
-		if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
-		{
-			n_ready++;
+		int			n_ready = 0;

-			if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
-				(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
-				 wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn))
+		for (int i = 0; i < wp->n_safekeepers; i++)
+		{
+			if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
 			{
-				wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
-				wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
-				wp->donor = i;
+				n_ready++;
+
+				if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
+					(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
+					 wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) ||
+					wp->donor == NULL)
+				{
+					wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
+					wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
+					wp->donor = &wp->safekeeper[i];
+				}
+				wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
 			}
-			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
 		}
+		collected = n_ready >= wp->quorum;
+		if (collected)
+		{
+			wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers);
+		}
+		return collected;
 	}

-	return n_ready >= wp->quorum;
+	/*
+	 * if generations are enabled we're expected to get to voting only when
+	 * mconf is established.
+	 */
+	Assert(wp->mconf.generation != INVALID_GENERATION);
+
+	/*
+	 * We must get votes from both msets if both are present.
+	 */
+	initStringInfo(&s);
+	appendStringInfoString(&s, "mset voters: ");
+	if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
+		goto res;
+	if (wp->mconf.new_members.len > 0)
+	{
+		appendStringInfoString(&s, ", new_mset voters: ");
+		if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
+			goto res;
+	}
+	wp_log(LOG, "walproposer elected, %s", s.data);
+	collected = true;
+
+res:
+	pfree(s.data);
+	return collected;
 }

 /*
@@ -955,7 +1236,7 @@ HandleElectedProposer(WalProposer *wp)
 	 * that only for logical replication (and switching logical walsenders to
 	 * neon_walreader is a todo.)
 	 */
-	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
+	if (!wp->api.recovery_download(wp, wp->donor))
 	{
 		wp_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
@@ -1078,7 +1359,7 @@ ProcessPropStartPos(WalProposer *wp)
 	/*
 	 * Proposer's term history is the donor's + its own entry.
 	 */
-	dth = &wp->safekeeper[wp->donor].voteResponse.termHistory;
+	dth = &wp->donor->voteResponse.termHistory;
 	wp->propTermHistory.n_entries = dth->n_entries + 1;
 	wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries);
 	if (dth->n_entries > 0)
@@ -1086,11 +1367,10 @@ ProcessPropStartPos(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn;

-	wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-		   wp->quorum,
+	wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
 		   wp->propTerm,
 		   LSN_FORMAT_ARGS(wp->propTermStartLsn),
-		   wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+		   wp->donor->host, wp->donor->port,
 		   LSN_FORMAT_ARGS(wp->truncateLsn));

 	/*
@@ -1508,6 +1788,14 @@ RecvAppendResponses(Safekeeper *sk)

 		readAnything = true;

+		/* should never happen: sk is expected to send ERROR instead */
+		if (sk->appendResponse.generation != wp->mconf.generation)
+		{
+			wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u",
+				   sk->greetResponse.nodeId, sk->host, sk->port,
+				   sk->appendResponse.generation, wp->mconf.generation);
+		}
+
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
@@ -1624,30 +1912,101 @@ CalculateMinFlushLsn(WalProposer *wp)
 }

 /*
- * Calculate WAL position acknowledged by quorum
+ * GetAcknowledgedByQuorumWALPosition for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
 */
 static XLogRecPtr
-GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
+GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk)
 {
 	XLogRecPtr	responses[MAX_SAFEKEEPERS];

 	/*
-	 * Sort acknowledged LSNs
+	 * Ascending sort acknowledged LSNs.
 	 */
-	for (int i = 0; i < wp->n_safekeepers; i++)
+	Assert(mset->len <= MAX_SAFEKEEPERS);
+	for (uint32 i = 0; i < mset->len; i++)
 	{
+		Safekeeper *sk = msk[i];
+
 		/*
 		 * Like in Raft, we aren't allowed to commit entries from previous
-		 * terms, so ignore reported LSN until it gets to epochStartLsn.
+		 * terms, so ignore reported LSN until it gets to propTermStartLsn.
+		 *
+		 * Note: we ignore sk state, which is ok: before first ack flushLsn is
+		 * 0, and later we just preserve value across reconnections. It would
+		 * be ok to check for SS_ACTIVE as well.
 		 */
-		responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
+		if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn)
+		{
+			responses[i] = sk->appendResponse.flushLsn;
+		}
+		else
+		{
+			responses[i] = 0;
+		}
 	}
-	qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+	qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn);

 	/*
-	 * Get the smallest LSN committed by quorum
+	 * And get value committed by the quorum. A way to view this: to get the
+	 * highest value committed on the quorum, in the ordered array we skip n -
+	 * n_quorum elements to get to the first (lowest) value present on all sks
+	 * of the highest quorum.
 	 */
-	return responses[wp->n_safekeepers - wp->quorum];
+	return responses[mset->len - MsetQuorum(mset)];
+}
+
+/*
+ * Calculate WAL position acknowledged by quorum, i.e. which may be regarded
+ * committed.
+ *
+ * Zero may be returned when there is no quorum of nodes recovered to term start
+ * lsn which sent feedback yet.
+ */
+static XLogRecPtr
+GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
+{
+	XLogRecPtr	committed;
+
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
+	{
+		XLogRecPtr	responses[MAX_SAFEKEEPERS];
+
+		/*
+		 * Sort acknowledged LSNs
+		 */
+		for (int i = 0; i < wp->n_safekeepers; i++)
+		{
+			/*
+			 * Like in Raft, we aren't allowed to commit entries from previous
+			 * terms, so ignore reported LSN until it gets to
+			 * propTermStartLsn.
+			 *
+			 * Note: we ignore sk state, which is ok: before first ack
+			 * flushLsn is 0, and later we just preserve value across
+			 * reconnections. It would be ok to check for SS_ACTIVE as well.
+			 */
+			responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
+		}
+		qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+
+		/*
+		 * Get the smallest LSN committed by quorum
+		 */
+		return responses[wp->n_safekeepers - wp->quorum];
+	}
+
+	committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers);
+	if (wp->mconf.new_members.len > 0)
+	{
+		XLogRecPtr	new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers);
+
+		committed = Min(committed, new_mset_committed);
+	}
+	return committed;
 }

 /*
@@ -1662,7 +2021,7 @@ UpdateDonorShmem(WalProposer *wp)
 	int			i;
 	XLogRecPtr	donor_lsn = InvalidXLogRecPtr;

-	if (wp->n_votes < wp->quorum)
+	if (wp->state < WPS_ELECTED)
 	{
 		wp_log(WARNING, "UpdateDonorShmem called before elections are won");
 		return;
@@ -1673,9 +2032,9 @@ UpdateDonorShmem(WalProposer *wp)
 	 * about its position immediately after election before any feedbacks are
 	 * sent.
 	 */
-	if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED)
+	if (wp->donor->state >= SS_WAIT_ELECTED)
 	{
-		donor = &wp->safekeeper[wp->donor];
+		donor = wp->donor;
 		donor_lsn = wp->propTermStartLsn;
 	}

@@ -1746,13 +2105,13 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 	}

 	/*
-	 * Generally sync is done when majority switched the epoch so we committed
-	 * epochStartLsn and made the majority aware of it, ensuring they are
-	 * ready to give all WAL to pageserver. It would mean whichever majority
-	 * is alive, there will be at least one safekeeper who is able to stream
-	 * WAL to pageserver to make basebackup possible. However, since at the
-	 * moment we don't have any good mechanism of defining the healthy and
-	 * most advanced safekeeper who should push the wal into pageserver and
+	 * Generally sync is done when majority reached propTermStartLsn so we
+	 * committed it and made the majority aware of it, ensuring they are ready
+	 * to give all WAL to pageserver. It would mean whichever majority is
+	 * alive, there will be at least one safekeeper who is able to stream WAL
+	 * to pageserver to make basebackup possible. However, since at the moment
+	 * we don't have any good mechanism of defining the healthy and most
+	 * advanced safekeeper who should push the wal into pageserver and
 	 * basically the random one gets connected, to prevent hanging basebackup
 	 * (due to pageserver connecting to not-synced-safekeeper) we currently
 	 * wait for all seemingly alive safekeepers to get synced.
@@ -1774,7 +2133,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 				n_synced++;
 		}

-		if (n_synced >= wp->quorum)
+		if (newCommitLsn >= wp->propTermStartLsn)
 		{
 			/* A quorum of safekeepers has been synced! */

--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -145,6 +145,7 @@ typedef uint64 NNodeId;
 * This and following structs pair ones in membership.rs.
 */
 typedef uint32 Generation;
+#define INVALID_GENERATION 0

 typedef struct SafekeeperId
 {
@@ -771,7 +772,17 @@ typedef struct WalProposer
 	/* Current walproposer membership configuration */
 	MembershipConfiguration mconf;

-	/* (n_safekeepers / 2) + 1 */
+	/*
+	 * Parallels mconf.members with pointers to the member's slot in
+	 * safekeepers array of connections, or NULL if such member is not
+	 * connected. Helps to avoid looking slot per id through all
+	 * .safekeepers[] when doing quorum checks.
+	 */
+	Safekeeper *members_safekeepers[MAX_SAFEKEEPERS];
+	/* As above, but for new_members. */
+	Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS];
+
+	/* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */
 	int			quorum;

 	/*
@@ -829,7 +840,7 @@ typedef struct WalProposer
 	term_t		donorLastLogTerm;

 	/* Most advanced acceptor */
-	int			donor;
+	Safekeeper *donor;

 	/* timeline globally starts at this LSN */
 	XLogRecPtr	timelineStartLsn;
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -425,12 +425,7 @@ impl CancelClosure {
            &mut mk_tls,
            &self.hostname,
        )
-        .map_err(|e| {
-            CancelError::IO(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                e.to_string(),
-            ))
-        })?;
+        .map_err(|e| CancelError::IO(std::io::Error::other(e.to_string())))?;

        self.cancel_token.cancel_query_raw(socket, tls).await?;
        debug!("query was cancelled");
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -568,7 +568,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
 fn helper_create_connect_info(
    mechanism: &TestConnectMechanism,
 ) -> auth::Backend<'static, ComputeCredentials> {
-    let user_info = auth::Backend::ControlPlane(
+    auth::Backend::ControlPlane(
        MaybeOwned::Owned(ControlPlaneClient::Test(Box::new(mechanism.clone()))),
        ComputeCredentials {
            info: ComputeUserInfo {
@@ -578,8 +578,7 @@ fn helper_create_connect_info(
            },
            keys: ComputeCredentialKeys::Password("password".into()),
        },
-    );
-    user_info
+    )
 }

 fn config() -> ComputeConfig {
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -47,6 +47,7 @@ impl ConnInfo {
 }

 #[derive(Clone)]
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 pub(crate) enum ClientDataEnum {
    Remote(ClientDataRemote),
    Local(ClientDataLocal),
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -31,6 +31,7 @@ pub async fn task_main_https(
    global_timelines: Arc<GlobalTimelines>,
 ) -> anyhow::Result<()> {
    let cert_resolver = ReloadingCertificateResolver::new(
+        "main",
        &conf.ssl_key_file,
        &conf.ssl_cert_file,
        conf.ssl_cert_reload_period,
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -138,6 +138,7 @@ impl Drop for WriteGuardSharedState<'_> {
 /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
 /// case, SafeKeeper is not available (because WAL is not present on disk) and all
 /// operations can be done only with control file.
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 pub enum StateSK {
    Loaded(SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>),
    Offloaded(Box<TimelineState<control_file::FileStorage>>),
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -35,7 +35,7 @@ impl Manager {
        next_event: &Option<tokio::time::Instant>,
        state: &StateSnapshot,
    ) -> bool {
-        let ready = self.backup_task.is_none()
+        self.backup_task.is_none()
            && self.recovery_task.is_none()
            && self.wal_removal_task.is_none()
            && self.partial_backup_task.is_none()
@@ -61,8 +61,7 @@ impl Manager {
                .unwrap()
                .flush_lsn
                .segment_number(self.wal_seg_size)
-                == self.last_removed_segno + 1;
-        ready
+                == self.last_removed_segno + 1
    }

    /// Evict the timeline to remote storage. Returns whether the eviction was successful.
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -96,6 +96,7 @@ enum Message {

 impl Message {
    /// Convert proto message to internal message.
+    #[allow(clippy::result_large_err, reason = "TODO")]
    pub fn from(proto_msg: TypedMessage) -> Result<Self, Status> {
        match proto_msg.r#type() {
            MessageType::SafekeeperTimelineInfo => Ok(Message::SafekeeperTimelineInfo(
@@ -127,6 +128,7 @@ impl Message {
    }

    /// Get the tenant_timeline_id from the message.
+    #[allow(clippy::result_large_err, reason = "TODO")]
    pub fn tenant_timeline_id(&self) -> Result<Option<TenantTimelineId>, Status> {
        match self {
            Message::SafekeeperTimelineInfo(msg) => Ok(msg
@@ -185,6 +187,7 @@ enum SubscriptionKey {

 impl SubscriptionKey {
    /// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
+    #[allow(clippy::result_large_err, reason = "TODO")]
    pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result<Self, Status> {
        match key {
            ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All),
@@ -195,6 +198,7 @@ impl SubscriptionKey {
    }

    /// Parse from FilterTenantTimelineId
+    #[allow(clippy::result_large_err, reason = "TODO")]
    pub fn from_proto_filter_tenant_timeline_id(
        opt: Option<&FilterTenantTimelineId>,
    ) -> Result<Self, Status> {
@@ -385,6 +389,7 @@ impl Registry {
    }

    /// Send msg to relevant subscribers.
+    #[allow(clippy::result_large_err, reason = "TODO")]
    pub fn send_msg(&self, msg: &Message) -> Result<(), Status> {
        PROCESSED_MESSAGES_TOTAL.inc();

@@ -436,6 +441,7 @@ struct Publisher {

 impl Publisher {
    /// Send msg to relevant subscribers.
+    #[allow(clippy::result_large_err, reason = "TODO")]
    pub fn send_msg(&mut self, msg: &Message) -> Result<(), Status> {
        self.registry.send_msg(msg)
    }
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -79,6 +79,7 @@ impl BrokerClientChannel {
 }

 // parse variable length bytes from protobuf
+#[allow(clippy::result_large_err, reason = "TODO")]
 pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result<TenantTimelineId, Status> {
    let tenant_id = TenantId::from_slice(&proto_ttid.tenant_id)
        .map_err(|e| Status::new(Code::InvalidArgument, format!("malformed tenant_id: {}", e)))?;
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -800,7 +800,7 @@ impl ComputeHook {

 #[cfg(test)]
 pub(crate) mod tests {
-    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber};
    use utils::id::TenantId;

    use super::*;
@@ -808,6 +808,7 @@ pub(crate) mod tests {
    #[test]
    fn tenant_updates() -> anyhow::Result<()> {
        let tenant_id = TenantId::generate();
+        let stripe_size = DEFAULT_STRIPE_SIZE;
        let mut tenant_state = ComputeHookTenant::new(
            TenantShardId {
                tenant_id,
@@ -848,7 +849,7 @@ pub(crate) mod tests {
                shard_count: ShardCount::new(2),
                shard_number: ShardNumber(1),
            },
-            stripe_size: ShardStripeSize(32768),
+            stripe_size,
            preferred_az: None,
            node_id: NodeId(1),
        });
@@ -864,7 +865,7 @@ pub(crate) mod tests {
                shard_count: ShardCount::new(2),
                shard_number: ShardNumber(0),
            },
-            stripe_size: ShardStripeSize(32768),
+            stripe_size,
            preferred_az: None,
            node_id: NodeId(1),
        });
@@ -874,7 +875,7 @@ pub(crate) mod tests {
            anyhow::bail!("Wrong send result");
        };
        assert_eq!(request.shards.len(), 2);
-        assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
+        assert_eq!(request.stripe_size, Some(stripe_size));

        // Simulate successful send
        *guard = Some(ComputeRemoteState {
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1235,8 +1235,18 @@ async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError
        ForwardOutcome::NotForwarded(req) => req,
    };

-    let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.step_down().await)
+    // Spawn a background task: once we start stepping down, we must finish: if the client drops
+    // their request we should avoid stopping in some part-stepped-down state.
+    let handle = tokio::spawn(async move {
+        let state = get_state(&req);
+        state.service.step_down().await
+    });
+
+    let result = handle
+        .await
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+    json_response(StatusCode::OK, result)
 }

 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -472,6 +472,7 @@ async fn async_main() -> anyhow::Result<()> {
            let https_listener = tcp_listener::bind(https_addr)?;

            let resolver = ReloadingCertificateResolver::new(
+                "main",
                &args.ssl_key_file,
                &args.ssl_cert_file,
                *args.ssl_cert_reload_period,
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -44,6 +44,15 @@ pub(crate) struct StorageControllerMetricGroup {
    /// Size of the in-memory map of pageserver_nodes
    pub(crate) storage_controller_pageserver_nodes: measured::Gauge,

+    /// Count of how many pageserver nodes from in-memory map have https configured
+    pub(crate) storage_controller_https_pageserver_nodes: measured::Gauge,
+
+    /// Size of the in-memory map of safekeeper_nodes
+    pub(crate) storage_controller_safekeeper_nodes: measured::Gauge,
+
+    /// Count of how many safekeeper nodes from in-memory map have https configured
+    pub(crate) storage_controller_https_safekeeper_nodes: measured::Gauge,
+
    /// Reconciler tasks completed, broken down by success/failure/cancelled
    pub(crate) storage_controller_reconcile_complete:
        measured::CounterVec<ReconcileCompleteLabelGroupSet>,
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -89,6 +89,10 @@ impl Node {
        self.scheduling = scheduling
    }

+    pub(crate) fn has_https_port(&self) -> bool {
+        self.listen_https_port.is_some()
+    }
+
    /// Does this registration request match `self`?  This is used when deciding whether a registration
    /// request should be allowed to update an existing record with the same node ID.
    pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
--- a/storage_controller/src/safekeeper.rs
+++ b/storage_controller/src/safekeeper.rs
@@ -89,6 +89,9 @@ impl Safekeeper {
    pub(crate) fn availability(&self) -> SafekeeperState {
        self.availability.clone()
    }
+    pub(crate) fn has_https_port(&self) -> bool {
+        self.listen_https_port.is_some()
+    }
    /// Perform an operation (which is given a [`SafekeeperClient`]) with retries
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn with_client_retries<T, O, F>(
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -43,7 +43,7 @@ use pageserver_api::models::{
    TimelineInfo, TopTenantShardItem, TopTenantShardsRequest,
 };
 use pageserver_api::shard::{
-    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
+    DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
 use pageserver_api::upcall_api::{
    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
@@ -61,7 +61,7 @@ use utils::completion::Barrier;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
-use utils::sync::gate::Gate;
+use utils::sync::gate::{Gate, GateGuard};
 use utils::{failpoint_support, pausable_failpoint};

 use crate::background_node_operations::{
@@ -594,6 +594,8 @@ struct TenantShardSplitAbort {
    new_stripe_size: Option<ShardStripeSize>,
    /// Until this abort op is complete, no other operations may be done on the tenant
    _tenant_lock: TracingExclusiveGuard<TenantOperations>,
+    /// The reconciler gate for the duration of the split operation, and any included abort.
+    _gate: GateGuard,
 }

 #[derive(thiserror::Error, Debug)]
@@ -1460,7 +1462,7 @@ impl Service {
            // Retry until shutdown: we must keep this request object alive until it is properly
            // processed, as it holds a lock guard that prevents other operations trying to do things
            // to the tenant while it is in a weird part-split state.
-            while !self.cancel.is_cancelled() {
+            while !self.reconcilers_cancel.is_cancelled() {
                match self.abort_tenant_shard_split(&op).await {
                    Ok(_) => break,
                    Err(e) => {
@@ -1473,9 +1475,12 @@ impl Service {
                        // when we retry, so that the abort op will succeed.  If the abort op is failing
                        // for some other reason, we will keep retrying forever, or until a human notices
                        // and does something about it (either fixing a pageserver or restarting the controller).
-                        tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled())
-                            .await
-                            .ok();
+                        tokio::time::timeout(
+                            Duration::from_secs(5),
+                            self.reconcilers_cancel.cancelled(),
+                        )
+                        .await
+                        .ok();
                    }
                }
            }
@@ -1509,6 +1514,10 @@ impl Service {
            .metrics_group
            .storage_controller_pageserver_nodes
            .set(nodes.len() as i64);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_https_pageserver_nodes
+            .set(nodes.values().filter(|n| n.has_https_port()).count() as i64);

        tracing::info!("Loading safekeepers from database...");
        let safekeepers = persistence
@@ -1526,6 +1535,14 @@ impl Service {
        let safekeepers: HashMap<NodeId, Safekeeper> =
            safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
        tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_safekeeper_nodes
+            .set(safekeepers.len() as i64);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_https_safekeeper_nodes
+            .set(safekeepers.values().filter(|s| s.has_https_port()).count() as i64);

        tracing::info!("Loading shards from database...");
        let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
@@ -2742,7 +2759,7 @@ impl Service {
                        count: tenant_shard_id.shard_count,
                        // We only import un-sharded or single-sharded tenants, so stripe
                        // size can be made up arbitrarily here.
-                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                        stripe_size: DEFAULT_STRIPE_SIZE,
                    },
                    placement_policy: Some(placement_policy),
                    config: req.config.tenant_conf,
@@ -4898,7 +4915,7 @@ impl Service {
                    1,
                    10,
                    Duration::from_secs(5),
-                    &self.cancel,
+                    &self.reconcilers_cancel,
                )
                .await
            {
@@ -5149,6 +5166,11 @@ impl Service {
        )
        .await;

+        let _gate = self
+            .reconcilers_gate
+            .enter()
+            .map_err(|_| ApiError::ShuttingDown)?;
+
        let new_shard_count = ShardCount::new(split_req.new_shard_count);
        let new_stripe_size = split_req.new_stripe_size;

@@ -5176,6 +5198,7 @@ impl Service {
                        new_shard_count,
                        new_stripe_size,
                        _tenant_lock,
+                        _gate,
                    })
                    // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
                    .ok();
@@ -5515,7 +5538,10 @@ impl Service {
                "failpoint".to_string()
            )));

-            failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
+            failpoint_support::sleep_millis_async!(
+                "shard-split-post-remote-sleep",
+                &self.reconcilers_cancel
+            );

            tracing::info!(
                "Split {} into {}",
@@ -5573,7 +5599,7 @@ impl Service {
                        stripe_size,
                        preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed),
                    },
-                    &self.cancel,
+                    &self.reconcilers_cancel,
                )
                .await
            {
@@ -6014,9 +6040,21 @@ impl Service {
            .max()
            .expect("We already validated >0 shards");

-        // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will
-        // only work if they were using the default stripe size.
-        let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE;
+        // Find the tenant's stripe size. This wasn't always persisted in the tenant manifest, so
+        // fall back to the original default stripe size of 32768 (256 MB) if it's not specified.
+        const ORIGINAL_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(32768);
+        let stripe_size = scan_result
+            .shards
+            .iter()
+            .find(|s| s.tenant_shard_id.shard_count == shard_count && s.generation == generation)
+            .expect("we validated >0 shards above")
+            .stripe_size
+            .unwrap_or_else(|| {
+                if shard_count.count() > 1 {
+                    warn!("unknown stripe size, assuming {ORIGINAL_STRIPE_SIZE}");
+                }
+                ORIGINAL_STRIPE_SIZE
+            });

        let (response, waiters) = self
            .do_tenant_create(TenantCreateRequest {
@@ -6242,6 +6280,10 @@ impl Service {
            .metrics_group
            .storage_controller_pageserver_nodes
            .set(locked.nodes.len() as i64);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_https_pageserver_nodes
+            .set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64);

        locked.scheduler.node_remove(node_id);

@@ -6333,6 +6375,10 @@ impl Service {
                    .metrics_group
                    .storage_controller_pageserver_nodes
                    .set(nodes.len() as i64);
+                metrics::METRICS_REGISTRY
+                    .metrics_group
+                    .storage_controller_https_pageserver_nodes
+                    .set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
            }
        }

@@ -6557,6 +6603,10 @@ impl Service {
            .metrics_group
            .storage_controller_pageserver_nodes
            .set(locked.nodes.len() as i64);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_https_pageserver_nodes
+            .set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64);

        match registration_status {
            RegistrationStatus::New => {
@@ -7270,7 +7320,7 @@ impl Service {
            }

            // Eventual consistency: if an earlier reconcile job failed, and the shard is still
-            // dirty, spawn another rone
+            // dirty, spawn another one
            if self
                .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
                .is_some()
@@ -7829,7 +7879,7 @@ impl Service {
        // old, persisted stripe size.
        let new_stripe_size = match candidate.id.shard_count.count() {
            0 => panic!("invalid shard count 0"),
-            1 => Some(ShardParameters::DEFAULT_STRIPE_SIZE),
+            1 => Some(DEFAULT_STRIPE_SIZE),
            2.. => None,
        };

@@ -8634,9 +8684,24 @@ impl Service {
        failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");

        self.inner.write().unwrap().step_down();
-        // TODO: would it make sense to have a time-out for this?
-        self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
-            .await;
+
+        // Wait for reconciliations to stop, or terminate this process if they
+        // fail to stop in time (this indicates a bug in shutdown)
+        tokio::select! {
+            _ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => {
+                tracing::info!("Reconciliations stopped, proceeding with step down");
+            }
+            _ = async {
+                failpoint_support::sleep_millis_async!("step-down-delay-timeout");
+                tokio::time::sleep(Duration::from_secs(10)).await
+            } => {
+                tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process");
+
+                // The caller may proceed to act as leader when it sees this request fail: reduce the chance
+                // of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state.
+                std::process::exit(1);
+            }
+        }

        let mut global_observed = GlobalObservedState::default();
        let locked = self.inner.read().unwrap();
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -5,6 +5,7 @@ use std::time::Duration;

 use super::safekeeper_reconciler::ScheduleRequest;
 use crate::heartbeater::SafekeeperState;
+use crate::metrics;
 use crate::persistence::{
    DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence,
 };
@@ -590,6 +591,20 @@ impl Service {
                }
            }
            locked.safekeepers = Arc::new(safekeepers);
+            metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_safekeeper_nodes
+                .set(locked.safekeepers.len() as i64);
+            metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_https_safekeeper_nodes
+                .set(
+                    locked
+                        .safekeepers
+                        .values()
+                        .filter(|s| s.has_https_port())
+                        .count() as i64,
+                );
        }
        Ok(())
    }
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -2000,7 +2000,7 @@ pub(crate) mod tests {
    use std::rc::Rc;

    use pageserver_api::controller_api::NodeAvailability;
-    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber};
    use rand::SeedableRng;
    use rand::rngs::StdRng;
    use utils::id::TenantId;
@@ -2012,6 +2012,7 @@ pub(crate) mod tests {
        let tenant_id = TenantId::generate();
        let shard_number = ShardNumber(0);
        let shard_count = ShardCount::new(1);
+        let stripe_size = DEFAULT_STRIPE_SIZE;

        let tenant_shard_id = TenantShardId {
            tenant_id,
@@ -2020,12 +2021,7 @@ pub(crate) mod tests {
        };
        TenantShard::new(
            tenant_shard_id,
-            ShardIdentity::new(
-                shard_number,
-                shard_count,
-                pageserver_api::shard::ShardStripeSize(32768),
-            )
-            .unwrap(),
+            ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(),
            policy,
            None,
        )
@@ -2045,6 +2041,7 @@ pub(crate) mod tests {
        shard_count: ShardCount,
        preferred_az: Option<AvailabilityZone>,
    ) -> Vec<TenantShard> {
+        let stripe_size = DEFAULT_STRIPE_SIZE;
        (0..shard_count.count())
            .map(|i| {
                let shard_number = ShardNumber(i);
@@ -2056,12 +2053,7 @@ pub(crate) mod tests {
                };
                TenantShard::new(
                    tenant_shard_id,
-                    ShardIdentity::new(
-                        shard_number,
-                        shard_count,
-                        pageserver_api::shard::ShardStripeSize(32768),
-                    )
-                    .unwrap(),
+                    ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(),
                    policy.clone(),
                    preferred_az.clone(),
                )
--- a/Show More
+++ b/Show More