Merge branch 'problame/lease-deadline-tests' into problame/standby-horizon-removal-poc-rip-out

2026-01-10 06:52:55 +00:00 · 2025-07-02 16:02:29 +02:00
parent 3a6094263d eb267de255
commit 0d01eada52
70 changed files with 2252 additions and 557 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -69,7 +69,7 @@ jobs:
          submodules: true

      - name: Check for file changes
-        uses: step-security/paths-filter@v3
+        uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36  # v3.0.2
        id: files-changed
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -53,7 +53,7 @@ jobs:
          submodules: true

      - name: Check for Postgres changes
-        uses: step-security/paths-filter@v3
+        uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242  #v3
        id: files_changed
        with:
          token: ${{ github.token }}
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -34,7 +34,7 @@ jobs:

      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
+      - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        id: python-src
        with:
          files: |
@@ -45,7 +45,7 @@ jobs:
            poetry.lock
            pyproject.toml

-      - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
+      - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        id: rust-src
        with:
          files: |
--- a/.github/workflows/proxy-benchmark.yml
+++ b/.github/workflows/proxy-benchmark.yml
@@ -60,22 +60,23 @@ jobs:
        } >> "$GITHUB_ENV"

    - name: Run proxy-bench
-      run: ./${PROXY_BENCH_PATH}/run.sh
+      run: ${PROXY_BENCH_PATH}/run.sh

    - name: Ingest Bench Results # neon repo script
-      if: success()
+      if: always()
      run: |
        mkdir -p $TEST_OUTPUT
        python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT

    - name: Push Metrics to Proxy perf database
-      if: success()
+      if: always()
      env:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PROXY_TEST_RESULT_CONNSTR }}"
        REPORT_FROM: $TEST_OUTPUT
      run: $NEON_DIR/scripts/generate_and_push_perf_report.sh

    - name: Docker cleanup
+      if: always()
      run: docker compose down

    - name: Notify Failure
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@
 /tmp_check_cli
 __pycache__/
 test_output/
+neon_previous/
 .vscode
 .idea
 *.swp
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1279,6 +1279,7 @@ dependencies = [
 "remote_storage",
 "serde",
 "serde_json",
+ "url",
 "utils",
 ]

@@ -1304,6 +1305,7 @@ dependencies = [
 "fail",
 "flate2",
 "futures",
+ "hostname-validator",
 "http 1.1.0",
 "indexmap 2.9.0",
 "itertools 0.10.5",
@@ -2770,6 +2772,12 @@ dependencies = [
 "windows",
 ]

+[[package]]
+name = "hostname-validator"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f558a64ac9af88b5ba400d99b579451af0d39c6d360980045b91aac966d705e2"
+
 [[package]]
 name = "http"
 version = "0.2.9"
@@ -4421,6 +4429,7 @@ dependencies = [
 "strum",
 "strum_macros",
 "thiserror 1.0.69",
+ "tracing",
 "tracing-utils",
 "utils",
 ]
@@ -4479,6 +4488,7 @@ dependencies = [
 "pageserver_api",
 "postgres_ffi_types",
 "prost 0.13.5",
+ "prost-types 0.13.5",
 "strum",
 "strum_macros",
 "thiserror 1.0.69",
@@ -5156,7 +5166,7 @@ dependencies = [
 "petgraph",
 "prettyplease",
 "prost 0.13.5",
- "prost-types 0.13.3",
+ "prost-types 0.13.5",
 "regex",
 "syn 2.0.100",
 "tempfile",
@@ -5199,9 +5209,9 @@ dependencies = [

 [[package]]
 name = "prost-types"
-version = "0.13.3"
+version = "0.13.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670"
+checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16"
 dependencies = [
 "prost 0.13.5",
 ]
@@ -6808,6 +6818,7 @@ dependencies = [
 "chrono",
 "clap",
 "clashmap",
+ "compute_api",
 "control_plane",
 "cron",
 "diesel",
@@ -7641,7 +7652,7 @@ dependencies = [
 "prettyplease",
 "proc-macro2",
 "prost-build 0.13.3",
- "prost-types 0.13.3",
+ "prost-types 0.13.5",
 "quote",
 "syn 2.0.100",
 ]
@@ -7653,7 +7664,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f9687bd5bfeafebdded2356950f278bba8226f0b32109537c4253406e09aafe1"
 dependencies = [
 "prost 0.13.5",
- "prost-types 0.13.3",
+ "prost-types 0.13.5",
 "tokio",
 "tokio-stream",
 "tonic 0.13.1",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -152,6 +152,7 @@ pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointe
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.13.5"
+prost-types = "0.13.5"
 rand = "0.8"
 redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1983,7 +1983,7 @@ RUN apt update && \
        locales \
        lsof \
        procps \
-        rsyslog \
+        rsyslog-gnutls \
        screen \
        tcpdump \
        $VERSION_INSTALLS && \
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -27,6 +27,7 @@ fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
 http.workspace = true
+hostname-validator = "1.1"
 indexmap.workspace = true
 itertools.workspace = true
 jsonwebtoken.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -36,6 +36,8 @@
 use std::ffi::OsString;
 use std::fs::File;
 use std::process::exit;
+use std::sync::Arc;
+use std::sync::atomic::AtomicU64;
 use std::sync::mpsc;
 use std::thread;
 use std::time::Duration;
@@ -190,7 +192,9 @@ fn main() -> Result<()> {
            cgroup: cli.cgroup,
            #[cfg(target_os = "linux")]
            vm_monitor_addr: cli.vm_monitor_addr,
-            installed_extensions_collection_interval: cli.installed_extensions_collection_interval,
+            installed_extensions_collection_interval: Arc::new(AtomicU64::new(
+                cli.installed_extensions_collection_interval,
+            )),
        },
        config,
    )?;
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,4 +1,4 @@
-use anyhow::{Context, Result, anyhow};
+use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{
@@ -6,7 +6,7 @@ use compute_api::responses::{
    LfcPrewarmState, TlsConfig,
 };
 use compute_api::spec::{
-    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
+    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
 };
 use futures::StreamExt;
 use futures::future::join_all;
@@ -25,7 +25,7 @@ use std::os::unix::fs::{PermissionsExt, symlink};
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::atomic::{AtomicU32, Ordering};
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
 use std::sync::{Arc, Condvar, Mutex, RwLock};
 use std::time::{Duration, Instant};
 use std::{env, fs};
@@ -70,6 +70,7 @@ pub static BUILD_TAG: Lazy<String> = Lazy::new(|| {
        .unwrap_or(BUILD_TAG_DEFAULT)
        .to_string()
 });
+const DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL: u64 = 3600;

 /// Static configuration params that don't change after startup. These mostly
 /// come from the CLI args, or are derived from them.
@@ -103,7 +104,7 @@ pub struct ComputeNodeParams {
    pub remote_ext_base_url: Option<Url>,

    /// Interval for installed extensions collection
-    pub installed_extensions_collection_interval: u64,
+    pub installed_extensions_collection_interval: Arc<AtomicU64>,
 }

 /// Compute node info shared across several `compute_ctl` threads.
@@ -126,6 +127,9 @@ pub struct ComputeNode {
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub compute_ctl_config: ComputeCtlConfig,
+
+    /// Handle to the extension stats collection task
+    extension_stats_task: Mutex<Option<tokio::task::JoinHandle<()>>>,
 }

 // store some metrics about download size that might impact startup time
@@ -428,6 +432,7 @@ impl ComputeNode {
            state_changed: Condvar::new(),
            ext_download_progress: RwLock::new(HashMap::new()),
            compute_ctl_config: config.compute_ctl_config,
+            extension_stats_task: Mutex::new(None),
        })
    }

@@ -515,6 +520,9 @@ impl ComputeNode {
            None
        };

+        // Terminate the extension stats collection task
+        this.terminate_extension_stats_task();
+
        // Terminate the vm_monitor so it releases the file watcher on
        // /sys/fs/cgroup/neon-postgres.
        // Note: the vm-monitor only runs on linux because it requires cgroups.
@@ -751,10 +759,15 @@ impl ComputeNode {
        // Configure and start rsyslog for compliance audit logging
        match pspec.spec.audit_log_level {
            ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
-                let remote_endpoint =
+                let remote_tls_endpoint =
+                    std::env::var("AUDIT_LOGGING_TLS_ENDPOINT").unwrap_or("".to_string());
+                let remote_plain_endpoint =
                    std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
-                if remote_endpoint.is_empty() {
-                    anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+
+                if remote_plain_endpoint.is_empty() && remote_tls_endpoint.is_empty() {
+                    anyhow::bail!(
+                        "AUDIT_LOGGING_ENDPOINT and AUDIT_LOGGING_TLS_ENDPOINT are both empty"
+                    );
                }

                let log_directory_path = Path::new(&self.params.pgdata).join("log");
@@ -770,7 +783,8 @@ impl ComputeNode {
                    log_directory_path.clone(),
                    endpoint_id,
                    project_id,
-                    &remote_endpoint,
+                    &remote_plain_endpoint,
+                    &remote_tls_endpoint,
                )?;

                // Launch a background task to clean up the audit logs
@@ -1003,19 +1017,12 @@ impl ComputeNode {
    fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");

-        // Detect the protocol scheme. If the URL doesn't have a scheme, assume libpq.
        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
-        let scheme = match Url::parse(shard0_connstr) {
-            Ok(url) => url.scheme().to_lowercase().to_string(),
-            Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(),
-            Err(err) => return Err(anyhow!("invalid connstring URL: {err}")),
-        };
-
        let started = Instant::now();
-        let (connected, size) = match scheme.as_str() {
-            "postgresql" | "postgres" => self.try_get_basebackup_libpq(spec, lsn)?,
-            "grpc" => self.try_get_basebackup_grpc(spec, lsn)?,
-            scheme => return Err(anyhow!("unknown URL scheme {scheme}")),
+
+        let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
+            PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
+            PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
        };

        let mut state = self.state.lock().unwrap();
@@ -1678,6 +1685,8 @@ impl ComputeNode {
            tls_config = self.compute_ctl_config.tls.clone();
        }

+        self.update_installed_extensions_collection_interval(&spec);
+
        let max_concurrent_connections = self.max_service_connections(compute_state, &spec);

        // Merge-apply spec & changes to PostgreSQL state.
@@ -1742,6 +1751,8 @@ impl ComputeNode {

        let tls_config = self.tls_config(&spec);

+        self.update_installed_extensions_collection_interval(&spec);
+
        if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
            info!("tuning pgbouncer");

@@ -2346,10 +2357,20 @@ LIMIT 100",
    }

    pub fn spawn_extension_stats_task(&self) {
+        // Cancel any existing task
+        if let Some(handle) = self.extension_stats_task.lock().unwrap().take() {
+            handle.abort();
+        }
+
        let conf = self.tokio_conn_conf.clone();
-        let installed_extensions_collection_interval =
-            self.params.installed_extensions_collection_interval;
-        tokio::spawn(async move {
+        let atomic_interval = self.params.installed_extensions_collection_interval.clone();
+        let mut installed_extensions_collection_interval =
+            2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst);
+        info!(
+            "[NEON_EXT_SPAWN] Spawning background installed extensions worker with Timeout: {}",
+            installed_extensions_collection_interval
+        );
+        let handle = tokio::spawn(async move {
            // An initial sleep is added to ensure that two collections don't happen at the same time.
            // The first collection happens during compute startup.
            tokio::time::sleep(tokio::time::Duration::from_secs(
@@ -2362,8 +2383,48 @@ LIMIT 100",
            loop {
                interval.tick().await;
                let _ = installed_extensions(conf.clone()).await;
+                // Acquire a read lock on the compute spec and then update the interval if necessary
+                interval = tokio::time::interval(tokio::time::Duration::from_secs(std::cmp::max(
+                    installed_extensions_collection_interval,
+                    2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst),
+                )));
+                installed_extensions_collection_interval = interval.period().as_secs();
            }
        });
+
+        // Store the new task handle
+        *self.extension_stats_task.lock().unwrap() = Some(handle);
+    }
+
+    fn terminate_extension_stats_task(&self) {
+        if let Some(handle) = self.extension_stats_task.lock().unwrap().take() {
+            handle.abort();
+        }
+    }
+
+    fn update_installed_extensions_collection_interval(&self, spec: &ComputeSpec) {
+        // Update the interval for collecting installed extensions statistics
+        // If the value is -1, we never suspend so set the value to default collection.
+        // If the value is 0, it means default, we will just continue to use the default.
+        if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 {
+            info!(
+                "[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}",
+                spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL
+            );
+            self.params.installed_extensions_collection_interval.store(
+                DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL,
+                std::sync::atomic::Ordering::SeqCst,
+            );
+        } else {
+            info!(
+                "[NEON_EXT_INT_UPD] Spec Timeout: {}",
+                spec.suspend_timeout_seconds
+            );
+            self.params.installed_extensions_collection_interval.store(
+                spec.suspend_timeout_seconds as u64,
+                std::sync::atomic::Ordering::SeqCst,
+            );
+        }
    }
 }

--- a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf
+++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf
@@ -10,7 +10,13 @@ input(type="imfile" File="{log_directory}/*.log"
  startmsg.regex="^[[:digit:]]{{4}}-[[:digit:]]{{2}}-[[:digit:]]{{2}} [[:digit:]]{{2}}:[[:digit:]]{{2}}:[[:digit:]]{{2}}.[[:digit:]]{{3}} GMT,")

 # the directory to store rsyslog state files
-global(workDirectory="/var/log/rsyslog")
+global(
+  workDirectory="/var/log/rsyslog"
+  DefaultNetstreamDriverCAFile="/etc/ssl/certs/ca-certificates.crt"
+)
+
+# Whether the remote syslog receiver uses tls
+set $.remote_syslog_tls = "{remote_syslog_tls}";

 # Construct json, endpoint_id and project_id as additional metadata
 set $.json_log!endpoint_id = "{endpoint_id}";
@@ -21,5 +27,29 @@ set $.json_log!msg = $msg;
 template(name="PgAuditLog" type="string"
    string="<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% - - - - %$.json_log%")

-# Forward to remote syslog receiver (@@<hostname>:<port>;format
-local5.info @@{remote_endpoint};PgAuditLog
+# Forward to remote syslog receiver (over TLS)
+if ( $syslogtag == 'pgaudit_log' ) then {{
+  if ( $.remote_syslog_tls == 'true' ) then {{
+    action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp"
+      template="PgAuditLog"
+      queue.type="linkedList"
+      queue.size="1000"
+      action.ResumeRetryCount="10"
+      StreamDriver="gtls"
+      StreamDriverMode="1"
+      StreamDriverAuthMode="x509/name"
+      StreamDriverPermittedPeers="{remote_syslog_host}"
+      StreamDriver.CheckExtendedKeyPurpose="on"
+      StreamDriver.PermitExpiredCerts="off"
+    )
+    stop
+  }} else {{
+    action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp"
+      template="PgAuditLog"
+      queue.type="linkedList"
+      queue.size="1000"
+      action.ResumeRetryCount="10"
+    )
+    stop
+  }}
+}}
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -4,7 +4,9 @@ use std::thread;
 use std::time::{Duration, SystemTime};

 use anyhow::{Result, bail};
-use compute_api::spec::ComputeMode;
+use compute_api::spec::{ComputeMode, PageserverProtocol};
+use itertools::Itertools as _;
+use pageserver_page_api as page_api;
 use postgres::{NoTls, SimpleQueryMessage};
 use tracing::{info, warn};
 use utils::id::{TenantId, TimelineId};
@@ -76,25 +78,17 @@ fn acquire_lsn_lease_with_retry(

    loop {
        // Note: List of pageservers is dynamic, need to re-read configs before each attempt.
-        let configs = {
+        let (connstrings, auth) = {
            let state = compute.state.lock().unwrap();
-
            let spec = state.pspec.as_ref().expect("spec must be set");
-
-            let conn_strings = spec.pageserver_connstr.split(',');
-
-            conn_strings
-                .map(|connstr| {
-                    let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
-                    if let Some(storage_auth_token) = &spec.storage_auth_token {
-                        config.password(storage_auth_token.clone());
-                    }
-                    config
-                })
-                .collect::<Vec<_>>()
+            (
+                spec.pageserver_connstr.clone(),
+                spec.storage_auth_token.clone(),
+            )
        };

-        let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
+        let result =
+            try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
        match result {
            Ok(Some(res)) => {
                return Ok(res);
@@ -116,68 +110,104 @@ fn acquire_lsn_lease_with_retry(
    }
 }

-/// Tries to acquire an LSN lease through PS page_service API.
+/// Tries to acquire LSN leases on all Pageserver shards.
 fn try_acquire_lsn_lease(
+    connstrings: &str,
+    auth: Option<&str>,
    tenant_id: TenantId,
    timeline_id: TimelineId,
    lsn: Lsn,
-    configs: &[postgres::Config],
 ) -> Result<Option<SystemTime>> {
-    fn get_valid_until(
-        config: &postgres::Config,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        lsn: Lsn,
-    ) -> Result<Option<SystemTime>> {
-        let mut client = config.connect(NoTls)?;
-        let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} ");
-        let res = client.simple_query(&cmd)?;
-        let msg = match res.first() {
-            Some(msg) => msg,
-            None => bail!("empty response"),
-        };
-        let row = match msg {
-            SimpleQueryMessage::Row(row) => row,
-            _ => bail!("error parsing lsn lease response"),
+    let connstrings = connstrings.split(',').collect_vec();
+    let shard_count = connstrings.len();
+    let mut leases = Vec::new();
+
+    for (shard_number, &connstring) in connstrings.iter().enumerate() {
+        let tenant_shard_id = match shard_count {
+            0 | 1 => TenantShardId::unsharded(tenant_id),
+            shard_count => TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(shard_number as u8),
+                shard_count: ShardCount::new(shard_count as u8),
+            },
        };

-        // Note: this will be None if a lease is explicitly not granted.
-        let valid_until_str = row.get("valid_until");
-
-        let valid_until = valid_until_str.map(|s| {
-            SystemTime::UNIX_EPOCH
-                .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
-                .expect("Time larger than max SystemTime could handle")
-        });
-        Ok(valid_until)
+        let lease = match PageserverProtocol::from_connstring(connstring)? {
+            PageserverProtocol::Libpq => {
+                acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
+            }
+            PageserverProtocol::Grpc => {
+                acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
+            }
+        };
+        leases.push(lease);
    }

-    let shard_count = configs.len();
+    Ok(leases.into_iter().min().flatten())
+}

-    let valid_until = if shard_count > 1 {
-        configs
-            .iter()
-            .enumerate()
-            .map(|(shard_number, config)| {
-                let tenant_shard_id = TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount::new(shard_count as u8),
-                    shard_number: ShardNumber(shard_number as u8),
-                };
-                get_valid_until(config, tenant_shard_id, timeline_id, lsn)
-            })
-            .collect::<Result<Vec<Option<SystemTime>>>>()?
-            .into_iter()
-            .min()
-            .unwrap()
-    } else {
-        get_valid_until(
-            &configs[0],
-            TenantShardId::unsharded(tenant_id),
-            timeline_id,
-            lsn,
-        )?
+/// Acquires an LSN lease on a single shard, using the libpq API. The connstring must use a
+/// postgresql:// scheme.
+fn acquire_lsn_lease_libpq(
+    connstring: &str,
+    auth: Option<&str>,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+) -> Result<Option<SystemTime>> {
+    let mut config = postgres::Config::from_str(connstring)?;
+    if let Some(auth) = auth {
+        config.password(auth);
+    }
+    let mut client = config.connect(NoTls)?;
+    let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} ");
+    let res = client.simple_query(&cmd)?;
+    let msg = match res.first() {
+        Some(msg) => msg,
+        None => bail!("empty response"),
+    };
+    let row = match msg {
+        SimpleQueryMessage::Row(row) => row,
+        _ => bail!("error parsing lsn lease response"),
    };

+    // Note: this will be None if a lease is explicitly not granted.
+    let valid_until_str = row.get("valid_until");
+
+    let valid_until = valid_until_str.map(|s| {
+        SystemTime::UNIX_EPOCH
+            .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
+            .expect("Time larger than max SystemTime could handle")
+    });
    Ok(valid_until)
 }
+
+/// Acquires an LSN lease on a single shard, using the gRPC API. The connstring must use a
+/// grpc:// scheme.
+fn acquire_lsn_lease_grpc(
+    connstring: &str,
+    auth: Option<&str>,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+) -> Result<Option<SystemTime>> {
+    tokio::runtime::Handle::current().block_on(async move {
+        let mut client = page_api::Client::new(
+            connstring.to_string(),
+            tenant_shard_id.tenant_id,
+            timeline_id,
+            tenant_shard_id.to_index(),
+            auth.map(String::from),
+            None,
+        )
+        .await?;
+
+        let req = page_api::LeaseLsnRequest { lsn };
+        match client.lease_lsn(req).await {
+            Ok(expires) => Ok(Some(expires)),
+            // Lease couldn't be acquired because the LSN has been garbage collected.
+            Err(err) if err.code() == tonic::Code::FailedPrecondition => Ok(None),
+            Err(err) => Err(err.into()),
+        }
+    })
+}
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -4,8 +4,10 @@ use std::path::Path;
 use std::process::Command;
 use std::time::Duration;
 use std::{fs::OpenOptions, io::Write};
+use url::{Host, Url};

 use anyhow::{Context, Result, anyhow};
+use hostname_validator;
 use tracing::{error, info, instrument, warn};

 const POSTGRES_LOGS_CONF_PATH: &str = "/etc/rsyslog.d/postgres_logs.conf";
@@ -82,18 +84,84 @@ fn restart_rsyslog() -> Result<()> {
    Ok(())
 }

+fn parse_audit_syslog_address(
+    remote_plain_endpoint: &str,
+    remote_tls_endpoint: &str,
+) -> Result<(String, u16, String)> {
+    let tls;
+    let remote_endpoint = if !remote_tls_endpoint.is_empty() {
+        tls = "true".to_string();
+        remote_tls_endpoint
+    } else {
+        tls = "false".to_string();
+        remote_plain_endpoint
+    };
+    // Urlify the remote_endpoint, so parsing can be done with url::Url.
+    let url_str = format!("http://{remote_endpoint}");
+    let url = Url::parse(&url_str).map_err(|err| {
+        anyhow!("Error parsing {remote_endpoint}, expected host:port, got {err:?}")
+    })?;
+
+    let is_valid = url.scheme() == "http"
+        && url.path() == "/"
+        && url.query().is_none()
+        && url.fragment().is_none()
+        && url.username() == ""
+        && url.password().is_none();
+
+    if !is_valid {
+        return Err(anyhow!(
+            "Invalid address format {remote_endpoint}, expected host:port"
+        ));
+    }
+    let host = match url.host() {
+        Some(Host::Domain(h)) if hostname_validator::is_valid(h) => h.to_string(),
+        Some(Host::Ipv4(ip4)) => ip4.to_string(),
+        Some(Host::Ipv6(ip6)) => ip6.to_string(),
+        _ => return Err(anyhow!("Invalid host")),
+    };
+    let port = url
+        .port()
+        .ok_or_else(|| anyhow!("Invalid port in {remote_endpoint}"))?;
+
+    Ok((host, port, tls))
+}
+
+fn generate_audit_rsyslog_config(
+    log_directory: String,
+    endpoint_id: &str,
+    project_id: &str,
+    remote_syslog_host: &str,
+    remote_syslog_port: u16,
+    remote_syslog_tls: &str,
+) -> String {
+    format!(
+        include_str!("config_template/compute_audit_rsyslog_template.conf"),
+        log_directory = log_directory,
+        endpoint_id = endpoint_id,
+        project_id = project_id,
+        remote_syslog_host = remote_syslog_host,
+        remote_syslog_port = remote_syslog_port,
+        remote_syslog_tls = remote_syslog_tls
+    )
+}
+
 pub fn configure_audit_rsyslog(
    log_directory: String,
    endpoint_id: &str,
    project_id: &str,
    remote_endpoint: &str,
+    remote_tls_endpoint: &str,
 ) -> Result<()> {
-    let config_content: String = format!(
-        include_str!("config_template/compute_audit_rsyslog_template.conf"),
-        log_directory = log_directory,
-        endpoint_id = endpoint_id,
-        project_id = project_id,
-        remote_endpoint = remote_endpoint
+    let (remote_syslog_host, remote_syslog_port, remote_syslog_tls) =
+        parse_audit_syslog_address(remote_endpoint, remote_tls_endpoint).unwrap();
+    let config_content = generate_audit_rsyslog_config(
+        log_directory,
+        endpoint_id,
+        project_id,
+        &remote_syslog_host,
+        remote_syslog_port,
+        &remote_syslog_tls,
    );

    info!("rsyslog config_content: {}", config_content);
@@ -258,6 +326,8 @@ pub fn launch_pgaudit_gc(log_directory: String) {
 mod tests {
    use crate::rsyslog::PostgresLogsRsyslogConfig;

+    use super::{generate_audit_rsyslog_config, parse_audit_syslog_address};
+
    #[test]
    fn test_postgres_logs_config() {
        {
@@ -287,4 +357,146 @@ mod tests {
            assert!(res.is_err());
        }
    }
+
+    #[test]
+    fn test_parse_audit_syslog_address() {
+        {
+            // host:port format (plaintext)
+            let parsed = parse_audit_syslog_address("collector.host.tld:5555", "");
+            assert!(parsed.is_ok());
+            assert_eq!(
+                parsed.unwrap(),
+                (
+                    String::from("collector.host.tld"),
+                    5555,
+                    String::from("false")
+                )
+            );
+        }
+
+        {
+            // host:port format with ipv4 ip address (plaintext)
+            let parsed = parse_audit_syslog_address("10.0.0.1:5555", "");
+            assert!(parsed.is_ok());
+            assert_eq!(
+                parsed.unwrap(),
+                (String::from("10.0.0.1"), 5555, String::from("false"))
+            );
+        }
+
+        {
+            // host:port format with ipv6 ip address (plaintext)
+            let parsed =
+                parse_audit_syslog_address("[7e60:82ed:cb2e:d617:f904:f395:aaca:e252]:5555", "");
+            assert_eq!(
+                parsed.unwrap(),
+                (
+                    String::from("7e60:82ed:cb2e:d617:f904:f395:aaca:e252"),
+                    5555,
+                    String::from("false")
+                )
+            );
+        }
+
+        {
+            // Only TLS host:port defined
+            let parsed = parse_audit_syslog_address("", "tls.host.tld:5556");
+            assert_eq!(
+                parsed.unwrap(),
+                (String::from("tls.host.tld"), 5556, String::from("true"))
+            );
+        }
+
+        {
+            // tls host should take precedence, when both defined
+            let parsed = parse_audit_syslog_address("plaintext.host.tld:5555", "tls.host.tld:5556");
+            assert_eq!(
+                parsed.unwrap(),
+                (String::from("tls.host.tld"), 5556, String::from("true"))
+            );
+        }
+
+        {
+            // host without port (plaintext)
+            let parsed = parse_audit_syslog_address("collector.host.tld", "");
+            assert!(parsed.is_err());
+        }
+
+        {
+            // port without host
+            let parsed = parse_audit_syslog_address(":5555", "");
+            assert!(parsed.is_err());
+        }
+
+        {
+            // valid host with invalid port
+            let parsed = parse_audit_syslog_address("collector.host.tld:90001", "");
+            assert!(parsed.is_err());
+        }
+
+        {
+            // invalid hostname with valid port
+            let parsed = parse_audit_syslog_address("-collector.host.tld:5555", "");
+            assert!(parsed.is_err());
+        }
+
+        {
+            // parse error
+            let parsed = parse_audit_syslog_address("collector.host.tld:::5555", "");
+            assert!(parsed.is_err());
+        }
+    }
+
+    #[test]
+    fn test_generate_audit_rsyslog_config() {
+        {
+            // plaintext version
+            let log_directory = "/tmp/log".to_string();
+            let endpoint_id = "ep-test-endpoint-id";
+            let project_id = "test-project-id";
+            let remote_syslog_host = "collector.host.tld";
+            let remote_syslog_port = 5555;
+            let remote_syslog_tls = "false";
+
+            let conf_str = generate_audit_rsyslog_config(
+                log_directory,
+                endpoint_id,
+                project_id,
+                remote_syslog_host,
+                remote_syslog_port,
+                remote_syslog_tls,
+            );
+
+            assert!(conf_str.contains(r#"set $.remote_syslog_tls = "false";"#));
+            assert!(conf_str.contains(r#"type="omfwd""#));
+            assert!(conf_str.contains(r#"target="collector.host.tld""#));
+            assert!(conf_str.contains(r#"port="5555""#));
+            assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#));
+        }
+
+        {
+            // TLS version
+            let log_directory = "/tmp/log".to_string();
+            let endpoint_id = "ep-test-endpoint-id";
+            let project_id = "test-project-id";
+            let remote_syslog_host = "collector.host.tld";
+            let remote_syslog_port = 5556;
+            let remote_syslog_tls = "true";
+
+            let conf_str = generate_audit_rsyslog_config(
+                log_directory,
+                endpoint_id,
+                project_id,
+                remote_syslog_host,
+                remote_syslog_port,
+                remote_syslog_tls,
+            );
+
+            assert!(conf_str.contains(r#"set $.remote_syslog_tls = "true";"#));
+            assert!(conf_str.contains(r#"type="omfwd""#));
+            assert!(conf_str.contains(r#"target="collector.host.tld""#));
+            assert!(conf_str.contains(r#"port="5556""#));
+            assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#));
+        }
+    }
 }
--- a/compute_tools/tests/cluster_spec.json
+++ b/compute_tools/tests/cluster_spec.json
@@ -3,7 +3,8 @@

  "timestamp": "2021-05-23T18:25:43.511Z",
  "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
-
+  "suspend_timeout_seconds": 3600,
+  
  "cluster": {
    "cluster_id": "test-cluster-42",
    "name": "Zenith Test",
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -16,9 +16,9 @@ use std::time::Duration;
 use anyhow::{Context, Result, anyhow, bail};
 use clap::Parser;
 use compute_api::requests::ComputeClaimsScope;
-use compute_api::spec::ComputeMode;
+use compute_api::spec::{ComputeMode, PageserverProtocol};
 use control_plane::broker::StorageBroker;
-use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode, PageserverProtocol};
+use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
 use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
 use control_plane::local_env;
 use control_plane::local_env::{
@@ -1649,7 +1649,9 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
            // If --safekeepers argument is given, use only the listed
            // safekeeper nodes; otherwise all from the env.
            let safekeepers = parse_safekeepers(&args.safekeepers)?;
-            endpoint.reconfigure(pageservers, None, safekeepers).await?;
+            endpoint
+                .reconfigure(Some(pageservers), None, safekeepers, None)
+                .await?;
        }
        EndpointCmd::Stop(args) => {
            let endpoint_id = &args.endpoint_id;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -56,8 +56,8 @@ use compute_api::responses::{
    TlsConfig,
 };
 use compute_api::spec::{
-    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
-    RemoteExtSpec, Role,
+    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
+    PgIdent, RemoteExtSpec, Role,
 };
 use jsonwebtoken::jwk::{
    AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
@@ -373,29 +373,6 @@ impl std::fmt::Display for EndpointTerminateMode {
    }
 }

-/// Protocol used to connect to a Pageserver.
-#[derive(Clone, Copy, Debug)]
-pub enum PageserverProtocol {
-    Libpq,
-    Grpc,
-}
-
-impl PageserverProtocol {
-    /// Returns the URL scheme for the protocol, used in connstrings.
-    pub fn scheme(&self) -> &'static str {
-        match self {
-            Self::Libpq => "postgresql",
-            Self::Grpc => "grpc",
-        }
-    }
-}
-
-impl Display for PageserverProtocol {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.write_str(self.scheme())
-    }
-}
-
 impl Endpoint {
    fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
        if !entry.file_type()?.is_dir() {
@@ -803,6 +780,7 @@ impl Endpoint {
                endpoint_storage_addr: Some(endpoint_storage_addr),
                endpoint_storage_token: Some(endpoint_storage_token),
                autoprewarm: false,
+                suspend_timeout_seconds: -1, // Only used in neon_local.
            };

            // this strange code is needed to support respec() in tests
@@ -997,12 +975,11 @@ impl Endpoint {

    pub async fn reconfigure(
        &self,
-        pageservers: Vec<(PageserverProtocol, Host, u16)>,
+        pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
        stripe_size: Option<ShardStripeSize>,
        safekeepers: Option<Vec<NodeId>>,
+        safekeeper_generation: Option<SafekeeperGeneration>,
    ) -> Result<()> {
-        anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
-
        let (mut spec, compute_ctl_config) = {
            let config_path = self.endpoint_path().join("config.json");
            let file = std::fs::File::open(config_path)?;
@@ -1014,16 +991,24 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
-        spec.pageserver_connstring = Some(pageserver_connstr);
-        if stripe_size.is_some() {
-            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
+        // If pageservers are not specified, don't change them.
+        if let Some(pageservers) = pageservers {
+            anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
+
+            let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
+            spec.pageserver_connstring = Some(pageserver_connstr);
+            if stripe_size.is_some() {
+                spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
+            }
        }

        // If safekeepers are not specified, don't change them.
        if let Some(safekeepers) = safekeepers {
            let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
            spec.safekeeper_connstrings = safekeeper_connstrings;
+            if let Some(g) = safekeeper_generation {
+                spec.safekeepers_generation = Some(g.into_inner());
+            }
        }

        let client = reqwest::Client::builder()
@@ -1061,6 +1046,24 @@ impl Endpoint {
        }
    }

+    pub async fn reconfigure_pageservers(
+        &self,
+        pageservers: Vec<(PageserverProtocol, Host, u16)>,
+        stripe_size: Option<ShardStripeSize>,
+    ) -> Result<()> {
+        self.reconfigure(Some(pageservers), stripe_size, None, None)
+            .await
+    }
+
+    pub async fn reconfigure_safekeepers(
+        &self,
+        safekeepers: Vec<NodeId>,
+        generation: SafekeeperGeneration,
+    ) -> Result<()> {
+        self.reconfigure(None, None, Some(safekeepers), Some(generation))
+            .await
+    }
+
    pub async fn stop(
        &self,
        mode: EndpointTerminateMode,
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -212,7 +212,7 @@ pub struct NeonStorageControllerConf {

    pub use_local_compute_notifications: bool,

-    pub timeline_safekeeper_count: Option<i64>,
+    pub timeline_safekeeper_count: Option<usize>,

    pub posthog_config: Option<PostHogConfig>,

--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -638,7 +638,13 @@ impl StorageController {
            args.push("--timelines-onto-safekeepers".to_string());
        }

-        if let Some(sk_cnt) = self.config.timeline_safekeeper_count {
+        // neon_local is used in test environments where we often have less than 3 safekeepers.
+        if self.config.timeline_safekeeper_count.is_some() || self.env.safekeepers.len() < 3 {
+            let sk_cnt = self
+                .config
+                .timeline_safekeeper_count
+                .unwrap_or(self.env.safekeepers.len());
+
            args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
        }

--- a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
@@ -4,6 +4,7 @@

        "timestamp": "2022-10-12T18:00:00.000Z",
        "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
+        "suspend_timeout_seconds": -1,

        "cluster": {
            "cluster_id": "docker_compose",
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -12,6 +12,7 @@ jsonwebtoken.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 regex.workspace = true
+url.workspace = true

 utils = { path = "../utils" }
 remote_storage = { version = "0.1", path = "../remote_storage/" }
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -4,11 +4,14 @@
 //! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
 //! compute_ctl can fetch it by calling the control plane's API.
 use std::collections::HashMap;
+use std::fmt::Display;

+use anyhow::anyhow;
 use indexmap::IndexMap;
 use regex::Regex;
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
+use url::Url;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -181,6 +184,11 @@ pub struct ComputeSpec {
    /// Download LFC state from endpoint_storage and pass it to Postgres on startup
    #[serde(default)]
    pub autoprewarm: bool,
+
+    /// Suspend timeout in seconds.
+    ///
+    /// We use this value to derive other values, such as the installed extensions metric.
+    pub suspend_timeout_seconds: i64,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -429,6 +437,47 @@ pub struct JwksSettings {
    pub jwt_audience: Option<String>,
 }

+/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
+#[derive(Clone, Copy, Debug, Default)]
+pub enum PageserverProtocol {
+    /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
+    #[default]
+    Libpq,
+    /// A newer, gRPC-based protocol. Uses grpc:// scheme.
+    Grpc,
+}
+
+impl PageserverProtocol {
+    /// Parses the protocol from a connstring scheme. Defaults to Libpq if no scheme is given.
+    /// Errors if the connstring is an invalid URL.
+    pub fn from_connstring(connstring: &str) -> anyhow::Result<Self> {
+        let scheme = match Url::parse(connstring) {
+            Ok(url) => url.scheme().to_lowercase(),
+            Err(url::ParseError::RelativeUrlWithoutBase) => return Ok(Self::default()),
+            Err(err) => return Err(anyhow!("invalid connstring URL: {err}")),
+        };
+        match scheme.as_str() {
+            "postgresql" | "postgres" => Ok(Self::Libpq),
+            "grpc" => Ok(Self::Grpc),
+            scheme => Err(anyhow!("invalid protocol scheme: {scheme}")),
+        }
+    }
+
+    /// Returns the URL scheme for the protocol, for use in connstrings.
+    pub fn scheme(&self) -> &'static str {
+        match self {
+            Self::Libpq => "postgresql",
+            Self::Grpc => "grpc",
+        }
+    }
+}
+
+impl Display for PageserverProtocol {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.scheme())
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::fs::File;
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -3,6 +3,7 @@

    "timestamp": "2021-05-23T18:25:43.511Z",
    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
+    "suspend_timeout_seconds": 3600,

    "cluster": {
        "cluster_id": "test-cluster-42",
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -30,12 +30,13 @@ humantime-serde.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 itertools.workspace = true
 storage_broker.workspace = true
-camino = {workspace = true, features = ["serde1"]}
+camino = { workspace = true, features = ["serde1"] }
 remote_storage.workspace = true
 postgres_backend.workspace = true
-nix = {workspace = true, optional = true}
+nix = { workspace = true, optional = true }
 reqwest.workspace = true
 rand.workspace = true
+tracing.workspace = true
 tracing-utils.workspace = true
 once_cell.workspace = true

--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -546,6 +546,11 @@ pub struct TimelineImportRequest {
    pub sk_set: Vec<NodeId>,
 }

+#[derive(serde::Serialize, serde::Deserialize, Clone)]
+pub struct TimelineSafekeeperMigrateRequest {
+    pub new_sk_set: Vec<NodeId>,
+}
+
 #[cfg(test)]
 mod test {
    use serde_json;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -21,7 +21,9 @@ use utils::{completion, serde_system_time};

 use crate::config::Ratio;
 use crate::key::{CompactKey, Key};
-use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
+use crate::shard::{
+    DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardStripeSize, TenantShardId,
+};

 /// The state of a tenant in this pageserver.
 ///
@@ -475,7 +477,7 @@ pub struct TenantShardSplitResponse {
 }

 /// Parameters that apply to all shards in a tenant.  Used during tenant creation.
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Clone, Copy, Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct ShardParameters {
    pub count: ShardCount,
@@ -497,6 +499,15 @@ impl Default for ShardParameters {
    }
 }

+impl From<ShardIdentity> for ShardParameters {
+    fn from(identity: ShardIdentity) -> Self {
+        Self {
+            count: identity.count,
+            stripe_size: identity.stripe_size,
+        }
+    }
+}
+
 #[derive(Debug, Default, Clone, Eq, PartialEq)]
 pub enum FieldPatch<T> {
    Upsert(T),
@@ -1724,6 +1735,7 @@ pub enum DownloadRemoteLayersTaskState {
 #[derive(Debug, Serialize, Deserialize)]
 pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
+    pub ignore_lease_deadline: Option<bool>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -37,6 +37,7 @@ use std::hash::{Hash, Hasher};
 pub use ::utils::shard::*;
 use postgres_ffi_types::forknum::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
+use utils::critical;

 use crate::key::Key;
 use crate::models::ShardParameters;
@@ -179,7 +180,7 @@ impl ShardIdentity {

    /// For use when creating ShardIdentity instances for new shards, where a creation request
    /// specifies the ShardParameters that apply to all shards.
-    pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self {
+    pub fn from_params(number: ShardNumber, params: ShardParameters) -> Self {
        Self {
            number,
            count: params.count,
@@ -188,6 +189,17 @@ impl ShardIdentity {
        }
    }

+    /// Asserts that the given shard identities are equal. Changes to shard parameters will likely
+    /// result in data corruption.
+    pub fn assert_equal(&self, other: ShardIdentity) {
+        if self != &other {
+            // TODO: for now, we're conservative and just log errors in production. Turn this into a
+            // real assertion when we're confident it doesn't misfire, and also reject requests that
+            // attempt to change it with an error response.
+            critical!("shard identity mismatch: {self:?} != {other:?}");
+        }
+    }
+
    fn is_broken(&self) -> bool {
        self.layout == LAYOUT_BROKEN
    }
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -173,7 +173,7 @@ pub struct TimelineStatus {
 }

 /// Request to switch membership configuration.
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 #[serde(transparent)]
 pub struct TimelineMembershipSwitchRequest {
    pub mconf: Configuration,
@@ -184,6 +184,8 @@ pub struct TimelineMembershipSwitchRequest {
 pub struct TimelineMembershipSwitchResponse {
    pub previous_conf: Configuration,
    pub current_conf: Configuration,
+    pub term: Term,
+    pub flush_lsn: Lsn,
 }

 #[derive(Clone, Copy, Serialize, Deserialize)]
--- a/pageserver/page_api/Cargo.toml
+++ b/pageserver/page_api/Cargo.toml
@@ -11,6 +11,7 @@ futures.workspace = true
 pageserver_api.workspace = true
 postgres_ffi_types.workspace = true
 prost.workspace = true
+prost-types.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 thiserror.workspace = true
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -35,6 +35,8 @@
 syntax = "proto3";
 package page_api;

+import "google/protobuf/timestamp.proto";
+
 service PageService {
  // Returns whether a relation exists.
  rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
@@ -64,6 +66,10 @@ service PageService {

  // Fetches an SLRU segment.
  rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
+
+  // Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage
+  // collect the LSN until the lease expires. Must be acquired on all relevant shards.
+  rpc LeaseLsn (LeaseLsnRequest) returns (LeaseLsnResponse);
 }

 // The LSN a request should read at.
@@ -252,3 +258,17 @@ message GetSlruSegmentRequest {
 message GetSlruSegmentResponse {
  bytes segment = 1;
 }
+
+// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage
+// collect the LSN until the lease expires. Must be acquired on all relevant shards.
+message LeaseLsnRequest {
+  // The LSN to lease. Can't be 0 or below the current GC cutoff.
+  uint64 lsn = 1;
+}
+
+// Lease acquisition response. If the lease could not be granted because the LSN has already been
+// garbage collected, a FailedPrecondition status will be returned instead.
+message LeaseLsnResponse {
+  // The lease expiration time.
+  google.protobuf.Timestamp expires = 1;
+}
--- a/pageserver/page_api/src/client.rs
+++ b/pageserver/page_api/src/client.rs
@@ -187,4 +187,17 @@ impl Client {
        let response = self.client.get_slru_segment(proto_req).await?;
        Ok(response.into_inner().try_into()?)
    }
+
+    /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
+    /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
+    ///
+    /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
+    /// acquired because the LSN has already been garbage collected.
+    pub async fn lease_lsn(
+        &mut self,
+        req: model::LeaseLsnRequest,
+    ) -> Result<model::LeaseLsnResponse, tonic::Status> {
+        let req = proto::LeaseLsnRequest::from(req);
+        Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?)
+    }
 }
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -16,6 +16,7 @@
 //! stream combinators without dealing with errors, and avoids validating the same message twice.

 use std::fmt::Display;
+use std::time::{Duration, SystemTime, UNIX_EPOCH};

 use bytes::Bytes;
 use postgres_ffi_types::Oid;
@@ -703,3 +704,54 @@ impl From<GetSlruSegmentResponse> for proto::GetSlruSegmentResponse {

 // SlruKind is defined in pageserver_api::reltag.
 pub type SlruKind = pageserver_api::reltag::SlruKind;
+
+/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage
+/// collect the LSN until the lease expires.
+pub struct LeaseLsnRequest {
+    /// The LSN to lease.
+    pub lsn: Lsn,
+}
+
+impl TryFrom<proto::LeaseLsnRequest> for LeaseLsnRequest {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::LeaseLsnRequest) -> Result<Self, Self::Error> {
+        if pb.lsn == 0 {
+            return Err(ProtocolError::Missing("lsn"));
+        }
+        Ok(Self { lsn: Lsn(pb.lsn) })
+    }
+}
+
+impl From<LeaseLsnRequest> for proto::LeaseLsnRequest {
+    fn from(request: LeaseLsnRequest) -> Self {
+        Self { lsn: request.lsn.0 }
+    }
+}
+
+/// Lease expiration time. If the lease could not be granted because the LSN has already been
+/// garbage collected, a FailedPrecondition status will be returned instead.
+pub type LeaseLsnResponse = SystemTime;
+
+impl TryFrom<proto::LeaseLsnResponse> for LeaseLsnResponse {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::LeaseLsnResponse) -> Result<Self, Self::Error> {
+        let expires = pb.expires.ok_or(ProtocolError::Missing("expires"))?;
+        UNIX_EPOCH
+            .checked_add(Duration::new(expires.seconds as u64, expires.nanos as u32))
+            .ok_or_else(|| ProtocolError::invalid("expires", expires))
+    }
+}
+
+impl From<LeaseLsnResponse> for proto::LeaseLsnResponse {
+    fn from(response: LeaseLsnResponse) -> Self {
+        let expires = response.duration_since(UNIX_EPOCH).unwrap_or_default();
+        Self {
+            expires: Some(prost_types::Timestamp {
+                seconds: expires.as_secs() as i64,
+                nanos: expires.subsec_nanos() as i32,
+            }),
+        }
+    }
+}
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1893,9 +1893,13 @@ async fn update_tenant_config_handler(
    let location_conf = LocationConf::attached_single(
        new_tenant_conf.clone(),
        tenant.get_generation(),
-        &ShardParameters::default(),
+        ShardParameters::from(tenant.get_shard_identity()),
    );

+    tenant
+        .get_shard_identity()
+        .assert_equal(location_conf.shard); // not strictly necessary since we construct it above
+
    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
@@ -1937,9 +1941,13 @@ async fn patch_tenant_config_handler(
    let location_conf = LocationConf::attached_single(
        updated,
        tenant.get_generation(),
-        &ShardParameters::default(),
+        ShardParameters::from(tenant.get_shard_identity()),
    );

+    tenant
+        .get_shard_identity()
+        .assert_equal(location_conf.shard); // not strictly necessary since we construct it above
+
    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -14,6 +14,7 @@ use std::{io, str};

 use anyhow::{Context as _, bail};
 use bytes::{Buf as _, BufMut as _, BytesMut};
+use chrono::Utc;
 use futures::future::BoxFuture;
 use futures::{FutureExt, Stream};
 use itertools::Itertools;
@@ -3760,6 +3761,36 @@ impl proto::PageService for GrpcPageServiceHandler {
        let resp: page_api::GetSlruSegmentResponse = resp.segment;
        Ok(tonic::Response::new(resp.into()))
    }
+
+    #[instrument(skip_all, fields(lsn))]
+    async fn lease_lsn(
+        &self,
+        req: tonic::Request<proto::LeaseLsnRequest>,
+    ) -> Result<tonic::Response<proto::LeaseLsnResponse>, tonic::Status> {
+        let timeline = self.get_request_timeline(&req).await?;
+        let ctx = self.ctx.with_scope_timeline(&timeline);
+
+        // Validate and convert the request, and decorate the span.
+        let req: page_api::LeaseLsnRequest = req.into_inner().try_into()?;
+
+        span_record!(lsn=%req.lsn);
+
+        // Attempt to acquire a lease. Return FailedPrecondition if the lease could not be granted.
+        let lease_length = timeline.get_lsn_lease_length();
+        let expires = match timeline.renew_lsn_lease(req.lsn, lease_length, &ctx) {
+            Ok(lease) => lease.valid_until,
+            Err(err) => return Err(tonic::Status::failed_precondition(format!("{err}"))),
+        };
+
+        // TODO: is this spammy? Move it compute-side?
+        info!(
+            "acquired lease for {} until {}",
+            req.lsn,
+            chrono::DateTime::<Utc>::from(expires).to_rfc3339()
+        );
+
+        Ok(tonic::Response::new(expires.into()))
+    }
 }

 /// gRPC middleware layer that handles observability concerns:
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -3015,7 +3015,7 @@ mod tests {
        // This shard will get the even blocks
        let shard = ShardIdentity::from_params(
            ShardNumber(0),
-            &ShardParameters {
+            ShardParameters {
                count: ShardCount(2),
                stripe_size: ShardStripeSize(1),
            },
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -34,7 +34,7 @@ use once_cell::sync::Lazy;
 pub use pageserver_api::models::TenantState;
 use pageserver_api::models::{self, RelSizeMigration};
 use pageserver_api::models::{
-    CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
+    CompactInfoResponse, TimelineArchivalState, TimelineState, TopTenantShardItem,
    WalRedoManagerStatus,
 };
 use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId};
@@ -180,6 +180,7 @@ pub(super) struct AttachedTenantConf {

 impl AttachedTenantConf {
    fn new(
+        conf: &'static PageServerConf,
        tenant_conf: pageserver_api::models::TenantConfig,
        location: AttachedLocationConfig,
    ) -> Self {
@@ -191,9 +192,7 @@ impl AttachedTenantConf {
        let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single {
            Some(
                tokio::time::Instant::now()
-                    + tenant_conf
-                        .lsn_lease_length
-                        .unwrap_or(LsnLease::DEFAULT_LENGTH),
+                    + TenantShard::get_lsn_lease_length_impl(conf, &tenant_conf),
            )
        } else {
            // We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale
@@ -208,10 +207,13 @@ impl AttachedTenantConf {
        }
    }

-    fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
+    fn try_from(
+        conf: &'static PageServerConf,
+        location_conf: LocationConf,
+    ) -> anyhow::Result<Self> {
        match &location_conf.mode {
            LocationMode::Attached(attach_conf) => {
-                Ok(Self::new(location_conf.tenant_conf, *attach_conf))
+                Ok(Self::new(conf, location_conf.tenant_conf, *attach_conf))
            }
            LocationMode::Secondary(_) => {
                anyhow::bail!(
@@ -1145,6 +1147,22 @@ pub(crate) enum LoadConfigError {
    NotFound(Utf8PathBuf),
 }

+pub(crate) enum IgnoreLeaseDeadline {
+    Default,
+    No,
+    Yes,
+}
+
+impl From<Option<bool>> for IgnoreLeaseDeadline {
+    fn from(value: Option<bool>) -> Self {
+        match value {
+            None => IgnoreLeaseDeadline::Default,
+            Some(false) => IgnoreLeaseDeadline::No,
+            Some(true) => IgnoreLeaseDeadline::Yes,
+        }
+    }
+}
+
 impl TenantShard {
    /// Yet another helper for timeline initialization.
    ///
@@ -3076,6 +3094,7 @@ impl TenantShard {
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
+        ignore_lease_deadline: IgnoreLeaseDeadline,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<GcResult, GcError> {
@@ -3101,7 +3120,23 @@ impl TenantShard {
                return Ok(GcResult::default());
            }

-            if conf.is_gc_blocked_by_lsn_lease_deadline() {
+            // Skip GC if we're within lease deadline.
+            //
+            // Rust & Python tests set single-digit second gc_period and/or
+            // do immediate gc via mgmt API. The lease deadline is hard-coded to
+            // 10min, which would make most if not all tests skip GC here.
+            // Rust tests could in theory tokio::time::advance, but Python
+            // tests have no such options.
+            // Since lease deadline is a crutch we hopefully eventually replace
+            // with durable leases, take a shortcut here and skip lease deadline check
+            // for all tests.
+            // Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329
+            let ignore_lease_deadline = match ignore_lease_deadline {
+                IgnoreLeaseDeadline::Default => cfg!(test) || cfg!(feature = "testing"),
+                IgnoreLeaseDeadline::No => false,
+                IgnoreLeaseDeadline::Yes => true,
+            };
+            if !ignore_lease_deadline && conf.is_gc_blocked_by_lsn_lease_deadline() {
                info!("Skipping GC because lsn lease deadline is not reached");
                return Ok(GcResult::default());
            }
@@ -3872,6 +3907,10 @@ impl TenantShard {
        &self.tenant_shard_id
    }

+    pub(crate) fn get_shard_identity(&self) -> ShardIdentity {
+        self.shard_identity
+    }
+
    pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize {
        self.shard_identity.stripe_size
    }
@@ -4197,10 +4236,16 @@ impl TenantShard {
    }

    pub fn get_lsn_lease_length(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        Self::get_lsn_lease_length_impl(self.conf, &self.tenant_conf.load().tenant_conf)
+    }
+
+    pub fn get_lsn_lease_length_impl(
+        conf: &'static PageServerConf,
+        tenant_conf: &pageserver_api::models::TenantConfig,
+    ) -> Duration {
        tenant_conf
            .lsn_lease_length
-            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
+            .unwrap_or(conf.default_tenant_conf.lsn_lease_length)
    }

    pub fn get_timeline_offloading_enabled(&self) -> bool {
@@ -4525,6 +4570,10 @@ impl TenantShard {
        Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
    }

+    /// Stores a tenant location config to disk.
+    ///
+    /// NB: make sure to call `ShardIdentity::assert_equal` before persisting a new config, to avoid
+    /// changes to shard parameters that may result in data corruption.
    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(super) async fn persist_tenant_config(
        conf: &'static PageServerConf,
@@ -6008,7 +6057,7 @@ pub(crate) mod harness {
                AttachedTenantConf::try_from(LocationConf::attached_single(
                    self.tenant_conf.clone(),
                    self.generation,
-                    &ShardParameters::default(),
+                    ShardParameters::default(),
                ))
                .unwrap(),
                self.shard_identity,
@@ -6660,17 +6709,13 @@ mod tests {
        tline.freeze_and_flush().await.map_err(|e| e.into())
    }

-    #[tokio::test(start_paused = true)]
+    #[tokio::test]
    async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
        let (tenant, ctx) =
            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
                .await?
                .load()
                .await;
-        // Advance to the lsn lease deadline so that GC is not blocked by
-        // initial transition into AttachedSingle.
-        tokio::time::advance(tenant.get_lsn_lease_length()).await;
-        tokio::time::resume();
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -6685,6 +6730,7 @@ mod tests {
                Some(TIMELINE_ID),
                0x10,
                Duration::ZERO,
+                IgnoreLeaseDeadline::Default,
                &CancellationToken::new(),
                &ctx,
            )
@@ -6798,6 +6844,7 @@ mod tests {
                Some(TIMELINE_ID),
                0x10,
                Duration::ZERO,
+                IgnoreLeaseDeadline::Default,
                &CancellationToken::new(),
                &ctx,
            )
@@ -6858,6 +6905,7 @@ mod tests {
                Some(TIMELINE_ID),
                0x10,
                Duration::ZERO,
+                IgnoreLeaseDeadline::Default,
                &CancellationToken::new(),
                &ctx,
            )
@@ -6892,6 +6940,7 @@ mod tests {
                Some(TIMELINE_ID),
                0x10,
                Duration::ZERO,
+                IgnoreLeaseDeadline::Default,
                &CancellationToken::new(),
                &ctx,
            )
@@ -7173,7 +7222,14 @@ mod tests {
            // this doesn't really need to use the timeline_id target, but it is closer to what it
            // originally was.
            let res = tenant
-                .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx)
+                .gc_iteration(
+                    Some(timeline.timeline_id),
+                    0,
+                    Duration::ZERO,
+                    IgnoreLeaseDeadline::Default,
+                    &cancel,
+                    ctx,
+                )
                .await?;

            assert_eq!(res.layers_removed, 0, "this never removes anything");
@@ -7791,7 +7847,14 @@ mod tests {
            // Perform a cycle of flush, and GC
            tline.freeze_and_flush().await?;
            tenant
-                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .gc_iteration(
+                    Some(tline.timeline_id),
+                    0,
+                    Duration::ZERO,
+                    IgnoreLeaseDeadline::Default,
+                    &cancel,
+                    &ctx,
+                )
                .await?;
        }

@@ -7882,7 +7945,14 @@ mod tests {
            tline.freeze_and_flush().await?;
            tline.compact(&cancel, EnumSet::default(), &ctx).await?;
            tenant
-                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .gc_iteration(
+                    Some(tline.timeline_id),
+                    0,
+                    Duration::ZERO,
+                    IgnoreLeaseDeadline::Default,
+                    &cancel,
+                    &ctx,
+                )
                .await?;
        }

@@ -8218,7 +8288,14 @@ mod tests {
                    )
                    .await?;
                tenant
-                    .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                    .gc_iteration(
+                        Some(tline.timeline_id),
+                        0,
+                        Duration::ZERO,
+                        IgnoreLeaseDeadline::Default,
+                        &cancel,
+                        &ctx,
+                    )
                    .await?;
            }
        }
@@ -9369,17 +9446,13 @@ mod tests {
        Ok(())
    }

-    #[tokio::test(start_paused = true)]
+    #[tokio::test]
    async fn test_lsn_lease() -> anyhow::Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_lsn_lease")
            .await
            .unwrap()
            .load()
            .await;
-        // Advance to the lsn lease deadline so that GC is not blocked by
-        // initial transition into AttachedSingle.
-        tokio::time::advance(tenant.get_lsn_lease_length()).await;
-        tokio::time::resume();
        let key = Key::from_hex("010000000033333333444444445500000000").unwrap();

        let end_lsn = Lsn(0x100);
@@ -9443,6 +9516,7 @@ mod tests {
                Some(TIMELINE_ID),
                0,
                Duration::ZERO,
+                IgnoreLeaseDeadline::Default,
                &CancellationToken::new(),
                &ctx,
            )
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -12,6 +12,7 @@
 use pageserver_api::models;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::{Deserialize, Serialize};
+use utils::critical;
 use utils::generation::Generation;

 #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -136,7 +137,7 @@ impl LocationConf {
    pub(crate) fn attached_single(
        tenant_conf: pageserver_api::models::TenantConfig,
        generation: Generation,
-        shard_params: &models::ShardParameters,
+        shard_params: models::ShardParameters,
    ) -> Self {
        Self {
            mode: LocationMode::Attached(AttachedLocationConfig {
@@ -171,6 +172,16 @@ impl LocationConf {
            }
        }

+        // This should never happen.
+        // TODO: turn this into a proper assertion.
+        if stripe_size != self.shard.stripe_size {
+            critical!(
+                "stripe size mismatch: {} != {}",
+                self.shard.stripe_size,
+                stripe_size,
+            );
+        }
+
        self.shard.stripe_size = stripe_size;
    }

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -664,7 +664,7 @@ pub async fn init_tenant_mgr(
                    tenant_shard_id,
                    &tenant_dir_path,
                    resources.clone(),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                    AttachedTenantConf::new(conf, location_conf.tenant_conf, attached_conf),
                    shard_identity,
                    Some(init_order.clone()),
                    SpawnMode::Lazy,
@@ -842,7 +842,7 @@ impl TenantManager {
                            // take our fast path and just provide the updated configuration
                            // to the tenant.
                            tenant.set_new_location_config(
-                                AttachedTenantConf::try_from(new_location_config.clone())
+                                AttachedTenantConf::try_from(self.conf, new_location_config.clone())
                                    .map_err(UpsertLocationError::BadRequest)?,
                            );

@@ -880,6 +880,9 @@ impl TenantManager {
        // phase of writing config and/or waiting for flush, before returning.
        match fast_path_taken {
            Some(FastPathModified::Attached(tenant)) => {
+                tenant
+                    .shard_identity
+                    .assert_equal(new_location_config.shard);
                TenantShard::persist_tenant_config(
                    self.conf,
                    &tenant_shard_id,
@@ -914,7 +917,10 @@ impl TenantManager {

                return Ok(Some(tenant));
            }
-            Some(FastPathModified::Secondary(_secondary_tenant)) => {
+            Some(FastPathModified::Secondary(secondary_tenant)) => {
+                secondary_tenant
+                    .shard_identity
+                    .assert_equal(new_location_config.shard);
                TenantShard::persist_tenant_config(
                    self.conf,
                    &tenant_shard_id,
@@ -948,6 +954,10 @@ impl TenantManager {

        match slot_guard.get_old_value() {
            Some(TenantSlot::Attached(tenant)) => {
+                tenant
+                    .shard_identity
+                    .assert_equal(new_location_config.shard);
+
                // The case where we keep a Tenant alive was covered above in the special case
                // for Attached->Attached transitions in the same generation.  By this point,
                // if we see an attached tenant we know it will be discarded and should be
@@ -981,9 +991,13 @@ impl TenantManager {
                // rather than assuming it to be empty.
                spawn_mode = SpawnMode::Eager;
            }
-            Some(TenantSlot::Secondary(state)) => {
+            Some(TenantSlot::Secondary(secondary_tenant)) => {
+                secondary_tenant
+                    .shard_identity
+                    .assert_equal(new_location_config.shard);
+
                info!("Shutting down secondary tenant");
-                state.shutdown().await;
+                secondary_tenant.shutdown().await;
            }
            Some(TenantSlot::InProgress(_)) => {
                // This should never happen: acquire_slot should error out
@@ -1032,7 +1046,7 @@ impl TenantManager {
                // Testing hack: if we are configured with no control plane, then drop the generation
                // from upserts.  This enables creating generation-less tenants even though neon_local
                // always uses generations when calling the location conf API.
-                let attached_conf = AttachedTenantConf::try_from(new_location_config)
+                let attached_conf = AttachedTenantConf::try_from(self.conf, new_location_config)
                    .map_err(UpsertLocationError::BadRequest)?;

                let tenant = tenant_spawn(
@@ -1236,7 +1250,7 @@ impl TenantManager {
            tenant_shard_id,
            &tenant_path,
            self.resources.clone(),
-            AttachedTenantConf::try_from(config)?,
+            AttachedTenantConf::try_from(self.conf, config)?,
            shard_identity,
            None,
            SpawnMode::Eager,
@@ -2117,7 +2131,7 @@ impl TenantManager {
                tenant_shard_id,
                &tenant_path,
                self.resources.clone(),
-                AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?,
+                AttachedTenantConf::try_from(self.conf, config).map_err(Error::DetachReparent)?,
                shard_identity,
                None,
                SpawnMode::Eager,
@@ -2351,7 +2365,14 @@ impl TenantManager {

        #[allow(unused_mut)]
        let mut result = tenant
-            .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
+            .gc_iteration(
+                Some(timeline_id),
+                gc_horizon,
+                pitr,
+                crate::tenant::IgnoreLeaseDeadline::from(gc_req.ignore_lease_deadline),
+                &cancel,
+                &ctx,
+            )
            .await;
        // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
        // better once the types support it.
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -101,7 +101,7 @@ pub(crate) struct SecondaryTenant {
    // Secondary mode does not need the full shard identity or the pageserver_api::models::TenantConfig.  However,
    // storing these enables us to report our full LocationConf, enabling convenient reconciliation
    // by the control plane (see [`Self::get_location_conf`])
-    shard_identity: ShardIdentity,
+    pub(crate) shard_identity: ShardIdentity,
    tenant_conf: std::sync::Mutex<pageserver_api::models::TenantConfig>,

    // Internal state used by the Downloader.
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -55,11 +55,11 @@ pub struct BatchLayerWriter {
 }

 impl BatchLayerWriter {
-    pub async fn new(conf: &'static PageServerConf) -> anyhow::Result<Self> {
-        Ok(Self {
+    pub fn new(conf: &'static PageServerConf) -> Self {
+        Self {
            generated_layer_writers: Vec::new(),
            conf,
-        })
+        }
    }

    pub fn add_unfinished_image_writer(
@@ -182,7 +182,7 @@ impl BatchLayerWriter {
 /// An image writer that takes images and produces multiple image layers.
 #[must_use]
 pub struct SplitImageLayerWriter<'a> {
-    inner: ImageLayerWriter,
+    inner: Option<ImageLayerWriter>,
    target_layer_size: u64,
    lsn: Lsn,
    conf: &'static PageServerConf,
@@ -196,7 +196,7 @@ pub struct SplitImageLayerWriter<'a> {

 impl<'a> SplitImageLayerWriter<'a> {
    #[allow(clippy::too_many_arguments)]
-    pub async fn new(
+    pub fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
@@ -205,30 +205,19 @@ impl<'a> SplitImageLayerWriter<'a> {
        target_layer_size: u64,
        gate: &'a utils::sync::gate::Gate,
        cancel: CancellationToken,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
+    ) -> Self {
+        Self {
            target_layer_size,
-            inner: ImageLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                &(start_key..Key::MAX),
-                lsn,
-                gate,
-                cancel.clone(),
-                ctx,
-            )
-            .await?,
+            inner: None,
            conf,
            timeline_id,
            tenant_shard_id,
-            batches: BatchLayerWriter::new(conf).await?,
+            batches: BatchLayerWriter::new(conf),
            lsn,
            start_key,
            gate,
            cancel,
-        })
+        }
    }

    pub async fn put_image(
@@ -237,12 +226,31 @@ impl<'a> SplitImageLayerWriter<'a> {
        img: Bytes,
        ctx: &RequestContext,
    ) -> Result<(), PutError> {
+        if self.inner.is_none() {
+            self.inner = Some(
+                ImageLayerWriter::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_shard_id,
+                    &(self.start_key..Key::MAX),
+                    self.lsn,
+                    self.gate,
+                    self.cancel.clone(),
+                    ctx,
+                )
+                .await
+                .map_err(PutError::Other)?,
+            );
+        }
+
+        let inner = self.inner.as_mut().unwrap();
+
        // The current estimation is an upper bound of the space that the key/image could take
        // because we did not consider compression in this estimation. The resulting image layer
        // could be smaller than the target size.
        let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        if inner.num_keys() >= 1
+            && inner.estimated_size() + addition_size_estimation >= self.target_layer_size
        {
            let next_image_writer = ImageLayerWriter::new(
                self.conf,
@@ -256,7 +264,7 @@ impl<'a> SplitImageLayerWriter<'a> {
            )
            .await
            .map_err(PutError::Other)?;
-            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
+            let prev_image_writer = std::mem::replace(inner, next_image_writer);
            self.batches.add_unfinished_image_writer(
                prev_image_writer,
                self.start_key..key,
@@ -264,7 +272,7 @@ impl<'a> SplitImageLayerWriter<'a> {
            );
            self.start_key = key;
        }
-        self.inner.put_image(key, img, ctx).await
+        inner.put_image(key, img, ctx).await
    }

    pub(crate) async fn finish_with_discard_fn<D, F>(
@@ -281,8 +289,10 @@ impl<'a> SplitImageLayerWriter<'a> {
        let Self {
            mut batches, inner, ..
        } = self;
-        if inner.num_keys() != 0 {
-            batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn);
+        if let Some(inner) = inner {
+            if inner.num_keys() != 0 {
+                batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn);
+            }
        }
        batches.finish_with_discard_fn(tline, ctx, discard_fn).await
    }
@@ -319,7 +329,7 @@ pub struct SplitDeltaLayerWriter<'a> {
 }

 impl<'a> SplitDeltaLayerWriter<'a> {
-    pub async fn new(
+    pub fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
@@ -327,8 +337,8 @@ impl<'a> SplitDeltaLayerWriter<'a> {
        target_layer_size: u64,
        gate: &'a utils::sync::gate::Gate,
        cancel: CancellationToken,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
+    ) -> Self {
+        Self {
            target_layer_size,
            inner: None,
            conf,
@@ -336,10 +346,10 @@ impl<'a> SplitDeltaLayerWriter<'a> {
            tenant_shard_id,
            lsn_range,
            last_key_written: Key::MIN,
-            batches: BatchLayerWriter::new(conf).await?,
+            batches: BatchLayerWriter::new(conf),
            gate,
            cancel,
-        })
+        }
    }

    pub async fn put_value(
@@ -497,10 +507,7 @@ mod tests {
            4 * 1024 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-            &ctx,
-        )
-        .await
-        .unwrap();
+        );

        let mut delta_writer = SplitDeltaLayerWriter::new(
            tenant.conf,
@@ -510,9 +517,7 @@ mod tests {
            4 * 1024 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-        )
-        .await
-        .unwrap();
+        );

        image_writer
            .put_image(get_key(0), get_img(0), &ctx)
@@ -578,10 +583,7 @@ mod tests {
            4 * 1024 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-            &ctx,
-        )
-        .await
-        .unwrap();
+        );
        let mut delta_writer = SplitDeltaLayerWriter::new(
            tenant.conf,
            tline.timeline_id,
@@ -590,9 +592,7 @@ mod tests {
            4 * 1024 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-        )
-        .await
-        .unwrap();
+        );
        const N: usize = 2000;
        for i in 0..N {
            let i = i as u32;
@@ -679,10 +679,7 @@ mod tests {
            4 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-            &ctx,
-        )
-        .await
-        .unwrap();
+        );

        let mut delta_writer = SplitDeltaLayerWriter::new(
            tenant.conf,
@@ -692,9 +689,7 @@ mod tests {
            4 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-        )
-        .await
-        .unwrap();
+        );

        image_writer
            .put_image(get_key(0), get_img(0), &ctx)
@@ -770,9 +765,7 @@ mod tests {
            4 * 1024 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-        )
-        .await
-        .unwrap();
+        );

        for i in 0..N {
            let i = i as u32;
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -406,6 +406,7 @@ async fn gc_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
                    None,
                    gc_horizon,
                    tenant.get_pitr_interval(),
+                    crate::tenant::IgnoreLeaseDeadline::Default,
                    &cancel,
                    &ctx,
                ))
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -761,7 +761,7 @@ pub(crate) enum CreateImageLayersError {
    PageReconstructError(#[source] PageReconstructError),

    #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(anyhow::Error),
 }

 impl From<layer_manager::Shutdown> for CreateImageLayersError {
@@ -5586,7 +5586,7 @@ impl Timeline {
                self.should_check_if_image_layers_required(lsn)
            };

-        let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
+        let mut batch_image_writer = BatchLayerWriter::new(self.conf);

        let mut all_generated = true;

@@ -5690,7 +5690,8 @@ impl Timeline {
                self.cancel.clone(),
                ctx,
            )
-            .await?;
+            .await
+            .map_err(CreateImageLayersError::Other)?;

            fail_point!("image-layer-writer-fail-before-finish", |_| {
                Err(CreateImageLayersError::Other(anyhow::anyhow!(
@@ -5785,7 +5786,10 @@ impl Timeline {
            }
        }

-        let image_layers = batch_image_writer.finish(self, ctx).await?;
+        let image_layers = batch_image_writer
+            .finish(self, ctx)
+            .await
+            .map_err(CreateImageLayersError::Other)?;

        let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await;

--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -3500,22 +3500,16 @@ impl Timeline {
        // Only create image layers when there is no ancestor branches. TODO: create covering image layer
        // when some condition meet.
        let mut image_layer_writer = if !has_data_below {
-            Some(
-                SplitImageLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_shard_id,
-                    job_desc.compaction_key_range.start,
-                    lowest_retain_lsn,
-                    self.get_compaction_target_size(),
-                    &self.gate,
-                    self.cancel.clone(),
-                    ctx,
-                )
-                .await
-                .context("failed to create image layer writer")
-                .map_err(CompactionError::Other)?,
-            )
+            Some(SplitImageLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                job_desc.compaction_key_range.start,
+                lowest_retain_lsn,
+                self.get_compaction_target_size(),
+                &self.gate,
+                self.cancel.clone(),
+            ))
        } else {
            None
        };
@@ -3528,10 +3522,7 @@ impl Timeline {
            self.get_compaction_target_size(),
            &self.gate,
            self.cancel.clone(),
-        )
-        .await
-        .context("failed to create delta layer writer")
-        .map_err(CompactionError::Other)?;
+        );

        #[derive(Default)]
        struct RewritingLayers {
@@ -4327,7 +4318,8 @@ impl TimelineAdaptor {
            self.timeline.cancel.clone(),
            ctx,
        )
-        .await?;
+        .await
+        .map_err(CreateImageLayersError::Other)?;

        fail_point!("image-layer-writer-fail-before-finish", |_| {
            Err(CreateImageLayersError::Other(anyhow::anyhow!(
@@ -4336,7 +4328,10 @@ impl TimelineAdaptor {
        });

        let keyspace = KeySpace {
-            ranges: self.get_keyspace(key_range, lsn, ctx).await?,
+            ranges: self
+                .get_keyspace(key_range, lsn, ctx)
+                .await
+                .map_err(CreateImageLayersError::Other)?,
        };
        // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
        let outcome = self
@@ -4355,9 +4350,13 @@ impl TimelineAdaptor {
            unfinished_image_layer,
        } = outcome
        {
-            let (desc, path) = unfinished_image_layer.finish(ctx).await?;
+            let (desc, path) = unfinished_image_layer
+                .finish(ctx)
+                .await
+                .map_err(CreateImageLayersError::Other)?;
            let image_layer =
-                Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
+                Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)
+                    .map_err(CreateImageLayersError::Other)?;
            self.new_images.push(image_layer);
        }

--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -885,7 +885,7 @@ async fn remote_copy(
                }
                tracing::info!("Deleting orphan layer file to make way for hard linking");
                // Delete orphan layer file and try again, to ensure this layer has a well understood source
-                std::fs::remove_file(adopted_path)
+                std::fs::remove_file(&adoptee_path)
                    .map_err(|e| Error::launder(e.into(), Error::Prepare))?;
                std::fs::hard_link(adopted_path, &adoptee_path)
                    .map_err(|e| Error::launder(e.into(), Error::Prepare))?;
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -887,7 +887,7 @@ mod tests {
            .expect("we still have it");
    }

-    fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
+    fn make_relation_key_for_shard(shard: ShardNumber, params: ShardParameters) -> Key {
        rel_block_to_key(
            RelTag {
                spcnode: 1663,
@@ -917,14 +917,14 @@ mod tests {
        let child0 = Arc::new_cyclic(|myself| StubTimeline {
            gate: Default::default(),
            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
+            shard: ShardIdentity::from_params(ShardNumber(0), child_params),
            per_timeline_state: PerTimelineState::default(),
            myself: myself.clone(),
        });
        let child1 = Arc::new_cyclic(|myself| StubTimeline {
            gate: Default::default(),
            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
+            shard: ShardIdentity::from_params(ShardNumber(1), child_params),
            per_timeline_state: PerTimelineState::default(),
            myself: myself.clone(),
        });
@@ -937,7 +937,7 @@ mod tests {
            let handle = cache
                .get(
                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
+                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)),
                    &StubManager {
                        shards: vec![parent.clone()],
                    },
@@ -961,7 +961,7 @@ mod tests {
            let handle = cache
                .get(
                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
+                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)),
                    &StubManager {
                        shards: vec![], // doesn't matter what's in here, the cache is fully loaded
                    },
@@ -978,7 +978,7 @@ mod tests {
        let parent_handle = cache
            .get(
                timeline_id,
-                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
+                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), child_params)),
                &StubManager {
                    shards: vec![parent.clone()],
                },
@@ -995,7 +995,7 @@ mod tests {
            let handle = cache
                .get(
                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
+                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)),
                    &StubManager {
                        shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
                    },
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -78,16 +78,6 @@ pub(crate) trait ReportableError: fmt::Display + Send + 'static {
    fn get_error_kind(&self) -> ErrorKind;
 }

-impl ReportableError for postgres_client::error::Error {
-    fn get_error_kind(&self) -> ErrorKind {
-        if self.as_db_error().is_some() {
-            ErrorKind::Postgres
-        } else {
-            ErrorKind::Compute
-        }
-    }
-}
-
 /// Flattens `Result<Result<T>>` into `Result<T>`.
 pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
    r.context("join error").and_then(|x| x)
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -404,7 +404,15 @@ impl ReportableError for HttpConnError {
    fn get_error_kind(&self) -> ErrorKind {
        match self {
            HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute,
-            HttpConnError::PostgresConnectionError(p) => p.get_error_kind(),
+            HttpConnError::PostgresConnectionError(p) => {
+                if p.as_db_error().is_some() {
+                    // postgres rejected the connection
+                    ErrorKind::Postgres
+                } else {
+                    // couldn't even reach postgres
+                    ErrorKind::Compute
+                }
+            }
            HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute,
            HttpConnError::ComputeCtl(_) => ErrorKind::Service,
            HttpConnError::JwtPayloadError(_) => ErrorKind::User,
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -22,7 +22,7 @@ use serde_json::Value;
 use serde_json::value::RawValue;
 use tokio::time::{self, Instant};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info};
+use tracing::{Level, debug, error, info};
 use typed_json::json;
 use url::Url;
 use uuid::Uuid;
@@ -390,12 +390,35 @@ pub(crate) async fn handle(
            let line = get(db_error, |db| db.line().map(|l| l.to_string()));
            let routine = get(db_error, |db| db.routine());

-            tracing::info!(
-                kind=error_kind.to_metric_label(),
-                error=%e,
-                msg=message,
-                "forwarding error to user"
-            );
+            match &e {
+                SqlOverHttpError::Postgres(e)
+                    if e.as_db_error().is_some() && error_kind == ErrorKind::User =>
+                {
+                    // this error contains too much info, and it's not an error we care about.
+                    if tracing::enabled!(Level::DEBUG) {
+                        tracing::debug!(
+                            kind=error_kind.to_metric_label(),
+                            error=%e,
+                            msg=message,
+                            "forwarding error to user"
+                        );
+                    } else {
+                        tracing::info!(
+                            kind = error_kind.to_metric_label(),
+                            error = "bad query",
+                            "forwarding error to user"
+                        );
+                    }
+                }
+                _ => {
+                    tracing::info!(
+                        kind=error_kind.to_metric_label(),
+                        error=%e,
+                        msg=message,
+                        "forwarding error to user"
+                    );
+                }
+            }

            json_response(
                e.get_http_status_code(),
@@ -460,7 +483,15 @@ impl ReportableError for SqlOverHttpError {
            SqlOverHttpError::ConnInfo(e) => e.get_error_kind(),
            SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User,
            SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
-            SqlOverHttpError::Postgres(p) => p.get_error_kind(),
+            // customer initiated SQL errors.
+            SqlOverHttpError::Postgres(p) => {
+                if p.as_db_error().is_some() {
+                    ErrorKind::User
+                } else {
+                    ErrorKind::Compute
+                }
+            }
+            // proxy initiated SQL errors.
            SqlOverHttpError::InternalPostgres(p) => {
                if p.as_db_error().is_some() {
                    ErrorKind::Service
@@ -468,6 +499,7 @@ impl ReportableError for SqlOverHttpError {
                    ErrorKind::Compute
                }
            }
+            // postgres returned a bad row format that we couldn't parse.
            SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres,
            SqlOverHttpError::Cancelled(c) => c.get_error_kind(),
        }
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -52,7 +52,7 @@ pub trait ResponseErrorMessageExt: Sized {
 impl ResponseErrorMessageExt for reqwest::Response {
    async fn error_from_body(self) -> Result<Self> {
        let status = self.status();
-        if !(status.is_client_error() || status.is_server_error()) {
+        if status.is_success() {
            return Ok(self);
        }

--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -220,7 +220,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                                    stripe_size: ShardStripeSize(stripe_size),
                                };
                                self.shard =
-                                    Some(ShardIdentity::from_params(ShardNumber(number), &params));
+                                    Some(ShardIdentity::from_params(ShardNumber(number), params));
                            }
                            _ => {
                                return Err(QueryError::Other(anyhow::anyhow!(
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -9,7 +9,7 @@ use anyhow::{Result, bail};
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use postgres_versioninfo::{PgMajorVersion, PgVersionId};
 use safekeeper_api::membership::Configuration;
-use safekeeper_api::models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse};
+use safekeeper_api::models::TimelineTermBumpResponse;
 use safekeeper_api::{INITIAL_TERM, ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tracing::info;
@@ -83,6 +83,11 @@ pub enum EvictionState {
    Offloaded(Lsn),
 }

+pub struct MembershipSwitchResult {
+    pub previous_conf: Configuration,
+    pub current_conf: Configuration,
+}
+
 impl TimelinePersistentState {
    /// commit_lsn is the same as start_lsn in the normal creaiton; see
    /// `TimelineCreateRequest` comments.`
@@ -261,10 +266,7 @@ where

    /// Switch into membership configuration `to` if it is higher than the
    /// current one.
-    pub async fn membership_switch(
-        &mut self,
-        to: Configuration,
-    ) -> Result<TimelineMembershipSwitchResponse> {
+    pub async fn membership_switch(&mut self, to: Configuration) -> Result<MembershipSwitchResult> {
        let before = self.mconf.clone();
        // Is switch allowed?
        if to.generation <= self.mconf.generation {
@@ -278,7 +280,7 @@ where
            self.finish_change(&state).await?;
            info!("switched membership conf to {} from {}", to, before);
        }
-        Ok(TimelineMembershipSwitchResponse {
+        Ok(MembershipSwitchResult {
            previous_conf: before,
            current_conf: self.mconf.clone(),
        })
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -190,7 +190,14 @@ impl StateSK {
        &mut self,
        to: Configuration,
    ) -> Result<TimelineMembershipSwitchResponse> {
-        self.state_mut().membership_switch(to).await
+        let result = self.state_mut().membership_switch(to).await?;
+
+        Ok(TimelineMembershipSwitchResponse {
+            previous_conf: result.previous_conf,
+            current_conf: result.current_conf,
+            term: self.state().acceptor_state.term,
+            flush_lsn: self.flush_lsn(),
+        })
    }

    /// Close open WAL files to release FDs.
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -154,8 +154,8 @@ pub struct PhysicalStorage {
    ///     record
    ///
    /// Partial segment 002 has no WAL records, and it will be removed by the
-    /// next truncate_wal(). This flag will be set to true after the first
-    /// truncate_wal() call.
+    /// next truncate_wal(). This flag will be set to false after the first
+    /// successful truncate_wal() call.
    ///
    /// [`write_lsn`]: Self::write_lsn
    pending_wal_truncation: bool,
@@ -202,6 +202,8 @@ impl PhysicalStorage {
            ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn,
        );
        if flush_lsn < state.commit_lsn {
+            // note: can never happen. find_end_of_wal returns provided start_lsn
+            // (state.commit_lsn in our case) if it doesn't find anything.
            bail!(
                "timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn  {} from control file",
                ttid.timeline_id,
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -20,6 +20,7 @@ camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 clashmap.workspace = true
+compute_api.workspace = true
 cron.workspace = true
 fail.workspace = true
 futures.workspace = true
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -5,7 +5,8 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::Context;
-use control_plane::endpoint::{ComputeControlPlane, EndpointStatus, PageserverProtocol};
+use compute_api::spec::PageserverProtocol;
+use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use futures::StreamExt;
 use hyper::StatusCode;
@@ -13,11 +14,12 @@ use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT;
 use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 use postgres_connection::parse_host_port;
+use safekeeper_api::membership::SafekeeperGeneration;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, info_span};
 use utils::backoff::{self};
-use utils::id::{NodeId, TenantId};
+use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};

 use crate::service::Config;

@@ -35,7 +37,7 @@ struct UnshardedComputeHookTenant {
    preferred_az: Option<AvailabilityZone>,

    // Must hold this lock to send a notification.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>,
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeRemoteTenantState>>>,
 }
 struct ShardedComputeHookTenant {
    stripe_size: ShardStripeSize,
@@ -48,7 +50,7 @@ struct ShardedComputeHookTenant {
    // Must hold this lock to send a notification.  The contents represent
    // the last successfully sent notification, and are used to coalesce multiple
    // updates by only sending when there is a chance since our last successful send.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>,
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeRemoteTenantState>>>,
 }

 /// Represents our knowledge of the compute's state: we can update this when we get a
@@ -56,9 +58,9 @@ struct ShardedComputeHookTenant {
 ///
 /// Should be wrapped in an Option<>, as we cannot always know the remote state.
 #[derive(PartialEq, Eq, Debug)]
-struct ComputeRemoteState {
+struct ComputeRemoteState<R> {
    // The request body which was acked by the compute
-    request: ComputeHookNotifyRequest,
+    request: R,

    // Whether the cplane indicated that the state was applied to running computes, or just
    // persisted.  In the Neon control plane, this is the difference between a 423 response (meaning
@@ -66,6 +68,36 @@ struct ComputeRemoteState {
    applied: bool,
 }

+type ComputeRemoteTenantState = ComputeRemoteState<NotifyAttachRequest>;
+type ComputeRemoteTimelineState = ComputeRemoteState<NotifySafekeepersRequest>;
+
+/// The trait which define the handler-specific types and methods.
+/// We have two implementations of this trait so far:
+/// - [`ComputeHookTenant`] for tenant attach notifications ("/notify-attach")
+/// - [`ComputeHookTimeline`] for safekeeper change notifications ("/notify-safekeepers")
+trait ApiMethod {
+    /// Type of the key which identifies the resource.
+    /// It's either TenantId for tenant attach notifications,
+    /// or TenantTimelineId for safekeeper change notifications.
+    type Key: std::cmp::Eq + std::hash::Hash + Clone;
+
+    type Request: serde::Serialize + std::fmt::Debug;
+
+    const API_PATH: &'static str;
+
+    fn maybe_send(
+        &self,
+        key: Self::Key,
+        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeRemoteState<Self::Request>>>>,
+    ) -> MaybeSendResult<Self::Request, Self::Key>;
+
+    async fn notify_local(
+        env: &LocalEnv,
+        cplane: &ComputeControlPlane,
+        req: &Self::Request,
+    ) -> Result<(), NotifyError>;
+}
+
 enum ComputeHookTenant {
    Unsharded(UnshardedComputeHookTenant),
    Sharded(ShardedComputeHookTenant),
@@ -96,7 +128,7 @@ impl ComputeHookTenant {
        }
    }

-    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>> {
+    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeRemoteTenantState>>> {
        match self {
            Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
            Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
@@ -190,19 +222,136 @@ impl ComputeHookTenant {
    }
 }

+/// The state of a timeline we need to notify the compute about.
+struct ComputeHookTimeline {
+    generation: SafekeeperGeneration,
+    safekeepers: Vec<SafekeeperInfo>,
+
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeRemoteTimelineState>>>,
+}
+
+impl ComputeHookTimeline {
+    /// Construct a new ComputeHookTimeline with the given safekeepers and generation.
+    fn new(generation: SafekeeperGeneration, safekeepers: Vec<SafekeeperInfo>) -> Self {
+        Self {
+            generation,
+            safekeepers,
+            send_lock: Arc::default(),
+        }
+    }
+
+    /// Update the state with a new SafekeepersUpdate.
+    /// Noop if the update generation is not greater than the current generation.
+    fn update(&mut self, sk_update: SafekeepersUpdate) {
+        if sk_update.generation > self.generation {
+            self.generation = sk_update.generation;
+            self.safekeepers = sk_update.safekeepers;
+        }
+    }
+}
+
+impl ApiMethod for ComputeHookTimeline {
+    type Key = TenantTimelineId;
+    type Request = NotifySafekeepersRequest;
+
+    const API_PATH: &'static str = "notify-safekeepers";
+
+    fn maybe_send(
+        &self,
+        ttid: TenantTimelineId,
+        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeRemoteTimelineState>>>,
+    ) -> MaybeSendNotifySafekeepersResult {
+        let locked = match lock {
+            Some(already_locked) => already_locked,
+            None => {
+                // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::timelines`] lock.
+                let Ok(locked) = self.send_lock.clone().try_lock_owned() else {
+                    return MaybeSendResult::AwaitLock((ttid, self.send_lock.clone()));
+                };
+                locked
+            }
+        };
+
+        if locked
+            .as_ref()
+            .is_some_and(|s| s.request.generation >= self.generation)
+        {
+            return MaybeSendResult::Noop;
+        }
+
+        MaybeSendResult::Transmit((
+            NotifySafekeepersRequest {
+                tenant_id: ttid.tenant_id,
+                timeline_id: ttid.timeline_id,
+                generation: self.generation,
+                safekeepers: self.safekeepers.clone(),
+            },
+            locked,
+        ))
+    }
+
+    async fn notify_local(
+        _env: &LocalEnv,
+        cplane: &ComputeControlPlane,
+        req: &NotifySafekeepersRequest,
+    ) -> Result<(), NotifyError> {
+        let NotifySafekeepersRequest {
+            tenant_id,
+            timeline_id,
+            generation,
+            safekeepers,
+        } = req;
+
+        for (endpoint_name, endpoint) in &cplane.endpoints {
+            if endpoint.tenant_id == *tenant_id
+                && endpoint.timeline_id == *timeline_id
+                && endpoint.status() == EndpointStatus::Running
+            {
+                tracing::info!("Reconfiguring safekeepers for endpoint {endpoint_name}");
+
+                let safekeepers = safekeepers.iter().map(|sk| sk.id).collect::<Vec<_>>();
+
+                endpoint
+                    .reconfigure_safekeepers(safekeepers, *generation)
+                    .await
+                    .map_err(NotifyError::NeonLocal)?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
 #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
-struct ComputeHookNotifyRequestShard {
+struct NotifyAttachRequestShard {
    node_id: NodeId,
    shard_number: ShardNumber,
 }

 /// Request body that we send to the control plane to notify it of where a tenant is attached
 #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
-struct ComputeHookNotifyRequest {
+struct NotifyAttachRequest {
    tenant_id: TenantId,
    preferred_az: Option<String>,
    stripe_size: Option<ShardStripeSize>,
-    shards: Vec<ComputeHookNotifyRequestShard>,
+    shards: Vec<NotifyAttachRequestShard>,
+}
+
+#[derive(Serialize, Deserialize, Debug, Eq, PartialEq, Clone)]
+pub(crate) struct SafekeeperInfo {
+    pub id: NodeId,
+    /// Hostname of the safekeeper.
+    /// It exists for better debuggability. Might be missing.
+    /// Should not be used for anything else.
+    pub hostname: Option<String>,
+}
+
+#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
+struct NotifySafekeepersRequest {
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    generation: SafekeeperGeneration,
+    safekeepers: Vec<SafekeeperInfo>,
 }

 /// Error type for attempts to call into the control plane compute notification hook
@@ -234,42 +383,50 @@ pub(crate) enum NotifyError {
    NeonLocal(anyhow::Error),
 }

-enum MaybeSendResult {
+enum MaybeSendResult<R, K> {
    // Please send this request while holding the lock, and if you succeed then write
    // the request into the lock.
    Transmit(
        (
-            ComputeHookNotifyRequest,
-            tokio::sync::OwnedMutexGuard<Option<ComputeRemoteState>>,
+            R,
+            tokio::sync::OwnedMutexGuard<Option<ComputeRemoteState<R>>>,
        ),
    ),
    // Something requires sending, but you must wait for a current sender then call again
-    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>),
+    AwaitLock((K, Arc<tokio::sync::Mutex<Option<ComputeRemoteState<R>>>>)),
    // Nothing requires sending
    Noop,
 }

-impl ComputeHookTenant {
+type MaybeSendNotifyAttachResult = MaybeSendResult<NotifyAttachRequest, TenantId>;
+type MaybeSendNotifySafekeepersResult = MaybeSendResult<NotifySafekeepersRequest, TenantTimelineId>;
+
+impl ApiMethod for ComputeHookTenant {
+    type Key = TenantId;
+    type Request = NotifyAttachRequest;
+
+    const API_PATH: &'static str = "notify-attach";
+
    fn maybe_send(
        &self,
        tenant_id: TenantId,
-        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeRemoteState>>>,
-    ) -> MaybeSendResult {
+        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeRemoteTenantState>>>,
+    ) -> MaybeSendNotifyAttachResult {
        let locked = match lock {
            Some(already_locked) => already_locked,
            None => {
-                // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
+                // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::tenants`] lock.
                let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
-                    return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
+                    return MaybeSendResult::AwaitLock((tenant_id, self.get_send_lock().clone()));
                };
                locked
            }
        };

        let request = match self {
-            Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
+            Self::Unsharded(unsharded_tenant) => Some(NotifyAttachRequest {
                tenant_id,
-                shards: vec![ComputeHookNotifyRequestShard {
+                shards: vec![NotifyAttachRequestShard {
                    shard_number: ShardNumber(0),
                    node_id: unsharded_tenant.node_id,
                }],
@@ -282,12 +439,12 @@ impl ComputeHookTenant {
            Self::Sharded(sharded_tenant)
                if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize =>
            {
-                Some(ComputeHookNotifyRequest {
+                Some(NotifyAttachRequest {
                    tenant_id,
                    shards: sharded_tenant
                        .shards
                        .iter()
-                        .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard {
+                        .map(|(shard_number, node_id)| NotifyAttachRequestShard {
                            shard_number: *shard_number,
                            node_id: *node_id,
                        })
@@ -332,98 +489,22 @@ impl ComputeHookTenant {
            }
        }
    }
-}

-/// The compute hook is a destination for notifications about changes to tenant:pageserver
-/// mapping.  It aggregates updates for the shards in a tenant, and when appropriate reconfigures
-/// the compute connection string.
-pub(super) struct ComputeHook {
-    config: Config,
-    state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
-    authorization_header: Option<String>,
-
-    // Concurrency limiter, so that we do not overload the cloud control plane when updating
-    // large numbers of tenants (e.g. when failing over after a node failure)
-    api_concurrency: tokio::sync::Semaphore,
-
-    // This lock is only used in testing enviroments, to serialize calls into neon_lock
-    neon_local_lock: tokio::sync::Mutex<()>,
-
-    // We share a client across all notifications to enable connection re-use etc when
-    // sending large numbers of notifications
-    client: reqwest::Client,
-}
-
-/// Callers may give us a list of these when asking us to send a bulk batch
-/// of notifications in the background.  This is a 'notification' in the sense of
-/// other code notifying us of a shard's status, rather than being the final notification
-/// that we send upwards to the control plane for the whole tenant.
-pub(crate) struct ShardUpdate<'a> {
-    pub(crate) tenant_shard_id: TenantShardId,
-    pub(crate) node_id: NodeId,
-    pub(crate) stripe_size: ShardStripeSize,
-    pub(crate) preferred_az: Option<Cow<'a, AvailabilityZone>>,
-}
-
-impl ComputeHook {
-    pub(super) fn new(config: Config) -> anyhow::Result<Self> {
-        let authorization_header = config
-            .control_plane_jwt_token
-            .clone()
-            .map(|jwt| format!("Bearer {jwt}"));
-
-        let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT);
-        for cert in &config.ssl_ca_certs {
-            client = client.add_root_certificate(cert.clone());
-        }
-        let client = client
-            .build()
-            .context("Failed to build http client for compute hook")?;
-
-        Ok(Self {
-            state: Default::default(),
-            config,
-            authorization_header,
-            neon_local_lock: Default::default(),
-            api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
-            client,
-        })
-    }
-
-    /// For test environments: use neon_local's LocalEnv to update compute
-    async fn do_notify_local(
-        &self,
-        reconfigure_request: &ComputeHookNotifyRequest,
+    async fn notify_local(
+        env: &LocalEnv,
+        cplane: &ComputeControlPlane,
+        req: &NotifyAttachRequest,
    ) -> Result<(), NotifyError> {
-        // neon_local updates are not safe to call concurrently, use a lock to serialize
-        // all calls to this function
-        let _locked = self.neon_local_lock.lock().await;
-
-        let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else {
-            tracing::warn!(
-                "neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update"
-            );
-            return Ok(());
-        };
-        let env = match LocalEnv::load_config(repo_dir) {
-            Ok(e) => e,
-            Err(e) => {
-                tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})");
-                return Ok(());
-            }
-        };
-        let cplane =
-            ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
-        let ComputeHookNotifyRequest {
+        let NotifyAttachRequest {
            tenant_id,
            shards,
            stripe_size,
            preferred_az: _preferred_az,
-        } = reconfigure_request;
+        } = req;

        for (endpoint_name, endpoint) in &cplane.endpoints {
            if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
-                tracing::info!("Reconfiguring endpoint {endpoint_name}");
+                tracing::info!("Reconfiguring pageservers for endpoint {endpoint_name}");

                let pageservers = shards
                    .iter()
@@ -445,7 +526,7 @@ impl ComputeHook {
                    .collect::<Vec<_>>();

                endpoint
-                    .reconfigure(pageservers, *stripe_size, None)
+                    .reconfigure_pageservers(pageservers, *stripe_size)
                    .await
                    .map_err(NotifyError::NeonLocal)?;
            }
@@ -453,11 +534,102 @@ impl ComputeHook {

        Ok(())
    }
+}

-    async fn do_notify_iteration(
+/// The compute hook is a destination for notifications about changes to tenant:pageserver
+/// mapping.  It aggregates updates for the shards in a tenant, and when appropriate reconfigures
+/// the compute connection string.
+pub(super) struct ComputeHook {
+    config: Config,
+    tenants: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+    timelines: std::sync::Mutex<HashMap<TenantTimelineId, ComputeHookTimeline>>,
+    authorization_header: Option<String>,
+
+    // Concurrency limiter, so that we do not overload the cloud control plane when updating
+    // large numbers of tenants (e.g. when failing over after a node failure)
+    api_concurrency: tokio::sync::Semaphore,
+
+    // This lock is only used in testing enviroments, to serialize calls into neon_local
+    neon_local_lock: tokio::sync::Mutex<()>,
+
+    // We share a client across all notifications to enable connection re-use etc when
+    // sending large numbers of notifications
+    client: reqwest::Client,
+}
+
+/// Callers may give us a list of these when asking us to send a bulk batch
+/// of notifications in the background.  This is a 'notification' in the sense of
+/// other code notifying us of a shard's status, rather than being the final notification
+/// that we send upwards to the control plane for the whole tenant.
+pub(crate) struct ShardUpdate<'a> {
+    pub(crate) tenant_shard_id: TenantShardId,
+    pub(crate) node_id: NodeId,
+    pub(crate) stripe_size: ShardStripeSize,
+    pub(crate) preferred_az: Option<Cow<'a, AvailabilityZone>>,
+}
+
+pub(crate) struct SafekeepersUpdate {
+    pub(crate) tenant_id: TenantId,
+    pub(crate) timeline_id: TimelineId,
+    pub(crate) generation: SafekeeperGeneration,
+    pub(crate) safekeepers: Vec<SafekeeperInfo>,
+}
+
+impl ComputeHook {
+    pub(super) fn new(config: Config) -> anyhow::Result<Self> {
+        let authorization_header = config
+            .control_plane_jwt_token
+            .clone()
+            .map(|jwt| format!("Bearer {jwt}"));
+
+        let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT);
+        for cert in &config.ssl_ca_certs {
+            client = client.add_root_certificate(cert.clone());
+        }
+        let client = client
+            .build()
+            .context("Failed to build http client for compute hook")?;
+
+        Ok(Self {
+            tenants: Default::default(),
+            timelines: Default::default(),
+            config,
+            authorization_header,
+            neon_local_lock: Default::default(),
+            api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
+            client,
+        })
+    }
+
+    /// For test environments: use neon_local's LocalEnv to update compute
+    async fn do_notify_local<M: ApiMethod>(&self, req: &M::Request) -> Result<(), NotifyError> {
+        // neon_local updates are not safe to call concurrently, use a lock to serialize
+        // all calls to this function
+        let _locked = self.neon_local_lock.lock().await;
+
+        let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else {
+            tracing::warn!(
+                "neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update"
+            );
+            return Ok(());
+        };
+        let env = match LocalEnv::load_config(repo_dir) {
+            Ok(e) => e,
+            Err(e) => {
+                tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})");
+                return Ok(());
+            }
+        };
+        let cplane =
+            ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
+
+        M::notify_local(&env, &cplane, req).await
+    }
+
+    async fn do_notify_iteration<Req: serde::Serialize + std::fmt::Debug>(
        &self,
        url: &String,
-        reconfigure_request: &ComputeHookNotifyRequest,
+        reconfigure_request: &Req,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
        let req = self.client.request(reqwest::Method::PUT, url);
@@ -479,9 +651,7 @@ impl ComputeHook {
        };

        // Treat all 2xx responses as success
-        if response.status() >= reqwest::StatusCode::OK
-            && response.status() < reqwest::StatusCode::MULTIPLE_CHOICES
-        {
+        if response.status().is_success() {
            if response.status() != reqwest::StatusCode::OK {
                // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so
                // log a warning.
@@ -532,10 +702,10 @@ impl ComputeHook {
        }
    }

-    async fn do_notify(
+    async fn do_notify<R: serde::Serialize + std::fmt::Debug>(
        &self,
        url: &String,
-        reconfigure_request: &ComputeHookNotifyRequest,
+        reconfigure_request: &R,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
        // We hold these semaphore units across all retries, rather than only across each
@@ -567,13 +737,13 @@ impl ComputeHook {
    }

    /// Synchronous phase: update the per-tenant state for the next intended notification
-    fn notify_prepare(&self, shard_update: ShardUpdate) -> MaybeSendResult {
-        let mut state_locked = self.state.lock().unwrap();
+    fn notify_attach_prepare(&self, shard_update: ShardUpdate) -> MaybeSendNotifyAttachResult {
+        let mut tenants_locked = self.tenants.lock().unwrap();

        use std::collections::hash_map::Entry;
        let tenant_shard_id = shard_update.tenant_shard_id;

-        let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
+        let tenant = match tenants_locked.entry(tenant_shard_id.tenant_id) {
            Entry::Vacant(e) => {
                let ShardUpdate {
                    tenant_shard_id,
@@ -597,10 +767,37 @@ impl ComputeHook {
        tenant.maybe_send(tenant_shard_id.tenant_id, None)
    }

-    async fn notify_execute(
+    fn notify_safekeepers_prepare(
        &self,
-        maybe_send_result: MaybeSendResult,
-        tenant_shard_id: TenantShardId,
+        safekeepers_update: SafekeepersUpdate,
+    ) -> MaybeSendNotifySafekeepersResult {
+        let mut timelines_locked = self.timelines.lock().unwrap();
+
+        let ttid = TenantTimelineId {
+            tenant_id: safekeepers_update.tenant_id,
+            timeline_id: safekeepers_update.timeline_id,
+        };
+
+        use std::collections::hash_map::Entry;
+        let timeline = match timelines_locked.entry(ttid) {
+            Entry::Vacant(e) => e.insert(ComputeHookTimeline::new(
+                safekeepers_update.generation,
+                safekeepers_update.safekeepers,
+            )),
+            Entry::Occupied(e) => {
+                let timeline = e.into_mut();
+                timeline.update(safekeepers_update);
+                timeline
+            }
+        };
+
+        timeline.maybe_send(ttid, None)
+    }
+
+    async fn notify_execute<M: ApiMethod>(
+        &self,
+        state: &std::sync::Mutex<HashMap<M::Key, M>>,
+        maybe_send_result: MaybeSendResult<M::Request, M::Key>,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
        // Process result: we may get an update to send, or we may have to wait for a lock
@@ -609,7 +806,7 @@ impl ComputeHook {
            MaybeSendResult::Noop => {
                return Ok(());
            }
-            MaybeSendResult::AwaitLock(send_lock) => {
+            MaybeSendResult::AwaitLock((key, send_lock)) => {
                let send_locked = tokio::select! {
                    guard = send_lock.lock_owned() => {guard},
                    _ = cancel.cancelled() => {
@@ -620,11 +817,11 @@ impl ComputeHook {
                // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
                // we have acquired the send lock and take `[Self::state]` lock.  This is safe because maybe_send only uses
                // try_lock.
-                let state_locked = self.state.lock().unwrap();
-                let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
+                let state_locked = state.lock().unwrap();
+                let Some(resource_state) = state_locked.get(&key) else {
                    return Ok(());
                };
-                match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
+                match resource_state.maybe_send(key, Some(send_locked)) {
                    MaybeSendResult::AwaitLock(_) => {
                        unreachable!("We supplied lock guard")
                    }
@@ -643,14 +840,18 @@ impl ComputeHook {
                    .control_plane_url
                    .as_ref()
                    .map(|control_plane_url| {
-                        format!("{}/notify-attach", control_plane_url.trim_end_matches('/'))
+                        format!(
+                            "{}/{}",
+                            control_plane_url.trim_end_matches('/'),
+                            M::API_PATH
+                        )
                    });

            // We validate this at startup
            let notify_url = compute_hook_url.as_ref().unwrap();
            self.do_notify(notify_url, &request, cancel).await
        } else {
-            self.do_notify_local(&request).await.map_err(|e| {
+            self.do_notify_local::<M>(&request).await.map_err(|e| {
                // This path is for testing only, so munge the error into our prod-style error type.
                tracing::error!("neon_local notification hook failed: {e}");
                NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
@@ -686,7 +887,7 @@ impl ComputeHook {
    /// Infallible synchronous fire-and-forget version of notify(), that sends its results to
    /// a channel.  Something should consume the channel and arrange to try notifying again
    /// if something failed.
-    pub(super) fn notify_background(
+    pub(super) fn notify_attach_background(
        self: &Arc<Self>,
        notifications: Vec<ShardUpdate>,
        result_tx: tokio::sync::mpsc::Sender<Result<(), (TenantShardId, NotifyError)>>,
@@ -695,7 +896,7 @@ impl ComputeHook {
        let mut maybe_sends = Vec::new();
        for shard_update in notifications {
            let tenant_shard_id = shard_update.tenant_shard_id;
-            let maybe_send_result = self.notify_prepare(shard_update);
+            let maybe_send_result = self.notify_attach_prepare(shard_update);
            maybe_sends.push((tenant_shard_id, maybe_send_result))
        }

@@ -714,10 +915,10 @@ impl ComputeHook {

                    async move {
                        this
-                            .notify_execute(maybe_send_result, tenant_shard_id, &cancel)
+                            .notify_execute(&this.tenants, maybe_send_result, &cancel)
                            .await.map_err(|e| (tenant_shard_id, e))
                    }.instrument(info_span!(
-                        "notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()
+                        "notify_attach_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()
                    ))
                })
                .buffered(API_CONCURRENCY);
@@ -760,14 +961,23 @@ impl ComputeHook {
    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
    /// the proper pageserver nodes for a tenant.
    #[tracing::instrument(skip_all, fields(tenant_id=%shard_update.tenant_shard_id.tenant_id, shard_id=%shard_update.tenant_shard_id.shard_slug(), node_id))]
-    pub(super) async fn notify<'a>(
+    pub(super) async fn notify_attach<'a>(
        &self,
        shard_update: ShardUpdate<'a>,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
-        let tenant_shard_id = shard_update.tenant_shard_id;
-        let maybe_send_result = self.notify_prepare(shard_update);
-        self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
+        let maybe_send_result = self.notify_attach_prepare(shard_update);
+        self.notify_execute(&self.tenants, maybe_send_result, cancel)
+            .await
+    }
+
+    pub(super) async fn notify_safekeepers(
+        &self,
+        safekeepers_update: SafekeepersUpdate,
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
+        let maybe_send_result = self.notify_safekeepers_prepare(safekeepers_update);
+        self.notify_execute(&self.timelines, maybe_send_result, cancel)
            .await
    }

@@ -783,8 +993,8 @@ impl ComputeHook {
    ) {
        use std::collections::hash_map::Entry;

-        let mut state_locked = self.state.lock().unwrap();
-        match state_locked.entry(tenant_shard_id.tenant_id) {
+        let mut tenants_locked = self.tenants.lock().unwrap();
+        match tenants_locked.entry(tenant_shard_id.tenant_id) {
            Entry::Vacant(_) => {
                // This is a valid but niche case, where the tenant was previously attached
                // as a Secondary location and then detached, so has no previously notified
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -22,7 +22,7 @@ use pageserver_api::controller_api::{
    MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest,
    ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
-    TimelineImportRequest,
+    TimelineImportRequest, TimelineSafekeeperMigrateRequest,
 };
 use pageserver_api::models::{
    DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest,
@@ -34,6 +34,7 @@ use pageserver_api::upcall_api::{
    PutTimelineImportStatusRequest, ReAttachRequest, TimelineImportStatusRequest, ValidateRequest,
 };
 use pageserver_client::{BlockUnblock, mgmt_api};
+
 use routerify::Middleware;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
@@ -635,6 +636,32 @@ async fn handle_tenant_timeline_download_heatmap_layers(
    json_response(StatusCode::OK, ())
 }

+async fn handle_tenant_timeline_safekeeper_migrate(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
+
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let migrate_req = json_request::<TimelineSafekeeperMigrateRequest>(&mut req).await?;
+
+    service
+        .tenant_timeline_safekeeper_migrate(tenant_id, timeline_id, migrate_req)
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handle_tenant_timeline_lsn_lease(
    service: Arc<Service>,
    req: Request<Body>,
@@ -2458,6 +2485,16 @@ pub fn make_router(
                )
            },
        )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_safekeeper_migrate,
+                    RequestName("v1_tenant_timeline_safekeeper_migrate"),
+                )
+            },
+        )
        // LSN lease passthrough to all shards
        .post(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease",
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -6,9 +6,7 @@ use std::time::Duration;
 use anyhow::{Context, anyhow};
 use camino::Utf8PathBuf;

-#[cfg(feature = "testing")]
-use clap::ArgAction;
-use clap::Parser;
+use clap::{ArgAction, Parser};
 use futures::future::OptionFuture;
 use http_utils::tls_certs::ReloadingCertificateResolver;
 use hyper0::Uri;
@@ -216,14 +214,13 @@ struct Cli {
    /// Number of safekeepers to choose for a timeline when creating it.
    /// Safekeepers will be choosen from different availability zones.
    /// This option exists primarily for testing purposes.
-    #[arg(long, default_value = "3", value_parser = clap::value_parser!(i64).range(1..))]
-    timeline_safekeeper_count: i64,
+    #[arg(long, default_value = "3", value_parser = clap::builder::RangedU64ValueParser::<usize>::new().range(1..))]
+    timeline_safekeeper_count: usize,

    /// When set, actively checks and initiates heatmap downloads/uploads during reconciliation.
    /// This speed up migrations by avoiding the default wait for the heatmap download interval.
    /// Primarily useful for testing to reduce test execution time.
-    #[cfg(feature = "testing")]
-    #[arg(long, default_value = "true", action=ArgAction::Set)]
+    #[arg(long, default_value = "false", action=ArgAction::Set)]
    kick_secondary_downloads: bool,
 }

@@ -472,7 +469,6 @@ async fn async_main() -> anyhow::Result<()> {
        use_local_compute_notifications: args.use_local_compute_notifications,
        timeline_safekeeper_count: args.timeline_safekeeper_count,
        posthog_config: posthog_config.clone(),
-        #[cfg(feature = "testing")]
        kick_secondary_downloads: args.kick_secondary_downloads,
    };

--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -333,6 +333,7 @@ pub(crate) enum DatabaseErrorLabel {
    ConnectionPool,
    Logical,
    Migration,
+    Cas,
 }

 impl DatabaseError {
@@ -343,6 +344,7 @@ impl DatabaseError {
            Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
            Self::Logical(_) => DatabaseErrorLabel::Logical,
            Self::Migration(_) => DatabaseErrorLabel::Migration,
+            Self::Cas(_) => DatabaseErrorLabel::Cas,
        }
    }
 }
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -29,6 +29,7 @@ use pageserver_api::shard::{
 use rustls::client::WebPkiServerVerifier;
 use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
 use rustls::crypto::ring;
+use safekeeper_api::membership::SafekeeperGeneration;
 use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
@@ -94,6 +95,8 @@ pub(crate) enum DatabaseError {
    Logical(String),
    #[error("Migration error: {0}")]
    Migration(String),
+    #[error("CAS error: {0}")]
+    Cas(String),
 }

 #[derive(measured::FixedCardinalityLabel, Copy, Clone)]
@@ -126,6 +129,7 @@ pub(crate) enum DatabaseOperation {
    UpdateLeader,
    SetPreferredAzs,
    InsertTimeline,
+    UpdateTimelineMembership,
    GetTimeline,
    InsertTimelineReconcile,
    RemoveTimelineReconcile,
@@ -1410,6 +1414,56 @@ impl Persistence {
        .await
    }

+    /// Update timeline membership configuration in the database.
+    /// Perform a compare-and-swap (CAS) operation on the timeline's generation.
+    /// The `new_generation` must be the next (+1) generation after the one in the database.
+    pub(crate) async fn update_timeline_membership(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        new_generation: SafekeeperGeneration,
+        sk_set: &[NodeId],
+        new_sk_set: Option<&[NodeId]>,
+    ) -> DatabaseResult<()> {
+        use crate::schema::timelines::dsl;
+
+        let prev_generation = new_generation.previous().unwrap();
+
+        let tenant_id = &tenant_id;
+        let timeline_id = &timeline_id;
+        self.with_measured_conn(DatabaseOperation::UpdateTimelineMembership, move |conn| {
+            Box::pin(async move {
+                let updated = diesel::update(dsl::timelines)
+                    .filter(dsl::tenant_id.eq(&tenant_id.to_string()))
+                    .filter(dsl::timeline_id.eq(&timeline_id.to_string()))
+                    .filter(dsl::generation.eq(prev_generation.into_inner() as i32))
+                    .set((
+                        dsl::generation.eq(new_generation.into_inner() as i32),
+                        dsl::sk_set.eq(sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>()),
+                        dsl::new_sk_set.eq(new_sk_set
+                            .map(|set| set.iter().map(|id| id.0 as i64).collect::<Vec<_>>())),
+                    ))
+                    .execute(conn)
+                    .await?;
+
+                match updated {
+                    0 => {
+                        // TODO(diko): It makes sense to select the current generation
+                        // and include it in the error message for better debuggability.
+                        Err(DatabaseError::Cas(
+                            "Failed to update membership configuration".to_string(),
+                        ))
+                    }
+                    1 => Ok(()),
+                    _ => Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({updated})"
+                    ))),
+                }
+            })
+        })
+        .await
+    }
+
    /// Load timeline from db. Returns `None` if not present.
    pub(crate) async fn get_timeline(
        &self,
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -65,7 +65,7 @@ pub(super) struct Reconciler {
    pub(crate) compute_hook: Arc<ComputeHook>,

    /// To avoid stalling if the cloud control plane is unavailable, we may proceed
-    /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
+    /// past failures in [`ComputeHook::notify_attach`], but we _must_ remember that we failed
    /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
    pub(crate) compute_notify_failure: bool,

@@ -1023,7 +1023,7 @@ impl Reconciler {
        if let Some(node) = &self.intent.attached {
            let result = self
                .compute_hook
-                .notify(
+                .notify_attach(
                    compute_hook::ShardUpdate {
                        tenant_shard_id: self.tenant_shard_id,
                        node_id: node.get_id(),
--- a/storage_controller/src/safekeeper.rs
+++ b/storage_controller/src/safekeeper.rs
@@ -2,6 +2,7 @@ use std::time::Duration;

 use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy};
 use reqwest::StatusCode;
+use safekeeper_api::membership::SafekeeperId;
 use safekeeper_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
 use utils::backoff;
@@ -92,6 +93,13 @@ impl Safekeeper {
    pub(crate) fn has_https_port(&self) -> bool {
        self.listen_https_port.is_some()
    }
+    pub(crate) fn get_safekeeper_id(&self) -> SafekeeperId {
+        SafekeeperId {
+            id: self.id,
+            host: self.skp.host.clone(),
+            pg_port: self.skp.port as u16,
+        }
+    }
    /// Perform an operation (which is given a [`SafekeeperClient`]) with retries
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn with_client_retries<T, O, F>(
--- a/storage_controller/src/safekeeper_client.rs
+++ b/storage_controller/src/safekeeper_client.rs
@@ -56,6 +56,10 @@ impl SafekeeperClient {
        }
    }

+    pub(crate) fn node_id_label(&self) -> &str {
+        &self.node_id_label
+    }
+
    pub(crate) async fn create_timeline(
        &self,
        req: &TimelineCreateRequest,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -161,6 +161,7 @@ enum TenantOperations {
    DropDetached,
    DownloadHeatmapLayers,
    TimelineLsnLease,
+    TimelineSafekeeperMigrate,
 }

 #[derive(Clone, strum_macros::Display)]
@@ -471,12 +472,12 @@ pub struct Config {

    /// Number of safekeepers to choose for a timeline when creating it.
    /// Safekeepers will be choosen from different availability zones.
-    pub timeline_safekeeper_count: i64,
+    pub timeline_safekeeper_count: usize,

    /// PostHog integration config
    pub posthog_config: Option<PostHogConfig>,

-    #[cfg(feature = "testing")]
+    /// When set, actively checks and initiates heatmap downloads/uploads.
    pub kick_secondary_downloads: bool,
 }

@@ -491,6 +492,7 @@ impl From<DatabaseError> for ApiError {
            DatabaseError::Logical(reason) | DatabaseError::Migration(reason) => {
                ApiError::InternalServerError(anyhow::anyhow!(reason))
            }
+            DatabaseError::Cas(reason) => ApiError::Conflict(reason),
        }
    }
 }
@@ -876,18 +878,18 @@ impl Service {
        // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
        // will emit compute hook notifications when they reconcile.
        //
-        // Ordering: our calls to notify_background synchronously establish a relative order for these notifications vs. any later
+        // Ordering: our calls to notify_attach_background synchronously establish a relative order for these notifications vs. any later
        // calls into the ComputeHook for the same tenant: we can leave these to run to completion in the background and any later
        // calls will be correctly ordered wrt these.
        //
-        // Concurrency: we call notify_background for all tenants, which will create O(N) tokio tasks, but almost all of them
+        // Concurrency: we call notify_attach_background for all tenants, which will create O(N) tokio tasks, but almost all of them
        // will just wait on the ComputeHook::API_CONCURRENCY semaphore immediately, so very cheap until they get that semaphore
        // unit and start doing I/O.
        tracing::info!(
            "Sending {} compute notifications",
            compute_notifications.len()
        );
-        self.compute_hook.notify_background(
+        self.compute_hook.notify_attach_background(
            compute_notifications,
            bg_compute_notify_result_tx.clone(),
            &self.cancel,
@@ -2582,7 +2584,7 @@ impl Service {
                .do_initial_shard_scheduling(
                    tenant_shard_id,
                    initial_generation,
-                    &create_req.shard_parameters,
+                    create_req.shard_parameters,
                    create_req.config.clone(),
                    placement_policy.clone(),
                    preferred_az_id.as_ref(),
@@ -2639,7 +2641,7 @@ impl Service {
        &self,
        tenant_shard_id: TenantShardId,
        initial_generation: Option<Generation>,
-        shard_params: &ShardParameters,
+        shard_params: ShardParameters,
        config: TenantConfig,
        placement_policy: PlacementPolicy,
        preferred_az_id: Option<&AvailabilityZone>,
@@ -6279,7 +6281,7 @@ impl Service {
        for (child_id, child_ps, stripe_size) in child_locations {
            if let Err(e) = self
                .compute_hook
-                .notify(
+                .notify_attach(
                    compute_hook::ShardUpdate {
                        tenant_shard_id: child_id,
                        node_id: child_ps,
@@ -8364,7 +8366,6 @@ impl Service {
                            "Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"
                        );

-                        #[cfg(feature = "testing")]
                        if progress.heatmap_mtime.is_none() {
                            // No heatmap might mean the attached location has never uploaded one, or that
                            // the secondary download hasn't happened yet.  This is relatively unusual in the field,
@@ -8389,7 +8390,6 @@ impl Service {
    /// happens on multi-minute timescales in the field, which is fine because optimisation is meant
    /// to be a lazy background thing. However, when testing, it is not practical to wait around, so
    /// we have this helper to move things along faster.
-    #[cfg(feature = "testing")]
    async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
        if !self.config.kick_secondary_downloads {
            // No-op if kick_secondary_downloads functionaliuty is not configured
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -145,7 +145,7 @@ pub(crate) async fn load_schedule_requests(
                        }
                        let Some(sk) = safekeepers.get(&other_node_id) else {
                            tracing::warn!(
-                                "couldnt find safekeeper with pending op id {other_node_id}, not pulling from it"
+                                "couldn't find safekeeper with pending op id {other_node_id}, not pulling from it"
                            );
                            return None;
                        };
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -1,25 +1,34 @@
-use std::cmp::max;
 use std::collections::HashSet;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;

 use super::safekeeper_reconciler::ScheduleRequest;
+use crate::compute_hook;
 use crate::heartbeater::SafekeeperState;
+use crate::id_lock_map::trace_shared_lock;
 use crate::metrics;
 use crate::persistence::{
    DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence,
 };
 use crate::safekeeper::Safekeeper;
+use crate::safekeeper_client::SafekeeperClient;
+use crate::service::TenantOperations;
 use crate::timeline_import::TimelineImportFinalizeError;
 use anyhow::Context;
 use http_utils::error::ApiError;
 use pageserver_api::controller_api::{
    SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest,
+    TimelineSafekeeperMigrateRequest,
 };
 use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo};
 use safekeeper_api::PgVersionId;
-use safekeeper_api::membership::{MemberSet, SafekeeperGeneration, SafekeeperId};
+use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration};
+use safekeeper_api::models::{
+    PullTimelineRequest, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse,
+};
+use safekeeper_api::{INITIAL_TERM, Term};
+use safekeeper_client::mgmt_api;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -36,6 +45,33 @@ pub struct TimelineLocateResponse {
 }

 impl Service {
+    fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, ApiError> {
+        let members = safekeepers
+            .iter()
+            .map(|sk| sk.get_safekeeper_id())
+            .collect::<Vec<_>>();
+
+        MemberSet::new(members).map_err(ApiError::InternalServerError)
+    }
+
+    fn get_safekeepers(&self, ids: &[i64]) -> Result<Vec<Safekeeper>, ApiError> {
+        let safekeepers = {
+            let locked = self.inner.read().unwrap();
+            locked.safekeepers.clone()
+        };
+
+        ids.iter()
+            .map(|&id| {
+                let node_id = NodeId(id as u64);
+                safekeepers.get(&node_id).cloned().ok_or_else(|| {
+                    ApiError::InternalServerError(anyhow::anyhow!(
+                        "safekeeper {node_id} is not registered"
+                    ))
+                })
+            })
+            .collect::<Result<Vec<_>, _>>()
+    }
+
    /// Timeline creation on safekeepers
    ///
    /// Returns `Ok(left)` if the timeline has been created on a quorum of safekeepers,
@@ -48,35 +84,9 @@ impl Service {
        pg_version: PgVersionId,
        timeline_persistence: &TimelinePersistence,
    ) -> Result<Vec<NodeId>, ApiError> {
-        // If quorum is reached, return if we are outside of a specified timeout
-        let jwt = self
-            .config
-            .safekeeper_jwt_token
-            .clone()
-            .map(SecretString::from);
-        let mut joinset = JoinSet::new();
+        let safekeepers = self.get_safekeepers(&timeline_persistence.sk_set)?;

-        // Prepare membership::Configuration from choosen safekeepers.
-        let safekeepers = {
-            let locked = self.inner.read().unwrap();
-            locked.safekeepers.clone()
-        };
-
-        let mut members = Vec::new();
-        for sk_id in timeline_persistence.sk_set.iter() {
-            let sk_id = NodeId(*sk_id as u64);
-            let Some(safekeeper) = safekeepers.get(&sk_id) else {
-                return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                    "couldn't find entry for safekeeper with id {sk_id}"
-                )))?;
-            };
-            members.push(SafekeeperId {
-                id: sk_id,
-                host: safekeeper.skp.host.clone(),
-                pg_port: safekeeper.skp.port as u16,
-            });
-        }
-        let mset = MemberSet::new(members).map_err(ApiError::InternalServerError)?;
+        let mset = Self::make_member_set(&safekeepers)?;
        let mconf = safekeeper_api::membership::Configuration::new(mset);

        let req = safekeeper_api::models::TimelineCreateRequest {
@@ -89,79 +99,150 @@ impl Service {
            timeline_id,
            wal_seg_size: None,
        };
+
        const SK_CREATE_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
-        for sk in timeline_persistence.sk_set.iter() {
-            let sk_id = NodeId(*sk as u64);
-            let safekeepers = safekeepers.clone();
+
+        let results = self
+            .tenant_timeline_safekeeper_op_quorum(
+                &safekeepers,
+                move |client| {
+                    let req = req.clone();
+                    async move { client.create_timeline(&req).await }
+                },
+                SK_CREATE_TIMELINE_RECONCILE_TIMEOUT,
+            )
+            .await?;
+
+        Ok(results
+            .into_iter()
+            .enumerate()
+            .filter_map(|(idx, res)| {
+                if res.is_ok() {
+                    None // Success, don't return this safekeeper
+                } else {
+                    Some(safekeepers[idx].get_id()) // Failure, return this safekeeper
+                }
+            })
+            .collect::<Vec<_>>())
+    }
+
+    /// Perform an operation on a list of safekeepers in parallel with retries.
+    ///
+    /// Return the results of the operation on each safekeeper in the input order.
+    async fn tenant_timeline_safekeeper_op<T, O, F>(
+        &self,
+        safekeepers: &[Safekeeper],
+        op: O,
+        timeout: Duration,
+    ) -> Result<Vec<mgmt_api::Result<T>>, ApiError>
+    where
+        O: FnMut(SafekeeperClient) -> F + Send + 'static,
+        O: Clone,
+        F: std::future::Future<Output = mgmt_api::Result<T>> + Send + 'static,
+        T: Sync + Send + 'static,
+    {
+        let jwt = self
+            .config
+            .safekeeper_jwt_token
+            .clone()
+            .map(SecretString::from);
+        let mut joinset = JoinSet::new();
+
+        for (idx, sk) in safekeepers.iter().enumerate() {
+            let sk = sk.clone();
            let http_client = self.http_client.clone();
            let jwt = jwt.clone();
-            let req = req.clone();
+            let op = op.clone();
            joinset.spawn(async move {
-                // Unwrap is fine as we already would have returned error above
-                let sk_p = safekeepers.get(&sk_id).unwrap();
-                let res = sk_p
+                let res = sk
                    .with_client_retries(
-                        |client| {
-                            let req = req.clone();
-                            async move { client.create_timeline(&req).await }
-                        },
+                        op,
                        &http_client,
                        &jwt,
                        3,
                        3,
-                        SK_CREATE_TIMELINE_RECONCILE_TIMEOUT,
+                        // TODO(diko): This is a wrong timeout.
+                        // It should be scaled to the retry count.
+                        timeout,
                        &CancellationToken::new(),
                    )
                    .await;
-                (sk_id, sk_p.skp.host.clone(), res)
+                (idx, res)
            });
        }
+
+        // Initialize results with timeout errors in case we never get a response.
+        let mut results: Vec<mgmt_api::Result<T>> = safekeepers
+            .iter()
+            .map(|_| {
+                Err(mgmt_api::Error::Timeout(
+                    "safekeeper operation timed out".to_string(),
+                ))
+            })
+            .collect();
+
        // After we have built the joinset, we now wait for the tasks to complete,
        // but with a specified timeout to make sure we return swiftly, either with
        // a failure or success.
-        let reconcile_deadline = tokio::time::Instant::now() + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT;
+        let reconcile_deadline = tokio::time::Instant::now() + timeout;

        // Wait until all tasks finish or timeout is hit, whichever occurs
        // first.
-        let mut reconcile_results = Vec::new();
+        let mut result_count = 0;
        loop {
            if let Ok(res) = tokio::time::timeout_at(reconcile_deadline, joinset.join_next()).await
            {
                let Some(res) = res else { break };
                match res {
-                    Ok(res) => {
+                    Ok((idx, res)) => {
+                        let sk = &safekeepers[idx];
                        tracing::info!(
                            "response from safekeeper id:{} at {}: {:?}",
-                            res.0,
-                            res.1,
-                            res.2
+                            sk.get_id(),
+                            sk.skp.host,
+                            // Only print errors, as there is no Debug trait for T.
+                            res.as_ref().map(|_| ()),
                        );
-                        reconcile_results.push(res);
+                        results[idx] = res;
+                        result_count += 1;
                    }
                    Err(join_err) => {
                        tracing::info!("join_err for task in joinset: {join_err}");
                    }
                }
            } else {
-                tracing::info!(
-                    "timeout for creation call after {} responses",
-                    reconcile_results.len()
-                );
+                tracing::info!("timeout for operation call after {result_count} responses",);
                break;
            }
        }

-        // Now check now if quorum was reached in reconcile_results.
-        let total_result_count = reconcile_results.len();
-        let remaining = reconcile_results
-            .into_iter()
-            .filter_map(|res| res.2.is_err().then_some(res.0))
-            .collect::<Vec<_>>();
-        tracing::info!(
-            "Got {} non-successful responses from initial creation request of total {total_result_count} responses",
-            remaining.len()
-        );
-        let target_sk_count = timeline_persistence.sk_set.len();
+        Ok(results)
+    }
+
+    /// Perform an operation on a list of safekeepers in parallel with retries,
+    /// and validates that we reach a quorum of successful responses.
+    ///
+    /// Return the results of the operation on each safekeeper in the input order.
+    /// It's guaranteed that at least a quorum of the responses are successful.
+    async fn tenant_timeline_safekeeper_op_quorum<T, O, F>(
+        &self,
+        safekeepers: &[Safekeeper],
+        op: O,
+        timeout: Duration,
+    ) -> Result<Vec<mgmt_api::Result<T>>, ApiError>
+    where
+        O: FnMut(SafekeeperClient) -> F,
+        O: Clone + Send + 'static,
+        F: std::future::Future<Output = mgmt_api::Result<T>> + Send + 'static,
+        T: Sync + Send + 'static,
+    {
+        let results = self
+            .tenant_timeline_safekeeper_op(safekeepers, op, timeout)
+            .await?;
+
+        // Now check if quorum was reached in results.
+
+        let target_sk_count = safekeepers.len();
        let quorum_size = match target_sk_count {
            0 => {
                return Err(ApiError::InternalServerError(anyhow::anyhow!(
@@ -180,7 +261,7 @@ impl Service {
                    // in order to schedule work to them
                    tracing::warn!(
                        "couldn't find at least 3 safekeepers for timeline, found: {:?}",
-                        timeline_persistence.sk_set
+                        target_sk_count
                    );
                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
                        "couldn't find at least 3 safekeepers to put timeline to"
@@ -189,7 +270,7 @@ impl Service {
            }
            _ => target_sk_count / 2 + 1,
        };
-        let success_count = target_sk_count - remaining.len();
+        let success_count = results.iter().filter(|res| res.is_ok()).count();
        if success_count < quorum_size {
            // Failure
            return Err(ApiError::InternalServerError(anyhow::anyhow!(
@@ -197,7 +278,7 @@ impl Service {
            )));
        }

-        Ok(remaining)
+        Ok(results)
    }

    /// Create timeline in controller database and on safekeepers.
@@ -654,13 +735,7 @@ impl Service {
            )
        });
        // Number of safekeepers in different AZs we are looking for
-        let mut wanted_count = self.config.timeline_safekeeper_count as usize;
-        // TODO(diko): remove this when `timeline_safekeeper_count` option is in the release
-        // branch and is specified in tests/neon_local config.
-        if cfg!(feature = "testing") && all_safekeepers.len() < wanted_count {
-            // In testing mode, we can have less safekeepers than the config says
-            wanted_count = max(all_safekeepers.len(), 1);
-        }
+        let wanted_count = self.config.timeline_safekeeper_count;

        let mut sks = Vec::new();
        let mut azs = HashSet::new();
@@ -804,4 +879,486 @@ impl Service {
        }
        Ok(())
    }
+
+    /// Call `switch_timeline_membership` on all safekeepers with retries
+    /// till the quorum of successful responses is reached.
+    ///
+    /// If min_position is not None, validates that majority of safekeepers
+    /// reached at least min_position.
+    ///
+    /// Return responses from safekeepers in the input order.
+    async fn tenant_timeline_set_membership_quorum(
+        self: &Arc<Self>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        safekeepers: &[Safekeeper],
+        config: &membership::Configuration,
+        min_position: Option<(Term, Lsn)>,
+    ) -> Result<Vec<mgmt_api::Result<TimelineMembershipSwitchResponse>>, ApiError> {
+        let req = TimelineMembershipSwitchRequest {
+            mconf: config.clone(),
+        };
+
+        const SK_SET_MEM_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
+
+        let results = self
+            .tenant_timeline_safekeeper_op_quorum(
+                safekeepers,
+                move |client| {
+                    let req = req.clone();
+                    async move {
+                        let mut res = client
+                            .switch_timeline_membership(tenant_id, timeline_id, &req)
+                            .await;
+
+                        // If min_position is not reached, map the response to an error,
+                        // so it isn't counted toward the quorum.
+                        if let Some(min_position) = min_position {
+                            if let Ok(ok_res) = &res {
+                                if (ok_res.term, ok_res.flush_lsn) < min_position {
+                                    // Use Error::Timeout to make this error retriable.
+                                    res = Err(mgmt_api::Error::Timeout(
+                                        format!(
+                                        "safekeeper {} returned position {:?} which is less than minimum required position {:?}",
+                                        client.node_id_label(),
+                                        (ok_res.term, ok_res.flush_lsn),
+                                        min_position
+                                        )
+                                    ));
+                                }
+                            }
+                        }
+
+                        res
+                    }
+                },
+                SK_SET_MEM_TIMELINE_RECONCILE_TIMEOUT,
+            )
+            .await?;
+
+        for res in results.iter().flatten() {
+            if res.current_conf.generation > config.generation {
+                // Antoher switch_membership raced us.
+                return Err(ApiError::Conflict(format!(
+                    "received configuration with generation {} from safekeeper, but expected {}",
+                    res.current_conf.generation, config.generation
+                )));
+            } else if res.current_conf.generation < config.generation {
+                // Note: should never happen.
+                // If we get a response, it should be at least the sent generation.
+                tracing::error!(
+                    "received configuration with generation {} from safekeeper, but expected {}",
+                    res.current_conf.generation,
+                    config.generation
+                );
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "received configuration with generation {} from safekeeper, but expected {}",
+                    res.current_conf.generation,
+                    config.generation
+                )));
+            }
+        }
+
+        Ok(results)
+    }
+
+    /// Pull timeline to to_safekeepers from from_safekeepers with retries.
+    ///
+    /// Returns Ok(()) only if all the pull_timeline requests were successful.
+    async fn tenant_timeline_pull_from_peers(
+        self: &Arc<Self>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        to_safekeepers: &[Safekeeper],
+        from_safekeepers: &[Safekeeper],
+    ) -> Result<(), ApiError> {
+        let http_hosts = from_safekeepers
+            .iter()
+            .map(|sk| sk.base_url())
+            .collect::<Vec<_>>();
+
+        tracing::info!(
+            "pulling timeline to {:?} from {:?}",
+            to_safekeepers
+                .iter()
+                .map(|sk| sk.get_id())
+                .collect::<Vec<_>>(),
+            from_safekeepers
+                .iter()
+                .map(|sk| sk.get_id())
+                .collect::<Vec<_>>()
+        );
+
+        // TODO(diko): need to pass mconf/generation with the request
+        // to properly handle tombstones. Ignore tombstones for now.
+        // Worst case: we leave a timeline on a safekeeper which is not in the current set.
+        let req = PullTimelineRequest {
+            tenant_id,
+            timeline_id,
+            http_hosts,
+            ignore_tombstone: Some(true),
+        };
+
+        const SK_PULL_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
+
+        let responses = self
+            .tenant_timeline_safekeeper_op(
+                to_safekeepers,
+                move |client| {
+                    let req = req.clone();
+                    async move { client.pull_timeline(&req).await }
+                },
+                SK_PULL_TIMELINE_RECONCILE_TIMEOUT,
+            )
+            .await?;
+
+        if let Some((idx, err)) = responses
+            .iter()
+            .enumerate()
+            .find_map(|(idx, res)| Some((idx, res.as_ref().err()?)))
+        {
+            let sk_id = to_safekeepers[idx].get_id();
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "pull_timeline to {sk_id} failed: {err}",
+            )));
+        }
+
+        Ok(())
+    }
+
+    /// Exclude a timeline from safekeepers in parallel with retries.
+    /// If an exclude request is unsuccessful, it will be added to
+    /// the reconciler, and after that the function will succeed.
+    async fn tenant_timeline_safekeeper_exclude(
+        self: &Arc<Self>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        safekeepers: &[Safekeeper],
+        config: &membership::Configuration,
+    ) -> Result<(), ApiError> {
+        let req = TimelineMembershipSwitchRequest {
+            mconf: config.clone(),
+        };
+
+        const SK_EXCLUDE_TIMELINE_TIMEOUT: Duration = Duration::from_secs(30);
+
+        let results = self
+            .tenant_timeline_safekeeper_op(
+                safekeepers,
+                move |client| {
+                    let req = req.clone();
+                    async move { client.exclude_timeline(tenant_id, timeline_id, &req).await }
+                },
+                SK_EXCLUDE_TIMELINE_TIMEOUT,
+            )
+            .await?;
+
+        let mut reconcile_requests = Vec::new();
+
+        for (idx, res) in results.iter().enumerate() {
+            if res.is_err() {
+                let sk_id = safekeepers[idx].skp.id;
+                let pending_op = TimelinePendingOpPersistence {
+                    tenant_id: tenant_id.to_string(),
+                    timeline_id: timeline_id.to_string(),
+                    generation: config.generation.into_inner() as i32,
+                    op_kind: SafekeeperTimelineOpKind::Exclude,
+                    sk_id,
+                };
+                tracing::info!("writing pending exclude op for sk id {sk_id}");
+                self.persistence.insert_pending_op(pending_op).await?;
+
+                let req = ScheduleRequest {
+                    safekeeper: Box::new(safekeepers[idx].clone()),
+                    host_list: Vec::new(),
+                    tenant_id,
+                    timeline_id: Some(timeline_id),
+                    generation: config.generation.into_inner(),
+                    kind: SafekeeperTimelineOpKind::Exclude,
+                };
+                reconcile_requests.push(req);
+            }
+        }
+
+        if !reconcile_requests.is_empty() {
+            let locked = self.inner.read().unwrap();
+            for req in reconcile_requests {
+                locked.safekeeper_reconcilers.schedule_request(req);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Migrate timeline safekeeper set to a new set.
+    ///
+    /// This function implements an algorithm from RFC-035.
+    /// <https://github.com/neondatabase/neon/blob/main/docs/rfcs/035-safekeeper-dynamic-membership-change.md>
+    pub(crate) async fn tenant_timeline_safekeeper_migrate(
+        self: &Arc<Self>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        req: TimelineSafekeeperMigrateRequest,
+    ) -> Result<(), ApiError> {
+        let all_safekeepers = self.inner.read().unwrap().safekeepers.clone();
+
+        let new_sk_set = req.new_sk_set;
+
+        for sk_id in new_sk_set.iter() {
+            if !all_safekeepers.contains_key(sk_id) {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "safekeeper {sk_id} does not exist"
+                )));
+            }
+        }
+
+        // TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks.
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineSafekeeperMigrate,
+        )
+        .await;
+
+        // 1. Fetch current timeline configuration from the configuration storage.
+
+        let timeline = self
+            .persistence
+            .get_timeline(tenant_id, timeline_id)
+            .await?;
+
+        let Some(timeline) = timeline else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!(
+                    "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table"
+                )
+                .into(),
+            ));
+        };
+
+        let cur_sk_set = timeline
+            .sk_set
+            .iter()
+            .map(|&id| NodeId(id as u64))
+            .collect::<Vec<_>>();
+
+        tracing::info!(
+            ?cur_sk_set,
+            ?new_sk_set,
+            "Migrating timeline to new safekeeper set",
+        );
+
+        let mut generation = SafekeeperGeneration::new(timeline.generation as u32);
+
+        if let Some(ref presistent_new_sk_set) = timeline.new_sk_set {
+            // 2. If it is already joint one and new_set is different from desired_set refuse to change.
+            if presistent_new_sk_set
+                .iter()
+                .map(|&id| NodeId(id as u64))
+                .ne(new_sk_set.iter().cloned())
+            {
+                tracing::info!(
+                    ?presistent_new_sk_set,
+                    ?new_sk_set,
+                    "different new safekeeper set is already set in the database",
+                );
+                return Err(ApiError::Conflict(format!(
+                    "the timeline is already migrating to a different safekeeper set: {presistent_new_sk_set:?}"
+                )));
+            }
+            // It it is the same new_sk_set, we can continue the migration (retry).
+        } else {
+            // 3. No active migration yet.
+            // Increment current generation and put desired_set to new_sk_set.
+            generation = generation.next();
+
+            self.persistence
+                .update_timeline_membership(
+                    tenant_id,
+                    timeline_id,
+                    generation,
+                    &cur_sk_set,
+                    Some(&new_sk_set),
+                )
+                .await?;
+        }
+
+        let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?;
+        let cur_sk_member_set = Self::make_member_set(&cur_safekeepers)?;
+
+        let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
+        let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?;
+        let new_sk_member_set = Self::make_member_set(&new_safekeepers)?;
+
+        let joint_config = membership::Configuration {
+            generation,
+            members: cur_sk_member_set,
+            new_members: Some(new_sk_member_set.clone()),
+        };
+
+        // 4. Call PUT configuration on safekeepers from the current set,
+        // delivering them joint_conf.
+
+        // Notify cplane/compute about the membership change BEFORE changing the membership on safekeepers.
+        // This way the compute will know about new safekeepers from joint_config before we require to
+        // collect a quorum from them.
+        self.cplane_notify_safekeepers(tenant_id, timeline_id, &joint_config)
+            .await?;
+
+        let results = self
+            .tenant_timeline_set_membership_quorum(
+                tenant_id,
+                timeline_id,
+                &cur_safekeepers,
+                &joint_config,
+                None, // no min position
+            )
+            .await?;
+
+        let mut sync_position = (INITIAL_TERM, Lsn::INVALID);
+        for res in results.into_iter().flatten() {
+            let sk_position = (res.term, res.flush_lsn);
+            if sync_position < sk_position {
+                sync_position = sk_position;
+            }
+        }
+
+        tracing::info!(
+            %generation,
+            ?sync_position,
+            "safekeepers set membership updated",
+        );
+
+        // 5. Initialize timeline on safekeeper(s) from new_sk_set where it doesn't exist yet
+        // by doing pull_timeline from the majority of the current set.
+
+        // Filter out safekeepers which are already in the current set.
+        let from_ids: HashSet<NodeId> = cur_safekeepers.iter().map(|sk| sk.get_id()).collect();
+        let pull_to_safekeepers = new_safekeepers
+            .iter()
+            .filter(|sk| !from_ids.contains(&sk.get_id()))
+            .cloned()
+            .collect::<Vec<_>>();
+
+        self.tenant_timeline_pull_from_peers(
+            tenant_id,
+            timeline_id,
+            &pull_to_safekeepers,
+            &cur_safekeepers,
+        )
+        .await?;
+
+        // 6. Call POST bump_term(sync_term) on safekeepers from the new set. Success on majority is enough.
+
+        // TODO(diko): do we need to bump timeline term?
+
+        // 7. Repeatedly call PUT configuration on safekeepers from the new set,
+        // delivering them joint_conf and collecting their positions.
+
+        tracing::info!(?sync_position, "waiting for safekeepers to sync position");
+
+        self.tenant_timeline_set_membership_quorum(
+            tenant_id,
+            timeline_id,
+            &new_safekeepers,
+            &joint_config,
+            Some(sync_position),
+        )
+        .await?;
+
+        // 8. Create new_conf: Configuration incrementing joint_conf generation and
+        // having new safekeeper set as sk_set and None new_sk_set.
+
+        let generation = generation.next();
+
+        let new_conf = membership::Configuration {
+            generation,
+            members: new_sk_member_set,
+            new_members: None,
+        };
+
+        self.persistence
+            .update_timeline_membership(tenant_id, timeline_id, generation, &new_sk_set, None)
+            .await?;
+
+        // TODO(diko): at this point we have already updated the timeline in the database,
+        // but we still need to notify safekeepers and cplane about the new configuration,
+        // and put delition of the timeline from the old safekeepers into the reconciler.
+        // Ideally it should be done atomically, but now it's not.
+        // Worst case: the timeline is not deleted from old safekeepers,
+        // the compute may require both quorums till the migration is retried and completed.
+
+        self.tenant_timeline_set_membership_quorum(
+            tenant_id,
+            timeline_id,
+            &new_safekeepers,
+            &new_conf,
+            None, // no min position
+        )
+        .await?;
+
+        let new_ids: HashSet<NodeId> = new_safekeepers.iter().map(|sk| sk.get_id()).collect();
+        let exclude_safekeepers = cur_safekeepers
+            .into_iter()
+            .filter(|sk| !new_ids.contains(&sk.get_id()))
+            .collect::<Vec<_>>();
+        self.tenant_timeline_safekeeper_exclude(
+            tenant_id,
+            timeline_id,
+            &exclude_safekeepers,
+            &new_conf,
+        )
+        .await?;
+
+        // Notify cplane/compute about the membership change AFTER changing the membership on safekeepers.
+        // This way the compute will stop talking to excluded safekeepers only after we stop requiring to
+        // collect a quorum from them.
+        self.cplane_notify_safekeepers(tenant_id, timeline_id, &new_conf)
+            .await?;
+
+        Ok(())
+    }
+
+    /// Notify cplane about safekeeper membership change.
+    /// The cplane will receive a joint set of safekeepers as a safekeeper list.
+    async fn cplane_notify_safekeepers(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        mconf: &membership::Configuration,
+    ) -> Result<(), ApiError> {
+        let mut safekeepers = Vec::new();
+        let mut ids: HashSet<_> = HashSet::new();
+
+        for member in mconf
+            .members
+            .m
+            .iter()
+            .chain(mconf.new_members.iter().flat_map(|m| m.m.iter()))
+        {
+            if ids.insert(member.id) {
+                safekeepers.push(compute_hook::SafekeeperInfo {
+                    id: member.id,
+                    hostname: Some(member.host.clone()),
+                });
+            }
+        }
+
+        self.compute_hook
+            .notify_safekeepers(
+                compute_hook::SafekeepersUpdate {
+                    tenant_id,
+                    timeline_id,
+                    generation: mconf.generation,
+                    safekeepers,
+                },
+                &self.cancel,
+            )
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "failed to notify cplane about safekeeper membership change: {err}"
+                ))
+            })
+    }
 }
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -453,7 +453,7 @@ class NeonEnvBuilder:
        pageserver_get_vectored_concurrent_io: str | None = None,
        pageserver_tracing_config: PageserverTracingConfig | None = None,
        pageserver_import_config: PageserverImportConfig | None = None,
-        storcon_kick_secondary_downloads: bool | None = None,
+        storcon_kick_secondary_downloads: bool | None = True,
    ):
        self.repo_dir = repo_dir
        self.rust_log_override = rust_log_override
@@ -724,15 +724,21 @@ class NeonEnvBuilder:

        shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log)
        assert not (storcon_db_to_dir / "postgres.log").exists()
+
        # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
-        # However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller
-        # will currently reject re-attach requests from them because the NodeMetadata isn't identical.
+        # However, in this new NeonEnv, the pageservers and safekeepers listen on different ports, and the storage
+        # controller will currently reject re-attach requests from them because the NodeMetadata isn't identical.
        # So, from_repo_dir patches up the the storcon database.
        patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
        assert not patch_script_path.exists()
        patch_script = ""
+
        for ps in self.env.pageservers:
-            patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';"
+            patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';\n"
+
+        for sk in self.env.safekeepers:
+            patch_script += f"UPDATE safekeepers SET http_port={sk.port.http}, port={sk.port.pg} WHERE id = '{sk.id}';\n"
+
        patch_script_path.write_text(patch_script)

        # Update the config with info about tenants and timelines
@@ -1215,6 +1221,13 @@ class NeonEnv:
            storage_controller_config = storage_controller_config or {}
            storage_controller_config["use_https_safekeeper_api"] = True

+        # TODO(diko): uncomment when timeline_safekeeper_count option is in the release branch,
+        # so the compat tests will not fail bacause of it presence.
+        # if config.num_safekeepers < 3:
+        #     storage_controller_config = storage_controller_config or {}
+        #     if "timeline_safekeeper_count" not in storage_controller_config:
+        #         storage_controller_config["timeline_safekeeper_count"] = config.num_safekeepers
+
        if storage_controller_config is not None:
            cfg["storage_controller"] = storage_controller_config

@@ -2226,6 +2239,21 @@ class NeonStorageController(MetricsGetter, LogUtils):
        response.raise_for_status()
        log.info(f"timeline_create success: {response.json()}")

+    def migrate_safekeepers(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        new_sk_set: list[int],
+    ):
+        response = self.request(
+            "POST",
+            f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate",
+            json={"new_sk_set": new_sk_set},
+            headers=self.headers(TokenScope.PAGE_SERVER_API),
+        )
+        response.raise_for_status()
+        log.info(f"migrate_safekeepers success: {response.json()}")
+
    def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
        """
        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int}
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -695,6 +695,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        tenant_id: TenantId | TenantShardId,
        timeline_id: TimelineId,
        gc_horizon: int | None,
+        ignore_lease_deadline: bool | None = None,
    ) -> dict[str, Any]:
        """
        Unlike most handlers, this will wait for the layers to be actually
@@ -707,7 +708,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        )
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc",
-            json={"gc_horizon": gc_horizon},
+            json={"gc_horizon": gc_horizon, "ignore_lease_deadline": ignore_lease_deadline},
        )
        log.info(f"Got GC request response code: {res.status_code}")
        self.verbose_error(res)
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -76,6 +76,7 @@ if TYPE_CHECKING:
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}
 #    export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install
+#    export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
 #
 #    # Build previous version of binaries and store them somewhere:
 #    rm -rf pg_install target
@@ -102,6 +103,7 @@ if TYPE_CHECKING:
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}
 #    export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install
+#    export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
 #    export NEON_BIN=target/${BUILD_TYPE}
 #    export POSTGRES_DISTRIB_DIR=pg_install
 #
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -201,11 +201,13 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
        for shard, ps in tenant_get_shards(env, env.initial_tenant):
            client = ps.http_client()
            layers_guarded_before_gc = get_layers_protected_by_lease(
-                client, shard, env.initial_timeline, lease_lsn=lsn
+                client, shard, env.initial_timeline, lease_lsn=lease_lsn
+            )
+            gc_result = client.timeline_gc(
+                shard, env.initial_timeline, 0, ignore_lease_deadline=False
            )
-            gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
            layers_guarded_after_gc = get_layers_protected_by_lease(
-                client, shard, env.initial_timeline, lease_lsn=lsn
+                client, shard, env.initial_timeline, lease_lsn=lease_lsn
            )

            # Note: cannot assert on `layers_removed` here because it could be layers
--- a/test_runner/regress/test_safekeeper_migration.py
+++ b/test_runner/regress/test_safekeeper_migration.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
+    """
+    Simple safekeeper migration test.
+    Creates 3 safekeepers. The timeline is configuret to use only one safekeeper.
+    1. Go through all safekeepers, migrate the timeline to it.
+    2. Stop the other safekeepers. Validate that the insert is successful.
+    3. Start the other safekeepers again and go to the next safekeeper.
+    4. Validate that the table contains all inserted values.
+    """
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.storage_controller_config = {
+        "timelines_onto_safekeepers": True,
+        "timeline_safekeeper_count": 1,
+    }
+    env = neon_env_builder.init_start()
+    # TODO(diko): pageserver spams with various errors during safekeeper migration.
+    # Fix the code so it handles the migration better.
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Timeline .* was cancelled and cannot be used anymore.*",
+            ".*Timeline .* has been deleted.*",
+            ".*wal receiver task finished with an error.*",
+        ]
+    )
+
+    ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
+
+    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+    assert mconf["new_sk_set"] is None
+    assert len(mconf["sk_set"]) == 1
+    assert mconf["generation"] == 1
+
+    ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
+    ep.safe_psql("CREATE EXTENSION neon_test_utils;")
+    ep.safe_psql("CREATE TABLE t(a int)")
+
+    for active_sk in range(1, 4):
+        env.storage_controller.migrate_safekeepers(
+            env.initial_tenant, env.initial_timeline, [active_sk]
+        )
+
+        other_sks = [sk for sk in range(1, 4) if sk != active_sk]
+
+        for sk in other_sks:
+            env.safekeepers[sk - 1].stop()
+
+        ep.safe_psql(f"INSERT INTO t VALUES ({active_sk})")
+
+        for sk in other_sks:
+            env.safekeepers[sk - 1].start()
+
+    ep.clear_buffers()
+
+    assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
+
+    # 1 initial generation + 2 migrations on each loop iteration.
+    expected_gen = 1 + 2 * 3
+
+    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+    assert mconf["generation"] == expected_gen
+
+    assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith(f"g#{expected_gen}:")
+
+    # Restart and check again to make sure data is persistent.
+    ep.stop()
+    ep.start(safekeeper_generation=1, safekeepers=[3])
+
+    assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]