Merge branch 'main' into amasterov/random-ops-add-snapshots

Modify weights
Rename a method for consistency
2026-05-20 22:50:38 +00:00 · 2025-07-31 12:10:43 +02:00 · 2025-07-23 15:54:39 +02:00 · 2025-07-23 15:38:59 +02:00 · 2025-07-23 15:27:36 +02:00 · 2025-07-23 15:21:21 +02:00
20 changed files with 819 additions and 977 deletions
--- a/.github/workflows/verify_runner_perf.yml
+++ b/.github/workflows/verify_runner_perf.yml
@@ -1,99 +0,0 @@
-name: verify runner performance with sysbench
-
-on:
-  # uncomment to run on push for debugging your PR
-  push:
-    branches: [ 'bodobolero/sysbench_4_perf_runner' ]
-  workflow_dispatch:
-    inputs:
-      runner_labels_json:
-        description: JSON array of runner labels to test (e.g. ["small-amd64","large-amd64"]) 
-        required: false
-        default: '["unit-perf-aws-arm"]'
-
-defaults:
-  run:
-    shell: bash -euxo pipefail {0}
-
-concurrency:
-  group: sysbench-runner-perf
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  sysbench:
-    strategy:
-      fail-fast: false
-      matrix:
-        runner: ${{ fromJSON((github.event_name == 'workflow_dispatch' && inputs.runner_labels_json != '' && inputs.runner_labels_json) || '["unit-perf-aws-arm"]') }}
-    permissions:
-      id-token: write # aws-actions/configure-aws-credentials
-      statuses: write
-      contents: write
-      pull-requests: write
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 120
-    container:
-      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
-      credentials:
-        username: ${{ github.actor }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-      # for changed limits, see comments on `options:` earlier in this file
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --ulimit nofile=65536:65536 --security-opt seccomp=unconfined
-
-    steps:
-      - name: Checkout sysbench source
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          repository: akopytov/sysbench
-          ref: master
-          path: sysbench
-
-      - name: Build sysbench
-        run: |
-          cd "$GITHUB_WORKSPACE/sysbench"
-          ./autogen.sh
-          ./configure --without-mysql
-          make -j"$(nproc || sysctl -n hw.ncpu || echo 2)"
-          ./src/sysbench --version
-
-      - name: sysbench io prepare
-        run: |
-          "$GITHUB_WORKSPACE/sysbench/src/sysbench" fileio \
-            --file-total-size=2G \
-            --file-test-mode=rndrw \
-            --file-extra-flags=direct \
-            --file-fsync-freq=0 \
-            --threads=4 \
-            --time=60 prepare
-
-      - name: sysbench io run
-        run: |
-          "$GITHUB_WORKSPACE/sysbench/src/sysbench" fileio \
-            --file-total-size=2G \
-            --file-test-mode=rndrw \
-            --file-extra-flags=direct \
-            --file-fsync-freq=0 \
-            --threads=4 \
-            --time=60 run
-
-      - name: sysbench cpu
-        run: |
-          "$GITHUB_WORKSPACE/sysbench/src/sysbench" cpu \
-            --cpu-max-prime=200000 \
-            --threads=8 \
-            --time=60 run
-
-      - name: sysbench memory
-        run: |
-          "$GITHUB_WORKSPACE/sysbench/src/sysbench" memory \
-            --memory-block-size=1M \
-            --memory-total-size=0 \
-            --threads=8 \
-            --time=60 \
-            --memory-oper=write \
-            run
-
-
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -2780,7 +2780,7 @@ LIMIT 100",
                // 4. We start again and try to prewarm with the state from 2. instead of the previous complete state
                if matches!(
                    prewarm_state,
-                    LfcPrewarmState::Completed { .. }
+                    LfcPrewarmState::Completed
                        | LfcPrewarmState::NotPrewarmed
                        | LfcPrewarmState::Skipped
                ) {
--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -7,11 +7,19 @@ use http::StatusCode;
 use reqwest::Client;
 use std::mem::replace;
 use std::sync::Arc;
-use std::time::Instant;
 use tokio::{io::AsyncReadExt, select, spawn};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info};

+#[derive(serde::Serialize, Default)]
+pub struct LfcPrewarmStateWithProgress {
+    #[serde(flatten)]
+    base: LfcPrewarmState,
+    total: i32,
+    prewarmed: i32,
+    skipped: i32,
+}
+
 /// A pair of url and a token to query endpoint storage for LFC prewarm-related tasks
 struct EndpointStoragePair {
    url: String,
@@ -20,7 +28,7 @@ struct EndpointStoragePair {

 const KEY: &str = "lfc_state";
 impl EndpointStoragePair {
-    /// endpoint_id is set to None while prewarming from other endpoint, see compute_promote.rs
+    /// endpoint_id is set to None while prewarming from other endpoint, see replica promotion
    /// If not None, takes precedence over pspec.spec.endpoint_id
    fn from_spec_and_endpoint(
        pspec: &crate::compute::ParsedSpec,
@@ -46,8 +54,36 @@ impl EndpointStoragePair {
 }

 impl ComputeNode {
-    pub async fn lfc_prewarm_state(&self) -> LfcPrewarmState {
-        self.state.lock().unwrap().lfc_prewarm_state.clone()
+    // If prewarm failed, we want to get overall number of segments as well as done ones.
+    // However, this function should be reliable even if querying postgres failed.
+    pub async fn lfc_prewarm_state(&self) -> LfcPrewarmStateWithProgress {
+        info!("requesting LFC prewarm state from postgres");
+        let mut state = LfcPrewarmStateWithProgress::default();
+        {
+            state.base = self.state.lock().unwrap().lfc_prewarm_state.clone();
+        }
+
+        let client = match ComputeNode::get_maintenance_client(&self.tokio_conn_conf).await {
+            Ok(client) => client,
+            Err(err) => {
+                error!(%err, "connecting to postgres");
+                return state;
+            }
+        };
+        let row = match client
+            .query_one("select * from neon.get_prewarm_info()", &[])
+            .await
+        {
+            Ok(row) => row,
+            Err(err) => {
+                error!(%err, "querying LFC prewarm status");
+                return state;
+            }
+        };
+        state.total = row.try_get(0).unwrap_or_default();
+        state.prewarmed = row.try_get(1).unwrap_or_default();
+        state.skipped = row.try_get(2).unwrap_or_default();
+        state
    }

    pub fn lfc_offload_state(&self) -> LfcOffloadState {
@@ -97,6 +133,7 @@ impl ComputeNode {
    }

    /// Request LFC state from endpoint storage and load corresponding pages into Postgres.
+    /// Returns a result with `false` if the LFC state is not found in endpoint storage.
    async fn prewarm_impl(
        &self,
        from_endpoint: Option<String>,
@@ -111,7 +148,6 @@ impl ComputeNode {
        fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint"));

        info!(%url, "requesting LFC state from endpoint storage");
-        let mut now = Instant::now();
        let request = Client::new().get(&url).bearer_auth(storage_token);
        let response = select! {
            _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
@@ -124,8 +160,6 @@ impl ComputeNode {
            StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped),
            status => bail!("{status} querying endpoint storage"),
        }
-        let state_download_time_ms = now.elapsed().as_millis() as u32;
-        now = Instant::now();

        let mut uncompressed = Vec::new();
        let lfc_state = select! {
@@ -140,8 +174,6 @@ impl ComputeNode {
            read = decoder.read_to_end(&mut uncompressed) => read
        }
        .context("decoding LFC state")?;
-        let uncompress_time_ms = now.elapsed().as_millis() as u32;
-        now = Instant::now();

        let uncompressed_len = uncompressed.len();
        info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}");
@@ -164,34 +196,15 @@ impl ComputeNode {
        }
        .context("loading LFC state into postgres")
        .map(|_| ())?;
-        let prewarm_time_ms = now.elapsed().as_millis() as u32;

-        let row = client
-            .query_one("select * from neon.get_prewarm_info()", &[])
-            .await
-            .context("querying prewarm info")?;
-        let total = row.try_get(0).unwrap_or_default();
-        let prewarmed = row.try_get(1).unwrap_or_default();
-        let skipped = row.try_get(2).unwrap_or_default();
-
-        Ok(LfcPrewarmState::Completed {
-            total,
-            prewarmed,
-            skipped,
-            state_download_time_ms,
-            uncompress_time_ms,
-            prewarm_time_ms,
-        })
+        Ok(LfcPrewarmState::Completed)
    }

    /// If offload request is ongoing, return false, true otherwise
    pub fn offload_lfc(self: &Arc<Self>) -> bool {
        {
            let state = &mut self.state.lock().unwrap().lfc_offload_state;
-            if matches!(
-                replace(state, LfcOffloadState::Offloading),
-                LfcOffloadState::Offloading
-            ) {
+            if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading {
                return false;
            }
        }
@@ -203,10 +216,7 @@ impl ComputeNode {
    pub async fn offload_lfc_async(self: &Arc<Self>) {
        {
            let state = &mut self.state.lock().unwrap().lfc_offload_state;
-            if matches!(
-                replace(state, LfcOffloadState::Offloading),
-                LfcOffloadState::Offloading
-            ) {
+            if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading {
                return;
            }
        }
@@ -224,6 +234,7 @@ impl ComputeNode {
                LfcOffloadState::Failed { error }
            }
        };
+
        self.state.lock().unwrap().lfc_offload_state = state;
    }

@@ -231,7 +242,6 @@ impl ComputeNode {
        let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
        info!(%url, "requesting LFC state from Postgres");

-        let mut now = Instant::now();
        let row = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
            .await
            .context("connecting to postgres")?
@@ -245,36 +255,25 @@ impl ComputeNode {
            info!(%url, "empty LFC state, not exporting");
            return Ok(LfcOffloadState::Skipped);
        };
-        let state_query_time_ms = now.elapsed().as_millis() as u32;
-        now = Instant::now();

        let mut compressed = Vec::new();
        ZstdEncoder::new(state)
            .read_to_end(&mut compressed)
            .await
            .context("compressing LFC state")?;
-        let compress_time_ms = now.elapsed().as_millis() as u32;
-        now = Instant::now();

        let compressed_len = compressed.len();
-        info!(%url, "downloaded LFC state, compressed size {compressed_len}");
+        info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage");

        let request = Client::new().put(url).bearer_auth(token).body(compressed);
-        let response = request
-            .send()
-            .await
-            .context("writing to endpoint storage")?;
-        let state_upload_time_ms = now.elapsed().as_millis() as u32;
-        let status = response.status();
-        if status != StatusCode::OK {
-            bail!("request to endpoint storage failed: {status}");
+        match request.send().await {
+            Ok(res) if res.status() == StatusCode::OK => Ok(LfcOffloadState::Completed),
+            Ok(res) => bail!(
+                "Request to endpoint storage failed with status: {}",
+                res.status()
+            ),
+            Err(err) => Err(err).context("writing to endpoint storage"),
        }
-
-        Ok(LfcOffloadState::Completed {
-            compress_time_ms,
-            state_query_time_ms,
-            state_upload_time_ms,
-        })
    }

    pub fn cancel_prewarm(self: &Arc<Self>) {
--- a/compute_tools/src/compute_promote.rs
+++ b/compute_tools/src/compute_promote.rs
@@ -1,24 +1,32 @@
 use crate::compute::ComputeNode;
-use anyhow::{Context, bail};
+use anyhow::{Context, Result, bail};
 use compute_api::responses::{LfcPrewarmState, PromoteConfig, PromoteState};
-use std::time::Instant;
+use compute_api::spec::ComputeMode;
+use itertools::Itertools;
+use std::collections::HashMap;
+use std::{sync::Arc, time::Duration};
+use tokio::time::sleep;
 use tracing::info;
+use utils::lsn::Lsn;

 impl ComputeNode {
-    /// Returns only when promote fails or succeeds. If http client calling this function
-    /// disconnects, this does not stop promotion, and subsequent calls block until promote finishes.
+    /// Returns only when promote fails or succeeds. If a network error occurs
+    /// and http client disconnects, this does not stop promotion, and subsequent
+    /// calls block until promote finishes.
    /// Called by control plane on secondary after primary endpoint is terminated
    /// Has a failpoint "compute-promotion"
-    pub async fn promote(self: &std::sync::Arc<Self>, cfg: PromoteConfig) -> PromoteState {
-        let this = self.clone();
-        let promote_fn = async move || match this.promote_impl(cfg).await {
-            Ok(state) => state,
-            Err(err) => {
-                tracing::error!(%err, "promoting replica");
-                let error = format!("{err:#}");
-                PromoteState::Failed { error }
+    pub async fn promote(self: &Arc<Self>, cfg: PromoteConfig) -> PromoteState {
+        let cloned = self.clone();
+        let promote_fn = async move || {
+            let Err(err) = cloned.promote_impl(cfg).await else {
+                return PromoteState::Completed;
+            };
+            tracing::error!(%err, "promoting");
+            PromoteState::Failed {
+                error: format!("{err:#}"),
            }
        };
+
        let start_promotion = || {
            let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
            tokio::spawn(async move { tx.send(promote_fn().await) });
@@ -26,31 +34,36 @@ impl ComputeNode {
        };

        let mut task;
-        // promote_impl locks self.state so we need to unlock it before calling task.changed()
+        // self.state is unlocked after block ends so we lock it in promote_impl
+        // and task.changed() is reached
        {
-            let promote_state = &mut self.state.lock().unwrap().promote_state;
-            task = promote_state.get_or_insert_with(start_promotion).clone()
-        }
-        if task.changed().await.is_err() {
-            let error = "promote sender dropped".to_string();
-            return PromoteState::Failed { error };
+            task = self
+                .state
+                .lock()
+                .unwrap()
+                .promote_state
+                .get_or_insert_with(start_promotion)
+                .clone()
        }
+        task.changed().await.expect("promote sender dropped");
        task.borrow().clone()
    }

-    async fn promote_impl(&self, cfg: PromoteConfig) -> anyhow::Result<PromoteState> {
+    async fn promote_impl(&self, mut cfg: PromoteConfig) -> Result<()> {
        {
            let state = self.state.lock().unwrap();
            let mode = &state.pspec.as_ref().unwrap().spec.mode;
-            if *mode != compute_api::spec::ComputeMode::Replica {
-                bail!("compute mode \"{}\" is not replica", mode.to_type_str());
+            if *mode != ComputeMode::Replica {
+                bail!("{} is not replica", mode.to_type_str());
            }
+
+            // we don't need to query Postgres so not self.lfc_prewarm_state()
            match &state.lfc_prewarm_state {
-                status @ (LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming) => {
-                    bail!("compute {status}")
+                LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
+                    bail!("prewarm not requested or pending")
                }
                LfcPrewarmState::Failed { error } => {
-                    tracing::warn!(%error, "compute prewarm failed")
+                    tracing::warn!(%error, "replica prewarm failed")
                }
                _ => {}
            }
@@ -59,10 +72,9 @@ impl ComputeNode {
        let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
            .await
            .context("connecting to postgres")?;
-        let mut now = Instant::now();

        let primary_lsn = cfg.wal_flush_lsn;
-        let mut standby_lsn = utils::lsn::Lsn::INVALID;
+        let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
        const RETRIES: i32 = 20;
        for i in 0..=RETRIES {
            let row = client
@@ -70,18 +82,16 @@ impl ComputeNode {
                .await
                .context("getting last replay lsn")?;
            let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
-            standby_lsn = lsn.into();
-            if standby_lsn >= primary_lsn {
+            last_wal_replay_lsn = lsn.into();
+            if last_wal_replay_lsn >= primary_lsn {
                break;
            }
-            info!(%standby_lsn, %primary_lsn, "catching up, try {i}");
-            tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+            info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
+            sleep(Duration::from_secs(1)).await;
        }
-        if standby_lsn < primary_lsn {
+        if last_wal_replay_lsn < primary_lsn {
            bail!("didn't catch up with primary in {RETRIES} retries");
        }
-        let lsn_wait_time_ms = now.elapsed().as_millis() as u32;
-        now = Instant::now();

        // using $1 doesn't work with ALTER SYSTEM SET
        let safekeepers_sql = format!(
@@ -92,33 +102,27 @@ impl ComputeNode {
            .query(&safekeepers_sql, &[])
            .await
            .context("setting safekeepers")?;
-        client
-            .query(
-                "ALTER SYSTEM SET synchronous_standby_names=walproposer",
-                &[],
-            )
-            .await
-            .context("setting synchronous_standby_names")?;
        client
            .query("SELECT pg_catalog.pg_reload_conf()", &[])
            .await
            .context("reloading postgres config")?;

        #[cfg(feature = "testing")]
-        fail::fail_point!("compute-promotion", |_| bail!(
-            "compute-promotion failpoint"
-        ));
+        fail::fail_point!("compute-promotion", |_| {
+            bail!("promotion configured to fail because of a failpoint")
+        });

        let row = client
            .query_one("SELECT * FROM pg_catalog.pg_promote()", &[])
            .await
            .context("pg_promote")?;
        if !row.get::<usize, bool>(0) {
-            bail!("pg_promote() failed");
+            bail!("pg_promote() returned false");
        }
-        let pg_promote_time_ms = now.elapsed().as_millis() as u32;
-        let now = Instant::now();

+        let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
+            .await
+            .context("connecting to postgres")?;
        let row = client
            .query_one("SHOW transaction_read_only", &[])
            .await
@@ -127,47 +131,36 @@ impl ComputeNode {
            bail!("replica in read only mode after promotion");
        }

-        // Already checked validity in http handler
-        #[allow(unused_mut)]
-        let mut new_pspec = crate::compute::ParsedSpec::try_from(cfg.spec).expect("invalid spec");
        {
            let mut state = self.state.lock().unwrap();
-
-            // Local setup has different ports for pg process (port=) for primary and secondary.
-            // Primary is stopped so we need secondary's "port" value
-            #[cfg(feature = "testing")]
-            {
-                let old_spec = &state.pspec.as_ref().unwrap().spec;
-                let Some(old_conf) = old_spec.cluster.postgresql_conf.as_ref() else {
-                    bail!("pspec.spec.cluster.postgresql_conf missing for endpoint");
-                };
-                let set: std::collections::HashMap<&str, &str> = old_conf
-                    .split_terminator('\n')
-                    .map(|e| e.split_once("=").expect("invalid item"))
-                    .collect();
-
-                let Some(new_conf) = new_pspec.spec.cluster.postgresql_conf.as_mut() else {
-                    bail!("pspec.spec.cluster.postgresql_conf missing for supplied config");
-                };
-                new_conf.push_str(&format!("port={}\n", set["port"]));
-            }
-
-            tracing::debug!("applied spec: {:#?}", new_pspec.spec);
-            if self.params.lakebase_mode {
-                ComputeNode::set_spec(&self.params, &mut state, new_pspec);
-            } else {
-                state.pspec = Some(new_pspec);
-            }
+            let spec = &mut state.pspec.as_mut().unwrap().spec;
+            spec.mode = ComputeMode::Primary;
+            let new_conf = cfg.spec.cluster.postgresql_conf.as_mut().unwrap();
+            let existing_conf = spec.cluster.postgresql_conf.as_ref().unwrap();
+            Self::merge_spec(new_conf, existing_conf);
        }
-
        info!("applied new spec, reconfiguring as primary");
-        self.reconfigure()?;
-        let reconfigure_time_ms = now.elapsed().as_millis() as u32;
+        self.reconfigure()
+    }

-        Ok(PromoteState::Completed {
-            lsn_wait_time_ms,
-            pg_promote_time_ms,
-            reconfigure_time_ms,
-        })
+    /// Merge old and new Postgres conf specs to apply on secondary.
+    /// Change new spec's port and safekeepers since they are supplied
+    /// differenly
+    fn merge_spec(new_conf: &mut String, existing_conf: &str) {
+        let mut new_conf_set: HashMap<&str, &str> = new_conf
+            .split_terminator('\n')
+            .map(|e| e.split_once("=").expect("invalid item"))
+            .collect();
+        new_conf_set.remove("neon.safekeepers");
+
+        let existing_conf_set: HashMap<&str, &str> = existing_conf
+            .split_terminator('\n')
+            .map(|e| e.split_once("=").expect("invalid item"))
+            .collect();
+        new_conf_set.insert("port", existing_conf_set["port"]);
+        *new_conf = new_conf_set
+            .iter()
+            .map(|(k, v)| format!("{k}={v}"))
+            .join("\n");
    }
 }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -65,19 +65,14 @@ pub fn write_postgres_conf(
        writeln!(file, "{conf}")?;
    }

+    // Stripe size GUC should be defined prior to connection string
+    if let Some(stripe_size) = spec.shard_stripe_size {
+        writeln!(file, "neon.stripe_size={stripe_size}")?;
+    }
    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
    writeln!(file)?;
    if let Some(conninfo) = &spec.pageserver_connection_info {
-        // Stripe size GUC should be defined prior to connection string
-        if let Some(stripe_size) = conninfo.stripe_size {
-            writeln!(
-                file,
-                "# from compute spec's pageserver_connection_info.stripe_size field"
-            )?;
-            writeln!(file, "neon.stripe_size={stripe_size}")?;
-        }
-
        let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
        let num_shards = if conninfo.shard_count.0 == 0 {
            1 // unsharded, treat it as a single shard
@@ -115,7 +110,7 @@ pub fn write_postgres_conf(
        if let Some(libpq_urls) = libpq_urls {
            writeln!(
                file,
-                "# derived from compute spec's pageserver_connection_info field"
+                "# derived from compute spec's pageserver_conninfo field"
            )?;
            writeln!(
                file,
@@ -125,16 +120,24 @@ pub fn write_postgres_conf(
        } else {
            writeln!(file, "# no neon.pageserver_connstring")?;
        }
-    } else {
-        // Stripe size GUC should be defined prior to connection string
-        if let Some(stripe_size) = spec.shard_stripe_size {
-            writeln!(file, "# from compute spec's shard_stripe_size field")?;
+
+        if let Some(stripe_size) = conninfo.stripe_size {
+            writeln!(
+                file,
+                "# from compute spec's pageserver_conninfo.stripe_size field"
+            )?;
            writeln!(file, "neon.stripe_size={stripe_size}")?;
        }
+    } else {
        if let Some(s) = &spec.pageserver_connstring {
            writeln!(file, "# from compute spec's pageserver_connstring field")?;
            writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
        }
+
+        if let Some(stripe_size) = spec.shard_stripe_size {
+            writeln!(file, "# from compute spec's shard_stripe_size field")?;
+            writeln!(file, "neon.stripe_size={stripe_size}")?;
+        }
    }

    if !spec.safekeeper_connstrings.is_empty() {
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -617,6 +617,9 @@ components:
      type: object
      required:
        - status
+        - total
+        - prewarmed
+        - skipped
      properties:
        status:
          description: LFC prewarm status
@@ -634,15 +637,6 @@ components:
        skipped:
          description: Pages processed but not prewarmed
          type: integer
-        state_download_time_ms:
-          description: Time it takes to download LFC state to compute
-          type: integer
-        uncompress_time_ms:
-          description: Time it takes to uncompress LFC state
-          type: integer
-        prewarm_time_ms:
-          description: Time it takes to prewarm LFC state in Postgres
-          type: integer

    LfcOffloadState:
      type: object
@@ -656,16 +650,6 @@ components:
        error:
          description: LFC offload error, if any
          type: string
-        state_query_time_ms:
-          description: Time it takes to get LFC state from Postgres
-          type: integer
-        compress_time_ms:
-          description: Time it takes to compress LFC state
-          type: integer
-        state_upload_time_ms:
-          description: Time it takes to upload LFC state to endpoint storage
-          type: integer
-

    PromoteState:
      type: object
@@ -679,15 +663,6 @@ components:
        error:
          description: Promote error, if any
          type: string
-        lsn_wait_time_ms:
-          description: Time it takes for secondary to catch up with primary WAL flush LSN
-          type: integer
-        pg_promote_time_ms:
-          description: Time it takes to call pg_promote on secondary
-          type: integer
-        reconfigure_time_ms:
-          description: Time it takes to reconfigure promoted secondary
-          type: integer

    SetRoleGrantsRequest:
      type: object
--- a/compute_tools/src/http/routes/lfc.rs
+++ b/compute_tools/src/http/routes/lfc.rs
@@ -1,11 +1,12 @@
+use crate::compute_prewarm::LfcPrewarmStateWithProgress;
 use crate::http::JsonResponse;
 use axum::response::{IntoResponse, Response};
 use axum::{Json, http::StatusCode};
 use axum_extra::extract::OptionalQuery;
-use compute_api::responses::{LfcOffloadState, LfcPrewarmState};
+use compute_api::responses::LfcOffloadState;
 type Compute = axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>;

-pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json<LfcPrewarmState> {
+pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json<LfcPrewarmStateWithProgress> {
    Json(compute.lfc_prewarm_state().await)
 }

--- a/compute_tools/src/http/routes/promote.rs
+++ b/compute_tools/src/http/routes/promote.rs
@@ -1,22 +1,11 @@
 use crate::http::JsonResponse;
 use axum::extract::Json;
-use compute_api::responses::PromoteConfig;
 use http::StatusCode;

 pub(in crate::http) async fn promote(
    compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
-    Json(cfg): Json<PromoteConfig>,
+    Json(cfg): Json<compute_api::responses::PromoteConfig>,
 ) -> axum::response::Response {
-    // Return early at the cost of extra parsing spec
-    let pspec = match crate::compute::ParsedSpec::try_from(cfg.spec) {
-        Ok(p) => p,
-        Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
-    };
-
-    let cfg = PromoteConfig {
-        spec: pspec.spec,
-        wal_flush_lsn: cfg.wal_flush_lsn,
-    };
    let state = compute.promote(cfg).await;
    if let compute_api::responses::PromoteState::Failed { error: _ } = state {
        return JsonResponse::create_response(StatusCode::INTERNAL_SERVER_ERROR, state);
--- a/compute_tools/src/sql/anon_ext_fn_reassign.sql
+++ b/compute_tools/src/sql/anon_ext_fn_reassign.sql
@@ -0,0 +1,13 @@
+DO $$
+DECLARE
+    query varchar;
+BEGIN
+    FOR query IN
+    SELECT pg_catalog.format('ALTER FUNCTION %I(%s) OWNER TO {db_owner};', p.oid::regproc, pg_catalog.pg_get_function_identity_arguments(p.oid))
+    FROM pg_catalog.pg_proc p
+        WHERE p.pronamespace OPERATOR(pg_catalog.=) 'anon'::regnamespace::oid
+    LOOP
+        EXECUTE query;
+    END LOOP;
+END
+$$;
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -71,9 +71,8 @@ const DEFAULT_PG_VERSION_NUM: &str = "17";

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

-/// Neon CLI.
 #[derive(clap::Parser)]
-#[command(version = GIT_VERSION, name = "Neon CLI")]
+#[command(version = GIT_VERSION, about, name = "Neon CLI")]
 struct Cli {
    #[command(subcommand)]
    command: NeonLocalCmd,
@@ -108,31 +107,30 @@ enum NeonLocalCmd {
    Stop(StopCmdArgs),
 }

-/// Initialize a new Neon repository, preparing configs for services to start with.
 #[derive(clap::Args)]
+#[clap(about = "Initialize a new Neon repository, preparing configs for services to start with")]
 struct InitCmdArgs {
-    /// How many pageservers to create (default 1).
-    #[clap(long)]
+    #[clap(long, help("How many pageservers to create (default 1)"))]
    num_pageservers: Option<u16>,

    #[clap(long)]
    config: Option<PathBuf>,

-    /// Force initialization even if the repository is not empty.
-    #[clap(long, default_value = "must-not-exist")]
+    #[clap(long, help("Force initialization even if the repository is not empty"))]
    #[arg(value_parser)]
+    #[clap(default_value = "must-not-exist")]
    force: InitForceMode,
 }

-/// Start pageserver and safekeepers.
 #[derive(clap::Args)]
+#[clap(about = "Start pageserver and safekeepers")]
 struct StartCmdArgs {
    #[clap(long = "start-timeout", default_value = "10s")]
    timeout: humantime::Duration,
 }

-/// Stop pageserver and safekeepers.
 #[derive(clap::Args)]
+#[clap(about = "Stop pageserver and safekeepers")]
 struct StopCmdArgs {
    #[arg(value_enum)]
    #[clap(long, default_value_t = StopMode::Fast)]
@@ -145,8 +143,8 @@ enum StopMode {
    Immediate,
 }

-/// Manage tenants.
 #[derive(clap::Subcommand)]
+#[clap(about = "Manage tenants")]
 enum TenantCmd {
    List,
    Create(TenantCreateCmdArgs),
@@ -157,36 +155,38 @@ enum TenantCmd {

 #[derive(clap::Args)]
 struct TenantCreateCmdArgs {
-    /// Tenant ID, as a 32-byte hexadecimal string.
-    #[clap(long = "tenant-id")]
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
    tenant_id: Option<TenantId>,

-    /// Use a specific timeline id when creating a tenant and its initial timeline.
-    #[clap(long)]
+    #[clap(
+        long,
+        help = "Use a specific timeline id when creating a tenant and its initial timeline"
+    )]
    timeline_id: Option<TimelineId>,

    #[clap(short = 'c')]
    config: Vec<String>,

-    /// Postgres version to use for the initial timeline.
    #[arg(default_value = DEFAULT_PG_VERSION_NUM)]
-    #[clap(long)]
+    #[clap(long, help = "Postgres version to use for the initial timeline")]
    pg_version: PgMajorVersion,

-    /// Use this tenant in future CLI commands where tenant_id is needed, but not specified.
-    #[clap(long)]
+    #[clap(
+        long,
+        help = "Use this tenant in future CLI commands where tenant_id is needed, but not specified"
+    )]
    set_default: bool,

-    /// Number of shards in the new tenant.
-    #[clap(long)]
+    #[clap(long, help = "Number of shards in the new tenant")]
    #[arg(default_value_t = 0)]
    shard_count: u8,
-    /// Sharding stripe size in pages.
-    #[clap(long)]
+    #[clap(long, help = "Sharding stripe size in pages")]
    shard_stripe_size: Option<u32>,

-    /// Placement policy shards in this tenant.
-    #[clap(long)]
+    #[clap(long, help = "Placement policy shards in this tenant")]
    #[arg(value_parser = parse_placement_policy)]
    placement_policy: Option<PlacementPolicy>,
 }
@@ -195,35 +195,44 @@ fn parse_placement_policy(s: &str) -> anyhow::Result<PlacementPolicy> {
    Ok(serde_json::from_str::<PlacementPolicy>(s)?)
 }

-/// Set a particular tenant as default in future CLI commands where tenant_id is needed, but not
-/// specified.
 #[derive(clap::Args)]
+#[clap(
+    about = "Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"
+)]
 struct TenantSetDefaultCmdArgs {
-    /// Tenant ID, as a 32-byte hexadecimal string.
-    #[clap(long = "tenant-id")]
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
    tenant_id: TenantId,
 }

 #[derive(clap::Args)]
 struct TenantConfigCmdArgs {
-    /// Tenant ID, as a 32-byte hexadecimal string.
-    #[clap(long = "tenant-id")]
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
    tenant_id: Option<TenantId>,

    #[clap(short = 'c')]
    config: Vec<String>,
 }

-/// Import a tenant that is present in remote storage, and create branches for its timelines.
 #[derive(clap::Args)]
+#[clap(
+    about = "Import a tenant that is present in remote storage, and create branches for its timelines"
+)]
 struct TenantImportCmdArgs {
-    /// Tenant ID, as a 32-byte hexadecimal string.
-    #[clap(long = "tenant-id")]
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
    tenant_id: TenantId,
 }

-/// Manage timelines.
 #[derive(clap::Subcommand)]
+#[clap(about = "Manage timelines")]
 enum TimelineCmd {
    List(TimelineListCmdArgs),
    Branch(TimelineBranchCmdArgs),
@@ -231,87 +240,98 @@ enum TimelineCmd {
    Import(TimelineImportCmdArgs),
 }

-/// List all timelines available to this pageserver.
 #[derive(clap::Args)]
+#[clap(about = "List all timelines available to this pageserver")]
 struct TimelineListCmdArgs {
-    /// Tenant ID, as a 32-byte hexadecimal string.
-    #[clap(long = "tenant-id")]
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
    tenant_shard_id: Option<TenantShardId>,
 }

-/// Create a new timeline, branching off from another timeline.
 #[derive(clap::Args)]
+#[clap(about = "Create a new timeline, branching off from another timeline")]
 struct TimelineBranchCmdArgs {
-    /// Tenant ID, as a 32-byte hexadecimal string.
-    #[clap(long = "tenant-id")]
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
    tenant_id: Option<TenantId>,
-    /// New timeline's ID, as a 32-byte hexadecimal string.
-    #[clap(long)]
+
+    #[clap(long, help = "New timeline's ID")]
    timeline_id: Option<TimelineId>,
-    /// Human-readable alias for the new timeline.
-    #[clap(long)]
+
+    #[clap(long, help = "Human-readable alias for the new timeline")]
    branch_name: String,
-    /// Use last Lsn of another timeline (and its data) as base when creating the new timeline. The
-    /// timeline gets resolved by its branch name.
-    #[clap(long)]
+
+    #[clap(
+        long,
+        help = "Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name."
+    )]
    ancestor_branch_name: Option<String>,
-    /// When using another timeline as base, use a specific Lsn in it instead of the latest one.
-    #[clap(long)]
+
+    #[clap(
+        long,
+        help = "When using another timeline as base, use a specific Lsn in it instead of the latest one"
+    )]
    ancestor_start_lsn: Option<Lsn>,
 }

-/// Create a new blank timeline.
 #[derive(clap::Args)]
+#[clap(about = "Create a new blank timeline")]
 struct TimelineCreateCmdArgs {
-    /// Tenant ID, as a 32-byte hexadecimal string.
-    #[clap(long = "tenant-id")]
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
    tenant_id: Option<TenantId>,
-    /// New timeline's ID, as a 32-byte hexadecimal string.
-    #[clap(long)]
+
+    #[clap(long, help = "New timeline's ID")]
    timeline_id: Option<TimelineId>,
-    /// Human-readable alias for the new timeline.
-    #[clap(long)]
+
+    #[clap(long, help = "Human-readable alias for the new timeline")]
    branch_name: String,

-    /// Postgres version.
    #[arg(default_value = DEFAULT_PG_VERSION_NUM)]
-    #[clap(long)]
+    #[clap(long, help = "Postgres version")]
    pg_version: PgMajorVersion,
 }

-/// Import a timeline from a basebackup directory.
 #[derive(clap::Args)]
+#[clap(about = "Import timeline from a basebackup directory")]
 struct TimelineImportCmdArgs {
-    /// Tenant ID, as a 32-byte hexadecimal string.
-    #[clap(long = "tenant-id")]
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
    tenant_id: Option<TenantId>,
-    /// New timeline's ID, as a 32-byte hexadecimal string.
-    #[clap(long)]
+
+    #[clap(long, help = "New timeline's ID")]
    timeline_id: TimelineId,
-    /// Human-readable alias for the new timeline.
-    #[clap(long)]
+
+    #[clap(long, help = "Human-readable alias for the new timeline")]
    branch_name: String,
-    /// Basebackup tarfile to import.
-    #[clap(long)]
+
+    #[clap(long, help = "Basebackup tarfile to import")]
    base_tarfile: PathBuf,
-    /// LSN the basebackup starts at.
-    #[clap(long)]
+
+    #[clap(long, help = "Lsn the basebackup starts at")]
    base_lsn: Lsn,
-    /// WAL to add after base.
-    #[clap(long)]
+
+    #[clap(long, help = "Wal to add after base")]
    wal_tarfile: Option<PathBuf>,
-    /// LSN the basebackup ends at.
-    #[clap(long)]
+
+    #[clap(long, help = "Lsn the basebackup ends at")]
    end_lsn: Option<Lsn>,

-    /// Postgres version of the basebackup being imported.
    #[arg(default_value = DEFAULT_PG_VERSION_NUM)]
-    #[clap(long)]
+    #[clap(long, help = "Postgres version of the backup being imported")]
    pg_version: PgMajorVersion,
 }

-/// Manage pageservers.
 #[derive(clap::Subcommand)]
+#[clap(about = "Manage pageservers")]
 enum PageserverCmd {
    Status(PageserverStatusCmdArgs),
    Start(PageserverStartCmdArgs),
@@ -319,202 +339,223 @@ enum PageserverCmd {
    Restart(PageserverRestartCmdArgs),
 }

-/// Show status of a local pageserver.
 #[derive(clap::Args)]
+#[clap(about = "Show status of a local pageserver")]
 struct PageserverStatusCmdArgs {
-    /// Pageserver ID.
-    #[clap(long = "id")]
+    #[clap(long = "id", help = "pageserver id")]
    pageserver_id: Option<NodeId>,
 }

-/// Start local pageserver.
 #[derive(clap::Args)]
+#[clap(about = "Start local pageserver")]
 struct PageserverStartCmdArgs {
-    /// Pageserver ID.
-    #[clap(long = "id")]
+    #[clap(long = "id", help = "pageserver id")]
    pageserver_id: Option<NodeId>,
-    /// Timeout until we fail the command.
-    #[clap(short = 't', long)]
+
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
 }

-/// Stop local pageserver.
 #[derive(clap::Args)]
+#[clap(about = "Stop local pageserver")]
 struct PageserverStopCmdArgs {
-    /// Pageserver ID.
-    #[clap(long = "id")]
+    #[clap(long = "id", help = "pageserver id")]
    pageserver_id: Option<NodeId>,
-    /// If 'immediate', don't flush repository data at shutdown
-    #[clap(short = 'm')]
+
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
    #[arg(value_enum, default_value = "fast")]
    stop_mode: StopMode,
 }

-/// Restart local pageserver.
 #[derive(clap::Args)]
+#[clap(about = "Restart local pageserver")]
 struct PageserverRestartCmdArgs {
-    /// Pageserver ID.
-    #[clap(long = "id")]
+    #[clap(long = "id", help = "pageserver id")]
    pageserver_id: Option<NodeId>,
-    /// Timeout until we fail the command.
-    #[clap(short = 't', long)]
+
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
 }

-/// Manage storage controller.
 #[derive(clap::Subcommand)]
+#[clap(about = "Manage storage controller")]
 enum StorageControllerCmd {
    Start(StorageControllerStartCmdArgs),
    Stop(StorageControllerStopCmdArgs),
 }

-/// Start storage controller.
 #[derive(clap::Args)]
+#[clap(about = "Start storage controller")]
 struct StorageControllerStartCmdArgs {
-    /// Timeout until we fail the command.
-    #[clap(short = 't', long)]
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
-    /// Identifier used to distinguish storage controller instances.
-    #[clap(long)]
+
+    #[clap(
+        long,
+        help = "Identifier used to distinguish storage controller instances"
+    )]
    #[arg(default_value_t = 1)]
    instance_id: u8,
-    /// Base port for the storage controller instance identified by instance-id (defaults to
-    /// pageserver cplane api).
-    #[clap(long)]
+
+    #[clap(
+        long,
+        help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)"
+    )]
    base_port: Option<u16>,

-    /// Whether the storage controller should handle pageserver-reported local disk loss events.
-    #[clap(long)]
+    #[clap(
+        long,
+        help = "Whether the storage controller should handle pageserver-reported local disk loss events."
+    )]
    handle_ps_local_disk_loss: Option<bool>,
 }

-/// Stop storage controller.
 #[derive(clap::Args)]
+#[clap(about = "Stop storage controller")]
 struct StorageControllerStopCmdArgs {
-    /// If 'immediate', don't flush repository data at shutdown
-    #[clap(short = 'm')]
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
    #[arg(value_enum, default_value = "fast")]
    stop_mode: StopMode,
-    /// Identifier used to distinguish storage controller instances.
-    #[clap(long)]
+
+    #[clap(
+        long,
+        help = "Identifier used to distinguish storage controller instances"
+    )]
    #[arg(default_value_t = 1)]
    instance_id: u8,
 }

-/// Manage storage broker.
 #[derive(clap::Subcommand)]
+#[clap(about = "Manage storage broker")]
 enum StorageBrokerCmd {
    Start(StorageBrokerStartCmdArgs),
    Stop(StorageBrokerStopCmdArgs),
 }

-/// Start broker.
 #[derive(clap::Args)]
+#[clap(about = "Start broker")]
 struct StorageBrokerStartCmdArgs {
-    /// Timeout until we fail the command.
-    #[clap(short = 't', long, default_value = "10s")]
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
 }

-/// Stop broker.
 #[derive(clap::Args)]
+#[clap(about = "stop broker")]
 struct StorageBrokerStopCmdArgs {
-    /// If 'immediate', don't flush repository data on shutdown.
-    #[clap(short = 'm')]
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
    #[arg(value_enum, default_value = "fast")]
    stop_mode: StopMode,
 }

-/// Manage safekeepers.
 #[derive(clap::Subcommand)]
+#[clap(about = "Manage safekeepers")]
 enum SafekeeperCmd {
    Start(SafekeeperStartCmdArgs),
    Stop(SafekeeperStopCmdArgs),
    Restart(SafekeeperRestartCmdArgs),
 }

-/// Manage object storage.
 #[derive(clap::Subcommand)]
+#[clap(about = "Manage object storage")]
 enum EndpointStorageCmd {
    Start(EndpointStorageStartCmd),
    Stop(EndpointStorageStopCmd),
 }

-/// Start object storage.
 #[derive(clap::Args)]
+#[clap(about = "Start object storage")]
 struct EndpointStorageStartCmd {
-    /// Timeout until we fail the command.
-    #[clap(short = 't', long)]
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
 }

-/// Stop object storage.
 #[derive(clap::Args)]
+#[clap(about = "Stop object storage")]
 struct EndpointStorageStopCmd {
-    /// If 'immediate', don't flush repository data on shutdown.
-    #[clap(short = 'm')]
    #[arg(value_enum, default_value = "fast")]
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
    stop_mode: StopMode,
 }

-/// Start local safekeeper.
 #[derive(clap::Args)]
+#[clap(about = "Start local safekeeper")]
 struct SafekeeperStartCmdArgs {
-    /// Safekeeper ID.
+    #[clap(help = "safekeeper id")]
    #[arg(default_value_t = NodeId(1))]
    id: NodeId,

-    /// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo.
-    #[clap(short = 'e', long = "safekeeper-extra-opt")]
+    #[clap(
+        short = 'e',
+        long = "safekeeper-extra-opt",
+        help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo"
+    )]
    extra_opt: Vec<String>,

-    /// Timeout until we fail the command.
-    #[clap(short = 't', long)]
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
 }

-/// Stop local safekeeper.
 #[derive(clap::Args)]
+#[clap(about = "Stop local safekeeper")]
 struct SafekeeperStopCmdArgs {
-    /// Safekeeper ID.
+    #[clap(help = "safekeeper id")]
    #[arg(default_value_t = NodeId(1))]
    id: NodeId,

-    /// If 'immediate', don't flush repository data on shutdown.
    #[arg(value_enum, default_value = "fast")]
-    #[clap(short = 'm')]
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
    stop_mode: StopMode,
 }

-/// Restart local safekeeper.
 #[derive(clap::Args)]
+#[clap(about = "Restart local safekeeper")]
 struct SafekeeperRestartCmdArgs {
-    /// Safekeeper ID.
+    #[clap(help = "safekeeper id")]
    #[arg(default_value_t = NodeId(1))]
    id: NodeId,

-    /// If 'immediate', don't flush repository data on shutdown.
    #[arg(value_enum, default_value = "fast")]
-    #[clap(short = 'm')]
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
    stop_mode: StopMode,

-    /// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo.
-    #[clap(short = 'e', long = "safekeeper-extra-opt")]
+    #[clap(
+        short = 'e',
+        long = "safekeeper-extra-opt",
+        help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo"
+    )]
    extra_opt: Vec<String>,

-    /// Timeout until we fail the command.
-    #[clap(short = 't', long)]
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
 }

-/// Manage Postgres instances.
 #[derive(clap::Subcommand)]
+#[clap(about = "Manage Postgres instances")]
 enum EndpointCmd {
    List(EndpointListCmdArgs),
    Create(EndpointCreateCmdArgs),
@@ -526,27 +567,33 @@ enum EndpointCmd {
    GenerateJwt(EndpointGenerateJwtCmdArgs),
 }

-/// List endpoints.
 #[derive(clap::Args)]
+#[clap(about = "List endpoints")]
 struct EndpointListCmdArgs {
-    /// Tenant ID, as a 32-byte hexadecimal string.
-    #[clap(long = "tenant-id")]
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
    tenant_shard_id: Option<TenantShardId>,
 }

-/// Create a compute endpoint.
 #[derive(clap::Args)]
+#[clap(about = "Create a compute endpoint")]
 struct EndpointCreateCmdArgs {
-    /// Tenant ID, as a 32-byte hexadecimal string.
-    #[clap(long = "tenant-id")]
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
    tenant_id: Option<TenantId>,
-    /// Postgres endpoint ID.
+
+    #[clap(help = "Postgres endpoint id")]
    endpoint_id: Option<String>,
-    /// Name of the branch the endpoint will run on.
-    #[clap(long)]
+    #[clap(long, help = "Name of the branch the endpoint will run on")]
    branch_name: Option<String>,
-    /// Specify LSN on the timeline to start from. By default, end of the timeline would be used.
-    #[clap(long)]
+    #[clap(
+        long,
+        help = "Specify Lsn on the timeline to start from. By default, end of the timeline would be used"
+    )]
    lsn: Option<Lsn>,
    #[clap(long)]
    pg_port: Option<u16>,
@@ -557,13 +604,16 @@ struct EndpointCreateCmdArgs {
    #[clap(long = "pageserver-id")]
    endpoint_pageserver_id: Option<NodeId>,

-    /// Don't do basebackup, create endpoint directory with only config files.
-    #[clap(long, action = clap::ArgAction::Set, default_value_t = false)]
+    #[clap(
+        long,
+        help = "Don't do basebackup, create endpoint directory with only config files",
+        action = clap::ArgAction::Set,
+        default_value_t = false
+    )]
    config_only: bool,

-    /// Postgres version.
    #[arg(default_value = DEFAULT_PG_VERSION_NUM)]
-    #[clap(long)]
+    #[clap(long, help = "Postgres version")]
    pg_version: PgMajorVersion,

    /// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings.
@@ -574,140 +624,170 @@ struct EndpointCreateCmdArgs {
    #[clap(long)]
    grpc: bool,

-    /// If set, the node will be a hot replica on the specified timeline.
-    #[clap(long, action = clap::ArgAction::Set, default_value_t = false)]
+    #[clap(
+        long,
+        help = "If set, the node will be a hot replica on the specified timeline",
+        action = clap::ArgAction::Set,
+        default_value_t = false
+    )]
    hot_standby: bool,
-    /// If set, will set up the catalog for neon_superuser.
-    #[clap(long)]
+
+    #[clap(long, help = "If set, will set up the catalog for neon_superuser")]
    update_catalog: bool,
-    /// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but
-    /// useful for tests.
-    #[clap(long)]
+
+    #[clap(
+        long,
+        help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
+    )]
    allow_multiple: bool,

-    /// Name of the privileged role for the endpoint.
-    // Only allow changing it on creation.
-    #[clap(long)]
+    /// Only allow changing it on creation
+    #[clap(long, help = "Name of the privileged role for the endpoint")]
    privileged_role_name: Option<String>,
 }

-/// Start Postgres. If the endpoint doesn't exist yet, it is created.
 #[derive(clap::Args)]
+#[clap(about = "Start postgres. If the endpoint doesn't exist yet, it is created.")]
 struct EndpointStartCmdArgs {
-    /// Postgres endpoint ID.
+    #[clap(help = "Postgres endpoint id")]
    endpoint_id: String,
-    /// Pageserver ID.
    #[clap(long = "pageserver-id")]
    endpoint_pageserver_id: Option<NodeId>,
-    /// Safekeepers membership generation to prefix neon.safekeepers with.
-    #[clap(long)]
+
+    #[clap(
+        long,
+        help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
+    )]
    safekeepers_generation: Option<u32>,
-    /// List of safekeepers endpoint will talk to.
-    #[clap(long)]
+    #[clap(
+        long,
+        help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
+    )]
    safekeepers: Option<String>,
-    /// Configure the remote extensions storage proxy gateway URL to request for extensions.
-    #[clap(long, alias = "remote-ext-config")]
+
+    #[clap(
+        long,
+        help = "Configure the remote extensions storage proxy gateway URL to request for extensions.",
+        alias = "remote-ext-config"
+    )]
    remote_ext_base_url: Option<String>,
-    /// If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`
-    #[clap(long)]
+
+    #[clap(
+        long,
+        help = "If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`"
+    )]
    create_test_user: bool,
-    /// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but
-    /// useful for tests.
-    #[clap(long)]
+
+    #[clap(
+        long,
+        help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
+    )]
    allow_multiple: bool,
-    /// Timeout until we fail the command.
-    #[clap(short = 't', long, value_parser= humantime::parse_duration)]
+
+    #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
    #[arg(default_value = "90s")]
    start_timeout: Duration,

-    /// Download LFC cache from endpoint storage on endpoint startup
-    #[clap(long, default_value = "false")]
+    #[clap(
+        long,
+        help = "Download LFC cache from endpoint storage on endpoint startup",
+        default_value = "false"
+    )]
    autoprewarm: bool,

-    /// Upload LFC cache to endpoint storage periodically
-    #[clap(long)]
+    #[clap(long, help = "Upload LFC cache to endpoint storage periodically")]
    offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,

-    /// Run in development mode, skipping VM-specific operations like process termination
-    #[clap(long, action = clap::ArgAction::SetTrue)]
+    #[clap(
+        long,
+        help = "Run in development mode, skipping VM-specific operations like process termination",
+        action = clap::ArgAction::SetTrue
+    )]
    dev: bool,
 }

-/// Reconfigure an endpoint.
 #[derive(clap::Args)]
+#[clap(about = "Reconfigure an endpoint")]
 struct EndpointReconfigureCmdArgs {
-    /// Tenant id. Represented as a hexadecimal string 32 symbols length
-    #[clap(long = "tenant-id")]
+    #[clap(
+        long = "tenant-id",
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
    tenant_id: Option<TenantId>,
-    /// Postgres endpoint ID.
+
+    #[clap(help = "Postgres endpoint id")]
    endpoint_id: String,
-    /// Pageserver ID.
    #[clap(long = "pageserver-id")]
    endpoint_pageserver_id: Option<NodeId>,
+
    #[clap(long)]
    safekeepers: Option<String>,
 }

-/// Refresh the endpoint's configuration by forcing it reload it's spec
 #[derive(clap::Args)]
+#[clap(about = "Refresh the endpoint's configuration by forcing it reload it's spec")]
 struct EndpointRefreshConfigurationArgs {
-    /// Postgres endpoint id
+    #[clap(help = "Postgres endpoint id")]
    endpoint_id: String,
 }

-/// Stop an endpoint.
 #[derive(clap::Args)]
+#[clap(about = "Stop an endpoint")]
 struct EndpointStopCmdArgs {
-    /// Postgres endpoint ID.
+    #[clap(help = "Postgres endpoint id")]
    endpoint_id: String,
-    /// Also delete data directory (now optional, should be default in future).
-    #[clap(long)]
+
+    #[clap(
+        long,
+        help = "Also delete data directory (now optional, should be default in future)"
+    )]
    destroy: bool,

-    /// Postgres shutdown mode, passed to `pg_ctl -m <mode>`.
-    #[clap(long)]
+    #[clap(long, help = "Postgres shutdown mode")]
    #[clap(default_value = "fast")]
    mode: EndpointTerminateMode,
 }

-/// Update the pageservers in the spec file of the compute endpoint
 #[derive(clap::Args)]
+#[clap(about = "Update the pageservers in the spec file of the compute endpoint")]
 struct EndpointUpdatePageserversCmdArgs {
-    /// Postgres endpoint id
+    #[clap(help = "Postgres endpoint id")]
    endpoint_id: String,

-    /// Specified pageserver id
-    #[clap(short = 'p', long)]
+    #[clap(short = 'p', long, help = "Specified pageserver id")]
    pageserver_id: Option<NodeId>,
 }

-/// Generate a JWT for an endpoint.
 #[derive(clap::Args)]
+#[clap(about = "Generate a JWT for an endpoint")]
 struct EndpointGenerateJwtCmdArgs {
-    /// Postgres endpoint ID.
+    #[clap(help = "Postgres endpoint id")]
    endpoint_id: String,
-    /// Scope to generate the JWT with.
-    #[clap(short = 's', long, value_parser = ComputeClaimsScope::from_str)]
+
+    #[clap(short = 's', long, help = "Scope to generate the JWT with", value_parser = ComputeClaimsScope::from_str)]
    scope: Option<ComputeClaimsScope>,
 }

-/// Manage neon_local branch name mappings.
 #[derive(clap::Subcommand)]
+#[clap(about = "Manage neon_local branch name mappings")]
 enum MappingsCmd {
    Map(MappingsMapCmdArgs),
 }

-/// Create new mapping which cannot exist already.
 #[derive(clap::Args)]
+#[clap(about = "Create new mapping which cannot exist already")]
 struct MappingsMapCmdArgs {
-    /// Tenant ID, as a 32-byte hexadecimal string.
-    #[clap(long)]
+    #[clap(
+        long,
+        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
+    )]
    tenant_id: TenantId,
-    /// Timeline ID, as a 32-byte hexadecimal string.
-    #[clap(long)]
+    #[clap(
+        long,
+        help = "Timeline id. Represented as a hexadecimal string 32 symbols length"
+    )]
    timeline_id: TimelineId,
-    /// Branch name to give to the timeline.
-    #[clap(long)]
+    #[clap(long, help = "Branch name to give to the timeline")]
    branch_name: String,
 }

--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -303,13 +303,6 @@ enum Command {
        #[arg(long, required = true, value_delimiter = ',')]
        new_sk_set: Vec<NodeId>,
    },
-    /// Abort ongoing safekeeper migration.
-    TimelineSafekeeperMigrateAbort {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        timeline_id: TimelineId,
-    },
 }

 #[derive(Parser)]
@@ -1403,17 +1396,6 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
-        Command::TimelineSafekeeperMigrateAbort {
-            tenant_id,
-            timeline_id,
-        } => {
-            let path =
-                format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate_abort");
-
-            storcon_client
-                .dispatch::<(), ()>(Method::POST, path, None)
-                .await?;
-        }
    }

    Ok(())
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -1,9 +1,10 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.

+use std::fmt::Display;
+
 use chrono::{DateTime, Utc};
 use jsonwebtoken::jwk::JwkSet;
 use serde::{Deserialize, Serialize, Serializer};
-use std::fmt::Display;

 use crate::privilege::Privilege;
 use crate::spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role};
@@ -48,7 +49,7 @@ pub struct ExtensionInstallResponse {
 /// Status of the LFC prewarm process. The same state machine is reused for
 /// both autoprewarm (prewarm after compute/Postgres start using the previously
 /// stored LFC state) and explicit prewarming via API.
-#[derive(Serialize, Default, Debug, Clone)]
+#[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcPrewarmState {
    /// Default value when compute boots up.
@@ -58,14 +59,7 @@ pub enum LfcPrewarmState {
    Prewarming,
    /// We found requested LFC state in the endpoint storage and
    /// completed prewarming successfully.
-    Completed {
-        total: i32,
-        prewarmed: i32,
-        skipped: i32,
-        state_download_time_ms: u32,
-        uncompress_time_ms: u32,
-        prewarm_time_ms: u32,
-    },
+    Completed,
    /// Unexpected error happened during prewarming. Note, `Not Found 404`
    /// response from the endpoint storage is explicitly excluded here
    /// because it can normally happen on the first compute start,
@@ -90,7 +84,7 @@ impl Display for LfcPrewarmState {
        match self {
            LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
            LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
-            LfcPrewarmState::Completed { .. } => f.write_str("Completed"),
+            LfcPrewarmState::Completed => f.write_str("Completed"),
            LfcPrewarmState::Skipped => f.write_str("Skipped"),
            LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
            LfcPrewarmState::Cancelled => f.write_str("Cancelled"),
@@ -98,36 +92,26 @@ impl Display for LfcPrewarmState {
    }
 }

-#[derive(Serialize, Default, Debug, Clone)]
+#[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcOffloadState {
    #[default]
    NotOffloaded,
    Offloading,
-    Completed {
-        state_query_time_ms: u32,
-        compress_time_ms: u32,
-        state_upload_time_ms: u32,
-    },
+    Completed,
    Failed {
        error: String,
    },
-    /// LFC state was empty so it wasn't offloaded
    Skipped,
 }

-#[derive(Serialize, Debug, Clone)]
+#[derive(Serialize, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
+/// Response of /promote
 pub enum PromoteState {
    NotPromoted,
-    Completed {
-        lsn_wait_time_ms: u32,
-        pg_promote_time_ms: u32,
-        reconfigure_time_ms: u32,
-    },
-    Failed {
-        error: String,
-    },
+    Completed,
+    Failed { error: String },
 }

 #[derive(Deserialize, Default, Debug)]
--- a/proxy/src/serverless/rest.rs
+++ b/proxy/src/serverless/rest.rs
@@ -5,17 +5,12 @@ use std::sync::Arc;

 use bytes::Bytes;
 use http::Method;
-use http::header::{
-    ACCESS_CONTROL_ALLOW_HEADERS, ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN,
-    ACCESS_CONTROL_EXPOSE_HEADERS, ACCESS_CONTROL_MAX_AGE, ACCESS_CONTROL_REQUEST_HEADERS, ALLOW,
-    AUTHORIZATION, CONTENT_TYPE, HOST, ORIGIN,
-};
+use http::header::{AUTHORIZATION, CONTENT_TYPE, HOST};
 use http_body_util::combinators::BoxBody;
-use http_body_util::{BodyExt, Empty, Full};
+use http_body_util::{BodyExt, Full};
 use http_utils::error::ApiError;
 use hyper::body::Incoming;
-use hyper::http::response::Builder;
-use hyper::http::{HeaderMap, HeaderName, HeaderValue};
+use hyper::http::{HeaderName, HeaderValue};
 use hyper::{Request, Response, StatusCode};
 use indexmap::IndexMap;
 use moka::sync::Cache;
@@ -72,15 +67,6 @@ use crate::util::deserialize_json_string;

 static EMPTY_JSON_SCHEMA: &str = r#"{"schemas":[]}"#;
 const INTROSPECTION_SQL: &str = POSTGRESQL_INTROSPECTION_SQL;
-const HEADER_VALUE_ALLOW_ALL_ORIGINS: HeaderValue = HeaderValue::from_static("*");
-// CORS headers values
-const ACCESS_CONTROL_ALLOW_METHODS_VALUE: HeaderValue =
-    HeaderValue::from_static("GET, POST, PATCH, PUT, DELETE, OPTIONS");
-const ACCESS_CONTROL_MAX_AGE_VALUE: HeaderValue = HeaderValue::from_static("86400");
-const ACCESS_CONTROL_EXPOSE_HEADERS_VALUE: HeaderValue = HeaderValue::from_static(
-    "Content-Encoding, Content-Location, Content-Range, Content-Type, Date, Location, Server, Transfer-Encoding, Range-Unit",
-);
-const ACCESS_CONTROL_ALLOW_HEADERS_VALUE: HeaderValue = HeaderValue::from_static("Authorization");

 // A wrapper around the DbSchema that allows for self-referencing
 #[self_referencing]
@@ -151,8 +137,6 @@ pub struct ApiConfig {
    pub role_claim_key: String,
    #[serde(default, deserialize_with = "deserialize_comma_separated_option")]
    pub db_extra_search_path: Option<Vec<String>>,
-    #[serde(default, deserialize_with = "deserialize_comma_separated_option")]
-    pub server_cors_allowed_origins: Option<Vec<String>>,
 }

 // The DbSchemaCache is a cache of the ApiConfig and DbSchemaOwned for each endpoint
@@ -181,13 +165,7 @@ impl DbSchemaCache {
        }
    }

-    pub fn get_cached(
-        &self,
-        endpoint_id: &EndpointCacheKey,
-    ) -> Option<Arc<(ApiConfig, DbSchemaOwned)>> {
-        count_cache_outcome(CacheKind::Schema, self.0.get(endpoint_id))
-    }
-    pub async fn get_remote(
+    pub async fn get_cached_or_remote(
        &self,
        endpoint_id: &EndpointCacheKey,
        auth_header: &HeaderValue,
@@ -196,42 +174,47 @@ impl DbSchemaCache {
        ctx: &RequestContext,
        config: &'static ProxyConfig,
    ) -> Result<Arc<(ApiConfig, DbSchemaOwned)>, RestError> {
-        info!("db_schema cache miss for endpoint: {:?}", endpoint_id);
-        let remote_value = self
-            .internal_get_remote(auth_header, connection_string, client, ctx, config)
-            .await;
-        let (api_config, schema_owned) = match remote_value {
-            Ok((api_config, schema_owned)) => (api_config, schema_owned),
-            Err(e @ RestError::SchemaTooLarge) => {
-                // for the case where the schema is too large, we cache an empty dummy value
-                // all the other requests will fail without triggering the introspection query
-                let schema_owned = serde_json::from_str::<DbSchemaOwned>(EMPTY_JSON_SCHEMA)
-                    .map_err(|e| JsonDeserialize { source: e })?;
+        let cache_result = count_cache_outcome(CacheKind::Schema, self.0.get(endpoint_id));
+        match cache_result {
+            Some(v) => Ok(v),
+            None => {
+                info!("db_schema cache miss for endpoint: {:?}", endpoint_id);
+                let remote_value = self
+                    .get_remote(auth_header, connection_string, client, ctx, config)
+                    .await;
+                let (api_config, schema_owned) = match remote_value {
+                    Ok((api_config, schema_owned)) => (api_config, schema_owned),
+                    Err(e @ RestError::SchemaTooLarge) => {
+                        // for the case where the schema is too large, we cache an empty dummy value
+                        // all the other requests will fail without triggering the introspection query
+                        let schema_owned = serde_json::from_str::<DbSchemaOwned>(EMPTY_JSON_SCHEMA)
+                            .map_err(|e| JsonDeserialize { source: e })?;

-                let api_config = ApiConfig {
-                    db_schemas: vec![],
-                    db_anon_role: None,
-                    db_max_rows: None,
-                    db_allowed_select_functions: vec![],
-                    role_claim_key: String::new(),
-                    db_extra_search_path: None,
-                    server_cors_allowed_origins: None,
+                        let api_config = ApiConfig {
+                            db_schemas: vec![],
+                            db_anon_role: None,
+                            db_max_rows: None,
+                            db_allowed_select_functions: vec![],
+                            role_claim_key: String::new(),
+                            db_extra_search_path: None,
+                        };
+                        let value = Arc::new((api_config, schema_owned));
+                        count_cache_insert(CacheKind::Schema);
+                        self.0.insert(endpoint_id.clone(), value);
+                        return Err(e);
+                    }
+                    Err(e) => {
+                        return Err(e);
+                    }
                };
                let value = Arc::new((api_config, schema_owned));
                count_cache_insert(CacheKind::Schema);
-                self.0.insert(endpoint_id.clone(), value);
-                return Err(e);
+                self.0.insert(endpoint_id.clone(), value.clone());
+                Ok(value)
            }
-            Err(e) => {
-                return Err(e);
-            }
-        };
-        let value = Arc::new((api_config, schema_owned));
-        count_cache_insert(CacheKind::Schema);
-        self.0.insert(endpoint_id.clone(), value.clone());
-        Ok(value)
+        }
    }
-    async fn internal_get_remote(
+    pub async fn get_remote(
        &self,
        auth_header: &HeaderValue,
        connection_string: &str,
@@ -548,7 +531,7 @@ pub(crate) async fn handle(
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
    let result = handle_inner(cancel, config, &ctx, request, backend).await;

-    let response = match result {
+    let mut response = match result {
        Ok(r) => {
            ctx.set_success();

@@ -657,6 +640,9 @@ pub(crate) async fn handle(
        }
    };

+    response
+        .headers_mut()
+        .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*"));
    Ok(response)
 }

@@ -736,37 +722,6 @@ async fn handle_inner(
    }
 }

-fn apply_common_cors_headers(
-    response: &mut Builder,
-    request_headers: &HeaderMap,
-    allowed_origins: Option<&Vec<String>>,
-) {
-    let request_origin = request_headers
-        .get(ORIGIN)
-        .map(|v| v.to_str().unwrap_or(""));
-
-    let response_allow_origin = match (request_origin, allowed_origins) {
-        (Some(or), Some(allowed_origins)) => {
-            if allowed_origins.iter().any(|o| o == or) {
-                Some(HeaderValue::from_str(or).unwrap_or(HEADER_VALUE_ALLOW_ALL_ORIGINS))
-            } else {
-                None
-            }
-        }
-        (Some(_), None) => Some(HEADER_VALUE_ALLOW_ALL_ORIGINS),
-        _ => None,
-    };
-    if let Some(h) = response.headers_mut() {
-        h.insert(
-            ACCESS_CONTROL_EXPOSE_HEADERS,
-            ACCESS_CONTROL_EXPOSE_HEADERS_VALUE,
-        );
-        if let Some(origin) = response_allow_origin {
-            h.insert(ACCESS_CONTROL_ALLOW_ORIGIN, origin);
-        }
-    }
-}
-
 #[allow(clippy::too_many_arguments)]
 async fn handle_rest_inner(
    config: &'static ProxyConfig,
@@ -778,6 +733,12 @@ async fn handle_rest_inner(
    jwt: String,
    backend: Arc<PoolingBackend>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, RestError> {
+    // validate the jwt token
+    let jwt_parsed = backend
+        .authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
+        .await
+        .map_err(HttpConnError::from)?;
+
    let db_schema_cache =
        config
            .rest_config
@@ -793,83 +754,28 @@ async fn handle_rest_inner(
            message: "Failed to get endpoint cache key".to_string(),
        }))?;

+    let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?;
+
    let (parts, originial_body) = request.into_parts();

-    // try and get the cached entry for this endpoint
-    // it contains the api config and the introspected db schema
-    let cached_entry = db_schema_cache.get_cached(&endpoint_cache_key);
-
-    let allowed_origins = cached_entry
-        .as_ref()
-        .and_then(|arc| arc.0.server_cors_allowed_origins.as_ref());
-
-    let mut response = Response::builder();
-    apply_common_cors_headers(&mut response, &parts.headers, allowed_origins);
-
-    // handle the OPTIONS request
-    if parts.method == Method::OPTIONS {
-        let allowed_headers = parts
-            .headers
-            .get(ACCESS_CONTROL_REQUEST_HEADERS)
-            .and_then(|a| a.to_str().ok())
-            .filter(|v| !v.is_empty())
-            .map_or_else(
-                || "Authorization".to_string(),
-                |v| format!("{v}, Authorization"),
-            );
-        return response
-            .status(StatusCode::OK)
-            .header(
-                ACCESS_CONTROL_ALLOW_METHODS,
-                ACCESS_CONTROL_ALLOW_METHODS_VALUE,
-            )
-            .header(ACCESS_CONTROL_MAX_AGE, ACCESS_CONTROL_MAX_AGE_VALUE)
-            .header(
-                ACCESS_CONTROL_ALLOW_HEADERS,
-                HeaderValue::from_str(&allowed_headers)
-                    .unwrap_or(ACCESS_CONTROL_ALLOW_HEADERS_VALUE),
-            )
-            .header(ALLOW, ACCESS_CONTROL_ALLOW_METHODS_VALUE)
-            .body(Empty::new().map_err(|x| match x {}).boxed())
-            .map_err(|e| {
-                RestError::SubzeroCore(InternalError {
-                    message: e.to_string(),
-                })
-            });
-    }
-
-    // validate the jwt token
-    let jwt_parsed = backend
-        .authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
-        .await
-        .map_err(HttpConnError::from)?;
-
    let auth_header = parts
        .headers
        .get(AUTHORIZATION)
        .ok_or(RestError::SubzeroCore(InternalError {
            message: "Authorization header is required".to_string(),
        }))?;
-    let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?;

-    let entry = match cached_entry {
-        Some(e) => e,
-        None => {
-            // if not cached, get the remote entry (will run the introspection query)
-            db_schema_cache
-                .get_remote(
-                    &endpoint_cache_key,
-                    auth_header,
-                    connection_string,
-                    &mut client,
-                    ctx,
-                    config,
-                )
-                .await?
-        }
-    };
+    let entry = db_schema_cache
+        .get_cached_or_remote(
+            &endpoint_cache_key,
+            auth_header,
+            connection_string,
+            &mut client,
+            ctx,
+            config,
+        )
+        .await?;
    let (api_config, db_schema_owned) = entry.as_ref();
-
    let db_schema = db_schema_owned.borrow_schema();

    let db_schemas = &api_config.db_schemas; // list of schemas available for the api
@@ -1093,8 +999,8 @@ async fn handle_rest_inner(
    let _metrics = client.metrics(ctx); // FIXME: is everything in the context set correctly?

    // send the request to the local proxy
-    let proxy_response = make_raw_local_proxy_request(&mut client, headers, req_body).await?;
-    let (response_parts, body) = proxy_response.into_parts();
+    let response = make_raw_local_proxy_request(&mut client, headers, req_body).await?;
+    let (parts, body) = response.into_parts();

    let max_response = config.http_config.max_response_size_bytes;
    let bytes = read_body_with_limit(body, max_response)
@@ -1103,7 +1009,7 @@ async fn handle_rest_inner(

    // if the response status is greater than 399, then it is an error
    // FIXME: check if there are other error codes or shapes of the response
-    if response_parts.status.as_u16() > 399 {
+    if parts.status.as_u16() > 399 {
        // turn this postgres error from the json into PostgresError
        let postgres_error = serde_json::from_slice(&bytes)
            .map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?;
@@ -1269,7 +1175,7 @@ async fn handle_rest_inner(
        .boxed();

    // build the response
-    response = response
+    let mut response = Response::builder()
        .status(StatusCode::from_u16(status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR))
        .header(CONTENT_TYPE, http_content_type);

--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -644,7 +644,6 @@ async fn handle_tenant_timeline_safekeeper_migrate(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    // TODO(diko): it's not PS operation, there should be a different permission scope.
    check_permissions(&req, Scope::PageServerApi)?;
    maybe_rate_limit(&req, tenant_id).await;

@@ -666,23 +665,6 @@ async fn handle_tenant_timeline_safekeeper_migrate(
    json_response(StatusCode::OK, ())
 }

-async fn handle_tenant_timeline_safekeeper_migrate_abort(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
-    // TODO(diko): it's not PS operation, there should be a different permission scope.
-    check_permissions(&req, Scope::PageServerApi)?;
-    maybe_rate_limit(&req, tenant_id).await;
-
-    service
-        .tenant_timeline_safekeeper_migrate_abort(tenant_id, timeline_id)
-        .await?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn handle_tenant_timeline_lsn_lease(
    service: Arc<Service>,
    req: Request<Body>,
@@ -2629,16 +2611,6 @@ pub fn make_router(
                )
            },
        )
-        .post(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort",
-            |r| {
-                tenant_service_handler(
-                    r,
-                    handle_tenant_timeline_safekeeper_migrate_abort,
-                    RequestName("v1_tenant_timeline_safekeeper_migrate_abort"),
-                )
-            },
-        )
        // LSN lease passthrough to all shards
        .post(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease",
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -1230,7 +1230,10 @@ impl Service {
            }
            // It it is the same new_sk_set, we can continue the migration (retry).
        } else {
-            if !is_migration_finished(&timeline) {
+            let prev_finished = timeline.cplane_notified_generation == timeline.generation
+                && timeline.sk_set_notified_generation == timeline.generation;
+
+            if !prev_finished {
                // The previous migration is committed, but the finish step failed.
                // Safekeepers/cplane might not know about the last membership configuration.
                // Retry the finish step to ensure smooth migration.
@@ -1542,8 +1545,6 @@ impl Service {
        timeline_id: TimelineId,
        timeline: &TimelinePersistence,
    ) -> Result<(), ApiError> {
-        tracing::info!(generation=?timeline.generation, sk_set=?timeline.sk_set, new_sk_set=?timeline.new_sk_set, "retrying finish safekeeper migration");
-
        if timeline.new_sk_set.is_some() {
            // Logical error, should never happen.
            return Err(ApiError::InternalServerError(anyhow::anyhow!(
@@ -1623,120 +1624,4 @@ impl Service {

        Ok(wal_positions[quorum_size - 1])
    }
-
-    /// Abort ongoing safekeeper migration.
-    pub(crate) async fn tenant_timeline_safekeeper_migrate_abort(
-        self: &Arc<Self>,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> Result<(), ApiError> {
-        // TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks.
-        let _tenant_lock = trace_shared_lock(
-            &self.tenant_op_locks,
-            tenant_id,
-            TenantOperations::TimelineSafekeeperMigrate,
-        )
-        .await;
-
-        // Fetch current timeline configuration from the configuration storage.
-        let timeline = self
-            .persistence
-            .get_timeline(tenant_id, timeline_id)
-            .await?;
-
-        let Some(timeline) = timeline else {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!(
-                    "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table"
-                )
-                .into(),
-            ));
-        };
-
-        let mut generation = SafekeeperGeneration::new(timeline.generation as u32);
-
-        let Some(new_sk_set) = &timeline.new_sk_set else {
-            // No new_sk_set -> no active migration that we can abort.
-            tracing::info!("timeline has no active migration");
-
-            if !is_migration_finished(&timeline) {
-                // The last migration is committed, but the finish step failed.
-                // Safekeepers/cplane might not know about the last membership configuration.
-                // Retry the finish step to make the timeline state clean.
-                self.finish_safekeeper_migration_retry(tenant_id, timeline_id, &timeline)
-                    .await?;
-            }
-            return Ok(());
-        };
-
-        tracing::info!(sk_set=?timeline.sk_set, ?new_sk_set, ?generation, "aborting timeline migration");
-
-        let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?;
-        let new_safekeepers = self.get_safekeepers(new_sk_set)?;
-
-        let cur_sk_member_set =
-            Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?;
-
-        // Increment current generation and remove new_sk_set from the timeline to abort the migration.
-        generation = generation.next();
-
-        let mconf = membership::Configuration {
-            generation,
-            members: cur_sk_member_set,
-            new_members: None,
-        };
-
-        // Exclude safekeepers which were added during the current migration.
-        let cur_ids: HashSet<NodeId> = cur_safekeepers.iter().map(|sk| sk.get_id()).collect();
-        let exclude_safekeepers = new_safekeepers
-            .into_iter()
-            .filter(|sk| !cur_ids.contains(&sk.get_id()))
-            .collect::<Vec<_>>();
-
-        let exclude_requests = exclude_safekeepers
-            .iter()
-            .map(|sk| TimelinePendingOpPersistence {
-                sk_id: sk.skp.id,
-                tenant_id: tenant_id.to_string(),
-                timeline_id: timeline_id.to_string(),
-                generation: generation.into_inner() as i32,
-                op_kind: SafekeeperTimelineOpKind::Exclude,
-            })
-            .collect::<Vec<_>>();
-
-        let cur_sk_set = cur_safekeepers
-            .iter()
-            .map(|sk| sk.get_id())
-            .collect::<Vec<_>>();
-
-        // Persist new mconf and exclude requests.
-        self.persistence
-            .update_timeline_membership(
-                tenant_id,
-                timeline_id,
-                generation,
-                &cur_sk_set,
-                None,
-                &exclude_requests,
-            )
-            .await?;
-
-        // At this point we have already commited the abort, but still need to notify
-        // cplane/safekeepers with the new mconf. That's what finish_safekeeper_migration does.
-        self.finish_safekeeper_migration(
-            tenant_id,
-            timeline_id,
-            &cur_safekeepers,
-            &mconf,
-            &exclude_safekeepers,
-        )
-        .await?;
-
-        Ok(())
-    }
-}
-
-fn is_migration_finished(timeline: &TimelinePersistence) -> bool {
-    timeline.cplane_notified_generation == timeline.generation
-        && timeline.sk_set_notified_generation == timeline.generation
 }
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -79,6 +79,7 @@ class NeonAPI:
                elif resp.status_code == 423 and resp.json()["message"] in {
                    "endpoint is in some transitive state, could not suspend",
                    "project already has running conflicting operations, scheduling of new ones is prohibited",
+                    "snapshot is in transition",
                }:
                    retry = True
                    self.retries4xx += 1
@@ -355,6 +356,63 @@ class NeonAPI:

        return cast("dict[str, Any]", resp.json())

+    def create_snapshot(
+        self,
+        project_id: str,
+        branch_id: str,
+        lsn: str | None = None,
+        timestamp: str | None = None,
+        name: str | None = None,
+        expires_at: str | None = None,
+    ) -> dict[str, Any]:
+        params: dict[str, Any] = {
+            "lsn": lsn,
+            "timestamp": timestamp,
+            "name": name,
+            "expires_at": expires_at,
+        }
+        params = {key: value for key, value in params.items() if value is not None}
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/branches/{branch_id}/snapshot",
+            params=params,
+            json={},
+            headers={
+                "Accept": "application/json",
+            },
+        )
+        return cast("dict[str, Any]", resp.json())
+
+    def delete_snapshot(self, project_id: str, snapshot_id: str) -> dict[str, Any]:
+        resp = self.__request("DELETE", f"/projects/{project_id}/snapshots/{snapshot_id}")
+        return cast("dict[str, Any]", resp.json())
+
+    def restore_snapshot(
+        self,
+        project_id: str,
+        snapshot_id: str,
+        target_branch_id: str,
+        name: str | None = None,
+        finalize_restore: bool = True,
+    ) -> dict[str, Any]:
+        data: dict[str, Any] = {
+            "target_branch_id": target_branch_id,
+            "finalize_restore": finalize_restore,
+        }
+        if name is not None:
+            data["name"] = name
+        log.info("Restore snapshot data: %s", data)
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/snapshots/{snapshot_id}/restore",
+            json=data,
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+        )
+        return cast("dict[str, Any]", resp.json())
+
    def delete_endpoint(self, project_id: str, endpoint_id: str) -> dict[str, Any]:
        resp = self.__request("DELETE", f"/projects/{project_id}/endpoints/{endpoint_id}")
        return cast("dict[str,Any]", resp.json())
@@ -396,6 +454,14 @@ class NeonAPI:

        return cast("dict[str, Any]", resp.json())

+    def get_branch_endpoints(self, project_id: str, branch_id: str) -> dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/branches/{branch_id}/endpoints",
+            headers={"Accept": "application/json", "Content-Type": "application/json"},
+        )
+        return cast("dict[str, Any]", resp.json())
+
    def get_endpoints(self, project_id: str) -> dict[str, Any]:
        resp = self.__request(
            "GET",
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2323,19 +2323,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
        response.raise_for_status()
        log.info(f"migrate_safekeepers success: {response.json()}")

-    def abort_safekeeper_migration(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ):
-        response = self.request(
-            "POST",
-            f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate_abort",
-            headers=self.headers(TokenScope.PAGE_SERVER_API),
-        )
-        response.raise_for_status()
-        log.info(f"abort_safekeeper_migration success: {response.json()}")
-
    def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
        """
        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int}
--- a/test_runner/random_ops/test_random_ops.py
+++ b/test_runner/random_ops/test_random_ops.py
@@ -11,6 +11,7 @@ import time
 from datetime import UTC, datetime, timedelta
 from typing import TYPE_CHECKING, Any

+import psycopg2
 import pytest
 from fixtures.log_helper import log

@@ -22,6 +23,29 @@ if TYPE_CHECKING:
    from fixtures.pg_version import PgVersion


+class NeonSnapshot:
+    """
+    A snapshot of the Neon Branch
+    Gets the output of the API call af a snapshot creation
+    """
+
+    def __init__(self, project: NeonProject, snapshot: dict[str, Any]):
+        self.project: NeonProject = project
+        snapshot = snapshot["snapshot"]
+        self.id: str = snapshot["id"]
+        self.name: str = snapshot["name"]
+        self.created_at: datetime = datetime.fromisoformat(snapshot["created_at"])
+        self.source_branch: NeonBranch = project.branches[snapshot["source_branch_id"]]
+        project.snapshots[self.id] = self
+        self.restored: bool = False
+
+    def __str__(self) -> str:
+        return f"id: {self.id}, name: {self.name}, created_at: {self.created_at}"
+
+    def delete(self) -> None:
+        self.project.delete_snapshot(self.id)
+
+
 class NeonEndpoint:
    """
    Neon Endpoint
@@ -70,6 +94,12 @@ class NeonBranch:
    def __init__(self, project, branch: dict[str, Any], is_reset=False):
        self.id: str = branch["branch"]["id"]
        self.desc = branch
+        self.name: str | None = None
+        if "name" in branch["branch"]:
+            self.name = branch["branch"]["name"]
+        self.restored_from: str | None = None
+        if "restored_from" in branch["branch"]:
+            self.restored_from = branch["branch"]["restored_from"]
        self.project: NeonProject = project
        self.neon_api: NeonAPI = project.neon_api
        self.project_id: str = branch["branch"]["project_id"]
@@ -113,10 +143,9 @@ class NeonBranch:

    def __str__(self):
        """
-        Prints the branch's name with all the predecessors
-        (r) means the branch is a reset one
+        Prints the branch's information with all the predecessors
        """
-        return f"{self.id}{'(r)' if self.id in self.project.reset_branches else ''}, parent: {self.parent}"
+        return f"{self.id}{f'({self.name})' if self.name and self.name != self.id else ''}{f'(restored_from: {self.restored_from})' if self.restored_from else ''}, parent: {self.parent}"

    def random_time(self) -> datetime:
        min_time = max(
@@ -152,6 +181,9 @@ class NeonBranch:
        self.project.terminate_benchmark(self.id)

    def reset_to_parent(self) -> None:
+        """
+        Resets the branch to the parent branch
+        """
        for ep in self.project.endpoints.values():
            if ep.type == "read_only":
                ep.terminate_benchmark()
@@ -240,6 +272,7 @@ class NeonProject:
        # Leaf branches are the branches, which do not have children
        self.leaf_branches: dict[str, NeonBranch] = {}
        self.branches: dict[str, NeonBranch] = {}
+        self.branch_num: int = 0
        self.reset_branches: set[str] = set()
        self.main_branch: NeonBranch = NeonBranch(self, proj)
        self.main_branch.connection_parameters = self.connection_parameters
@@ -253,6 +286,8 @@ class NeonProject:
        self.limits: dict[str, Any] = self.get_limits()["limits"]
        self.read_only_endpoints_total: int = 0
        self.min_time: datetime = datetime.now(UTC)
+        self.snapshots: dict[str, NeonSnapshot] = {}
+        self.snapshot_num: int = 0

    def get_limits(self) -> dict[str, Any]:
        return self.neon_api.get_project_limits(self.id)
@@ -280,7 +315,10 @@ class NeonProject:
        return False

    def create_branch(
-        self, parent_id: str | None = None, parent_timestamp: datetime | None = None
+        self,
+        parent_id: str | None = None,
+        parent_timestamp: datetime | None = None,
+        is_reset: bool = False,
    ) -> NeonBranch | None:
        self.wait()
        if not self.check_limit_branches():
@@ -293,14 +331,14 @@ class NeonProject:
        branch_def = self.neon_api.create_branch(
            self.id, parent_id=parent_id, parent_timestamp=parent_timestamp_str
        )
-        new_branch = NeonBranch(self, branch_def)
+        new_branch = NeonBranch(self, branch_def, is_reset)
        self.wait()
        return new_branch

    def delete_branch(self, branch_id: str) -> None:
        parent = self.branches[branch_id].parent
        if not parent or branch_id == self.main_branch.id:
-            raise RuntimeError("Cannot delete the main branch")
+            raise RuntimeError("Cannot delete the main branch or a branch restored from a snapshot")
        if branch_id not in self.leaf_branches and branch_id not in self.reset_branches:
            raise RuntimeError(f"The branch {branch_id}, probably, has ancestors")
        if branch_id not in self.branches:
@@ -313,7 +351,7 @@ class NeonProject:
        if branch_id not in self.reset_branches:
            self.terminate_benchmark(branch_id)
        self.neon_api.delete_branch(self.id, branch_id)
-        if len(parent.children) == 1 and parent.id != self.main_branch.id:
+        if len(parent.children) == 1 and parent.parent is not None:
            self.leaf_branches[parent.id] = parent
        parent.children.pop(branch_id)
        if branch_id in self.leaf_branches:
@@ -333,6 +371,22 @@ class NeonProject:
            log.info("No leaf branches found")
        return target

+    def get_random_parent_branch(self) -> NeonBranch:
+        return self.branches[random.choice(list(set(self.branches.keys()) - self.reset_branches))]
+
+    def gen_branch_name(self) -> str:
+        self.branch_num += 1
+        return f"branch{self.branch_num}"
+
+    def get_random_snapshot(self) -> NeonSnapshot | None:
+        snapshot: NeonSnapshot | None = None
+        avail_snapshots = [sn for sn in self.snapshots.values() if not sn.restored]
+        if avail_snapshots:
+            snapshot = random.choice(avail_snapshots)
+        else:
+            log.info("No snapshots found")
+        return snapshot
+
    def delete_endpoint(self, endpoint_id: str) -> None:
        self.terminate_benchmark(endpoint_id)
        self.neon_api.delete_endpoint(self.id, endpoint_id)
@@ -409,6 +463,116 @@ class NeonProject:
        self.restore_num += 1
        return f"restore{self.restore_num}"

+    def gen_snapshot_name(self) -> str:
+        self.snapshot_num += 1
+        return f"snapshot{self.snapshot_num}"
+
+    def create_snapshot(
+        self,
+        lsn: str | None = None,
+        timestamp: datetime | None = None,
+    ) -> NeonSnapshot:
+        """
+        Create a new Neon snapshot for the current project
+        Two optional arguments: lsn and timestamp are mutually exclusive
+        they instruct to create a snapshot with the specific lns or timestamp
+        """
+        snapshot_name = self.gen_snapshot_name()
+        with psycopg2.connect(self.connection_uri) as conn:
+            with conn.cursor() as cur:
+                # We will check the value we set now after the snapshot restored to verify consistency
+                cur.execute(
+                    f"INSERT INTO sanity_check (name, value) VALUES "
+                    f"('snapsot_name', '{snapshot_name}') ON CONFLICT (name) DO UPDATE SET value = EXCLUDED.value"
+                )
+                conn.commit()
+                snapshot = NeonSnapshot(
+                    self,
+                    self.neon_api.create_snapshot(
+                        self.id,
+                        self.main_branch.id,
+                        lsn,
+                        timestamp.isoformat().replace("+00:00", "Z") if timestamp else None,
+                        snapshot_name,
+                    ),
+                )
+                self.wait()
+                # Now we taint the value after the snapshot was taken
+                cur.execute("UPDATE sanity_check SET value = 'tainted' || value")
+                conn.commit()
+        return snapshot
+
+    def delete_snapshot(self, snapshot_id: str) -> None:
+        """
+        Deletes the snapshot with the given id
+        """
+        self.wait()
+        self.neon_api.delete_snapshot(self.id, snapshot_id)
+        self.snapshots.pop(snapshot_id)
+        self.wait()
+
+    def restore_snapshot(self, snapshot_id: str) -> NeonBranch | None:
+        """
+        Creates a new Neon branch for the current project, then restores the snapshot
+        with the given id
+        """
+        target_branch = self.get_random_parent_branch().create_child_branch()
+        if not target_branch:
+            return None
+        self.snapshots[snapshot_id].restored = True
+        new_branch_def: dict[str, Any] = self.neon_api.restore_snapshot(
+            self.id,
+            snapshot_id,
+            target_branch.id,
+            self.gen_branch_name(),
+        )
+        self.wait()
+        new_branch_def = self.neon_api.get_branch_details(self.id, new_branch_def["branch"]["id"])
+        # The restored branch will lose the parent afterward, but it has it during the restoration.
+        # So, we delete parent_id
+        new_branch_def["branch"].pop("parent_id")
+        new_branch = NeonBranch(self, new_branch_def)
+        log.info("Restored snapshot to the branch: %s", new_branch)
+        target_branch_def = self.neon_api.get_branch_details(self.id, target_branch.id)
+        if "name" in target_branch_def["branch"]:
+            target_branch.name = target_branch_def["branch"]["name"]
+        if new_branch.connection_parameters is None:
+            if not new_branch.endpoints:
+                for ep in self.neon_api.get_branch_endpoints(self.id, new_branch.id)["endpoints"]:
+                    if ep["id"] not in self.endpoints:
+                        NeonEndpoint(self, ep)
+            new_branch.connection_parameters = self.connection_parameters.copy()
+            for ep in new_branch.endpoints.values():
+                if ep.type == "read_write":
+                    new_branch.connection_parameters["host"] = ep.host
+                    break
+            new_branch.connect_env = {
+                "PGHOST": new_branch.connection_parameters["host"],
+                "PGUSER": new_branch.connection_parameters["role"],
+                "PGDATABASE": new_branch.connection_parameters["database"],
+                "PGPASSWORD": new_branch.connection_parameters["password"],
+                "PGSSLMODE": "require",
+            }
+        with psycopg2.connect(
+            host=new_branch.connection_parameters["host"],
+            port=5432,
+            user=new_branch.connection_parameters["role"],
+            password=new_branch.connection_parameters["password"],
+            database=new_branch.connection_parameters["database"],
+        ) as conn:
+            with conn.cursor() as cur:
+                cur.execute("SELECT value FROM sanity_check WHERE name = 'snapsot_name'")
+                snapshot_name = None
+                if row := cur.fetchone():
+                    snapshot_name = row[0]
+                # We verify here that the value we select from the table matches with the snapshot name
+                # To ensure consistency
+                assert snapshot_name == self.snapshots[snapshot_id].name
+        self.wait()
+        target_branch.start_benchmark()
+        new_branch.start_benchmark()
+        return new_branch
+

@pytest.fixture()
 def setup_class(
@@ -438,9 +602,7 @@ def do_action(project: NeonProject, action: str) -> bool:
    if action == "new_branch" or action == "new_branch_random_time":
        use_random_time: bool = action == "new_branch_random_time"
        log.info("Trying to create a new branch %s", "random time" if use_random_time else "")
-        parent = project.branches[
-            random.choice(list(set(project.branches.keys()) - project.reset_branches))
-        ]
+        parent = project.get_random_parent_branch()
        child = parent.create_child_branch(parent.random_time() if use_random_time else None)
        if child is None:
            return False
@@ -479,6 +641,23 @@ def do_action(project: NeonProject, action: str) -> bool:
            return False
        log.info("Reset to parent %s", target)
        target.reset_to_parent()
+    elif action == "create_snapshot":
+        snapshot = project.create_snapshot()
+        if snapshot is None:
+            return False
+        log.info("Created snapshot %s", snapshot)
+    elif action == "restore_snapshot":
+        if (snapshot_to_restore := project.get_random_snapshot()) is None:
+            return False
+        log.info("Restoring snapshot %s", snapshot_to_restore)
+        if project.restore_snapshot(snapshot_to_restore.id) is None:
+            return False
+    elif action == "delete_snapshot":
+        snapshot_to_delete = project.get_random_snapshot()
+        if snapshot_to_delete is None:
+            return False
+        snapshot_to_delete.delete()
+        log.info("Deleted snapshot %s", snapshot_to_delete)
    else:
        raise ValueError(f"The action {action} is unknown")
    return True
@@ -512,12 +691,28 @@ def test_api_random(
        ("delete_branch", 1.2),
        ("restore_random_time", 0.9),
        ("reset_to_parent", 0.3),
+        ("create_snapshot", 0.2),
+        ("restore_snapshot", 0.1),
+        ("delete_snapshot", 0.1),
    )
    if num_ops_env := os.getenv("NUM_OPERATIONS"):
        num_operations = int(num_ops_env)
    else:
        num_operations = 250
    pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=project.main_branch.connect_env)
+    # Create a table for sanity check
+    # We are going to leve some control values there to check, e.g., after restoring a snapshot
+    pg_bin.run(
+        [
+            "psql",
+            "-c",
+            "CREATE TABLE IF NOT EXISTS sanity_check (name VARCHAR NOT NULL PRIMARY KEY, value VARCHAR)",
+        ],
+        env=project.main_branch.connect_env,
+    )
+    # To not go to the past where pgbench tables do not exist
+    time.sleep(1)
+    project.min_time = datetime.now(UTC)
    # To not go to the past where pgbench tables do not exist
    time.sleep(1)
    project.min_time = datetime.now(UTC)
--- a/test_runner/regress/test_replica_promotes.py
+++ b/test_runner/regress/test_replica_promotes.py
@@ -145,7 +145,6 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
        stop_and_check_lsn(secondary, None)

    if method == PromoteMethod.COMPUTE_CTL:
-        log.info("Restarting primary to check new config")
        secondary.stop()
        # In production, compute ultimately receives new compute spec from cplane.
        secondary.respec(mode="Primary")
--- a/test_runner/regress/test_safekeeper_migration.py
+++ b/test_runner/regress/test_safekeeper_migration.py
@@ -460,91 +460,3 @@ def test_pull_from_most_advanced_sk(neon_env_builder: NeonEnvBuilder):

    ep.start(safekeeper_generation=5, safekeepers=new_sk_set2)
    assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)]
-
-
-def test_abort_safekeeper_migration(neon_env_builder: NeonEnvBuilder):
-    """
-    Test that safekeeper migration can be aborted.
-    1. Insert failpoints and ensure the abort successfully reverts the timeline state.
-    2. Check that endpoint is operational after the abort.
-    """
-    neon_env_builder.num_safekeepers = 2
-    neon_env_builder.storage_controller_config = {
-        "timelines_onto_safekeepers": True,
-        "timeline_safekeeper_count": 1,
-    }
-    env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS)
-
-    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
-    assert len(mconf["sk_set"]) == 1
-    cur_sk = mconf["sk_set"][0]
-    cur_gen = 1
-
-    ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
-    ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
-    ep.safe_psql("CREATE EXTENSION neon_test_utils;")
-    ep.safe_psql("CREATE TABLE t(a int)")
-    ep.safe_psql("INSERT INTO t VALUES (1)")
-
-    another_sk = [sk.id for sk in env.safekeepers if sk.id != cur_sk][0]
-
-    failpoints = [
-        "sk-migration-after-step-3",
-        "sk-migration-after-step-4",
-        "sk-migration-after-step-5",
-        "sk-migration-after-step-7",
-    ]
-
-    for fp in failpoints:
-        env.storage_controller.configure_failpoints((fp, "return(1)"))
-
-        with pytest.raises(StorageControllerApiException, match=f"failpoint {fp}"):
-            env.storage_controller.migrate_safekeepers(
-                env.initial_tenant, env.initial_timeline, [another_sk]
-            )
-        cur_gen += 1
-
-        env.storage_controller.configure_failpoints((fp, "off"))
-
-        # We should have a joint mconf after the failure.
-        mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
-        assert mconf["generation"] == cur_gen
-        assert mconf["sk_set"] == [cur_sk]
-        assert mconf["new_sk_set"] == [another_sk]
-
-        env.storage_controller.abort_safekeeper_migration(env.initial_tenant, env.initial_timeline)
-        cur_gen += 1
-
-        # Abort should revert the timeline to the previous sk_set and increment the generation.
-        mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
-        assert mconf["generation"] == cur_gen
-        assert mconf["sk_set"] == [cur_sk]
-        assert mconf["new_sk_set"] is None
-
-        assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith(f"g#{cur_gen}:")
-        ep.safe_psql(f"INSERT INTO t VALUES ({cur_gen})")
-
-    # After step-8 the final mconf is committed and the migration is not abortable anymore.
-    # So the abort should not abort anything.
-    env.storage_controller.configure_failpoints(("sk-migration-after-step-8", "return(1)"))
-
-    with pytest.raises(StorageControllerApiException, match="failpoint sk-migration-after-step-8"):
-        env.storage_controller.migrate_safekeepers(
-            env.initial_tenant, env.initial_timeline, [another_sk]
-        )
-    cur_gen += 2
-
-    env.storage_controller.configure_failpoints((fp, "off"))
-
-    env.storage_controller.abort_safekeeper_migration(env.initial_tenant, env.initial_timeline)
-
-    # The migration is fully committed, no abort should have been performed.
-    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
-    assert mconf["generation"] == cur_gen
-    assert mconf["sk_set"] == [another_sk]
-    assert mconf["new_sk_set"] is None
-
-    ep.safe_psql(f"INSERT INTO t VALUES ({cur_gen})")
-    ep.clear_buffers()
-    assert ep.safe_psql("SELECT * FROM t") == [(i + 1,) for i in range(cur_gen) if i % 2 == 0]
Author	SHA1	Message	Date
a-masterov	f26987deef	Merge branch 'main' into amasterov/random-ops-add-snapshots	2025-07-31 12:10:43 +02:00
Alexey Masterov	7c2022c1b5	Modify weights	2025-07-23 15:54:39 +02:00
Alexey Masterov	c233deb1c2	Rename a method for consistency	2025-07-23 15:38:59 +02:00
Alexey Masterov	d550b67c5f	Reword the comment	2025-07-23 15:27:36 +02:00
Alexey Masterov	2ca2b05ab5	Remove the redundant condition	2025-07-23 15:21:21 +02:00
Alexey Masterov	5e1057b860	Return sleep before retry	2025-07-23 15:17:01 +02:00
Alexey Masterov	bb6127f495	Update the comment	2025-07-23 15:16:02 +02:00
a-masterov	a41c00e7c1	Merge branch 'main' into amasterov/random-ops-add-snapshots	2025-07-23 15:15:27 +02:00
Alexey Masterov	76832488d0	Add a log with the new branch	2025-07-23 15:09:02 +02:00
Alexey Masterov	3ce2c15c10	Cleanup	2025-07-23 14:30:55 +02:00
Alexey Masterov	1c3f49e231	fix restored_from	2025-07-23 12:57:15 +02:00
Alexey Masterov	b982cf6c84	format	2025-07-23 12:18:27 +02:00
Alexey Masterov	b1b23cdc8e	Do not mark a leaf a branch without a parent	2025-07-23 12:10:56 +02:00
Alexey Masterov	556e9cb781	Merge branch 'main' into amasterov/random-ops-add-snapshots # Conflicts: # test_runner/random_ops/test_random_ops.py	2025-07-23 10:58:56 +02:00
Alexey Masterov	8edea1dea3	Do not consider restored branches as leaf ones	2025-07-22 13:41:09 +02:00
Alexey Masterov	f5a553a8e5	Add debug	2025-07-22 11:38:09 +02:00
Alexey Masterov	7423c393c6	Remove parent_id from the restored branch	2025-07-22 11:04:08 +02:00
Alexey Masterov	c3a7158e62	Add debug	2025-07-22 10:43:28 +02:00
Alexey Masterov	848dcd7540	Add debug	2025-07-21 17:00:15 +02:00
Alexey Masterov	783dfe3cce	refactoring	2025-07-21 16:05:56 +02:00
Alexey Masterov	cdc2ea110f	Cleanup	2025-07-18 16:38:25 +02:00
Alexey Masterov	c7e1183da4	Cleanup	2025-07-18 16:37:39 +02:00
Alexey Masterov	6763925a4d	Run all the operations	2025-07-18 16:32:07 +02:00
Alexey Masterov	3bcdbe30f1	Avoid to manipulate restored snapshots	2025-07-18 16:09:46 +02:00
Alexey Masterov	22975426b7	10x more wait	2025-07-18 15:17:29 +02:00
Alexey Masterov	31c6f66a49	Wait before delete	2025-07-18 15:08:45 +02:00
Alexey Masterov	287e01fdf9	retry more	2025-07-18 15:06:52 +02:00
Alexey Masterov	91c81cc5e5	refactor	2025-07-18 14:52:39 +02:00
Alexey Masterov	a8354b0aa3	Delete projects	2025-07-18 14:44:26 +02:00
Alexey Masterov	1102e2aff0	Add connect_env	2025-07-18 14:42:28 +02:00
Alexey Masterov	f6a61c9492	Add commit	2025-07-18 14:08:15 +02:00
Alexey Masterov	cbf8e248fc	Do not delete project after failure (debug only, do not merge!)	2025-07-18 09:39:04 +02:00
Alexey Masterov	f0f30076cc	Do not delete project after failure (debug only, do not merge!)	2025-07-18 09:35:26 +02:00
Alexey Masterov	42544cf145	Add debug	2025-07-17 19:57:20 +02:00
Alexey Masterov	28b25092ad	An attempt 5	2025-07-17 19:49:49 +02:00
Alexey Masterov	b77a1fae04	An attempt 4	2025-07-17 18:58:00 +02:00
Alexey Masterov	73ed7ade70	An attempt 3	2025-07-17 18:53:09 +02:00
Alexey Masterov	74626b94a8	An attempt 2	2025-07-17 18:48:44 +02:00
Alexey Masterov	4ca6d8cecf	An attempt	2025-07-17 18:32:57 +02:00
Alexey Masterov	bf0be50df9	Add debug	2025-07-17 15:06:53 +02:00
Alexey Masterov	1adc95758e	add the database	2025-07-17 14:52:27 +02:00
Alexey Masterov	03e994f9c7	Connection parameters	2025-07-17 14:40:25 +02:00
Alexey Masterov	f0671c996e	Debug	2025-07-17 14:27:09 +02:00
Alexey Masterov	829cb5fe59	use connection parameters instead of connect URI	2025-07-17 14:25:53 +02:00
Alexey Masterov	561083524d	finalize restore by default	2025-07-17 13:58:09 +02:00
Alexey Masterov	009303e31f	Connect to the target branch, not the main one	2025-07-17 13:44:19 +02:00
Alexey Masterov	0e42cac589	Add debug	2025-07-17 12:48:08 +02:00
Alexey Masterov	f5cebcaf6a	Wait for the snapshot to complete	2025-07-17 12:34:43 +02:00
Alexey Masterov	5861d0f9b2	Add the environment	2025-07-17 12:01:50 +02:00
Alexey Masterov	dbedf11191	Add check for snapshot sanity	2025-07-17 11:50:30 +02:00
Alexey Masterov	1e20c4f2b2	format	2025-07-16 13:26:02 +02:00
Alexey Masterov	018f95115a	Retry on 423 error "snapshot is in transition"	2025-07-16 13:21:32 +02:00
Alexey Masterov	f222256225	Added a documentation for the new methods	2025-07-15 17:07:07 +02:00
Alexey Masterov	17b5f5e090	Merge remote-tracking branch 'origin/amasterov/random-ops-add' into amasterov/random-ops-add	2025-07-15 16:35:48 +02:00
Alexey Masterov	9bf5d69c01	Cleanup	2025-07-15 16:35:16 +02:00
a-masterov	f816b3d90e	Merge branch 'main' into amasterov/random-ops-add	2025-07-15 16:20:14 +02:00
Alexey Masterov	1ec1a82d3d	Start benchmark	2025-07-15 15:16:45 +02:00
Alexey Masterov	e97c1d2684	Fix the parameter error	2025-07-15 14:49:27 +02:00
a-masterov	94cfd3f22e	Merge branch 'main' into amasterov/random-ops-add	2025-07-15 12:09:32 +02:00
Alexey Masterov	f45ea8fe6b	Add snapshots	2025-07-15 12:08:38 +02:00
Alexey Masterov	1443ba65d3	Add reset_to_parent	2025-07-15 12:08:38 +02:00
Alexey Masterov	185f4de0fe	refactoring	2025-07-09 19:22:27 +02:00
Alexey Masterov	efb08f82cd	Merge branch 'main' into amasterov/random-ops-add # Conflicts: # test_runner/random_ops/test_random_ops.py	2025-07-09 18:59:43 +02:00
Alexey Masterov	c31563f551	formatting	2025-07-09 14:54:13 +02:00
Alexey Masterov	fd6c2cba01	Merge branch 'amasterov/workaround-branch-not-found-problem' into amasterov/random-ops-add # Conflicts: # test_runner/random_ops/test_random_ops.py	2025-07-09 14:53:05 +02:00
Alexey Masterov	899f4a1e77	remove the duplicate log message	2025-07-09 13:58:13 +02:00
Alexey Masterov	e95fcfa0d5	Add retrying if parent branch is not found. Add clarification and a link to Jira issue.	2025-07-09 13:54:54 +02:00
a-masterov	0ccc649299	Merge branch 'main' into amasterov/random-ops-add	2025-07-08 11:34:39 +02:00
Alexey Masterov	fe2abf3531	fix a mypy discovered error	2025-06-12 10:48:04 +02:00