Replica promotion in compute_ctl (#12183)

Add `/promote` method for `compute_ctl` promoting secondary replica to primary, depends on secondary being prewarmed. Add `compute-ctl` mode to `test_replica_promotes`, testing happy path only (no corner cases yet) Add openapi spec for `/promote` and `/lfc` handlers https://github.com/neondatabase/cloud/issues/19011 Resolves: https://github.com/neondatabase/cloud/issues/29807
2025-12-26 15:49:58 +00:00 · 2025-07-09 13:55:10 +01:00
parent 4ee0da0a20
commit e7d18bc188
12 changed files with 448 additions and 65 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1348,6 +1348,7 @@ dependencies = [
 "p256 0.13.2",
 "pageserver_page_api",
 "postgres",
+ "postgres-types",
 "postgres_initdb",
 "postgres_versioninfo",
 "regex",
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -66,7 +66,7 @@ url.workspace = true
 uuid.workspace = true
 walkdir.workspace = true
 x509-cert.workspace = true
-
+postgres-types.workspace = true
 postgres_versioninfo.workspace = true
 postgres_initdb.workspace = true
 compute_api.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{
    ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
-    LfcPrewarmState, TlsConfig,
+    LfcPrewarmState, PromoteState, TlsConfig,
 };
 use compute_api::spec::{
    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
@@ -29,8 +29,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
 use std::sync::{Arc, Condvar, Mutex, RwLock};
 use std::time::{Duration, Instant};
 use std::{env, fs};
-use tokio::task::JoinHandle;
-use tokio::{spawn, time};
+use tokio::{spawn, sync::watch, task::JoinHandle, time};
 use tracing::{Instrument, debug, error, info, instrument, warn};
 use url::Url;
 use utils::id::{TenantId, TimelineId};
@@ -175,6 +174,7 @@ pub struct ComputeState {
    /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
    /// mode == ComputeMode::Primary. None otherwise
    pub terminate_flush_lsn: Option<Lsn>,
+    pub promote_state: Option<watch::Receiver<PromoteState>>,

    pub metrics: ComputeMetrics,
 }
@@ -192,6 +192,7 @@ impl ComputeState {
            lfc_prewarm_state: LfcPrewarmState::default(),
            lfc_offload_state: LfcOffloadState::default(),
            terminate_flush_lsn: None,
+            promote_state: None,
        }
    }

--- a/compute_tools/src/compute_promote.rs
+++ b/compute_tools/src/compute_promote.rs
@@ -0,0 +1,132 @@
+use crate::compute::ComputeNode;
+use anyhow::{Context, Result, bail};
+use compute_api::{
+    responses::{LfcPrewarmState, PromoteState, SafekeepersLsn},
+    spec::ComputeMode,
+};
+use std::{sync::Arc, time::Duration};
+use tokio::time::sleep;
+use utils::lsn::Lsn;
+
+impl ComputeNode {
+    /// Returns only when promote fails or succeeds. If a network error occurs
+    /// and http client disconnects, this does not stop promotion, and subsequent
+    /// calls block until promote finishes.
+    /// Called by control plane on secondary after primary endpoint is terminated
+    pub async fn promote(self: &Arc<Self>, safekeepers_lsn: SafekeepersLsn) -> PromoteState {
+        let cloned = self.clone();
+        let start_promotion = || {
+            let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
+            tokio::spawn(async move {
+                tx.send(match cloned.promote_impl(safekeepers_lsn).await {
+                    Ok(_) => PromoteState::Completed,
+                    Err(err) => {
+                        tracing::error!(%err, "promoting");
+                        PromoteState::Failed {
+                            error: err.to_string(),
+                        }
+                    }
+                })
+            });
+            rx
+        };
+
+        let mut task;
+        // self.state is unlocked after block ends so we lock it in promote_impl
+        // and task.changed() is reached
+        {
+            task = self
+                .state
+                .lock()
+                .unwrap()
+                .promote_state
+                .get_or_insert_with(start_promotion)
+                .clone()
+        }
+        task.changed().await.expect("promote sender dropped");
+        task.borrow().clone()
+    }
+
+    // Why do we have to supply safekeepers?
+    // For secondary we use primary_connection_conninfo so safekeepers field is empty
+    async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> {
+        {
+            let state = self.state.lock().unwrap();
+            let mode = &state.pspec.as_ref().unwrap().spec.mode;
+            if *mode != ComputeMode::Replica {
+                bail!("{} is not replica", mode.to_type_str());
+            }
+
+            // we don't need to query Postgres so not self.lfc_prewarm_state()
+            match &state.lfc_prewarm_state {
+                LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
+                    bail!("prewarm not requested or pending")
+                }
+                LfcPrewarmState::Failed { error } => {
+                    tracing::warn!(%error, "replica prewarm failed")
+                }
+                _ => {}
+            }
+        }
+
+        let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
+            .await
+            .context("connecting to postgres")?;
+
+        let primary_lsn = safekeepers_lsn.wal_flush_lsn;
+        let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
+        const RETRIES: i32 = 20;
+        for i in 0..=RETRIES {
+            let row = client
+                .query_one("SELECT pg_last_wal_replay_lsn()", &[])
+                .await
+                .context("getting last replay lsn")?;
+            let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
+            last_wal_replay_lsn = lsn.into();
+            if last_wal_replay_lsn >= primary_lsn {
+                break;
+            }
+            tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
+            sleep(Duration::from_secs(1)).await;
+        }
+        if last_wal_replay_lsn < primary_lsn {
+            bail!("didn't catch up with primary in {RETRIES} retries");
+        }
+
+        // using $1 doesn't work with ALTER SYSTEM SET
+        let safekeepers_sql = format!(
+            "ALTER SYSTEM SET neon.safekeepers='{}'",
+            safekeepers_lsn.safekeepers
+        );
+        client
+            .query(&safekeepers_sql, &[])
+            .await
+            .context("setting safekeepers")?;
+        client
+            .query("SELECT pg_reload_conf()", &[])
+            .await
+            .context("reloading postgres config")?;
+        let row = client
+            .query_one("SELECT * FROM pg_promote()", &[])
+            .await
+            .context("pg_promote")?;
+        if !row.get::<usize, bool>(0) {
+            bail!("pg_promote() returned false");
+        }
+
+        let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
+            .await
+            .context("connecting to postgres")?;
+        let row = client
+            .query_one("SHOW transaction_read_only", &[])
+            .await
+            .context("getting transaction_read_only")?;
+        if row.get::<usize, &str>(0) == "on" {
+            bail!("replica in read only mode after promotion");
+        }
+
+        let mut state = self.state.lock().unwrap();
+        state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary;
+        Ok(())
+    }
+}
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -83,6 +83,87 @@ paths:
              schema:
                $ref: "#/components/schemas/DbsAndRoles"

+  /promote:
+    post:
+      tags:
+        - Promotion
+      summary: Promote secondary replica to primary
+      description: ""
+      operationId: promoteReplica
+      requestBody:
+        description: Promote requests data
+        required: true
+        content:
+          application/json:
+            schema:
+                $ref: "#/components/schemas/SafekeepersLsn"
+      responses:
+        200:
+          description: Promote succeeded or wasn't started
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PromoteState"
+        500:
+          description: Promote failed
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PromoteState"
+
+  /lfc/prewarm:
+    post:
+      summary: Request LFC Prewarm
+      parameters:
+        - name: from_endpoint
+          in: query
+          schema:
+            type: string
+      description: ""
+      operationId: lfcPrewarm
+      responses:
+        202:
+          description: LFC prewarm started
+        429:
+          description: LFC prewarm ongoing
+    get:
+      tags:
+        - Prewarm
+      summary: Get LFC prewarm state
+      description: ""
+      operationId: getLfcPrewarmState
+      responses:
+        200:
+          description: Prewarm state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/LfcPrewarmState"
+
+  /lfc/offload:
+    post:
+      summary: Request LFC offload
+      description: ""
+      operationId: lfcOffload
+      responses:
+        202:
+          description: LFC offload started
+        429:
+          description: LFC offload ongoing
+    get:
+      tags:
+        - Prewarm
+      summary: Get LFC offloading state
+      description: ""
+      operationId: getLfcOffloadState
+      responses:
+        200:
+          description: Offload state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/LfcOffloadState"
+
  /database_schema:
    get:
      tags:
@@ -497,6 +578,70 @@ components:
          type: string
          example: "1.0.0"

+    SafekeepersLsn:
+      type: object
+      required:
+        - safekeepers
+        - wal_flush_lsn
+      properties:
+        safekeepers:
+          description: Primary replica safekeepers
+          type: string
+        wal_flush_lsn:
+          description: Primary last WAL flush LSN
+          type: string
+
+    LfcPrewarmState:
+      type: object
+      required:
+        - status
+        - total
+        - prewarmed
+        - skipped
+      properties:
+        status:
+          description: Lfc prewarm status
+          enum: [not_prewarmed, prewarming, completed, failed]
+          type: string
+        error:
+          description: Lfc prewarm error, if any
+          type: string
+        total:
+          description: Total pages processed
+          type: integer
+        prewarmed:
+          description: Total pages prewarmed
+          type: integer
+        skipped:
+          description: Pages processed but not prewarmed
+          type: integer
+
+    LfcOffloadState:
+      type: object
+      required:
+        - status
+      properties:
+        status:
+          description: Lfc offload status
+          enum: [not_offloaded, offloading, completed, failed]
+          type: string
+        error:
+          description: Lfc offload error, if any
+          type: string
+
+    PromoteState:
+      type: object
+      required:
+        - status
+      properties:
+        status:
+          description: Promote result
+          enum: [not_promoted, completed, failed]
+          type: string
+        error:
+          description: Promote error, if any
+          type: string
+
    InstalledExtensions:
      type: object
      properties:
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -14,6 +14,7 @@ pub(in crate::http) mod insights;
 pub(in crate::http) mod lfc;
 pub(in crate::http) mod metrics;
 pub(in crate::http) mod metrics_json;
+pub(in crate::http) mod promote;
 pub(in crate::http) mod status;
 pub(in crate::http) mod terminate;

--- a/compute_tools/src/http/routes/promote.rs
+++ b/compute_tools/src/http/routes/promote.rs
@@ -0,0 +1,14 @@
+use crate::http::JsonResponse;
+use axum::Form;
+use http::StatusCode;
+
+pub(in crate::http) async fn promote(
+    compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
+    Form(safekeepers_lsn): Form<compute_api::responses::SafekeepersLsn>,
+) -> axum::response::Response {
+    let state = compute.promote(safekeepers_lsn).await;
+    if let compute_api::responses::PromoteState::Failed { error } = state {
+        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error);
+    }
+    JsonResponse::success(StatusCode::OK, state)
+}
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -23,7 +23,7 @@ use super::{
    middleware::authorize::Authorize,
    routes::{
        check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
-        grants, insights, lfc, metrics, metrics_json, status, terminate,
+        grants, insights, lfc, metrics, metrics_json, promote, status, terminate,
    },
 };
 use crate::compute::ComputeNode;
@@ -87,6 +87,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
                let authenticated_router = Router::<Arc<ComputeNode>>::new()
                    .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
                    .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
+                    .route("/promote", post(promote::promote))
                    .route("/check_writability", post(check_writability::is_writable))
                    .route("/configure", post(configure::configure))
                    .route("/database_schema", get(database_schema::get_schema_dump))
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -12,6 +12,7 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod compute_prewarm;
+pub mod compute_promote;
 pub mod disk_quota;
 pub mod extension_server;
 pub mod installed_extensions;
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -46,7 +46,7 @@ pub struct ExtensionInstallResponse {
    pub version: ExtVersion,
 }

-#[derive(Serialize, Default, Debug, Clone)]
+#[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcPrewarmState {
    #[default]
@@ -58,6 +58,17 @@ pub enum LfcPrewarmState {
    },
 }

+impl Display for LfcPrewarmState {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
+            LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
+            LfcPrewarmState::Completed => f.write_str("Completed"),
+            LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
+        }
+    }
+}
+
 #[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcOffloadState {
@@ -70,6 +81,23 @@ pub enum LfcOffloadState {
    },
 }

+#[derive(Serialize, Debug, Clone, PartialEq)]
+#[serde(tag = "status", rename_all = "snake_case")]
+/// Response of /promote
+pub enum PromoteState {
+    NotPromoted,
+    Completed,
+    Failed { error: String },
+}
+
+#[derive(Deserialize, Serialize, Default, Debug, Clone)]
+#[serde(rename_all = "snake_case")]
+/// Result of /safekeepers_lsn
+pub struct SafekeepersLsn {
+    pub safekeepers: String,
+    pub wal_flush_lsn: utils::lsn::Lsn,
+}
+
 /// Response of the /status API
 #[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -2,11 +2,12 @@ from __future__ import annotations

 import urllib.parse
 from enum import StrEnum
-from typing import TYPE_CHECKING, final
+from typing import TYPE_CHECKING, Any, final

 import requests
 from requests.adapters import HTTPAdapter
 from requests.auth import AuthBase
+from requests.exceptions import ReadTimeout
 from typing_extensions import override

 from fixtures.log_helper import log
@@ -102,6 +103,18 @@ class EndpointHttpClient(requests.Session):

        wait_until(offloaded)

+    def promote(self, safekeepers_lsn: dict[str, Any], disconnect: bool = False):
+        url = f"http://localhost:{self.external_port}/promote"
+        if disconnect:
+            try:  # send first request to start promote and disconnect
+                self.post(url, data=safekeepers_lsn, timeout=0.001)
+            except ReadTimeout:
+                pass  # wait on second request which returns on promotion finish
+        res = self.post(url, data=safekeepers_lsn)
+        res.raise_for_status()
+        json: dict[str, str] = res.json()
+        return json
+
    def database_schema(self, database: str):
        res = self.get(
            f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}",
--- a/test_runner/regress/test_replica_promotes.py
+++ b/test_runner/regress/test_replica_promotes.py
@@ -1,29 +1,51 @@
 """
-File with secondary->primary promotion testing.
-
-This far, only contains a test that we don't break and that the data is persisted.
+Secondary -> primary promotion testing
 """

+from enum import StrEnum
 from typing import cast

 import psycopg2
+import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup
-from fixtures.pg_version import PgVersion
+from fixtures.utils import USE_LFC
+from psycopg2.extensions import cursor as Cursor
 from pytest import raises


 def stop_and_check_lsn(ep: Endpoint, expected_lsn: Lsn | None):
    ep.stop(mode="immediate-terminate")
    lsn = ep.terminate_flush_lsn
-    if expected_lsn is not None:
+    assert (lsn is not None) == (expected_lsn is not None), f"{lsn=}, {expected_lsn=}"
+    if lsn is not None:
        assert lsn >= expected_lsn, f"{expected_lsn=} < {lsn=}"
-    else:
-        assert lsn == expected_lsn, f"{expected_lsn=} != {lsn=}"


-def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
+def get_lsn_triple(cur: Cursor) -> tuple[str, str, str]:
+    cur.execute(
+        """
+        SELECT pg_current_wal_insert_lsn(),
+               pg_current_wal_lsn(),
+               pg_current_wal_flush_lsn()
+        """
+    )
+    return cast("tuple[str, str, str]", cur.fetchone())
+
+
+class PromoteMethod(StrEnum):
+    COMPUTE_CTL = "compute-ctl"
+    POSTGRES = "postgres"
+
+
+METHOD_OPTIONS = [e for e in PromoteMethod]
+METHOD_IDS = [e.value for e in PromoteMethod]
+
+
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+@pytest.mark.parametrize("method", METHOD_OPTIONS, ids=METHOD_IDS)
+def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
    """
    Test that a replica safely promotes, and can commit data updates which
    show up when the primary boots up after the promoted secondary endpoint
@@ -38,29 +60,26 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):

    with primary.connect() as primary_conn:
        primary_cur = primary_conn.cursor()
+        primary_cur.execute("create extension neon")
        primary_cur.execute(
            "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
        )
        primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
-        primary_cur.execute(
-            """
-            SELECT pg_current_wal_insert_lsn(),
-                   pg_current_wal_lsn(),
-                   pg_current_wal_flush_lsn()
-            """
-        )
-        lsn_triple = cast("tuple[str, str, str]", primary_cur.fetchone())
+
+        lsn_triple = get_lsn_triple(primary_cur)
        log.info(f"Primary: Current LSN after workload is {lsn_triple}")
        expected_primary_lsn: Lsn = Lsn(lsn_triple[2])
        primary_cur.execute("show neon.safekeepers")
        safekeepers = primary_cur.fetchall()[0][0]

-    wait_replica_caughtup(primary, secondary)
+    if method == PromoteMethod.COMPUTE_CTL:
+        primary.http_client().offload_lfc()
+    else:
+        wait_replica_caughtup(primary, secondary)

    with secondary.connect() as secondary_conn:
        secondary_cur = secondary_conn.cursor()
        secondary_cur.execute("select count(*) from t")
-
        assert secondary_cur.fetchone() == (100,)

        with raises(psycopg2.Error):
@@ -71,28 +90,30 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
        secondary_cur.execute("select count(*) from t")
        assert secondary_cur.fetchone() == (100,)

+    primary_endpoint_id = primary.endpoint_id
    stop_and_check_lsn(primary, expected_primary_lsn)

    # Reconnect to the secondary to make sure we get a read-write connection
    promo_conn = secondary.connect()
    promo_cur = promo_conn.cursor()
-    promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
-    promo_cur.execute("select pg_reload_conf()")
+    if method == PromoteMethod.COMPUTE_CTL:
+        client = secondary.http_client()
+        client.prewarm_lfc(primary_endpoint_id)
+        # control plane knows safekeepers, simulate it by querying primary
+        assert (lsn := primary.terminate_flush_lsn)
+        safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
+        assert client.promote(safekeepers_lsn)["status"] == "completed"
+    else:
+        promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
+        promo_cur.execute("select pg_reload_conf()")
+        promo_cur.execute("SELECT * FROM pg_promote()")
+        assert promo_cur.fetchone() == (True,)

-    promo_cur.execute("SELECT * FROM pg_promote()")
-    assert promo_cur.fetchone() == (True,)
-    promo_cur.execute(
-        """
-            SELECT pg_current_wal_insert_lsn(),
-                   pg_current_wal_lsn(),
-                   pg_current_wal_flush_lsn()
-            """
-    )
-    log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}")
+    lsn_triple = get_lsn_triple(promo_cur)
+    log.info(f"Secondary: LSN after promotion is {lsn_triple}")

    # Reconnect to the secondary to make sure we get a read-write connection
-    with secondary.connect() as new_primary_conn:
-        new_primary_cur = new_primary_conn.cursor()
+    with secondary.connect() as conn, conn.cursor() as new_primary_cur:
        new_primary_cur.execute("select count(*) from t")
        assert new_primary_cur.fetchone() == (100,)

@@ -101,43 +122,34 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
        )
        assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)]

-        new_primary_cur = new_primary_conn.cursor()
+        new_primary_cur = conn.cursor()
        new_primary_cur.execute("select payload from t")
        assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]

        new_primary_cur.execute("select count(*) from t")
        assert new_primary_cur.fetchone() == (200,)
-        new_primary_cur.execute(
-            """
-            SELECT pg_current_wal_insert_lsn(),
-                   pg_current_wal_lsn(),
-                   pg_current_wal_flush_lsn()
-            """
-        )
-        log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}")

-    with secondary.connect() as second_viewpoint_conn:
-        new_primary_cur = second_viewpoint_conn.cursor()
+        lsn_triple = get_lsn_triple(new_primary_cur)
+        log.info(f"Secondary: LSN after workload is {lsn_triple}")
+        expected_promoted_lsn = Lsn(lsn_triple[2])
+
+    with secondary.connect() as conn, conn.cursor() as new_primary_cur:
        new_primary_cur.execute("select payload from t")
        assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]

-    # wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline)
-
-    # secondaries don't sync safekeepers on finish so LSN will be None
-    stop_and_check_lsn(secondary, None)
+    if method == PromoteMethod.COMPUTE_CTL:
+        # compute_ctl's /promote switches replica type to Primary so it syncs
+        # safekeepers on finish
+        stop_and_check_lsn(secondary, expected_promoted_lsn)
+    else:
+        # on testing postgres, we don't update replica type, secondaries don't
+        # sync so lsn should be None
+        stop_and_check_lsn(secondary, None)

    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary2")

-    with primary.connect() as new_primary:
-        new_primary_cur = new_primary.cursor()
-        new_primary_cur.execute(
-            """
-            SELECT pg_current_wal_insert_lsn(),
-                   pg_current_wal_lsn(),
-                   pg_current_wal_flush_lsn()
-            """
-        )
-        lsn_triple = cast("tuple[str, str, str]", new_primary_cur.fetchone())
+    with primary.connect() as new_primary, new_primary.cursor() as new_primary_cur:
+        lsn_triple = get_lsn_triple(new_primary_cur)
        expected_primary_lsn = Lsn(lsn_triple[2])
        log.info(f"New primary: Boot LSN is {lsn_triple}")

@@ -146,5 +158,39 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
        new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)")
        new_primary_cur.execute("select count(*) from t")
        assert new_primary_cur.fetchone() == (300,)
-
    stop_and_check_lsn(primary, expected_primary_lsn)
+
+
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv):
+    """
+    Test that if a handler disconnects from /promote route of compute_ctl, promotion still happens
+    once, and no error is thrown
+    """
+    env: NeonEnv = neon_simple_env
+    primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    with primary.connect() as conn, conn.cursor() as cur:
+        cur.execute("create extension neon")
+        cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
+        cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
+        cur.execute("show neon.safekeepers")
+        safekeepers = cur.fetchall()[0][0]
+
+    primary.http_client().offload_lfc()
+    primary_endpoint_id = primary.endpoint_id
+    primary.stop(mode="immediate-terminate")
+    assert (lsn := primary.terminate_flush_lsn)
+
+    client = secondary.http_client()
+    client.prewarm_lfc(primary_endpoint_id)
+    safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
+    assert client.promote(safekeepers_lsn, disconnect=True)["status"] == "completed"
+
+    with secondary.connect() as conn, conn.cursor() as cur:
+        cur.execute("select count(*) from t")
+        assert cur.fetchone() == (100,)
+        cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload")
+        cur.execute("select count(*) from t")
+        assert cur.fetchone() == (200,)