Replica promotion in compute_ctl (#12183)

Add `/promote` method for `compute_ctl` promoting secondary replica to
primary,
depends on secondary being prewarmed.
Add `compute-ctl` mode to `test_replica_promotes`, testing happy path
only (no corner cases yet)
Add openapi spec for `/promote` and `/lfc` handlers

https://github.com/neondatabase/cloud/issues/19011
Resolves: https://github.com/neondatabase/cloud/issues/29807
This commit is contained in:
Mikhail
2025-07-09 13:55:10 +01:00
committed by GitHub
parent 4ee0da0a20
commit e7d18bc188
12 changed files with 448 additions and 65 deletions

1
Cargo.lock generated
View File

@@ -1348,6 +1348,7 @@ dependencies = [
"p256 0.13.2",
"pageserver_page_api",
"postgres",
"postgres-types",
"postgres_initdb",
"postgres_versioninfo",
"regex",

View File

@@ -66,7 +66,7 @@ url.workspace = true
uuid.workspace = true
walkdir.workspace = true
x509-cert.workspace = true
postgres-types.workspace = true
postgres_versioninfo.workspace = true
postgres_initdb.workspace = true
compute_api.workspace = true

View File

@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
use compute_api::privilege::Privilege;
use compute_api::responses::{
ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
LfcPrewarmState, TlsConfig,
LfcPrewarmState, PromoteState, TlsConfig,
};
use compute_api::spec::{
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
@@ -29,8 +29,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
use std::sync::{Arc, Condvar, Mutex, RwLock};
use std::time::{Duration, Instant};
use std::{env, fs};
use tokio::task::JoinHandle;
use tokio::{spawn, time};
use tokio::{spawn, sync::watch, task::JoinHandle, time};
use tracing::{Instrument, debug, error, info, instrument, warn};
use url::Url;
use utils::id::{TenantId, TimelineId};
@@ -175,6 +174,7 @@ pub struct ComputeState {
/// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
/// mode == ComputeMode::Primary. None otherwise
pub terminate_flush_lsn: Option<Lsn>,
pub promote_state: Option<watch::Receiver<PromoteState>>,
pub metrics: ComputeMetrics,
}
@@ -192,6 +192,7 @@ impl ComputeState {
lfc_prewarm_state: LfcPrewarmState::default(),
lfc_offload_state: LfcOffloadState::default(),
terminate_flush_lsn: None,
promote_state: None,
}
}

View File

@@ -0,0 +1,132 @@
use crate::compute::ComputeNode;
use anyhow::{Context, Result, bail};
use compute_api::{
responses::{LfcPrewarmState, PromoteState, SafekeepersLsn},
spec::ComputeMode,
};
use std::{sync::Arc, time::Duration};
use tokio::time::sleep;
use utils::lsn::Lsn;
impl ComputeNode {
/// Returns only when promote fails or succeeds. If a network error occurs
/// and http client disconnects, this does not stop promotion, and subsequent
/// calls block until promote finishes.
/// Called by control plane on secondary after primary endpoint is terminated
pub async fn promote(self: &Arc<Self>, safekeepers_lsn: SafekeepersLsn) -> PromoteState {
let cloned = self.clone();
let start_promotion = || {
let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
tokio::spawn(async move {
tx.send(match cloned.promote_impl(safekeepers_lsn).await {
Ok(_) => PromoteState::Completed,
Err(err) => {
tracing::error!(%err, "promoting");
PromoteState::Failed {
error: err.to_string(),
}
}
})
});
rx
};
let mut task;
// self.state is unlocked after block ends so we lock it in promote_impl
// and task.changed() is reached
{
task = self
.state
.lock()
.unwrap()
.promote_state
.get_or_insert_with(start_promotion)
.clone()
}
task.changed().await.expect("promote sender dropped");
task.borrow().clone()
}
// Why do we have to supply safekeepers?
// For secondary we use primary_connection_conninfo so safekeepers field is empty
async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> {
{
let state = self.state.lock().unwrap();
let mode = &state.pspec.as_ref().unwrap().spec.mode;
if *mode != ComputeMode::Replica {
bail!("{} is not replica", mode.to_type_str());
}
// we don't need to query Postgres so not self.lfc_prewarm_state()
match &state.lfc_prewarm_state {
LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
bail!("prewarm not requested or pending")
}
LfcPrewarmState::Failed { error } => {
tracing::warn!(%error, "replica prewarm failed")
}
_ => {}
}
}
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
.await
.context("connecting to postgres")?;
let primary_lsn = safekeepers_lsn.wal_flush_lsn;
let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
const RETRIES: i32 = 20;
for i in 0..=RETRIES {
let row = client
.query_one("SELECT pg_last_wal_replay_lsn()", &[])
.await
.context("getting last replay lsn")?;
let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
last_wal_replay_lsn = lsn.into();
if last_wal_replay_lsn >= primary_lsn {
break;
}
tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
sleep(Duration::from_secs(1)).await;
}
if last_wal_replay_lsn < primary_lsn {
bail!("didn't catch up with primary in {RETRIES} retries");
}
// using $1 doesn't work with ALTER SYSTEM SET
let safekeepers_sql = format!(
"ALTER SYSTEM SET neon.safekeepers='{}'",
safekeepers_lsn.safekeepers
);
client
.query(&safekeepers_sql, &[])
.await
.context("setting safekeepers")?;
client
.query("SELECT pg_reload_conf()", &[])
.await
.context("reloading postgres config")?;
let row = client
.query_one("SELECT * FROM pg_promote()", &[])
.await
.context("pg_promote")?;
if !row.get::<usize, bool>(0) {
bail!("pg_promote() returned false");
}
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
.await
.context("connecting to postgres")?;
let row = client
.query_one("SHOW transaction_read_only", &[])
.await
.context("getting transaction_read_only")?;
if row.get::<usize, &str>(0) == "on" {
bail!("replica in read only mode after promotion");
}
let mut state = self.state.lock().unwrap();
state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary;
Ok(())
}
}

View File

@@ -83,6 +83,87 @@ paths:
schema:
$ref: "#/components/schemas/DbsAndRoles"
/promote:
post:
tags:
- Promotion
summary: Promote secondary replica to primary
description: ""
operationId: promoteReplica
requestBody:
description: Promote requests data
required: true
content:
application/json:
schema:
$ref: "#/components/schemas/SafekeepersLsn"
responses:
200:
description: Promote succeeded or wasn't started
content:
application/json:
schema:
$ref: "#/components/schemas/PromoteState"
500:
description: Promote failed
content:
application/json:
schema:
$ref: "#/components/schemas/PromoteState"
/lfc/prewarm:
post:
summary: Request LFC Prewarm
parameters:
- name: from_endpoint
in: query
schema:
type: string
description: ""
operationId: lfcPrewarm
responses:
202:
description: LFC prewarm started
429:
description: LFC prewarm ongoing
get:
tags:
- Prewarm
summary: Get LFC prewarm state
description: ""
operationId: getLfcPrewarmState
responses:
200:
description: Prewarm state
content:
application/json:
schema:
$ref: "#/components/schemas/LfcPrewarmState"
/lfc/offload:
post:
summary: Request LFC offload
description: ""
operationId: lfcOffload
responses:
202:
description: LFC offload started
429:
description: LFC offload ongoing
get:
tags:
- Prewarm
summary: Get LFC offloading state
description: ""
operationId: getLfcOffloadState
responses:
200:
description: Offload state
content:
application/json:
schema:
$ref: "#/components/schemas/LfcOffloadState"
/database_schema:
get:
tags:
@@ -497,6 +578,70 @@ components:
type: string
example: "1.0.0"
SafekeepersLsn:
type: object
required:
- safekeepers
- wal_flush_lsn
properties:
safekeepers:
description: Primary replica safekeepers
type: string
wal_flush_lsn:
description: Primary last WAL flush LSN
type: string
LfcPrewarmState:
type: object
required:
- status
- total
- prewarmed
- skipped
properties:
status:
description: Lfc prewarm status
enum: [not_prewarmed, prewarming, completed, failed]
type: string
error:
description: Lfc prewarm error, if any
type: string
total:
description: Total pages processed
type: integer
prewarmed:
description: Total pages prewarmed
type: integer
skipped:
description: Pages processed but not prewarmed
type: integer
LfcOffloadState:
type: object
required:
- status
properties:
status:
description: Lfc offload status
enum: [not_offloaded, offloading, completed, failed]
type: string
error:
description: Lfc offload error, if any
type: string
PromoteState:
type: object
required:
- status
properties:
status:
description: Promote result
enum: [not_promoted, completed, failed]
type: string
error:
description: Promote error, if any
type: string
InstalledExtensions:
type: object
properties:

View File

@@ -14,6 +14,7 @@ pub(in crate::http) mod insights;
pub(in crate::http) mod lfc;
pub(in crate::http) mod metrics;
pub(in crate::http) mod metrics_json;
pub(in crate::http) mod promote;
pub(in crate::http) mod status;
pub(in crate::http) mod terminate;

View File

@@ -0,0 +1,14 @@
use crate::http::JsonResponse;
use axum::Form;
use http::StatusCode;
pub(in crate::http) async fn promote(
compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
Form(safekeepers_lsn): Form<compute_api::responses::SafekeepersLsn>,
) -> axum::response::Response {
let state = compute.promote(safekeepers_lsn).await;
if let compute_api::responses::PromoteState::Failed { error } = state {
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error);
}
JsonResponse::success(StatusCode::OK, state)
}

View File

@@ -23,7 +23,7 @@ use super::{
middleware::authorize::Authorize,
routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
grants, insights, lfc, metrics, metrics_json, status, terminate,
grants, insights, lfc, metrics, metrics_json, promote, status, terminate,
},
};
use crate::compute::ComputeNode;
@@ -87,6 +87,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
let authenticated_router = Router::<Arc<ComputeNode>>::new()
.route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
.route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
.route("/promote", post(promote::promote))
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))

View File

@@ -12,6 +12,7 @@ pub mod logger;
pub mod catalog;
pub mod compute;
pub mod compute_prewarm;
pub mod compute_promote;
pub mod disk_quota;
pub mod extension_server;
pub mod installed_extensions;

View File

@@ -46,7 +46,7 @@ pub struct ExtensionInstallResponse {
pub version: ExtVersion,
}
#[derive(Serialize, Default, Debug, Clone)]
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
#[serde(tag = "status", rename_all = "snake_case")]
pub enum LfcPrewarmState {
#[default]
@@ -58,6 +58,17 @@ pub enum LfcPrewarmState {
},
}
impl Display for LfcPrewarmState {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
LfcPrewarmState::Completed => f.write_str("Completed"),
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
}
}
}
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
#[serde(tag = "status", rename_all = "snake_case")]
pub enum LfcOffloadState {
@@ -70,6 +81,23 @@ pub enum LfcOffloadState {
},
}
#[derive(Serialize, Debug, Clone, PartialEq)]
#[serde(tag = "status", rename_all = "snake_case")]
/// Response of /promote
pub enum PromoteState {
NotPromoted,
Completed,
Failed { error: String },
}
#[derive(Deserialize, Serialize, Default, Debug, Clone)]
#[serde(rename_all = "snake_case")]
/// Result of /safekeepers_lsn
pub struct SafekeepersLsn {
pub safekeepers: String,
pub wal_flush_lsn: utils::lsn::Lsn,
}
/// Response of the /status API
#[derive(Serialize, Debug, Deserialize)]
#[serde(rename_all = "snake_case")]

View File

@@ -2,11 +2,12 @@ from __future__ import annotations
import urllib.parse
from enum import StrEnum
from typing import TYPE_CHECKING, final
from typing import TYPE_CHECKING, Any, final
import requests
from requests.adapters import HTTPAdapter
from requests.auth import AuthBase
from requests.exceptions import ReadTimeout
from typing_extensions import override
from fixtures.log_helper import log
@@ -102,6 +103,18 @@ class EndpointHttpClient(requests.Session):
wait_until(offloaded)
def promote(self, safekeepers_lsn: dict[str, Any], disconnect: bool = False):
url = f"http://localhost:{self.external_port}/promote"
if disconnect:
try: # send first request to start promote and disconnect
self.post(url, data=safekeepers_lsn, timeout=0.001)
except ReadTimeout:
pass # wait on second request which returns on promotion finish
res = self.post(url, data=safekeepers_lsn)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def database_schema(self, database: str):
res = self.get(
f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}",

View File

@@ -1,29 +1,51 @@
"""
File with secondary->primary promotion testing.
This far, only contains a test that we don't break and that the data is persisted.
Secondary -> primary promotion testing
"""
from enum import StrEnum
from typing import cast
import psycopg2
import pytest
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup
from fixtures.pg_version import PgVersion
from fixtures.utils import USE_LFC
from psycopg2.extensions import cursor as Cursor
from pytest import raises
def stop_and_check_lsn(ep: Endpoint, expected_lsn: Lsn | None):
ep.stop(mode="immediate-terminate")
lsn = ep.terminate_flush_lsn
if expected_lsn is not None:
assert (lsn is not None) == (expected_lsn is not None), f"{lsn=}, {expected_lsn=}"
if lsn is not None:
assert lsn >= expected_lsn, f"{expected_lsn=} < {lsn=}"
else:
assert lsn == expected_lsn, f"{expected_lsn=} != {lsn=}"
def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
def get_lsn_triple(cur: Cursor) -> tuple[str, str, str]:
cur.execute(
"""
SELECT pg_current_wal_insert_lsn(),
pg_current_wal_lsn(),
pg_current_wal_flush_lsn()
"""
)
return cast("tuple[str, str, str]", cur.fetchone())
class PromoteMethod(StrEnum):
COMPUTE_CTL = "compute-ctl"
POSTGRES = "postgres"
METHOD_OPTIONS = [e for e in PromoteMethod]
METHOD_IDS = [e.value for e in PromoteMethod]
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
@pytest.mark.parametrize("method", METHOD_OPTIONS, ids=METHOD_IDS)
def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
"""
Test that a replica safely promotes, and can commit data updates which
show up when the primary boots up after the promoted secondary endpoint
@@ -38,29 +60,26 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
with primary.connect() as primary_conn:
primary_cur = primary_conn.cursor()
primary_cur.execute("create extension neon")
primary_cur.execute(
"create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
)
primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
primary_cur.execute(
"""
SELECT pg_current_wal_insert_lsn(),
pg_current_wal_lsn(),
pg_current_wal_flush_lsn()
"""
)
lsn_triple = cast("tuple[str, str, str]", primary_cur.fetchone())
lsn_triple = get_lsn_triple(primary_cur)
log.info(f"Primary: Current LSN after workload is {lsn_triple}")
expected_primary_lsn: Lsn = Lsn(lsn_triple[2])
primary_cur.execute("show neon.safekeepers")
safekeepers = primary_cur.fetchall()[0][0]
wait_replica_caughtup(primary, secondary)
if method == PromoteMethod.COMPUTE_CTL:
primary.http_client().offload_lfc()
else:
wait_replica_caughtup(primary, secondary)
with secondary.connect() as secondary_conn:
secondary_cur = secondary_conn.cursor()
secondary_cur.execute("select count(*) from t")
assert secondary_cur.fetchone() == (100,)
with raises(psycopg2.Error):
@@ -71,28 +90,30 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
secondary_cur.execute("select count(*) from t")
assert secondary_cur.fetchone() == (100,)
primary_endpoint_id = primary.endpoint_id
stop_and_check_lsn(primary, expected_primary_lsn)
# Reconnect to the secondary to make sure we get a read-write connection
promo_conn = secondary.connect()
promo_cur = promo_conn.cursor()
promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
promo_cur.execute("select pg_reload_conf()")
if method == PromoteMethod.COMPUTE_CTL:
client = secondary.http_client()
client.prewarm_lfc(primary_endpoint_id)
# control plane knows safekeepers, simulate it by querying primary
assert (lsn := primary.terminate_flush_lsn)
safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
assert client.promote(safekeepers_lsn)["status"] == "completed"
else:
promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
promo_cur.execute("select pg_reload_conf()")
promo_cur.execute("SELECT * FROM pg_promote()")
assert promo_cur.fetchone() == (True,)
promo_cur.execute("SELECT * FROM pg_promote()")
assert promo_cur.fetchone() == (True,)
promo_cur.execute(
"""
SELECT pg_current_wal_insert_lsn(),
pg_current_wal_lsn(),
pg_current_wal_flush_lsn()
"""
)
log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}")
lsn_triple = get_lsn_triple(promo_cur)
log.info(f"Secondary: LSN after promotion is {lsn_triple}")
# Reconnect to the secondary to make sure we get a read-write connection
with secondary.connect() as new_primary_conn:
new_primary_cur = new_primary_conn.cursor()
with secondary.connect() as conn, conn.cursor() as new_primary_cur:
new_primary_cur.execute("select count(*) from t")
assert new_primary_cur.fetchone() == (100,)
@@ -101,43 +122,34 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
)
assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)]
new_primary_cur = new_primary_conn.cursor()
new_primary_cur = conn.cursor()
new_primary_cur.execute("select payload from t")
assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
new_primary_cur.execute("select count(*) from t")
assert new_primary_cur.fetchone() == (200,)
new_primary_cur.execute(
"""
SELECT pg_current_wal_insert_lsn(),
pg_current_wal_lsn(),
pg_current_wal_flush_lsn()
"""
)
log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}")
with secondary.connect() as second_viewpoint_conn:
new_primary_cur = second_viewpoint_conn.cursor()
lsn_triple = get_lsn_triple(new_primary_cur)
log.info(f"Secondary: LSN after workload is {lsn_triple}")
expected_promoted_lsn = Lsn(lsn_triple[2])
with secondary.connect() as conn, conn.cursor() as new_primary_cur:
new_primary_cur.execute("select payload from t")
assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
# wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline)
# secondaries don't sync safekeepers on finish so LSN will be None
stop_and_check_lsn(secondary, None)
if method == PromoteMethod.COMPUTE_CTL:
# compute_ctl's /promote switches replica type to Primary so it syncs
# safekeepers on finish
stop_and_check_lsn(secondary, expected_promoted_lsn)
else:
# on testing postgres, we don't update replica type, secondaries don't
# sync so lsn should be None
stop_and_check_lsn(secondary, None)
primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary2")
with primary.connect() as new_primary:
new_primary_cur = new_primary.cursor()
new_primary_cur.execute(
"""
SELECT pg_current_wal_insert_lsn(),
pg_current_wal_lsn(),
pg_current_wal_flush_lsn()
"""
)
lsn_triple = cast("tuple[str, str, str]", new_primary_cur.fetchone())
with primary.connect() as new_primary, new_primary.cursor() as new_primary_cur:
lsn_triple = get_lsn_triple(new_primary_cur)
expected_primary_lsn = Lsn(lsn_triple[2])
log.info(f"New primary: Boot LSN is {lsn_triple}")
@@ -146,5 +158,39 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)")
new_primary_cur.execute("select count(*) from t")
assert new_primary_cur.fetchone() == (300,)
stop_and_check_lsn(primary, expected_primary_lsn)
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv):
"""
Test that if a handler disconnects from /promote route of compute_ctl, promotion still happens
once, and no error is thrown
"""
env: NeonEnv = neon_simple_env
primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
with primary.connect() as conn, conn.cursor() as cur:
cur.execute("create extension neon")
cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
cur.execute("show neon.safekeepers")
safekeepers = cur.fetchall()[0][0]
primary.http_client().offload_lfc()
primary_endpoint_id = primary.endpoint_id
primary.stop(mode="immediate-terminate")
assert (lsn := primary.terminate_flush_lsn)
client = secondary.http_client()
client.prewarm_lfc(primary_endpoint_id)
safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
assert client.promote(safekeepers_lsn, disconnect=True)["status"] == "completed"
with secondary.connect() as conn, conn.cursor() as cur:
cur.execute("select count(*) from t")
assert cur.fetchone() == (100,)
cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload")
cur.execute("select count(*) from t")
assert cur.fetchone() == (200,)