mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 14:02:55 +00:00
Merge branch 'main' into thesuhas/brc-3082
This commit is contained in:
@@ -1 +1 @@
|
||||
SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer;
|
||||
SELECT num_requested AS checkpoints_req FROM pg_catalog.pg_stat_checkpointer;
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT checkpoints_req FROM pg_stat_bgwriter;
|
||||
SELECT checkpoints_req FROM pg_catalog.pg_stat_bgwriter;
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT checkpoints_timed FROM pg_stat_bgwriter;
|
||||
SELECT checkpoints_timed FROM pg_catalog.pg_stat_bgwriter;
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;
|
||||
SELECT (neon.backpressure_throttling_time()::pg_catalog.float8 / 1000000) AS throttled;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
SELECT CASE
|
||||
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
|
||||
ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8
|
||||
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_replay_lsn() - '0/0')::pg_catalog.FLOAT8
|
||||
ELSE (pg_catalog.pg_current_wal_lsn() - '0/0')::pg_catalog.FLOAT8
|
||||
END AS lsn;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
SELECT
|
||||
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
(SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
|
||||
-- These temporary snapshot files are renamed to the actual snapshot files
|
||||
-- after they are completely built. We only WAL-log the completely built
|
||||
-- snapshot files
|
||||
(SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
|
||||
(SELECT COUNT(*) FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
SELECT
|
||||
(SELECT current_setting('neon.timeline_id')) AS timeline_id,
|
||||
(SELECT pg_catalog.current_setting('neon.timeline_id')) AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
|
||||
-- These temporary snapshot files are renamed to the actual snapshot files
|
||||
-- after they are completely built. We only WAL-log the completely built
|
||||
-- snapshot files
|
||||
(SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
|
||||
(SELECT COALESCE(pg_catalog.sum(size), 0) FROM pg_catalog.pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
SELECT
|
||||
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
(SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
|
||||
-- These temporary snapshot files are renamed to the actual snapshot files
|
||||
-- after they are completely built. We only WAL-log the completely built
|
||||
-- snapshot files
|
||||
(SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
|
||||
FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
|
||||
(SELECT COALESCE(pg_catalog.sum((pg_catalog.pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
|
||||
FROM (SELECT * FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
|
||||
) AS logical_snapshots_bytes;
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT current_setting('max_connections') as max_connections;
|
||||
SELECT pg_catalog.current_setting('max_connections') AS max_connections;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
SELECT datname database_name,
|
||||
age(datfrozenxid) frozen_xid_age
|
||||
FROM pg_database
|
||||
pg_catalog.age(datfrozenxid) frozen_xid_age
|
||||
FROM pg_catalog.pg_database
|
||||
ORDER BY frozen_xid_age DESC LIMIT 10;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
SELECT datname database_name,
|
||||
mxid_age(datminmxid) min_mxid_age
|
||||
FROM pg_database
|
||||
pg_catalog.mxid_age(datminmxid) min_mxid_age
|
||||
FROM pg_catalog.pg_database
|
||||
ORDER BY min_mxid_age DESC LIMIT 10;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
SELECT CASE
|
||||
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
|
||||
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_receive_lsn() - '0/0')::pg_catalog.FLOAT8
|
||||
ELSE 0
|
||||
END AS lsn;
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled;
|
||||
SELECT subenabled::pg_catalog.text AS enabled, pg_catalog.count(*) AS subscriptions_count FROM pg_catalog.pg_subscription GROUP BY subenabled;
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state;
|
||||
SELECT datname, state, pg_catalog.count(*) AS count FROM pg_catalog.pg_stat_activity WHERE state <> '' GROUP BY datname, state;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
SELECT sum(pg_database_size(datname)) AS total
|
||||
FROM pg_database
|
||||
SELECT pg_catalog.sum(pg_catalog.pg_database_size(datname)) AS total
|
||||
FROM pg_catalog.pg_database
|
||||
-- Ignore invalid databases, as we will likely have problems with
|
||||
-- getting their size from the Pageserver.
|
||||
WHERE datconnlimit != -2;
|
||||
|
||||
@@ -3,6 +3,6 @@
|
||||
-- minutes.
|
||||
|
||||
SELECT
|
||||
x::text as duration_seconds,
|
||||
x::pg_catalog.text AS duration_seconds,
|
||||
neon.approximate_working_set_size_seconds(x) AS size
|
||||
FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x);
|
||||
|
||||
@@ -3,6 +3,6 @@
|
||||
|
||||
SELECT
|
||||
x AS duration,
|
||||
neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM (
|
||||
neon.approximate_working_set_size_seconds(extract('epoch' FROM x::pg_catalog.interval)::pg_catalog.int4) AS size FROM (
|
||||
VALUES ('5m'), ('15m'), ('1h')
|
||||
) AS t (x);
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
|
||||
SELECT pg_catalog.pg_size_bytes(pg_catalog.current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
|
||||
FROM pg_replication_slots
|
||||
SELECT slot_name, (restart_lsn - '0/0')::pg_catalog.FLOAT8 AS restart_lsn
|
||||
FROM pg_catalog.pg_replication_slots
|
||||
WHERE slot_type = 'logical';
|
||||
|
||||
@@ -1 +1 @@
|
||||
SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size';
|
||||
SELECT setting::pg_catalog.int4 AS max_cluster_size FROM pg_catalog.pg_settings WHERE name = 'neon.max_cluster_size';
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
-- We export stats for 10 non-system databases. Without this limit it is too
|
||||
-- easy to abuse the system by creating lots of databases.
|
||||
|
||||
SELECT pg_database_size(datname) AS db_size,
|
||||
SELECT pg_catalog.pg_database_size(datname) AS db_size,
|
||||
deadlocks,
|
||||
tup_inserted AS inserted,
|
||||
tup_updated AS updated,
|
||||
tup_deleted AS deleted,
|
||||
datname
|
||||
FROM pg_stat_database
|
||||
FROM pg_catalog.pg_stat_database
|
||||
WHERE datname IN (
|
||||
SELECT datname FROM pg_database
|
||||
-- Ignore invalid databases, as we will likely have problems with
|
||||
|
||||
@@ -3,4 +3,4 @@
|
||||
-- replay LSN may have advanced past the receive LSN we are using for the
|
||||
-- calculation.
|
||||
|
||||
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
||||
SELECT GREATEST(0, pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), pg_catalog.pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
|
||||
ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
|
||||
WHEN pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn() THEN 0
|
||||
ELSE GREATEST(0, EXTRACT (EPOCH FROM pg_catalog.now() - pg_catalog.pg_last_xact_replay_timestamp()))
|
||||
END AS replication_delay_seconds;
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
SELECT
|
||||
slot_name,
|
||||
pg_wal_lsn_diff(
|
||||
pg_catalog.pg_wal_lsn_diff(
|
||||
CASE
|
||||
WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn()
|
||||
ELSE pg_current_wal_lsn()
|
||||
WHEN pg_catalog.pg_is_in_recovery() THEN pg_catalog.pg_last_wal_replay_lsn()
|
||||
ELSE pg_catalog.pg_current_wal_lsn()
|
||||
END,
|
||||
restart_lsn)::FLOAT8 AS retained_wal
|
||||
FROM pg_replication_slots
|
||||
restart_lsn)::pg_catalog.FLOAT8 AS retained_wal
|
||||
FROM pg_catalog.pg_replication_slots
|
||||
WHERE active = false;
|
||||
|
||||
@@ -4,4 +4,4 @@ SELECT
|
||||
WHEN wal_status = 'lost' THEN 1
|
||||
ELSE 0
|
||||
END AS wal_is_lost
|
||||
FROM pg_replication_slots;
|
||||
FROM pg_catalog.pg_replication_slots;
|
||||
|
||||
@@ -279,7 +279,7 @@ fn main() -> Result<()> {
|
||||
config,
|
||||
)?;
|
||||
|
||||
let exit_code = compute_node.run()?;
|
||||
let exit_code = compute_node.run().context("running compute node")?;
|
||||
|
||||
scenario.teardown();
|
||||
|
||||
|
||||
@@ -24,9 +24,9 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
|
||||
});
|
||||
|
||||
let query = "
|
||||
INSERT INTO health_check VALUES (1, now())
|
||||
INSERT INTO public.health_check VALUES (1, pg_catalog.now())
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET updated_at = now();";
|
||||
SET updated_at = pg_catalog.now();";
|
||||
|
||||
match client.simple_query(query).await {
|
||||
Result::Ok(result) => {
|
||||
|
||||
@@ -32,8 +32,12 @@ use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
use tokio::{spawn, sync::watch, task::JoinHandle, time};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, debug, error, info, instrument, warn};
|
||||
use url::Url;
|
||||
use utils::backoff::{
|
||||
DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff_duration,
|
||||
};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
@@ -192,6 +196,7 @@ pub struct ComputeState {
|
||||
pub startup_span: Option<tracing::span::Span>,
|
||||
|
||||
pub lfc_prewarm_state: LfcPrewarmState,
|
||||
pub lfc_prewarm_token: CancellationToken,
|
||||
pub lfc_offload_state: LfcOffloadState,
|
||||
|
||||
/// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
|
||||
@@ -217,6 +222,7 @@ impl ComputeState {
|
||||
lfc_offload_state: LfcOffloadState::default(),
|
||||
terminate_flush_lsn: None,
|
||||
promote_state: None,
|
||||
lfc_prewarm_token: CancellationToken::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -583,7 +589,7 @@ impl ComputeNode {
|
||||
// that can affect `compute_ctl` and prevent it from properly configuring the database schema.
|
||||
// Unset them via connection string options before connecting to the database.
|
||||
// N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
|
||||
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none";
|
||||
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0 -c pgaudit.log=none";
|
||||
let options = match conn_conf.get_options() {
|
||||
// Allow the control plane to override any options set by the
|
||||
// compute
|
||||
@@ -1554,6 +1560,41 @@ impl ComputeNode {
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
fn sync_safekeepers_with_retries(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
|
||||
let max_retries = 5;
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
let result = self.sync_safekeepers(storage_auth_token.clone());
|
||||
match &result {
|
||||
Ok(_) => {
|
||||
if attempts > 0 {
|
||||
tracing::info!("sync_safekeepers succeeded after {attempts} retries");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
Err(e) if attempts < max_retries => {
|
||||
tracing::info!(
|
||||
"sync_safekeepers failed, will retry (attempt {attempts}): {e:#}"
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
tracing::warn!(
|
||||
"sync_safekeepers still failed after {attempts} retries, giving up: {err:?}"
|
||||
);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
// sleep and retry
|
||||
let backoff = exponential_backoff_duration(
|
||||
attempts,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
);
|
||||
std::thread::sleep(backoff);
|
||||
attempts += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
#[instrument(skip_all)]
|
||||
@@ -1589,7 +1630,7 @@ impl ComputeNode {
|
||||
lsn
|
||||
} else {
|
||||
info!("starting safekeepers syncing");
|
||||
self.sync_safekeepers(pspec.storage_auth_token.clone())
|
||||
self.sync_safekeepers_with_retries(pspec.storage_auth_token.clone())
|
||||
.with_context(|| "failed to sync safekeepers")?
|
||||
};
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
@@ -1884,7 +1925,7 @@ impl ComputeNode {
|
||||
|
||||
// It doesn't matter what were the options before, here we just want
|
||||
// to connect and create a new superuser role.
|
||||
const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
|
||||
const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0";
|
||||
zenith_admin_conf.options(ZENITH_OPTIONS);
|
||||
|
||||
let mut client =
|
||||
@@ -2339,13 +2380,13 @@ impl ComputeNode {
|
||||
let result = client
|
||||
.simple_query(
|
||||
"SELECT
|
||||
row_to_json(pg_stat_statements)
|
||||
pg_catalog.row_to_json(pss)
|
||||
FROM
|
||||
pg_stat_statements
|
||||
public.pg_stat_statements pss
|
||||
WHERE
|
||||
userid != 'cloud_admin'::regrole::oid
|
||||
pss.userid != 'cloud_admin'::pg_catalog.regrole::pg_catalog.oid
|
||||
ORDER BY
|
||||
(mean_exec_time + mean_plan_time) DESC
|
||||
(pss.mean_exec_time + pss.mean_plan_time) DESC
|
||||
LIMIT 100",
|
||||
)
|
||||
.await;
|
||||
@@ -2473,11 +2514,11 @@ LIMIT 100",
|
||||
|
||||
// check the role grants first - to gracefully handle read-replicas.
|
||||
let select = "SELECT privilege_type
|
||||
FROM pg_namespace
|
||||
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
|
||||
JOIN pg_user users ON acl.grantee = users.usesysid
|
||||
WHERE users.usename = $1
|
||||
AND nspname = $2";
|
||||
FROM pg_catalog.pg_namespace
|
||||
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) AS acl ON true
|
||||
JOIN pg_catalog.pg_user users ON acl.grantee = users.usesysid
|
||||
WHERE users.usename OPERATOR(pg_catalog.=) $1::pg_catalog.name
|
||||
AND nspname OPERATOR(pg_catalog.=) $2::pg_catalog.name";
|
||||
let rows = db_client
|
||||
.query(select, &[role_name, schema_name])
|
||||
.await
|
||||
@@ -2546,8 +2587,9 @@ LIMIT 100",
|
||||
.await
|
||||
.with_context(|| format!("Failed to execute query: {query}"))?;
|
||||
} else {
|
||||
let query =
|
||||
format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}");
|
||||
let query = format!(
|
||||
"CREATE EXTENSION IF NOT EXISTS {ext_name} WITH SCHEMA public VERSION {quoted_version}"
|
||||
);
|
||||
db_client
|
||||
.simple_query(&query)
|
||||
.await
|
||||
@@ -2738,7 +2780,7 @@ LIMIT 100",
|
||||
// 4. We start again and try to prewarm with the state from 2. instead of the previous complete state
|
||||
if matches!(
|
||||
prewarm_state,
|
||||
LfcPrewarmState::Completed
|
||||
LfcPrewarmState::Completed { .. }
|
||||
| LfcPrewarmState::NotPrewarmed
|
||||
| LfcPrewarmState::Skipped
|
||||
) {
|
||||
|
||||
@@ -7,18 +7,11 @@ use http::StatusCode;
|
||||
use reqwest::Client;
|
||||
use std::mem::replace;
|
||||
use std::sync::Arc;
|
||||
use tokio::{io::AsyncReadExt, spawn};
|
||||
use std::time::Instant;
|
||||
use tokio::{io::AsyncReadExt, select, spawn};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info};
|
||||
|
||||
#[derive(serde::Serialize, Default)]
|
||||
pub struct LfcPrewarmStateWithProgress {
|
||||
#[serde(flatten)]
|
||||
base: LfcPrewarmState,
|
||||
total: i32,
|
||||
prewarmed: i32,
|
||||
skipped: i32,
|
||||
}
|
||||
|
||||
/// A pair of url and a token to query endpoint storage for LFC prewarm-related tasks
|
||||
struct EndpointStoragePair {
|
||||
url: String,
|
||||
@@ -27,7 +20,7 @@ struct EndpointStoragePair {
|
||||
|
||||
const KEY: &str = "lfc_state";
|
||||
impl EndpointStoragePair {
|
||||
/// endpoint_id is set to None while prewarming from other endpoint, see replica promotion
|
||||
/// endpoint_id is set to None while prewarming from other endpoint, see compute_promote.rs
|
||||
/// If not None, takes precedence over pspec.spec.endpoint_id
|
||||
fn from_spec_and_endpoint(
|
||||
pspec: &crate::compute::ParsedSpec,
|
||||
@@ -53,36 +46,8 @@ impl EndpointStoragePair {
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
// If prewarm failed, we want to get overall number of segments as well as done ones.
|
||||
// However, this function should be reliable even if querying postgres failed.
|
||||
pub async fn lfc_prewarm_state(&self) -> LfcPrewarmStateWithProgress {
|
||||
info!("requesting LFC prewarm state from postgres");
|
||||
let mut state = LfcPrewarmStateWithProgress::default();
|
||||
{
|
||||
state.base = self.state.lock().unwrap().lfc_prewarm_state.clone();
|
||||
}
|
||||
|
||||
let client = match ComputeNode::get_maintenance_client(&self.tokio_conn_conf).await {
|
||||
Ok(client) => client,
|
||||
Err(err) => {
|
||||
error!(%err, "connecting to postgres");
|
||||
return state;
|
||||
}
|
||||
};
|
||||
let row = match client
|
||||
.query_one("select * from neon.get_prewarm_info()", &[])
|
||||
.await
|
||||
{
|
||||
Ok(row) => row,
|
||||
Err(err) => {
|
||||
error!(%err, "querying LFC prewarm status");
|
||||
return state;
|
||||
}
|
||||
};
|
||||
state.total = row.try_get(0).unwrap_or_default();
|
||||
state.prewarmed = row.try_get(1).unwrap_or_default();
|
||||
state.skipped = row.try_get(2).unwrap_or_default();
|
||||
state
|
||||
pub async fn lfc_prewarm_state(&self) -> LfcPrewarmState {
|
||||
self.state.lock().unwrap().lfc_prewarm_state.clone()
|
||||
}
|
||||
|
||||
pub fn lfc_offload_state(&self) -> LfcOffloadState {
|
||||
@@ -92,34 +57,35 @@ impl ComputeNode {
|
||||
/// If there is a prewarm request ongoing, return `false`, `true` otherwise.
|
||||
/// Has a failpoint "compute-prewarm"
|
||||
pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
|
||||
let token: CancellationToken;
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
|
||||
if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) {
|
||||
let state = &mut self.state.lock().unwrap();
|
||||
token = state.lfc_prewarm_token.clone();
|
||||
if let LfcPrewarmState::Prewarming =
|
||||
replace(&mut state.lfc_prewarm_state, LfcPrewarmState::Prewarming)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
crate::metrics::LFC_PREWARMS.inc();
|
||||
|
||||
let cloned = self.clone();
|
||||
let this = self.clone();
|
||||
spawn(async move {
|
||||
let state = match cloned.prewarm_impl(from_endpoint).await {
|
||||
Ok(true) => LfcPrewarmState::Completed,
|
||||
Ok(false) => {
|
||||
info!(
|
||||
"skipping LFC prewarm because LFC state is not found in endpoint storage"
|
||||
);
|
||||
LfcPrewarmState::Skipped
|
||||
}
|
||||
let prewarm_state = match this.prewarm_impl(from_endpoint, token).await {
|
||||
Ok(state) => state,
|
||||
Err(err) => {
|
||||
crate::metrics::LFC_PREWARM_ERRORS.inc();
|
||||
error!(%err, "could not prewarm LFC");
|
||||
LfcPrewarmState::Failed {
|
||||
error: format!("{err:#}"),
|
||||
}
|
||||
let error = format!("{err:#}");
|
||||
LfcPrewarmState::Failed { error }
|
||||
}
|
||||
};
|
||||
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = state;
|
||||
let state = &mut this.state.lock().unwrap();
|
||||
if let LfcPrewarmState::Cancelled = prewarm_state {
|
||||
state.lfc_prewarm_token = CancellationToken::new();
|
||||
}
|
||||
state.lfc_prewarm_state = prewarm_state;
|
||||
});
|
||||
true
|
||||
}
|
||||
@@ -131,55 +97,101 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
/// Request LFC state from endpoint storage and load corresponding pages into Postgres.
|
||||
/// Returns a result with `false` if the LFC state is not found in endpoint storage.
|
||||
async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<bool> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
|
||||
async fn prewarm_impl(
|
||||
&self,
|
||||
from_endpoint: Option<String>,
|
||||
token: CancellationToken,
|
||||
) -> Result<LfcPrewarmState> {
|
||||
let EndpointStoragePair {
|
||||
url,
|
||||
token: storage_token,
|
||||
} = self.endpoint_storage_pair(from_endpoint)?;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fail::fail_point!("compute-prewarm", |_| {
|
||||
bail!("prewarm configured to fail because of a failpoint")
|
||||
});
|
||||
fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint"));
|
||||
|
||||
info!(%url, "requesting LFC state from endpoint storage");
|
||||
let request = Client::new().get(&url).bearer_auth(token);
|
||||
let res = request.send().await.context("querying endpoint storage")?;
|
||||
match res.status() {
|
||||
let mut now = Instant::now();
|
||||
let request = Client::new().get(&url).bearer_auth(storage_token);
|
||||
let response = select! {
|
||||
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
|
||||
response = request.send() => response
|
||||
}
|
||||
.context("querying endpoint storage")?;
|
||||
|
||||
match response.status() {
|
||||
StatusCode::OK => (),
|
||||
StatusCode::NOT_FOUND => {
|
||||
return Ok(false);
|
||||
}
|
||||
StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped),
|
||||
status => bail!("{status} querying endpoint storage"),
|
||||
}
|
||||
let state_download_time_ms = now.elapsed().as_millis() as u32;
|
||||
now = Instant::now();
|
||||
|
||||
let mut uncompressed = Vec::new();
|
||||
let lfc_state = res
|
||||
.bytes()
|
||||
.await
|
||||
.context("getting request body from endpoint storage")?;
|
||||
ZstdDecoder::new(lfc_state.iter().as_slice())
|
||||
.read_to_end(&mut uncompressed)
|
||||
.await
|
||||
.context("decoding LFC state")?;
|
||||
let lfc_state = select! {
|
||||
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
|
||||
lfc_state = response.bytes() => lfc_state
|
||||
}
|
||||
.context("getting request body from endpoint storage")?;
|
||||
|
||||
let mut decoder = ZstdDecoder::new(lfc_state.iter().as_slice());
|
||||
select! {
|
||||
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
|
||||
read = decoder.read_to_end(&mut uncompressed) => read
|
||||
}
|
||||
.context("decoding LFC state")?;
|
||||
let uncompress_time_ms = now.elapsed().as_millis() as u32;
|
||||
now = Instant::now();
|
||||
|
||||
let uncompressed_len = uncompressed.len();
|
||||
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}");
|
||||
|
||||
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres");
|
||||
|
||||
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
// Client connection and prewarm info querying are fast and therefore don't need
|
||||
// cancellation
|
||||
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?
|
||||
.query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
|
||||
.await
|
||||
.context("loading LFC state into postgres")
|
||||
.map(|_| ())?;
|
||||
.context("connecting to postgres")?;
|
||||
let pg_token = client.cancel_token();
|
||||
|
||||
Ok(true)
|
||||
let params: Vec<&(dyn postgres_types::ToSql + Sync)> = vec![&uncompressed];
|
||||
select! {
|
||||
res = client.query_one("select neon.prewarm_local_cache($1)", ¶ms) => res,
|
||||
_ = token.cancelled() => {
|
||||
pg_token.cancel_query(postgres::NoTls).await
|
||||
.context("cancelling neon.prewarm_local_cache()")?;
|
||||
return Ok(LfcPrewarmState::Cancelled)
|
||||
}
|
||||
}
|
||||
.context("loading LFC state into postgres")
|
||||
.map(|_| ())?;
|
||||
let prewarm_time_ms = now.elapsed().as_millis() as u32;
|
||||
|
||||
let row = client
|
||||
.query_one("select * from neon.get_prewarm_info()", &[])
|
||||
.await
|
||||
.context("querying prewarm info")?;
|
||||
let total = row.try_get(0).unwrap_or_default();
|
||||
let prewarmed = row.try_get(1).unwrap_or_default();
|
||||
let skipped = row.try_get(2).unwrap_or_default();
|
||||
|
||||
Ok(LfcPrewarmState::Completed {
|
||||
total,
|
||||
prewarmed,
|
||||
skipped,
|
||||
state_download_time_ms,
|
||||
uncompress_time_ms,
|
||||
prewarm_time_ms,
|
||||
})
|
||||
}
|
||||
|
||||
/// If offload request is ongoing, return false, true otherwise
|
||||
pub fn offload_lfc(self: &Arc<Self>) -> bool {
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_offload_state;
|
||||
if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading {
|
||||
if matches!(
|
||||
replace(state, LfcOffloadState::Offloading),
|
||||
LfcOffloadState::Offloading
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -191,7 +203,10 @@ impl ComputeNode {
|
||||
pub async fn offload_lfc_async(self: &Arc<Self>) {
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_offload_state;
|
||||
if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading {
|
||||
if matches!(
|
||||
replace(state, LfcOffloadState::Offloading),
|
||||
LfcOffloadState::Offloading
|
||||
) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -200,23 +215,23 @@ impl ComputeNode {
|
||||
|
||||
async fn offload_lfc_with_state_update(&self) {
|
||||
crate::metrics::LFC_OFFLOADS.inc();
|
||||
|
||||
let Err(err) = self.offload_lfc_impl().await else {
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
|
||||
return;
|
||||
};
|
||||
|
||||
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
|
||||
error!(%err, "could not offload LFC state to endpoint storage");
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
|
||||
error: format!("{err:#}"),
|
||||
let state = match self.offload_lfc_impl().await {
|
||||
Ok(state) => state,
|
||||
Err(err) => {
|
||||
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
|
||||
error!(%err, "could not offload LFC");
|
||||
let error = format!("{err:#}");
|
||||
LfcOffloadState::Failed { error }
|
||||
}
|
||||
};
|
||||
self.state.lock().unwrap().lfc_offload_state = state;
|
||||
}
|
||||
|
||||
async fn offload_lfc_impl(&self) -> Result<()> {
|
||||
async fn offload_lfc_impl(&self) -> Result<LfcOffloadState> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
|
||||
info!(%url, "requesting LFC state from Postgres");
|
||||
|
||||
let mut now = Instant::now();
|
||||
let row = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?
|
||||
@@ -228,26 +243,41 @@ impl ComputeNode {
|
||||
.context("deserializing LFC state")?;
|
||||
let Some(state) = state else {
|
||||
info!(%url, "empty LFC state, not exporting");
|
||||
return Ok(());
|
||||
return Ok(LfcOffloadState::Skipped);
|
||||
};
|
||||
let state_query_time_ms = now.elapsed().as_millis() as u32;
|
||||
now = Instant::now();
|
||||
|
||||
let mut compressed = Vec::new();
|
||||
ZstdEncoder::new(state)
|
||||
.read_to_end(&mut compressed)
|
||||
.await
|
||||
.context("compressing LFC state")?;
|
||||
let compress_time_ms = now.elapsed().as_millis() as u32;
|
||||
now = Instant::now();
|
||||
|
||||
let compressed_len = compressed.len();
|
||||
info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage");
|
||||
info!(%url, "downloaded LFC state, compressed size {compressed_len}");
|
||||
|
||||
let request = Client::new().put(url).bearer_auth(token).body(compressed);
|
||||
match request.send().await {
|
||||
Ok(res) if res.status() == StatusCode::OK => Ok(()),
|
||||
Ok(res) => bail!(
|
||||
"Request to endpoint storage failed with status: {}",
|
||||
res.status()
|
||||
),
|
||||
Err(err) => Err(err).context("writing to endpoint storage"),
|
||||
let response = request
|
||||
.send()
|
||||
.await
|
||||
.context("writing to endpoint storage")?;
|
||||
let state_upload_time_ms = now.elapsed().as_millis() as u32;
|
||||
let status = response.status();
|
||||
if status != StatusCode::OK {
|
||||
bail!("request to endpoint storage failed: {status}");
|
||||
}
|
||||
|
||||
Ok(LfcOffloadState::Completed {
|
||||
compress_time_ms,
|
||||
state_query_time_ms,
|
||||
state_upload_time_ms,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn cancel_prewarm(self: &Arc<Self>) {
|
||||
self.state.lock().unwrap().lfc_prewarm_token.cancel();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,32 +1,24 @@
|
||||
use crate::compute::ComputeNode;
|
||||
use anyhow::{Context, Result, bail};
|
||||
use anyhow::{Context, bail};
|
||||
use compute_api::responses::{LfcPrewarmState, PromoteConfig, PromoteState};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use itertools::Itertools;
|
||||
use std::collections::HashMap;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio::time::sleep;
|
||||
use std::time::Instant;
|
||||
use tracing::info;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
impl ComputeNode {
|
||||
/// Returns only when promote fails or succeeds. If a network error occurs
|
||||
/// and http client disconnects, this does not stop promotion, and subsequent
|
||||
/// calls block until promote finishes.
|
||||
/// Returns only when promote fails or succeeds. If http client calling this function
|
||||
/// disconnects, this does not stop promotion, and subsequent calls block until promote finishes.
|
||||
/// Called by control plane on secondary after primary endpoint is terminated
|
||||
/// Has a failpoint "compute-promotion"
|
||||
pub async fn promote(self: &Arc<Self>, cfg: PromoteConfig) -> PromoteState {
|
||||
let cloned = self.clone();
|
||||
let promote_fn = async move || {
|
||||
let Err(err) = cloned.promote_impl(cfg).await else {
|
||||
return PromoteState::Completed;
|
||||
};
|
||||
tracing::error!(%err, "promoting");
|
||||
PromoteState::Failed {
|
||||
error: format!("{err:#}"),
|
||||
pub async fn promote(self: &std::sync::Arc<Self>, cfg: PromoteConfig) -> PromoteState {
|
||||
let this = self.clone();
|
||||
let promote_fn = async move || match this.promote_impl(cfg).await {
|
||||
Ok(state) => state,
|
||||
Err(err) => {
|
||||
tracing::error!(%err, "promoting replica");
|
||||
let error = format!("{err:#}");
|
||||
PromoteState::Failed { error }
|
||||
}
|
||||
};
|
||||
|
||||
let start_promotion = || {
|
||||
let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
|
||||
tokio::spawn(async move { tx.send(promote_fn().await) });
|
||||
@@ -34,36 +26,31 @@ impl ComputeNode {
|
||||
};
|
||||
|
||||
let mut task;
|
||||
// self.state is unlocked after block ends so we lock it in promote_impl
|
||||
// and task.changed() is reached
|
||||
// promote_impl locks self.state so we need to unlock it before calling task.changed()
|
||||
{
|
||||
task = self
|
||||
.state
|
||||
.lock()
|
||||
.unwrap()
|
||||
.promote_state
|
||||
.get_or_insert_with(start_promotion)
|
||||
.clone()
|
||||
let promote_state = &mut self.state.lock().unwrap().promote_state;
|
||||
task = promote_state.get_or_insert_with(start_promotion).clone()
|
||||
}
|
||||
if task.changed().await.is_err() {
|
||||
let error = "promote sender dropped".to_string();
|
||||
return PromoteState::Failed { error };
|
||||
}
|
||||
task.changed().await.expect("promote sender dropped");
|
||||
task.borrow().clone()
|
||||
}
|
||||
|
||||
async fn promote_impl(&self, mut cfg: PromoteConfig) -> Result<()> {
|
||||
async fn promote_impl(&self, cfg: PromoteConfig) -> anyhow::Result<PromoteState> {
|
||||
{
|
||||
let state = self.state.lock().unwrap();
|
||||
let mode = &state.pspec.as_ref().unwrap().spec.mode;
|
||||
if *mode != ComputeMode::Replica {
|
||||
bail!("{} is not replica", mode.to_type_str());
|
||||
if *mode != compute_api::spec::ComputeMode::Replica {
|
||||
bail!("compute mode \"{}\" is not replica", mode.to_type_str());
|
||||
}
|
||||
|
||||
// we don't need to query Postgres so not self.lfc_prewarm_state()
|
||||
match &state.lfc_prewarm_state {
|
||||
LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
|
||||
bail!("prewarm not requested or pending")
|
||||
status @ (LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming) => {
|
||||
bail!("compute {status}")
|
||||
}
|
||||
LfcPrewarmState::Failed { error } => {
|
||||
tracing::warn!(%error, "replica prewarm failed")
|
||||
tracing::warn!(%error, "compute prewarm failed")
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
@@ -72,26 +59,29 @@ impl ComputeNode {
|
||||
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?;
|
||||
let mut now = Instant::now();
|
||||
|
||||
let primary_lsn = cfg.wal_flush_lsn;
|
||||
let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
|
||||
let mut standby_lsn = utils::lsn::Lsn::INVALID;
|
||||
const RETRIES: i32 = 20;
|
||||
for i in 0..=RETRIES {
|
||||
let row = client
|
||||
.query_one("SELECT pg_last_wal_replay_lsn()", &[])
|
||||
.query_one("SELECT pg_catalog.pg_last_wal_replay_lsn()", &[])
|
||||
.await
|
||||
.context("getting last replay lsn")?;
|
||||
let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
|
||||
last_wal_replay_lsn = lsn.into();
|
||||
if last_wal_replay_lsn >= primary_lsn {
|
||||
standby_lsn = lsn.into();
|
||||
if standby_lsn >= primary_lsn {
|
||||
break;
|
||||
}
|
||||
info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
info!(%standby_lsn, %primary_lsn, "catching up, try {i}");
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
}
|
||||
if last_wal_replay_lsn < primary_lsn {
|
||||
if standby_lsn < primary_lsn {
|
||||
bail!("didn't catch up with primary in {RETRIES} retries");
|
||||
}
|
||||
let lsn_wait_time_ms = now.elapsed().as_millis() as u32;
|
||||
now = Instant::now();
|
||||
|
||||
// using $1 doesn't work with ALTER SYSTEM SET
|
||||
let safekeepers_sql = format!(
|
||||
@@ -103,26 +93,32 @@ impl ComputeNode {
|
||||
.await
|
||||
.context("setting safekeepers")?;
|
||||
client
|
||||
.query("SELECT pg_reload_conf()", &[])
|
||||
.query(
|
||||
"ALTER SYSTEM SET synchronous_standby_names=walproposer",
|
||||
&[],
|
||||
)
|
||||
.await
|
||||
.context("setting synchronous_standby_names")?;
|
||||
client
|
||||
.query("SELECT pg_catalog.pg_reload_conf()", &[])
|
||||
.await
|
||||
.context("reloading postgres config")?;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fail::fail_point!("compute-promotion", |_| {
|
||||
bail!("promotion configured to fail because of a failpoint")
|
||||
});
|
||||
fail::fail_point!("compute-promotion", |_| bail!(
|
||||
"compute-promotion failpoint"
|
||||
));
|
||||
|
||||
let row = client
|
||||
.query_one("SELECT * FROM pg_promote()", &[])
|
||||
.query_one("SELECT * FROM pg_catalog.pg_promote()", &[])
|
||||
.await
|
||||
.context("pg_promote")?;
|
||||
if !row.get::<usize, bool>(0) {
|
||||
bail!("pg_promote() returned false");
|
||||
bail!("pg_promote() failed");
|
||||
}
|
||||
let pg_promote_time_ms = now.elapsed().as_millis() as u32;
|
||||
let now = Instant::now();
|
||||
|
||||
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?;
|
||||
let row = client
|
||||
.query_one("SHOW transaction_read_only", &[])
|
||||
.await
|
||||
@@ -131,36 +127,47 @@ impl ComputeNode {
|
||||
bail!("replica in read only mode after promotion");
|
||||
}
|
||||
|
||||
// Already checked validity in http handler
|
||||
#[allow(unused_mut)]
|
||||
let mut new_pspec = crate::compute::ParsedSpec::try_from(cfg.spec).expect("invalid spec");
|
||||
{
|
||||
let mut state = self.state.lock().unwrap();
|
||||
let spec = &mut state.pspec.as_mut().unwrap().spec;
|
||||
spec.mode = ComputeMode::Primary;
|
||||
let new_conf = cfg.spec.cluster.postgresql_conf.as_mut().unwrap();
|
||||
let existing_conf = spec.cluster.postgresql_conf.as_ref().unwrap();
|
||||
Self::merge_spec(new_conf, existing_conf);
|
||||
|
||||
// Local setup has different ports for pg process (port=) for primary and secondary.
|
||||
// Primary is stopped so we need secondary's "port" value
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
let old_spec = &state.pspec.as_ref().unwrap().spec;
|
||||
let Some(old_conf) = old_spec.cluster.postgresql_conf.as_ref() else {
|
||||
bail!("pspec.spec.cluster.postgresql_conf missing for endpoint");
|
||||
};
|
||||
let set: std::collections::HashMap<&str, &str> = old_conf
|
||||
.split_terminator('\n')
|
||||
.map(|e| e.split_once("=").expect("invalid item"))
|
||||
.collect();
|
||||
|
||||
let Some(new_conf) = new_pspec.spec.cluster.postgresql_conf.as_mut() else {
|
||||
bail!("pspec.spec.cluster.postgresql_conf missing for supplied config");
|
||||
};
|
||||
new_conf.push_str(&format!("port={}\n", set["port"]));
|
||||
}
|
||||
|
||||
tracing::debug!("applied spec: {:#?}", new_pspec.spec);
|
||||
if self.params.lakebase_mode {
|
||||
ComputeNode::set_spec(&self.params, &mut state, new_pspec);
|
||||
} else {
|
||||
state.pspec = Some(new_pspec);
|
||||
}
|
||||
}
|
||||
|
||||
info!("applied new spec, reconfiguring as primary");
|
||||
self.reconfigure()
|
||||
}
|
||||
self.reconfigure()?;
|
||||
let reconfigure_time_ms = now.elapsed().as_millis() as u32;
|
||||
|
||||
/// Merge old and new Postgres conf specs to apply on secondary.
|
||||
/// Change new spec's port and safekeepers since they are supplied
|
||||
/// differenly
|
||||
fn merge_spec(new_conf: &mut String, existing_conf: &str) {
|
||||
let mut new_conf_set: HashMap<&str, &str> = new_conf
|
||||
.split_terminator('\n')
|
||||
.map(|e| e.split_once("=").expect("invalid item"))
|
||||
.collect();
|
||||
new_conf_set.remove("neon.safekeepers");
|
||||
|
||||
let existing_conf_set: HashMap<&str, &str> = existing_conf
|
||||
.split_terminator('\n')
|
||||
.map(|e| e.split_once("=").expect("invalid item"))
|
||||
.collect();
|
||||
new_conf_set.insert("port", existing_conf_set["port"]);
|
||||
*new_conf = new_conf_set
|
||||
.iter()
|
||||
.map(|(k, v)| format!("{k}={v}"))
|
||||
.join("\n");
|
||||
Ok(PromoteState::Completed {
|
||||
lsn_wait_time_ms,
|
||||
pg_promote_time_ms,
|
||||
reconfigure_time_ms,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,14 +65,19 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "{conf}")?;
|
||||
}
|
||||
|
||||
// Stripe size GUC should be defined prior to connection string
|
||||
if let Some(stripe_size) = spec.shard_stripe_size {
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
// Add options for connecting to storage
|
||||
writeln!(file, "# Neon storage settings")?;
|
||||
writeln!(file)?;
|
||||
if let Some(conninfo) = &spec.pageserver_connection_info {
|
||||
// Stripe size GUC should be defined prior to connection string
|
||||
if let Some(stripe_size) = conninfo.stripe_size {
|
||||
writeln!(
|
||||
file,
|
||||
"# from compute spec's pageserver_connection_info.stripe_size field"
|
||||
)?;
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
|
||||
let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
|
||||
let num_shards = if conninfo.shard_count.0 == 0 {
|
||||
1 // unsharded, treat it as a single shard
|
||||
@@ -110,7 +115,7 @@ pub fn write_postgres_conf(
|
||||
if let Some(libpq_urls) = libpq_urls {
|
||||
writeln!(
|
||||
file,
|
||||
"# derived from compute spec's pageserver_conninfo field"
|
||||
"# derived from compute spec's pageserver_connection_info field"
|
||||
)?;
|
||||
writeln!(
|
||||
file,
|
||||
@@ -120,24 +125,16 @@ pub fn write_postgres_conf(
|
||||
} else {
|
||||
writeln!(file, "# no neon.pageserver_connstring")?;
|
||||
}
|
||||
|
||||
if let Some(stripe_size) = conninfo.stripe_size {
|
||||
writeln!(
|
||||
file,
|
||||
"# from compute spec's pageserver_conninfo.stripe_size field"
|
||||
)?;
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
} else {
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "# from compute spec's pageserver_connstring field")?;
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
}
|
||||
|
||||
// Stripe size GUC should be defined prior to connection string
|
||||
if let Some(stripe_size) = spec.shard_stripe_size {
|
||||
writeln!(file, "# from compute spec's shard_stripe_size field")?;
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "# from compute spec's pageserver_connstring field")?;
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
}
|
||||
}
|
||||
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
|
||||
@@ -139,6 +139,15 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LfcPrewarmState"
|
||||
delete:
|
||||
tags:
|
||||
- Prewarm
|
||||
summary: Cancel ongoing LFC prewarm
|
||||
description: ""
|
||||
operationId: cancelLfcPrewarm
|
||||
responses:
|
||||
202:
|
||||
description: Prewarm cancelled
|
||||
|
||||
/lfc/offload:
|
||||
post:
|
||||
@@ -608,9 +617,6 @@ components:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
- total
|
||||
- prewarmed
|
||||
- skipped
|
||||
properties:
|
||||
status:
|
||||
description: LFC prewarm status
|
||||
@@ -628,6 +634,15 @@ components:
|
||||
skipped:
|
||||
description: Pages processed but not prewarmed
|
||||
type: integer
|
||||
state_download_time_ms:
|
||||
description: Time it takes to download LFC state to compute
|
||||
type: integer
|
||||
uncompress_time_ms:
|
||||
description: Time it takes to uncompress LFC state
|
||||
type: integer
|
||||
prewarm_time_ms:
|
||||
description: Time it takes to prewarm LFC state in Postgres
|
||||
type: integer
|
||||
|
||||
LfcOffloadState:
|
||||
type: object
|
||||
@@ -636,11 +651,21 @@ components:
|
||||
properties:
|
||||
status:
|
||||
description: LFC offload status
|
||||
enum: [not_offloaded, offloading, completed, failed]
|
||||
enum: [not_offloaded, offloading, completed, skipped, failed]
|
||||
type: string
|
||||
error:
|
||||
description: LFC offload error, if any
|
||||
type: string
|
||||
state_query_time_ms:
|
||||
description: Time it takes to get LFC state from Postgres
|
||||
type: integer
|
||||
compress_time_ms:
|
||||
description: Time it takes to compress LFC state
|
||||
type: integer
|
||||
state_upload_time_ms:
|
||||
description: Time it takes to upload LFC state to endpoint storage
|
||||
type: integer
|
||||
|
||||
|
||||
PromoteState:
|
||||
type: object
|
||||
@@ -654,6 +679,15 @@ components:
|
||||
error:
|
||||
description: Promote error, if any
|
||||
type: string
|
||||
lsn_wait_time_ms:
|
||||
description: Time it takes for secondary to catch up with primary WAL flush LSN
|
||||
type: integer
|
||||
pg_promote_time_ms:
|
||||
description: Time it takes to call pg_promote on secondary
|
||||
type: integer
|
||||
reconfigure_time_ms:
|
||||
description: Time it takes to reconfigure promoted secondary
|
||||
type: integer
|
||||
|
||||
SetRoleGrantsRequest:
|
||||
type: object
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
use crate::compute_prewarm::LfcPrewarmStateWithProgress;
|
||||
use crate::http::JsonResponse;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::{Json, http::StatusCode};
|
||||
use axum_extra::extract::OptionalQuery;
|
||||
use compute_api::responses::LfcOffloadState;
|
||||
use compute_api::responses::{LfcOffloadState, LfcPrewarmState};
|
||||
type Compute = axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>;
|
||||
|
||||
pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json<LfcPrewarmStateWithProgress> {
|
||||
pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json<LfcPrewarmState> {
|
||||
Json(compute.lfc_prewarm_state().await)
|
||||
}
|
||||
|
||||
@@ -46,3 +45,8 @@ pub(in crate::http) async fn offload(compute: Compute) -> Response {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub(in crate::http) async fn cancel_prewarm(compute: Compute) -> StatusCode {
|
||||
compute.cancel_prewarm();
|
||||
StatusCode::ACCEPTED
|
||||
}
|
||||
|
||||
@@ -1,11 +1,22 @@
|
||||
use crate::http::JsonResponse;
|
||||
use axum::extract::Json;
|
||||
use compute_api::responses::PromoteConfig;
|
||||
use http::StatusCode;
|
||||
|
||||
pub(in crate::http) async fn promote(
|
||||
compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
|
||||
Json(cfg): Json<compute_api::responses::PromoteConfig>,
|
||||
Json(cfg): Json<PromoteConfig>,
|
||||
) -> axum::response::Response {
|
||||
// Return early at the cost of extra parsing spec
|
||||
let pspec = match crate::compute::ParsedSpec::try_from(cfg.spec) {
|
||||
Ok(p) => p,
|
||||
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
|
||||
};
|
||||
|
||||
let cfg = PromoteConfig {
|
||||
spec: pspec.spec,
|
||||
wal_flush_lsn: cfg.wal_flush_lsn,
|
||||
};
|
||||
let state = compute.promote(cfg).await;
|
||||
if let compute_api::responses::PromoteState::Failed { error: _ } = state {
|
||||
return JsonResponse::create_response(StatusCode::INTERNAL_SERVER_ERROR, state);
|
||||
|
||||
@@ -99,7 +99,12 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
);
|
||||
|
||||
let authenticated_router = Router::<Arc<ComputeNode>>::new()
|
||||
.route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
|
||||
.route(
|
||||
"/lfc/prewarm",
|
||||
get(lfc::prewarm_state)
|
||||
.post(lfc::prewarm)
|
||||
.delete(lfc::cancel_prewarm),
|
||||
)
|
||||
.route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
|
||||
.route("/promote", post(promote::promote))
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
|
||||
@@ -19,7 +19,7 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
|
||||
.query(
|
||||
"SELECT datname FROM pg_catalog.pg_database
|
||||
WHERE datallowconn
|
||||
AND datconnlimit <> - 2
|
||||
AND datconnlimit OPERATOR(pg_catalog.<>) (OPERATOR(pg_catalog.-) 2::pg_catalog.int4)
|
||||
LIMIT 500",
|
||||
&[],
|
||||
)
|
||||
@@ -67,7 +67,7 @@ pub async fn get_installed_extensions(
|
||||
|
||||
let extensions: Vec<(String, String, i32)> = client
|
||||
.query(
|
||||
"SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
|
||||
"SELECT extname, extversion, extowner::pg_catalog.int4 FROM pg_catalog.pg_extension",
|
||||
&[],
|
||||
)
|
||||
.await?
|
||||
|
||||
@@ -76,7 +76,7 @@ impl<'m> MigrationRunner<'m> {
|
||||
self.client
|
||||
.simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")
|
||||
.await?;
|
||||
self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?;
|
||||
self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key pg_catalog.int4 NOT NULL PRIMARY KEY, id pg_catalog.int8 NOT NULL DEFAULT 0)").await?;
|
||||
self.client
|
||||
.simple_query(
|
||||
"INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",
|
||||
|
||||
@@ -15,17 +15,17 @@ DO $$
|
||||
DECLARE
|
||||
role_name text;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member')
|
||||
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', pg_catalog.quote_ident(role_name);
|
||||
EXECUTE pg_catalog.format('ALTER ROLE %I INHERIT;', role_name);
|
||||
END LOOP;
|
||||
|
||||
FOR role_name IN SELECT rolname FROM pg_roles
|
||||
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles
|
||||
WHERE
|
||||
NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_')
|
||||
NOT pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT pg_catalog.starts_with(rolname, 'pg_')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', pg_catalog.quote_ident(role_name);
|
||||
EXECUTE pg_catalog.format('ALTER ROLE %I NOBYPASSRLS;', role_name);
|
||||
END LOOP;
|
||||
END $$;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
@@ -5,9 +5,9 @@ DO $$
|
||||
DECLARE
|
||||
role_name TEXT;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE rolreplication IS TRUE
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', pg_catalog.quote_ident(role_name);
|
||||
EXECUTE pg_catalog.format('ALTER ROLE %I NOREPLICATION;', role_name);
|
||||
END LOOP;
|
||||
END $$;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name OPERATOR(pg_catalog.=) 'server_version_num'::pg_catalog.text) THEN
|
||||
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}';
|
||||
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}';
|
||||
END IF;
|
||||
|
||||
@@ -2,7 +2,7 @@ DO $$
|
||||
DECLARE
|
||||
bypassrls boolean;
|
||||
BEGIN
|
||||
SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser';
|
||||
SELECT rolbypassrls INTO bypassrls FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser';
|
||||
IF NOT bypassrls THEN
|
||||
RAISE EXCEPTION 'neon_superuser cannot bypass RLS';
|
||||
END IF;
|
||||
|
||||
@@ -4,8 +4,8 @@ DECLARE
|
||||
BEGIN
|
||||
FOR role IN
|
||||
SELECT rolname AS name, rolinherit AS inherit
|
||||
FROM pg_roles
|
||||
WHERE pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
FROM pg_catalog.pg_roles
|
||||
WHERE pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
LOOP
|
||||
IF NOT role.inherit THEN
|
||||
RAISE EXCEPTION '% cannot inherit', quote_ident(role.name);
|
||||
@@ -14,12 +14,12 @@ BEGIN
|
||||
|
||||
FOR role IN
|
||||
SELECT rolname AS name, rolbypassrls AS bypassrls
|
||||
FROM pg_roles
|
||||
WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
AND NOT starts_with(rolname, 'pg_')
|
||||
FROM pg_catalog.pg_roles
|
||||
WHERE NOT pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
AND NOT pg_catalog.starts_with(rolname, 'pg_')
|
||||
LOOP
|
||||
IF role.bypassrls THEN
|
||||
RAISE EXCEPTION '% can bypass RLS', quote_ident(role.name);
|
||||
RAISE EXCEPTION '% can bypass RLS', pg_catalog.quote_ident(role.name);
|
||||
END IF;
|
||||
END LOOP;
|
||||
END $$;
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN
|
||||
IF (SELECT pg_catalog.current_setting('server_version_num')::pg_catalog.numeric < 160000) THEN
|
||||
RETURN;
|
||||
END IF;
|
||||
|
||||
IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
|
||||
IF NOT (SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
|
||||
RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
@@ -2,12 +2,12 @@ DO $$
|
||||
DECLARE
|
||||
monitor record;
|
||||
BEGIN
|
||||
SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
|
||||
SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
|
||||
admin_option AS admin
|
||||
INTO monitor
|
||||
FROM pg_auth_members
|
||||
WHERE roleid = 'pg_monitor'::regrole
|
||||
AND member = 'neon_superuser'::regrole;
|
||||
FROM pg_catalog.pg_auth_members
|
||||
WHERE roleid = 'pg_monitor'::pg_catalog.regrole
|
||||
AND member = 'neon_superuser'::pg_catalog.regrole;
|
||||
|
||||
IF monitor IS NULL THEN
|
||||
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';
|
||||
|
||||
@@ -2,11 +2,11 @@ DO $$
|
||||
DECLARE
|
||||
can_execute boolean;
|
||||
BEGIN
|
||||
SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute'))
|
||||
SELECT pg_catalog.bool_and(pg_catalog.has_function_privilege('neon_superuser', oid, 'execute'))
|
||||
INTO can_execute
|
||||
FROM pg_proc
|
||||
FROM pg_catalog.pg_proc
|
||||
WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot')
|
||||
AND pronamespace = 'pg_catalog'::regnamespace;
|
||||
AND pronamespace = 'pg_catalog'::pg_catalog.regnamespace;
|
||||
IF NOT can_execute THEN
|
||||
RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot';
|
||||
END IF;
|
||||
|
||||
@@ -2,9 +2,9 @@ DO $$
|
||||
DECLARE
|
||||
can_execute boolean;
|
||||
BEGIN
|
||||
SELECT has_function_privilege('neon_superuser', oid, 'execute')
|
||||
SELECT pg_catalog.has_function_privilege('neon_superuser', oid, 'execute')
|
||||
INTO can_execute
|
||||
FROM pg_proc
|
||||
FROM pg_catalog.pg_proc
|
||||
WHERE proname = 'pg_show_replication_origin_status'
|
||||
AND pronamespace = 'pg_catalog'::regnamespace;
|
||||
IF NOT can_execute THEN
|
||||
|
||||
@@ -2,10 +2,10 @@ DO $$
|
||||
DECLARE
|
||||
signal_backend record;
|
||||
BEGIN
|
||||
SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
|
||||
SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
|
||||
admin_option AS admin
|
||||
INTO signal_backend
|
||||
FROM pg_auth_members
|
||||
FROM pg_catalog.pg_auth_members
|
||||
WHERE roleid = 'pg_signal_backend'::regrole
|
||||
AND member = 'neon_superuser'::regrole;
|
||||
|
||||
|
||||
@@ -407,9 +407,9 @@ fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> {
|
||||
// like `postgres_exporter` use it to query Postgres statistics.
|
||||
// Use explicit 8 bytes type casts to match Rust types.
|
||||
let stats = cli.query_one(
|
||||
"SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time,
|
||||
coalesce(sum(sessions), 0)::bigint AS total_sessions
|
||||
FROM pg_stat_database
|
||||
"SELECT pg_catalog.coalesce(pg_catalog.sum(active_time), 0.0)::pg_catalog.float8 AS total_active_time,
|
||||
pg_catalog.coalesce(pg_catalog.sum(sessions), 0)::pg_catalog.bigint AS total_sessions
|
||||
FROM pg_catalog.pg_stat_database
|
||||
WHERE datname NOT IN (
|
||||
'postgres',
|
||||
'template0',
|
||||
@@ -445,11 +445,11 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime
|
||||
let mut last_active: Option<DateTime<Utc>> = None;
|
||||
// Get all running client backends except ourself, use RFC3339 DateTime format.
|
||||
let backends = cli.query(
|
||||
"SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
|
||||
"SELECT state, pg_catalog.to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"'::pg_catalog.text) AS state_change
|
||||
FROM pg_stat_activity
|
||||
WHERE backend_type = 'client backend'
|
||||
AND pid != pg_backend_pid()
|
||||
AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
|
||||
WHERE backend_type OPERATOR(pg_catalog.=) 'client backend'::pg_catalog.text
|
||||
AND pid OPERATOR(pg_catalog.!=) pg_catalog.pg_backend_pid()
|
||||
AND usename OPERATOR(pg_catalog.!=) 'cloud_admin'::pg_catalog.name;", // XXX: find a better way to filter other monitors?
|
||||
&[],
|
||||
);
|
||||
|
||||
|
||||
@@ -299,9 +299,9 @@ pub async fn get_existing_dbs_async(
|
||||
.query_raw::<str, &String, &[String; 0]>(
|
||||
"SELECT
|
||||
datname AS name,
|
||||
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
|
||||
(SELECT rolname FROM pg_catalog.pg_roles WHERE oid OPERATOR(pg_catalog.=) datdba) AS owner,
|
||||
NOT datallowconn AS restrict_conn,
|
||||
datconnlimit = - 2 AS invalid
|
||||
datconnlimit OPERATOR(pg_catalog.=) (OPERATOR(pg_catalog.-) 2) AS invalid
|
||||
FROM
|
||||
pg_catalog.pg_database;",
|
||||
&[],
|
||||
|
||||
@@ -82,7 +82,7 @@ impl ComputeNode {
|
||||
info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id);
|
||||
|
||||
drop_subscriptions_done = match
|
||||
client.query("select 1 from neon.drop_subscriptions_done where timeline_id = $1", &[&timeline_id.to_string()]).await {
|
||||
client.query("select 1 from neon.drop_subscriptions_done where timeline_id OPERATOR(pg_catalog.=) $1", &[&timeline_id.to_string()]).await {
|
||||
Ok(result) => !result.is_empty(),
|
||||
Err(e) =>
|
||||
{
|
||||
@@ -1142,7 +1142,9 @@ async fn get_operations<'a>(
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
if libs.contains("pg_stat_statements") {
|
||||
return Ok(Box::new(once(Operation {
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"),
|
||||
query: String::from(
|
||||
"CREATE EXTENSION IF NOT EXISTS pg_stat_statements WITH SCHEMA public",
|
||||
),
|
||||
comment: Some(String::from("create system extensions")),
|
||||
})));
|
||||
}
|
||||
@@ -1150,11 +1152,13 @@ async fn get_operations<'a>(
|
||||
Ok(Box::new(empty()))
|
||||
}
|
||||
ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit WITH SCHEMA public"),
|
||||
comment: Some(String::from("create pgaudit extensions")),
|
||||
}))),
|
||||
ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
|
||||
query: String::from(
|
||||
"CREATE EXTENSION IF NOT EXISTS pgauditlogtofile WITH SCHEMA public",
|
||||
),
|
||||
comment: Some(String::from("create pgauditlogtofile extensions")),
|
||||
}))),
|
||||
// Disable pgaudit logging for postgres database.
|
||||
@@ -1178,7 +1182,7 @@ async fn get_operations<'a>(
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'",
|
||||
"UPDATE pg_catalog.pg_extension SET extrelocatable = true WHERE extname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name AND extrelocatable OPERATOR(pg_catalog.=) false",
|
||||
),
|
||||
comment: Some(String::from("compat/fix: make neon relocatable")),
|
||||
},
|
||||
|
||||
@@ -3,16 +3,17 @@ BEGIN
|
||||
IF NOT EXISTS(
|
||||
SELECT 1
|
||||
FROM pg_catalog.pg_tables
|
||||
WHERE tablename = 'health_check'
|
||||
WHERE tablename::pg_catalog.name OPERATOR(pg_catalog.=) 'health_check'::pg_catalog.name
|
||||
AND schemaname::pg_catalog.name OPERATOR(pg_catalog.=) 'public'::pg_catalog.name
|
||||
)
|
||||
THEN
|
||||
CREATE TABLE health_check (
|
||||
id serial primary key,
|
||||
updated_at timestamptz default now()
|
||||
CREATE TABLE public.health_check (
|
||||
id pg_catalog.int4 primary key generated by default as identity,
|
||||
updated_at pg_catalog.timestamptz default pg_catalog.now()
|
||||
);
|
||||
INSERT INTO health_check VALUES (1, now())
|
||||
INSERT INTO public.health_check VALUES (1, pg_catalog.now())
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET updated_at = now();
|
||||
SET updated_at = pg_catalog.now();
|
||||
END IF;
|
||||
END
|
||||
$$
|
||||
@@ -1,12 +0,0 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
query varchar;
|
||||
BEGIN
|
||||
FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};'
|
||||
FROM pg_proc p
|
||||
JOIN pg_namespace nsp ON p.pronamespace = nsp.oid
|
||||
WHERE nsp.nspname = 'anon' LOOP
|
||||
EXECUTE query;
|
||||
END LOOP;
|
||||
END
|
||||
$$;
|
||||
@@ -1,6 +1,6 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}')
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname OPERATOR(pg_catalog.=) '{privileged_role_name}'::pg_catalog.name)
|
||||
THEN
|
||||
CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
END IF;
|
||||
|
||||
@@ -4,14 +4,14 @@ $$
|
||||
IF EXISTS(
|
||||
SELECT nspname
|
||||
FROM pg_catalog.pg_namespace
|
||||
WHERE nspname = 'public'
|
||||
WHERE nspname OPERATOR(pg_catalog.=) 'public'
|
||||
) AND
|
||||
current_setting('server_version_num')::int / 10000 >= 15
|
||||
pg_catalog.current_setting('server_version_num')::int OPERATOR(pg_catalog./) 10000 OPERATOR(pg_catalog.>=) 15
|
||||
THEN
|
||||
IF EXISTS(
|
||||
SELECT rolname
|
||||
FROM pg_catalog.pg_roles
|
||||
WHERE rolname = 'web_access'
|
||||
WHERE rolname OPERATOR(pg_catalog.=) 'web_access'
|
||||
)
|
||||
THEN
|
||||
GRANT CREATE ON SCHEMA public TO web_access;
|
||||
@@ -20,7 +20,7 @@ $$
|
||||
IF EXISTS(
|
||||
SELECT nspname
|
||||
FROM pg_catalog.pg_namespace
|
||||
WHERE nspname = 'public'
|
||||
WHERE nspname OPERATOR(pg_catalog.=) 'public'
|
||||
)
|
||||
THEN
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
|
||||
|
||||
@@ -2,11 +2,17 @@ DO ${outer_tag}$
|
||||
DECLARE
|
||||
subname TEXT;
|
||||
BEGIN
|
||||
LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE;
|
||||
FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
|
||||
EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
|
||||
EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
|
||||
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
|
||||
LOCK TABLE pg_catalog.pg_subscription IN ACCESS EXCLUSIVE MODE;
|
||||
FOR subname IN
|
||||
SELECT pg_subscription.subname
|
||||
FROM pg_catalog.pg_subscription
|
||||
WHERE subdbid OPERATOR(pg_catalog.=) (
|
||||
SELECT oid FROM pg_database WHERE datname OPERATOR(pg_catalog.=) {datname_str}::pg_catalog.name
|
||||
)
|
||||
LOOP
|
||||
EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I DISABLE;', subname);
|
||||
EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
|
||||
EXECUTE pg_catalog.format('DROP SUBSCRIPTION %I;', subname);
|
||||
END LOOP;
|
||||
END;
|
||||
${outer_tag}$;
|
||||
|
||||
@@ -3,19 +3,19 @@ BEGIN
|
||||
IF NOT EXISTS(
|
||||
SELECT 1
|
||||
FROM pg_catalog.pg_tables
|
||||
WHERE tablename = 'drop_subscriptions_done'
|
||||
AND schemaname = 'neon'
|
||||
WHERE tablename OPERATOR(pg_catalog.=) 'drop_subscriptions_done'::pg_catalog.name
|
||||
AND schemaname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name
|
||||
)
|
||||
THEN
|
||||
CREATE TABLE neon.drop_subscriptions_done
|
||||
(id serial primary key, timeline_id text);
|
||||
(id pg_catalog.int4 primary key generated by default as identity, timeline_id pg_catalog.text);
|
||||
END IF;
|
||||
|
||||
-- preserve the timeline_id of the last drop_subscriptions run
|
||||
-- to ensure that the cleanup of a timeline is executed only once.
|
||||
-- use upsert to avoid the table bloat in case of cascade branching (branch of a branch)
|
||||
INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id'))
|
||||
INSERT INTO neon.drop_subscriptions_done VALUES (1, pg_catalog.current_setting('neon.timeline_id'))
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET timeline_id = current_setting('neon.timeline_id');
|
||||
SET timeline_id = pg_catalog.current_setting('neon.timeline_id')::pg_catalog.text;
|
||||
END
|
||||
$$
|
||||
|
||||
@@ -15,15 +15,15 @@ BEGIN
|
||||
WHERE schema_name IN ('public')
|
||||
LOOP
|
||||
FOR grantor IN EXECUTE
|
||||
format(
|
||||
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s',
|
||||
pg_catalog.format(
|
||||
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee OPERATOR(pg_catalog.=) %s',
|
||||
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
quote_literal({role_name})
|
||||
)
|
||||
LOOP
|
||||
EXECUTE format('SET LOCAL ROLE %I', grantor);
|
||||
EXECUTE pg_catalog.format('SET LOCAL ROLE %I', grantor);
|
||||
|
||||
revoke_query := format(
|
||||
revoke_query := pg_catalog.format(
|
||||
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I',
|
||||
schema,
|
||||
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
|
||||
@@ -5,17 +5,17 @@ DO ${outer_tag}$
|
||||
IF EXISTS(
|
||||
SELECT nspname
|
||||
FROM pg_catalog.pg_namespace
|
||||
WHERE nspname = 'public'
|
||||
WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.name
|
||||
)
|
||||
THEN
|
||||
SELECT nspowner::regrole::text
|
||||
FROM pg_catalog.pg_namespace
|
||||
WHERE nspname = 'public'
|
||||
WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.text
|
||||
INTO schema_owner;
|
||||
|
||||
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
|
||||
IF schema_owner OPERATOR(pg_catalog.=) 'cloud_admin'::pg_catalog.text OR schema_owner OPERATOR(pg_catalog.=) 'zenith_admin'::pg_catalog.text
|
||||
THEN
|
||||
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
|
||||
EXECUTE pg_catalog.format('ALTER SCHEMA public OWNER TO %I', {db_owner});
|
||||
END IF;
|
||||
END IF;
|
||||
END
|
||||
|
||||
@@ -3,10 +3,10 @@ DO ${outer_tag}$
|
||||
IF EXISTS(
|
||||
SELECT 1
|
||||
FROM pg_catalog.pg_database
|
||||
WHERE datname = {datname}
|
||||
WHERE datname OPERATOR(pg_catalog.=) {datname}::pg_catalog.name
|
||||
)
|
||||
THEN
|
||||
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
|
||||
EXECUTE pg_catalog.format('ALTER DATABASE %I is_template false', {datname});
|
||||
END IF;
|
||||
END
|
||||
${outer_tag}$;
|
||||
|
||||
@@ -71,8 +71,9 @@ const DEFAULT_PG_VERSION_NUM: &str = "17";
|
||||
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
||||
|
||||
/// Neon CLI.
|
||||
#[derive(clap::Parser)]
|
||||
#[command(version = GIT_VERSION, about, name = "Neon CLI")]
|
||||
#[command(version = GIT_VERSION, name = "Neon CLI")]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
command: NeonLocalCmd,
|
||||
@@ -107,30 +108,31 @@ enum NeonLocalCmd {
|
||||
Stop(StopCmdArgs),
|
||||
}
|
||||
|
||||
/// Initialize a new Neon repository, preparing configs for services to start with.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Initialize a new Neon repository, preparing configs for services to start with")]
|
||||
struct InitCmdArgs {
|
||||
#[clap(long, help("How many pageservers to create (default 1)"))]
|
||||
/// How many pageservers to create (default 1).
|
||||
#[clap(long)]
|
||||
num_pageservers: Option<u16>,
|
||||
|
||||
#[clap(long)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
#[clap(long, help("Force initialization even if the repository is not empty"))]
|
||||
/// Force initialization even if the repository is not empty.
|
||||
#[clap(long, default_value = "must-not-exist")]
|
||||
#[arg(value_parser)]
|
||||
#[clap(default_value = "must-not-exist")]
|
||||
force: InitForceMode,
|
||||
}
|
||||
|
||||
/// Start pageserver and safekeepers.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Start pageserver and safekeepers")]
|
||||
struct StartCmdArgs {
|
||||
#[clap(long = "start-timeout", default_value = "10s")]
|
||||
timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
/// Stop pageserver and safekeepers.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Stop pageserver and safekeepers")]
|
||||
struct StopCmdArgs {
|
||||
#[arg(value_enum)]
|
||||
#[clap(long, default_value_t = StopMode::Fast)]
|
||||
@@ -143,8 +145,8 @@ enum StopMode {
|
||||
Immediate,
|
||||
}
|
||||
|
||||
/// Manage tenants.
|
||||
#[derive(clap::Subcommand)]
|
||||
#[clap(about = "Manage tenants")]
|
||||
enum TenantCmd {
|
||||
List,
|
||||
Create(TenantCreateCmdArgs),
|
||||
@@ -155,38 +157,36 @@ enum TenantCmd {
|
||||
|
||||
#[derive(clap::Args)]
|
||||
struct TenantCreateCmdArgs {
|
||||
#[clap(
|
||||
long = "tenant-id",
|
||||
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Tenant ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long = "tenant-id")]
|
||||
tenant_id: Option<TenantId>,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Use a specific timeline id when creating a tenant and its initial timeline"
|
||||
)]
|
||||
/// Use a specific timeline id when creating a tenant and its initial timeline.
|
||||
#[clap(long)]
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
#[clap(short = 'c')]
|
||||
config: Vec<String>,
|
||||
|
||||
/// Postgres version to use for the initial timeline.
|
||||
#[arg(default_value = DEFAULT_PG_VERSION_NUM)]
|
||||
#[clap(long, help = "Postgres version to use for the initial timeline")]
|
||||
#[clap(long)]
|
||||
pg_version: PgMajorVersion,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Use this tenant in future CLI commands where tenant_id is needed, but not specified"
|
||||
)]
|
||||
/// Use this tenant in future CLI commands where tenant_id is needed, but not specified.
|
||||
#[clap(long)]
|
||||
set_default: bool,
|
||||
|
||||
#[clap(long, help = "Number of shards in the new tenant")]
|
||||
/// Number of shards in the new tenant.
|
||||
#[clap(long)]
|
||||
#[arg(default_value_t = 0)]
|
||||
shard_count: u8,
|
||||
#[clap(long, help = "Sharding stripe size in pages")]
|
||||
/// Sharding stripe size in pages.
|
||||
#[clap(long)]
|
||||
shard_stripe_size: Option<u32>,
|
||||
|
||||
#[clap(long, help = "Placement policy shards in this tenant")]
|
||||
/// Placement policy shards in this tenant.
|
||||
#[clap(long)]
|
||||
#[arg(value_parser = parse_placement_policy)]
|
||||
placement_policy: Option<PlacementPolicy>,
|
||||
}
|
||||
@@ -195,44 +195,35 @@ fn parse_placement_policy(s: &str) -> anyhow::Result<PlacementPolicy> {
|
||||
Ok(serde_json::from_str::<PlacementPolicy>(s)?)
|
||||
}
|
||||
|
||||
/// Set a particular tenant as default in future CLI commands where tenant_id is needed, but not
|
||||
/// specified.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(
|
||||
about = "Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"
|
||||
)]
|
||||
struct TenantSetDefaultCmdArgs {
|
||||
#[clap(
|
||||
long = "tenant-id",
|
||||
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Tenant ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long = "tenant-id")]
|
||||
tenant_id: TenantId,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
struct TenantConfigCmdArgs {
|
||||
#[clap(
|
||||
long = "tenant-id",
|
||||
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Tenant ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long = "tenant-id")]
|
||||
tenant_id: Option<TenantId>,
|
||||
|
||||
#[clap(short = 'c')]
|
||||
config: Vec<String>,
|
||||
}
|
||||
|
||||
/// Import a tenant that is present in remote storage, and create branches for its timelines.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(
|
||||
about = "Import a tenant that is present in remote storage, and create branches for its timelines"
|
||||
)]
|
||||
struct TenantImportCmdArgs {
|
||||
#[clap(
|
||||
long = "tenant-id",
|
||||
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Tenant ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long = "tenant-id")]
|
||||
tenant_id: TenantId,
|
||||
}
|
||||
|
||||
/// Manage timelines.
|
||||
#[derive(clap::Subcommand)]
|
||||
#[clap(about = "Manage timelines")]
|
||||
enum TimelineCmd {
|
||||
List(TimelineListCmdArgs),
|
||||
Branch(TimelineBranchCmdArgs),
|
||||
@@ -240,98 +231,87 @@ enum TimelineCmd {
|
||||
Import(TimelineImportCmdArgs),
|
||||
}
|
||||
|
||||
/// List all timelines available to this pageserver.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "List all timelines available to this pageserver")]
|
||||
struct TimelineListCmdArgs {
|
||||
#[clap(
|
||||
long = "tenant-id",
|
||||
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Tenant ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long = "tenant-id")]
|
||||
tenant_shard_id: Option<TenantShardId>,
|
||||
}
|
||||
|
||||
/// Create a new timeline, branching off from another timeline.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Create a new timeline, branching off from another timeline")]
|
||||
struct TimelineBranchCmdArgs {
|
||||
#[clap(
|
||||
long = "tenant-id",
|
||||
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Tenant ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long = "tenant-id")]
|
||||
tenant_id: Option<TenantId>,
|
||||
|
||||
#[clap(long, help = "New timeline's ID")]
|
||||
/// New timeline's ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long)]
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
#[clap(long, help = "Human-readable alias for the new timeline")]
|
||||
/// Human-readable alias for the new timeline.
|
||||
#[clap(long)]
|
||||
branch_name: String,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name."
|
||||
)]
|
||||
/// Use last Lsn of another timeline (and its data) as base when creating the new timeline. The
|
||||
/// timeline gets resolved by its branch name.
|
||||
#[clap(long)]
|
||||
ancestor_branch_name: Option<String>,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "When using another timeline as base, use a specific Lsn in it instead of the latest one"
|
||||
)]
|
||||
/// When using another timeline as base, use a specific Lsn in it instead of the latest one.
|
||||
#[clap(long)]
|
||||
ancestor_start_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
/// Create a new blank timeline.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Create a new blank timeline")]
|
||||
struct TimelineCreateCmdArgs {
|
||||
#[clap(
|
||||
long = "tenant-id",
|
||||
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Tenant ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long = "tenant-id")]
|
||||
tenant_id: Option<TenantId>,
|
||||
|
||||
#[clap(long, help = "New timeline's ID")]
|
||||
/// New timeline's ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long)]
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
#[clap(long, help = "Human-readable alias for the new timeline")]
|
||||
/// Human-readable alias for the new timeline.
|
||||
#[clap(long)]
|
||||
branch_name: String,
|
||||
|
||||
/// Postgres version.
|
||||
#[arg(default_value = DEFAULT_PG_VERSION_NUM)]
|
||||
#[clap(long, help = "Postgres version")]
|
||||
#[clap(long)]
|
||||
pg_version: PgMajorVersion,
|
||||
}
|
||||
|
||||
/// Import a timeline from a basebackup directory.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Import timeline from a basebackup directory")]
|
||||
struct TimelineImportCmdArgs {
|
||||
#[clap(
|
||||
long = "tenant-id",
|
||||
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Tenant ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long = "tenant-id")]
|
||||
tenant_id: Option<TenantId>,
|
||||
|
||||
#[clap(long, help = "New timeline's ID")]
|
||||
/// New timeline's ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long)]
|
||||
timeline_id: TimelineId,
|
||||
|
||||
#[clap(long, help = "Human-readable alias for the new timeline")]
|
||||
/// Human-readable alias for the new timeline.
|
||||
#[clap(long)]
|
||||
branch_name: String,
|
||||
|
||||
#[clap(long, help = "Basebackup tarfile to import")]
|
||||
/// Basebackup tarfile to import.
|
||||
#[clap(long)]
|
||||
base_tarfile: PathBuf,
|
||||
|
||||
#[clap(long, help = "Lsn the basebackup starts at")]
|
||||
/// LSN the basebackup starts at.
|
||||
#[clap(long)]
|
||||
base_lsn: Lsn,
|
||||
|
||||
#[clap(long, help = "Wal to add after base")]
|
||||
/// WAL to add after base.
|
||||
#[clap(long)]
|
||||
wal_tarfile: Option<PathBuf>,
|
||||
|
||||
#[clap(long, help = "Lsn the basebackup ends at")]
|
||||
/// LSN the basebackup ends at.
|
||||
#[clap(long)]
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
/// Postgres version of the basebackup being imported.
|
||||
#[arg(default_value = DEFAULT_PG_VERSION_NUM)]
|
||||
#[clap(long, help = "Postgres version of the backup being imported")]
|
||||
#[clap(long)]
|
||||
pg_version: PgMajorVersion,
|
||||
}
|
||||
|
||||
/// Manage pageservers.
|
||||
#[derive(clap::Subcommand)]
|
||||
#[clap(about = "Manage pageservers")]
|
||||
enum PageserverCmd {
|
||||
Status(PageserverStatusCmdArgs),
|
||||
Start(PageserverStartCmdArgs),
|
||||
@@ -339,223 +319,202 @@ enum PageserverCmd {
|
||||
Restart(PageserverRestartCmdArgs),
|
||||
}
|
||||
|
||||
/// Show status of a local pageserver.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Show status of a local pageserver")]
|
||||
struct PageserverStatusCmdArgs {
|
||||
#[clap(long = "id", help = "pageserver id")]
|
||||
/// Pageserver ID.
|
||||
#[clap(long = "id")]
|
||||
pageserver_id: Option<NodeId>,
|
||||
}
|
||||
|
||||
/// Start local pageserver.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Start local pageserver")]
|
||||
struct PageserverStartCmdArgs {
|
||||
#[clap(long = "id", help = "pageserver id")]
|
||||
/// Pageserver ID.
|
||||
#[clap(long = "id")]
|
||||
pageserver_id: Option<NodeId>,
|
||||
|
||||
#[clap(short = 't', long, help = "timeout until we fail the command")]
|
||||
/// Timeout until we fail the command.
|
||||
#[clap(short = 't', long)]
|
||||
#[arg(default_value = "10s")]
|
||||
start_timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
/// Stop local pageserver.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Stop local pageserver")]
|
||||
struct PageserverStopCmdArgs {
|
||||
#[clap(long = "id", help = "pageserver id")]
|
||||
/// Pageserver ID.
|
||||
#[clap(long = "id")]
|
||||
pageserver_id: Option<NodeId>,
|
||||
|
||||
#[clap(
|
||||
short = 'm',
|
||||
help = "If 'immediate', don't flush repository data at shutdown"
|
||||
)]
|
||||
/// If 'immediate', don't flush repository data at shutdown
|
||||
#[clap(short = 'm')]
|
||||
#[arg(value_enum, default_value = "fast")]
|
||||
stop_mode: StopMode,
|
||||
}
|
||||
|
||||
/// Restart local pageserver.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Restart local pageserver")]
|
||||
struct PageserverRestartCmdArgs {
|
||||
#[clap(long = "id", help = "pageserver id")]
|
||||
/// Pageserver ID.
|
||||
#[clap(long = "id")]
|
||||
pageserver_id: Option<NodeId>,
|
||||
|
||||
#[clap(short = 't', long, help = "timeout until we fail the command")]
|
||||
/// Timeout until we fail the command.
|
||||
#[clap(short = 't', long)]
|
||||
#[arg(default_value = "10s")]
|
||||
start_timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
/// Manage storage controller.
|
||||
#[derive(clap::Subcommand)]
|
||||
#[clap(about = "Manage storage controller")]
|
||||
enum StorageControllerCmd {
|
||||
Start(StorageControllerStartCmdArgs),
|
||||
Stop(StorageControllerStopCmdArgs),
|
||||
}
|
||||
|
||||
/// Start storage controller.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Start storage controller")]
|
||||
struct StorageControllerStartCmdArgs {
|
||||
#[clap(short = 't', long, help = "timeout until we fail the command")]
|
||||
/// Timeout until we fail the command.
|
||||
#[clap(short = 't', long)]
|
||||
#[arg(default_value = "10s")]
|
||||
start_timeout: humantime::Duration,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Identifier used to distinguish storage controller instances"
|
||||
)]
|
||||
/// Identifier used to distinguish storage controller instances.
|
||||
#[clap(long)]
|
||||
#[arg(default_value_t = 1)]
|
||||
instance_id: u8,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)"
|
||||
)]
|
||||
/// Base port for the storage controller instance identified by instance-id (defaults to
|
||||
/// pageserver cplane api).
|
||||
#[clap(long)]
|
||||
base_port: Option<u16>,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Whether the storage controller should handle pageserver-reported local disk loss events."
|
||||
)]
|
||||
/// Whether the storage controller should handle pageserver-reported local disk loss events.
|
||||
#[clap(long)]
|
||||
handle_ps_local_disk_loss: Option<bool>,
|
||||
}
|
||||
|
||||
/// Stop storage controller.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Stop storage controller")]
|
||||
struct StorageControllerStopCmdArgs {
|
||||
#[clap(
|
||||
short = 'm',
|
||||
help = "If 'immediate', don't flush repository data at shutdown"
|
||||
)]
|
||||
/// If 'immediate', don't flush repository data at shutdown
|
||||
#[clap(short = 'm')]
|
||||
#[arg(value_enum, default_value = "fast")]
|
||||
stop_mode: StopMode,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Identifier used to distinguish storage controller instances"
|
||||
)]
|
||||
/// Identifier used to distinguish storage controller instances.
|
||||
#[clap(long)]
|
||||
#[arg(default_value_t = 1)]
|
||||
instance_id: u8,
|
||||
}
|
||||
|
||||
/// Manage storage broker.
|
||||
#[derive(clap::Subcommand)]
|
||||
#[clap(about = "Manage storage broker")]
|
||||
enum StorageBrokerCmd {
|
||||
Start(StorageBrokerStartCmdArgs),
|
||||
Stop(StorageBrokerStopCmdArgs),
|
||||
}
|
||||
|
||||
/// Start broker.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Start broker")]
|
||||
struct StorageBrokerStartCmdArgs {
|
||||
#[clap(short = 't', long, help = "timeout until we fail the command")]
|
||||
#[arg(default_value = "10s")]
|
||||
/// Timeout until we fail the command.
|
||||
#[clap(short = 't', long, default_value = "10s")]
|
||||
start_timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
/// Stop broker.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "stop broker")]
|
||||
struct StorageBrokerStopCmdArgs {
|
||||
#[clap(
|
||||
short = 'm',
|
||||
help = "If 'immediate', don't flush repository data at shutdown"
|
||||
)]
|
||||
/// If 'immediate', don't flush repository data on shutdown.
|
||||
#[clap(short = 'm')]
|
||||
#[arg(value_enum, default_value = "fast")]
|
||||
stop_mode: StopMode,
|
||||
}
|
||||
|
||||
/// Manage safekeepers.
|
||||
#[derive(clap::Subcommand)]
|
||||
#[clap(about = "Manage safekeepers")]
|
||||
enum SafekeeperCmd {
|
||||
Start(SafekeeperStartCmdArgs),
|
||||
Stop(SafekeeperStopCmdArgs),
|
||||
Restart(SafekeeperRestartCmdArgs),
|
||||
}
|
||||
|
||||
/// Manage object storage.
|
||||
#[derive(clap::Subcommand)]
|
||||
#[clap(about = "Manage object storage")]
|
||||
enum EndpointStorageCmd {
|
||||
Start(EndpointStorageStartCmd),
|
||||
Stop(EndpointStorageStopCmd),
|
||||
}
|
||||
|
||||
/// Start object storage.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Start object storage")]
|
||||
struct EndpointStorageStartCmd {
|
||||
#[clap(short = 't', long, help = "timeout until we fail the command")]
|
||||
/// Timeout until we fail the command.
|
||||
#[clap(short = 't', long)]
|
||||
#[arg(default_value = "10s")]
|
||||
start_timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
/// Stop object storage.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Stop object storage")]
|
||||
struct EndpointStorageStopCmd {
|
||||
/// If 'immediate', don't flush repository data on shutdown.
|
||||
#[clap(short = 'm')]
|
||||
#[arg(value_enum, default_value = "fast")]
|
||||
#[clap(
|
||||
short = 'm',
|
||||
help = "If 'immediate', don't flush repository data at shutdown"
|
||||
)]
|
||||
stop_mode: StopMode,
|
||||
}
|
||||
|
||||
/// Start local safekeeper.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Start local safekeeper")]
|
||||
struct SafekeeperStartCmdArgs {
|
||||
#[clap(help = "safekeeper id")]
|
||||
/// Safekeeper ID.
|
||||
#[arg(default_value_t = NodeId(1))]
|
||||
id: NodeId,
|
||||
|
||||
#[clap(
|
||||
short = 'e',
|
||||
long = "safekeeper-extra-opt",
|
||||
help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo"
|
||||
)]
|
||||
/// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo.
|
||||
#[clap(short = 'e', long = "safekeeper-extra-opt")]
|
||||
extra_opt: Vec<String>,
|
||||
|
||||
#[clap(short = 't', long, help = "timeout until we fail the command")]
|
||||
/// Timeout until we fail the command.
|
||||
#[clap(short = 't', long)]
|
||||
#[arg(default_value = "10s")]
|
||||
start_timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
/// Stop local safekeeper.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Stop local safekeeper")]
|
||||
struct SafekeeperStopCmdArgs {
|
||||
#[clap(help = "safekeeper id")]
|
||||
/// Safekeeper ID.
|
||||
#[arg(default_value_t = NodeId(1))]
|
||||
id: NodeId,
|
||||
|
||||
/// If 'immediate', don't flush repository data on shutdown.
|
||||
#[arg(value_enum, default_value = "fast")]
|
||||
#[clap(
|
||||
short = 'm',
|
||||
help = "If 'immediate', don't flush repository data at shutdown"
|
||||
)]
|
||||
#[clap(short = 'm')]
|
||||
stop_mode: StopMode,
|
||||
}
|
||||
|
||||
/// Restart local safekeeper.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Restart local safekeeper")]
|
||||
struct SafekeeperRestartCmdArgs {
|
||||
#[clap(help = "safekeeper id")]
|
||||
/// Safekeeper ID.
|
||||
#[arg(default_value_t = NodeId(1))]
|
||||
id: NodeId,
|
||||
|
||||
/// If 'immediate', don't flush repository data on shutdown.
|
||||
#[arg(value_enum, default_value = "fast")]
|
||||
#[clap(
|
||||
short = 'm',
|
||||
help = "If 'immediate', don't flush repository data at shutdown"
|
||||
)]
|
||||
#[clap(short = 'm')]
|
||||
stop_mode: StopMode,
|
||||
|
||||
#[clap(
|
||||
short = 'e',
|
||||
long = "safekeeper-extra-opt",
|
||||
help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo"
|
||||
)]
|
||||
/// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo.
|
||||
#[clap(short = 'e', long = "safekeeper-extra-opt")]
|
||||
extra_opt: Vec<String>,
|
||||
|
||||
#[clap(short = 't', long, help = "timeout until we fail the command")]
|
||||
/// Timeout until we fail the command.
|
||||
#[clap(short = 't', long)]
|
||||
#[arg(default_value = "10s")]
|
||||
start_timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
/// Manage Postgres instances.
|
||||
#[derive(clap::Subcommand)]
|
||||
#[clap(about = "Manage Postgres instances")]
|
||||
enum EndpointCmd {
|
||||
List(EndpointListCmdArgs),
|
||||
Create(EndpointCreateCmdArgs),
|
||||
@@ -567,33 +526,27 @@ enum EndpointCmd {
|
||||
GenerateJwt(EndpointGenerateJwtCmdArgs),
|
||||
}
|
||||
|
||||
/// List endpoints.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "List endpoints")]
|
||||
struct EndpointListCmdArgs {
|
||||
#[clap(
|
||||
long = "tenant-id",
|
||||
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Tenant ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long = "tenant-id")]
|
||||
tenant_shard_id: Option<TenantShardId>,
|
||||
}
|
||||
|
||||
/// Create a compute endpoint.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Create a compute endpoint")]
|
||||
struct EndpointCreateCmdArgs {
|
||||
#[clap(
|
||||
long = "tenant-id",
|
||||
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Tenant ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long = "tenant-id")]
|
||||
tenant_id: Option<TenantId>,
|
||||
|
||||
#[clap(help = "Postgres endpoint id")]
|
||||
/// Postgres endpoint ID.
|
||||
endpoint_id: Option<String>,
|
||||
#[clap(long, help = "Name of the branch the endpoint will run on")]
|
||||
/// Name of the branch the endpoint will run on.
|
||||
#[clap(long)]
|
||||
branch_name: Option<String>,
|
||||
#[clap(
|
||||
long,
|
||||
help = "Specify Lsn on the timeline to start from. By default, end of the timeline would be used"
|
||||
)]
|
||||
/// Specify LSN on the timeline to start from. By default, end of the timeline would be used.
|
||||
#[clap(long)]
|
||||
lsn: Option<Lsn>,
|
||||
#[clap(long)]
|
||||
pg_port: Option<u16>,
|
||||
@@ -604,16 +557,13 @@ struct EndpointCreateCmdArgs {
|
||||
#[clap(long = "pageserver-id")]
|
||||
endpoint_pageserver_id: Option<NodeId>,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Don't do basebackup, create endpoint directory with only config files",
|
||||
action = clap::ArgAction::Set,
|
||||
default_value_t = false
|
||||
)]
|
||||
/// Don't do basebackup, create endpoint directory with only config files.
|
||||
#[clap(long, action = clap::ArgAction::Set, default_value_t = false)]
|
||||
config_only: bool,
|
||||
|
||||
/// Postgres version.
|
||||
#[arg(default_value = DEFAULT_PG_VERSION_NUM)]
|
||||
#[clap(long, help = "Postgres version")]
|
||||
#[clap(long)]
|
||||
pg_version: PgMajorVersion,
|
||||
|
||||
/// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings.
|
||||
@@ -624,170 +574,140 @@ struct EndpointCreateCmdArgs {
|
||||
#[clap(long)]
|
||||
grpc: bool,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "If set, the node will be a hot replica on the specified timeline",
|
||||
action = clap::ArgAction::Set,
|
||||
default_value_t = false
|
||||
)]
|
||||
/// If set, the node will be a hot replica on the specified timeline.
|
||||
#[clap(long, action = clap::ArgAction::Set, default_value_t = false)]
|
||||
hot_standby: bool,
|
||||
|
||||
#[clap(long, help = "If set, will set up the catalog for neon_superuser")]
|
||||
/// If set, will set up the catalog for neon_superuser.
|
||||
#[clap(long)]
|
||||
update_catalog: bool,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
|
||||
)]
|
||||
/// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but
|
||||
/// useful for tests.
|
||||
#[clap(long)]
|
||||
allow_multiple: bool,
|
||||
|
||||
/// Only allow changing it on creation
|
||||
#[clap(long, help = "Name of the privileged role for the endpoint")]
|
||||
/// Name of the privileged role for the endpoint.
|
||||
// Only allow changing it on creation.
|
||||
#[clap(long)]
|
||||
privileged_role_name: Option<String>,
|
||||
}
|
||||
|
||||
/// Start Postgres. If the endpoint doesn't exist yet, it is created.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Start postgres. If the endpoint doesn't exist yet, it is created.")]
|
||||
struct EndpointStartCmdArgs {
|
||||
#[clap(help = "Postgres endpoint id")]
|
||||
/// Postgres endpoint ID.
|
||||
endpoint_id: String,
|
||||
/// Pageserver ID.
|
||||
#[clap(long = "pageserver-id")]
|
||||
endpoint_pageserver_id: Option<NodeId>,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
|
||||
)]
|
||||
/// Safekeepers membership generation to prefix neon.safekeepers with.
|
||||
#[clap(long)]
|
||||
safekeepers_generation: Option<u32>,
|
||||
#[clap(
|
||||
long,
|
||||
help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
|
||||
)]
|
||||
/// List of safekeepers endpoint will talk to.
|
||||
#[clap(long)]
|
||||
safekeepers: Option<String>,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Configure the remote extensions storage proxy gateway URL to request for extensions.",
|
||||
alias = "remote-ext-config"
|
||||
)]
|
||||
/// Configure the remote extensions storage proxy gateway URL to request for extensions.
|
||||
#[clap(long, alias = "remote-ext-config")]
|
||||
remote_ext_base_url: Option<String>,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`"
|
||||
)]
|
||||
/// If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`
|
||||
#[clap(long)]
|
||||
create_test_user: bool,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
|
||||
)]
|
||||
/// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but
|
||||
/// useful for tests.
|
||||
#[clap(long)]
|
||||
allow_multiple: bool,
|
||||
|
||||
#[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
|
||||
/// Timeout until we fail the command.
|
||||
#[clap(short = 't', long, value_parser= humantime::parse_duration)]
|
||||
#[arg(default_value = "90s")]
|
||||
start_timeout: Duration,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Download LFC cache from endpoint storage on endpoint startup",
|
||||
default_value = "false"
|
||||
)]
|
||||
/// Download LFC cache from endpoint storage on endpoint startup
|
||||
#[clap(long, default_value = "false")]
|
||||
autoprewarm: bool,
|
||||
|
||||
#[clap(long, help = "Upload LFC cache to endpoint storage periodically")]
|
||||
/// Upload LFC cache to endpoint storage periodically
|
||||
#[clap(long)]
|
||||
offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Run in development mode, skipping VM-specific operations like process termination",
|
||||
action = clap::ArgAction::SetTrue
|
||||
)]
|
||||
/// Run in development mode, skipping VM-specific operations like process termination
|
||||
#[clap(long, action = clap::ArgAction::SetTrue)]
|
||||
dev: bool,
|
||||
}
|
||||
|
||||
/// Reconfigure an endpoint.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Reconfigure an endpoint")]
|
||||
struct EndpointReconfigureCmdArgs {
|
||||
#[clap(
|
||||
long = "tenant-id",
|
||||
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Tenant id. Represented as a hexadecimal string 32 symbols length
|
||||
#[clap(long = "tenant-id")]
|
||||
tenant_id: Option<TenantId>,
|
||||
|
||||
#[clap(help = "Postgres endpoint id")]
|
||||
/// Postgres endpoint ID.
|
||||
endpoint_id: String,
|
||||
/// Pageserver ID.
|
||||
#[clap(long = "pageserver-id")]
|
||||
endpoint_pageserver_id: Option<NodeId>,
|
||||
|
||||
#[clap(long)]
|
||||
safekeepers: Option<String>,
|
||||
}
|
||||
|
||||
/// Refresh the endpoint's configuration by forcing it reload it's spec
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Refresh the endpoint's configuration by forcing it reload it's spec")]
|
||||
struct EndpointRefreshConfigurationArgs {
|
||||
#[clap(help = "Postgres endpoint id")]
|
||||
/// Postgres endpoint id
|
||||
endpoint_id: String,
|
||||
}
|
||||
|
||||
/// Stop an endpoint.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Stop an endpoint")]
|
||||
struct EndpointStopCmdArgs {
|
||||
#[clap(help = "Postgres endpoint id")]
|
||||
/// Postgres endpoint ID.
|
||||
endpoint_id: String,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Also delete data directory (now optional, should be default in future)"
|
||||
)]
|
||||
/// Also delete data directory (now optional, should be default in future).
|
||||
#[clap(long)]
|
||||
destroy: bool,
|
||||
|
||||
#[clap(long, help = "Postgres shutdown mode")]
|
||||
/// Postgres shutdown mode, passed to `pg_ctl -m <mode>`.
|
||||
#[clap(long)]
|
||||
#[clap(default_value = "fast")]
|
||||
mode: EndpointTerminateMode,
|
||||
}
|
||||
|
||||
/// Update the pageservers in the spec file of the compute endpoint
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Update the pageservers in the spec file of the compute endpoint")]
|
||||
struct EndpointUpdatePageserversCmdArgs {
|
||||
#[clap(help = "Postgres endpoint id")]
|
||||
/// Postgres endpoint id
|
||||
endpoint_id: String,
|
||||
|
||||
#[clap(short = 'p', long, help = "Specified pageserver id")]
|
||||
/// Specified pageserver id
|
||||
#[clap(short = 'p', long)]
|
||||
pageserver_id: Option<NodeId>,
|
||||
}
|
||||
|
||||
/// Generate a JWT for an endpoint.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Generate a JWT for an endpoint")]
|
||||
struct EndpointGenerateJwtCmdArgs {
|
||||
#[clap(help = "Postgres endpoint id")]
|
||||
/// Postgres endpoint ID.
|
||||
endpoint_id: String,
|
||||
|
||||
#[clap(short = 's', long, help = "Scope to generate the JWT with", value_parser = ComputeClaimsScope::from_str)]
|
||||
/// Scope to generate the JWT with.
|
||||
#[clap(short = 's', long, value_parser = ComputeClaimsScope::from_str)]
|
||||
scope: Option<ComputeClaimsScope>,
|
||||
}
|
||||
|
||||
/// Manage neon_local branch name mappings.
|
||||
#[derive(clap::Subcommand)]
|
||||
#[clap(about = "Manage neon_local branch name mappings")]
|
||||
enum MappingsCmd {
|
||||
Map(MappingsMapCmdArgs),
|
||||
}
|
||||
|
||||
/// Create new mapping which cannot exist already.
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Create new mapping which cannot exist already")]
|
||||
struct MappingsMapCmdArgs {
|
||||
#[clap(
|
||||
long,
|
||||
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Tenant ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long)]
|
||||
tenant_id: TenantId,
|
||||
#[clap(
|
||||
long,
|
||||
help = "Timeline id. Represented as a hexadecimal string 32 symbols length"
|
||||
)]
|
||||
/// Timeline ID, as a 32-byte hexadecimal string.
|
||||
#[clap(long)]
|
||||
timeline_id: TimelineId,
|
||||
#[clap(long, help = "Branch name to give to the timeline")]
|
||||
/// Branch name to give to the timeline.
|
||||
#[clap(long)]
|
||||
branch_name: String,
|
||||
}
|
||||
|
||||
|
||||
@@ -303,6 +303,13 @@ enum Command {
|
||||
#[arg(long, required = true, value_delimiter = ',')]
|
||||
new_sk_set: Vec<NodeId>,
|
||||
},
|
||||
/// Abort ongoing safekeeper migration.
|
||||
TimelineSafekeeperMigrateAbort {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
timeline_id: TimelineId,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -1396,6 +1403,17 @@ async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TimelineSafekeeperMigrateAbort {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} => {
|
||||
let path =
|
||||
format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate_abort");
|
||||
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(Method::POST, path, None)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
246
docs/rfcs/2025-07-07-node-deletion-api-improvement.md
Normal file
246
docs/rfcs/2025-07-07-node-deletion-api-improvement.md
Normal file
@@ -0,0 +1,246 @@
|
||||
# Node deletion API improvement
|
||||
|
||||
Created on 2025-07-07
|
||||
Implemented on _TBD_
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC describes improvements to the storage controller API for gracefully deleting pageserver
|
||||
nodes.
|
||||
|
||||
## Motivation
|
||||
|
||||
The basic node deletion API introduced in [#8226](https://github.com/neondatabase/neon/issues/8333)
|
||||
has several limitations:
|
||||
|
||||
- Deleted nodes can re-add themselves if they restart (e.g., a flaky node that keeps restarting and
|
||||
we cannot reach via SSH to stop the pageserver). This issue has been resolved by tombstone
|
||||
mechanism in [#12036](https://github.com/neondatabase/neon/issues/12036)
|
||||
- Process of node deletion is not graceful, i.e. it just imitates a node failure
|
||||
|
||||
In this context, "graceful" node deletion means that users do not experience any disruption or
|
||||
negative effects, provided the system remains in a healthy state (i.e., the remaining pageservers
|
||||
can handle the workload and all requirements are met). To achieve this, the system must perform
|
||||
live migration of all tenant shards from the node being deleted while the node is still running
|
||||
and continue processing all incoming requests. The node is removed only after all tenant shards
|
||||
have been safely migrated.
|
||||
|
||||
Although live migrations can be achieved with the drain functionality, it leads to incorrect shard
|
||||
placement, such as not matching availability zones. This results in unnecessary work to optimize
|
||||
the placement that was just recently performed.
|
||||
|
||||
If we delete a node before its tenant shards are fully moved, the new node won't have all the
|
||||
needed data (e.g. heatmaps) ready. This means user requests to the new node will be much slower at
|
||||
first. If there are many tenant shards, this slowdown affects a huge amount of users.
|
||||
|
||||
Graceful node deletion is more complicated and can introduce new issues. It takes longer because
|
||||
live migration of each tenant shard can last several minutes. Using non-blocking accessors may
|
||||
also cause deletion to wait if other processes are holding inner state lock. It also gets trickier
|
||||
because we need to handle other requests, like drain and fill, at the same time.
|
||||
|
||||
## Impacted components (e.g. pageserver, safekeeper, console, etc)
|
||||
|
||||
- storage controller
|
||||
- pageserver (indirectly)
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
### Tombstones
|
||||
|
||||
To resolve the problem of deleted nodes re-adding themselves, a tombstone mechanism was introduced
|
||||
as part of the node stored information. Each node has a separate `NodeLifecycle` field with two
|
||||
possible states: `Active` and `Deleted`. When node deletion completes, the database row is not
|
||||
deleted but instead has its `NodeLifecycle` column switched to `Deleted`. Nodes with `Deleted`
|
||||
lifecycle are treated as if the row is absent for most handlers, with several exceptions: reattach
|
||||
and register functionality must be aware of tombstones. Additionally, new debug handlers are
|
||||
available for listing and deleting tombstones via the `/debug/v1/tombstone` path.
|
||||
|
||||
### Gracefulness
|
||||
|
||||
The problem of making node deletion graceful is complex and involves several challenges:
|
||||
|
||||
- **Cancellable**: The operation must be cancellable to allow administrators to abort the process
|
||||
if needed, e.g. if run by mistake.
|
||||
- **Non-blocking**: We don't want to block deployment operations like draining/filling on the node
|
||||
deletion process. We need clear policies for handling concurrent operations: what happens when a
|
||||
drain/fill request arrives while deletion is in progress, and what happens when a delete request
|
||||
arrives while drain/fill is in progress.
|
||||
- **Persistent**: If the storage controller restarts during this long-running operation, we must
|
||||
preserve progress and automatically resume the deletion process after the storage controller
|
||||
restarts.
|
||||
- **Migrated correctly**: We cannot simply use the existing drain mechanism for nodes scheduled
|
||||
for deletion, as this would move shards to irrelevant locations. The drain process expects the
|
||||
node to return, so it only moves shards to backup locations, not to their preferred AZs. It also
|
||||
leaves secondary locations unmoved. This could result in unnecessary load on the storage
|
||||
controller and inefficient resource utilization.
|
||||
- **Force option**: Administrators need the ability to force immediate, non-graceful deletion when
|
||||
time constraints or emergency situations require it, bypassing the normal graceful migration
|
||||
process.
|
||||
|
||||
See below for a detailed breakdown of the proposed changes and mechanisms.
|
||||
|
||||
#### Node lifecycle
|
||||
|
||||
New `NodeLifecycle` enum and a matching database field with these values:
|
||||
- `Active`: The normal state. All operations are allowed.
|
||||
- `ScheduledForDeletion`: The node is marked to be deleted soon. Deletion may be in progress or
|
||||
will happen later, but the node will eventually be removed. All operations are allowed.
|
||||
- `Deleted`: The node is fully deleted. No operations are allowed, and the node cannot be brought
|
||||
back. The only action left is to remove its record from the database. Any attempt to register a
|
||||
node in this state will fail.
|
||||
|
||||
This state persists across storage controller restarts.
|
||||
|
||||
**State transition**
|
||||
```
|
||||
+--------------------+
|
||||
+---| Active |<---------------------+
|
||||
| +--------------------+ |
|
||||
| ^ |
|
||||
| start_node_delete | cancel_node_delete |
|
||||
v | |
|
||||
+----------------------------------+ |
|
||||
| ScheduledForDeletion | |
|
||||
+----------------------------------+ |
|
||||
| |
|
||||
| node_register |
|
||||
| |
|
||||
| delete_node (at the finish) |
|
||||
| |
|
||||
v |
|
||||
+---------+ tombstone_delete +----------+
|
||||
| Deleted |-------------------------------->| no row |
|
||||
+---------+ +----------+
|
||||
```
|
||||
|
||||
#### NodeSchedulingPolicy::Deleting
|
||||
|
||||
A `Deleting` variant to the `NodeSchedulingPolicy` enum. This means the deletion function is
|
||||
running for the node right now. Only one node can have the `Deleting` policy at a time.
|
||||
|
||||
The `NodeSchedulingPolicy::Deleting` state is persisted in the database. However, after a storage
|
||||
controller restart, any node previously marked as `Deleting` will have its scheduling policy reset
|
||||
to `Pause`. The policy will only transition back to `Deleting` when the deletion operation is
|
||||
actively started again, as triggered by the node's `NodeLifecycle::ScheduledForDeletion` state.
|
||||
|
||||
`NodeSchedulingPolicy` transition details:
|
||||
1. When `node_delete` begins, set the policy to `NodeSchedulingPolicy::Deleting`.
|
||||
2. If `node_delete` is cancelled (for example, due to a concurrent drain operation), revert the
|
||||
policy to its previous value. The policy is persisted in storcon DB.
|
||||
3. After `node_delete` completes, the final value of the scheduling policy is irrelevant, since
|
||||
`NodeLifecycle::Deleted` prevents any further access to this field.
|
||||
|
||||
The deletion process cannot be initiated for nodes currently undergoing deployment-related
|
||||
operations (`Draining`, `Filling`, or `PauseForRestart` policies). Deletion will only be triggered
|
||||
once the node transitions to either the `Active` or `Pause` state.
|
||||
|
||||
#### OperationTracker
|
||||
|
||||
A replacement for `Option<OperationHandler> ongoing_operation`, the `OperationTracker` is a
|
||||
dedicated service state object responsible for managing all long-running node operations (drain,
|
||||
fill, delete) with robust concurrency control.
|
||||
|
||||
Key responsibilities:
|
||||
- Orchestrates the execution of operations
|
||||
- Supports cancellation of currently running operations
|
||||
- Enforces operation constraints, e.g. allowing only single drain/fill operation at a time
|
||||
- Persists deletion state, enabling recovery of pending deletions across restarts
|
||||
- Ensures thread safety across concurrent requests
|
||||
|
||||
#### Attached tenant shard processing
|
||||
|
||||
When deleting a node, handle each attached tenant shard as follows:
|
||||
|
||||
1. Pick the best node to become the new attached (the candidate).
|
||||
2. If the candidate already has this shard as a secondary:
|
||||
- Create a new secondary for the shard on another suitable node.
|
||||
Otherwise:
|
||||
- Create a secondary for the shard on the candidate node.
|
||||
3. Wait until all secondaries are ready and pre-warmed.
|
||||
4. Promote the candidate's secondary to attached.
|
||||
5. Remove the secondary from the node being deleted.
|
||||
|
||||
This process safely moves all attached shards before deleting the node.
|
||||
|
||||
#### Secondary tenant shard processing
|
||||
|
||||
When deleting a node, handle each secondary tenant shard as follows:
|
||||
|
||||
1. Choose the best node to become the new secondary.
|
||||
2. Create a secondary for the shard on that node.
|
||||
3. Wait until the new secondary is ready.
|
||||
4. Remove the secondary from the node being deleted.
|
||||
|
||||
This ensures all secondary shards are safely moved before deleting the node.
|
||||
|
||||
### Reliability, failure modes and corner cases
|
||||
|
||||
In case of a storage controller failure and following restart, the system behavior depends on the
|
||||
`NodeLifecycle` state:
|
||||
|
||||
- If `NodeLifecycle` is `Active`: No action is taken for this node.
|
||||
- If `NodeLifecycle` is `Deleted`: The node will not be re-added.
|
||||
- If `NodeLifecycle` is `ScheduledForDeletion`: A deletion background task will be launched for
|
||||
this node.
|
||||
|
||||
In case of a pageserver node failure during deletion, the behavior depends on the `force` flag:
|
||||
- If `force` is set: The node deletion will proceed regardless of the node's availability.
|
||||
- If `force` is not set: The deletion will be retried a limited number of times. If the node
|
||||
remains unavailable, the deletion process will pause and automatically resume when the node
|
||||
becomes healthy again.
|
||||
|
||||
### Operations concurrency
|
||||
|
||||
The following sections describe the behavior when different types of requests arrive at the storage
|
||||
controller and how they interact with ongoing operations.
|
||||
|
||||
#### Delete request
|
||||
|
||||
Handler: `PUT /control/v1/node/:node_id/delete`
|
||||
|
||||
1. If node lifecycle is `NodeLifecycle::ScheduledForDeletion`:
|
||||
- Return `200 OK`: there is already an ongoing deletion request for this node
|
||||
2. Update & persist lifecycle to `NodeLifecycle::ScheduledForDeletion`
|
||||
3. Persist current scheduling policy
|
||||
4. If there is no active operation (drain/fill/delete):
|
||||
- Run deletion process for this node
|
||||
|
||||
#### Cancel delete request
|
||||
|
||||
Handler: `DELETE /control/v1/node/:node_id/delete`
|
||||
|
||||
1. If node lifecycle is not `NodeLifecycle::ScheduledForDeletion`:
|
||||
- Return `404 Not Found`: there is no current deletion request for this node
|
||||
2. If the active operation is deleting this node, cancel it
|
||||
3. Update & persist lifecycle to `NodeLifecycle::Active`
|
||||
4. Restore the last scheduling policy from persistence
|
||||
|
||||
#### Drain/fill request
|
||||
|
||||
1. If there are already ongoing drain/fill processes:
|
||||
- Return `409 Conflict`: queueing of drain/fill processes is not supported
|
||||
2. If there is an ongoing delete process:
|
||||
- Cancel it and wait until it is cancelled
|
||||
3. Run the drain/fill process
|
||||
4. After the drain/fill process is cancelled or finished:
|
||||
- Try to find another candidate to delete and run the deletion process for that node
|
||||
|
||||
#### Drain/fill cancel request
|
||||
|
||||
1. If the active operation is not the related process:
|
||||
- Return `400 Bad Request`: cancellation request is incorrect, operations are not the same
|
||||
2. Cancel the active operation
|
||||
3. Try to find another candidate to delete and run the deletion process for that node
|
||||
|
||||
## Definition of Done
|
||||
|
||||
- [x] Fix flaky node scenario and introduce related debug handlers
|
||||
- [ ] Node deletion intent is persistent - a node will be eventually deleted after a deletion
|
||||
request regardless of draining/filling requests and restarts
|
||||
- [ ] Node deletion can be graceful - deletion completes only after moving all tenant shards to
|
||||
recommended locations
|
||||
- [ ] Deploying does not break due to long deletions - drain/fill operations override deletion
|
||||
process and deletion resumes after drain/fill completes
|
||||
- [ ] `force` flag is implemented and provides fast, failure-tolerant node removal (e.g., when a
|
||||
pageserver node does not respond)
|
||||
- [ ] Legacy delete handler code is removed from storage_controller, test_runner, and storcon_cli
|
||||
@@ -1,10 +1,9 @@
|
||||
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
|
||||
|
||||
use std::fmt::Display;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use jsonwebtoken::jwk::JwkSet;
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
use std::fmt::Display;
|
||||
|
||||
use crate::privilege::Privilege;
|
||||
use crate::spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role};
|
||||
@@ -49,7 +48,7 @@ pub struct ExtensionInstallResponse {
|
||||
/// Status of the LFC prewarm process. The same state machine is reused for
|
||||
/// both autoprewarm (prewarm after compute/Postgres start using the previously
|
||||
/// stored LFC state) and explicit prewarming via API.
|
||||
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
||||
#[derive(Serialize, Default, Debug, Clone)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcPrewarmState {
|
||||
/// Default value when compute boots up.
|
||||
@@ -59,7 +58,14 @@ pub enum LfcPrewarmState {
|
||||
Prewarming,
|
||||
/// We found requested LFC state in the endpoint storage and
|
||||
/// completed prewarming successfully.
|
||||
Completed,
|
||||
Completed {
|
||||
total: i32,
|
||||
prewarmed: i32,
|
||||
skipped: i32,
|
||||
state_download_time_ms: u32,
|
||||
uncompress_time_ms: u32,
|
||||
prewarm_time_ms: u32,
|
||||
},
|
||||
/// Unexpected error happened during prewarming. Note, `Not Found 404`
|
||||
/// response from the endpoint storage is explicitly excluded here
|
||||
/// because it can normally happen on the first compute start,
|
||||
@@ -68,11 +74,15 @@ pub enum LfcPrewarmState {
|
||||
/// We tried to fetch the corresponding LFC state from the endpoint storage,
|
||||
/// but received `Not Found 404`. This should normally happen only during the
|
||||
/// first endpoint start after creation with `autoprewarm: true`.
|
||||
/// This may also happen if LFC is turned off or not initialized
|
||||
///
|
||||
/// During the orchestrated prewarm via API, when a caller explicitly
|
||||
/// provides the LFC state key to prewarm from, it's the caller responsibility
|
||||
/// to handle this status as an error state in this case.
|
||||
Skipped,
|
||||
/// LFC prewarm was cancelled. Some pages in LFC cache may be prewarmed if query
|
||||
/// has started working before cancellation
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl Display for LfcPrewarmState {
|
||||
@@ -80,32 +90,44 @@ impl Display for LfcPrewarmState {
|
||||
match self {
|
||||
LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
|
||||
LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
|
||||
LfcPrewarmState::Completed => f.write_str("Completed"),
|
||||
LfcPrewarmState::Completed { .. } => f.write_str("Completed"),
|
||||
LfcPrewarmState::Skipped => f.write_str("Skipped"),
|
||||
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
|
||||
LfcPrewarmState::Cancelled => f.write_str("Cancelled"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
||||
#[derive(Serialize, Default, Debug, Clone)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcOffloadState {
|
||||
#[default]
|
||||
NotOffloaded,
|
||||
Offloading,
|
||||
Completed,
|
||||
Completed {
|
||||
state_query_time_ms: u32,
|
||||
compress_time_ms: u32,
|
||||
state_upload_time_ms: u32,
|
||||
},
|
||||
Failed {
|
||||
error: String,
|
||||
},
|
||||
/// LFC state was empty so it wasn't offloaded
|
||||
Skipped,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, PartialEq)]
|
||||
#[derive(Serialize, Debug, Clone)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
/// Response of /promote
|
||||
pub enum PromoteState {
|
||||
NotPromoted,
|
||||
Completed,
|
||||
Failed { error: String },
|
||||
Completed {
|
||||
lsn_wait_time_ms: u32,
|
||||
pg_promote_time_ms: u32,
|
||||
reconfigure_time_ms: u32,
|
||||
},
|
||||
Failed {
|
||||
error: String,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Default, Debug)]
|
||||
|
||||
@@ -458,7 +458,7 @@ pub(crate) enum LocalProxyConnError {
|
||||
impl ReportableError for HttpConnError {
|
||||
fn get_error_kind(&self) -> ErrorKind {
|
||||
match self {
|
||||
HttpConnError::ConnectError(_) => ErrorKind::Compute,
|
||||
HttpConnError::ConnectError(e) => e.get_error_kind(),
|
||||
HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute,
|
||||
HttpConnError::PostgresConnectionError(p) => match p.as_db_error() {
|
||||
// user provided a wrong database name
|
||||
|
||||
@@ -5,12 +5,17 @@ use std::sync::Arc;
|
||||
|
||||
use bytes::Bytes;
|
||||
use http::Method;
|
||||
use http::header::{AUTHORIZATION, CONTENT_TYPE, HOST};
|
||||
use http::header::{
|
||||
ACCESS_CONTROL_ALLOW_HEADERS, ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN,
|
||||
ACCESS_CONTROL_EXPOSE_HEADERS, ACCESS_CONTROL_MAX_AGE, ACCESS_CONTROL_REQUEST_HEADERS, ALLOW,
|
||||
AUTHORIZATION, CONTENT_TYPE, HOST, ORIGIN,
|
||||
};
|
||||
use http_body_util::combinators::BoxBody;
|
||||
use http_body_util::{BodyExt, Full};
|
||||
use http_body_util::{BodyExt, Empty, Full};
|
||||
use http_utils::error::ApiError;
|
||||
use hyper::body::Incoming;
|
||||
use hyper::http::{HeaderName, HeaderValue};
|
||||
use hyper::http::response::Builder;
|
||||
use hyper::http::{HeaderMap, HeaderName, HeaderValue};
|
||||
use hyper::{Request, Response, StatusCode};
|
||||
use indexmap::IndexMap;
|
||||
use moka::sync::Cache;
|
||||
@@ -67,6 +72,15 @@ use crate::util::deserialize_json_string;
|
||||
|
||||
static EMPTY_JSON_SCHEMA: &str = r#"{"schemas":[]}"#;
|
||||
const INTROSPECTION_SQL: &str = POSTGRESQL_INTROSPECTION_SQL;
|
||||
const HEADER_VALUE_ALLOW_ALL_ORIGINS: HeaderValue = HeaderValue::from_static("*");
|
||||
// CORS headers values
|
||||
const ACCESS_CONTROL_ALLOW_METHODS_VALUE: HeaderValue =
|
||||
HeaderValue::from_static("GET, POST, PATCH, PUT, DELETE, OPTIONS");
|
||||
const ACCESS_CONTROL_MAX_AGE_VALUE: HeaderValue = HeaderValue::from_static("86400");
|
||||
const ACCESS_CONTROL_EXPOSE_HEADERS_VALUE: HeaderValue = HeaderValue::from_static(
|
||||
"Content-Encoding, Content-Location, Content-Range, Content-Type, Date, Location, Server, Transfer-Encoding, Range-Unit",
|
||||
);
|
||||
const ACCESS_CONTROL_ALLOW_HEADERS_VALUE: HeaderValue = HeaderValue::from_static("Authorization");
|
||||
|
||||
// A wrapper around the DbSchema that allows for self-referencing
|
||||
#[self_referencing]
|
||||
@@ -137,6 +151,8 @@ pub struct ApiConfig {
|
||||
pub role_claim_key: String,
|
||||
#[serde(default, deserialize_with = "deserialize_comma_separated_option")]
|
||||
pub db_extra_search_path: Option<Vec<String>>,
|
||||
#[serde(default, deserialize_with = "deserialize_comma_separated_option")]
|
||||
pub server_cors_allowed_origins: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
// The DbSchemaCache is a cache of the ApiConfig and DbSchemaOwned for each endpoint
|
||||
@@ -165,7 +181,13 @@ impl DbSchemaCache {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_cached_or_remote(
|
||||
pub fn get_cached(
|
||||
&self,
|
||||
endpoint_id: &EndpointCacheKey,
|
||||
) -> Option<Arc<(ApiConfig, DbSchemaOwned)>> {
|
||||
count_cache_outcome(CacheKind::Schema, self.0.get(endpoint_id))
|
||||
}
|
||||
pub async fn get_remote(
|
||||
&self,
|
||||
endpoint_id: &EndpointCacheKey,
|
||||
auth_header: &HeaderValue,
|
||||
@@ -174,47 +196,42 @@ impl DbSchemaCache {
|
||||
ctx: &RequestContext,
|
||||
config: &'static ProxyConfig,
|
||||
) -> Result<Arc<(ApiConfig, DbSchemaOwned)>, RestError> {
|
||||
let cache_result = count_cache_outcome(CacheKind::Schema, self.0.get(endpoint_id));
|
||||
match cache_result {
|
||||
Some(v) => Ok(v),
|
||||
None => {
|
||||
info!("db_schema cache miss for endpoint: {:?}", endpoint_id);
|
||||
let remote_value = self
|
||||
.get_remote(auth_header, connection_string, client, ctx, config)
|
||||
.await;
|
||||
let (api_config, schema_owned) = match remote_value {
|
||||
Ok((api_config, schema_owned)) => (api_config, schema_owned),
|
||||
Err(e @ RestError::SchemaTooLarge) => {
|
||||
// for the case where the schema is too large, we cache an empty dummy value
|
||||
// all the other requests will fail without triggering the introspection query
|
||||
let schema_owned = serde_json::from_str::<DbSchemaOwned>(EMPTY_JSON_SCHEMA)
|
||||
.map_err(|e| JsonDeserialize { source: e })?;
|
||||
info!("db_schema cache miss for endpoint: {:?}", endpoint_id);
|
||||
let remote_value = self
|
||||
.internal_get_remote(auth_header, connection_string, client, ctx, config)
|
||||
.await;
|
||||
let (api_config, schema_owned) = match remote_value {
|
||||
Ok((api_config, schema_owned)) => (api_config, schema_owned),
|
||||
Err(e @ RestError::SchemaTooLarge) => {
|
||||
// for the case where the schema is too large, we cache an empty dummy value
|
||||
// all the other requests will fail without triggering the introspection query
|
||||
let schema_owned = serde_json::from_str::<DbSchemaOwned>(EMPTY_JSON_SCHEMA)
|
||||
.map_err(|e| JsonDeserialize { source: e })?;
|
||||
|
||||
let api_config = ApiConfig {
|
||||
db_schemas: vec![],
|
||||
db_anon_role: None,
|
||||
db_max_rows: None,
|
||||
db_allowed_select_functions: vec![],
|
||||
role_claim_key: String::new(),
|
||||
db_extra_search_path: None,
|
||||
};
|
||||
let value = Arc::new((api_config, schema_owned));
|
||||
count_cache_insert(CacheKind::Schema);
|
||||
self.0.insert(endpoint_id.clone(), value);
|
||||
return Err(e);
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
}
|
||||
let api_config = ApiConfig {
|
||||
db_schemas: vec![],
|
||||
db_anon_role: None,
|
||||
db_max_rows: None,
|
||||
db_allowed_select_functions: vec![],
|
||||
role_claim_key: String::new(),
|
||||
db_extra_search_path: None,
|
||||
server_cors_allowed_origins: None,
|
||||
};
|
||||
let value = Arc::new((api_config, schema_owned));
|
||||
count_cache_insert(CacheKind::Schema);
|
||||
self.0.insert(endpoint_id.clone(), value.clone());
|
||||
Ok(value)
|
||||
self.0.insert(endpoint_id.clone(), value);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
let value = Arc::new((api_config, schema_owned));
|
||||
count_cache_insert(CacheKind::Schema);
|
||||
self.0.insert(endpoint_id.clone(), value.clone());
|
||||
Ok(value)
|
||||
}
|
||||
pub async fn get_remote(
|
||||
async fn internal_get_remote(
|
||||
&self,
|
||||
auth_header: &HeaderValue,
|
||||
connection_string: &str,
|
||||
@@ -531,7 +548,7 @@ pub(crate) async fn handle(
|
||||
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
|
||||
let result = handle_inner(cancel, config, &ctx, request, backend).await;
|
||||
|
||||
let mut response = match result {
|
||||
let response = match result {
|
||||
Ok(r) => {
|
||||
ctx.set_success();
|
||||
|
||||
@@ -640,9 +657,6 @@ pub(crate) async fn handle(
|
||||
}
|
||||
};
|
||||
|
||||
response
|
||||
.headers_mut()
|
||||
.insert("Access-Control-Allow-Origin", HeaderValue::from_static("*"));
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
@@ -722,6 +736,37 @@ async fn handle_inner(
|
||||
}
|
||||
}
|
||||
|
||||
fn apply_common_cors_headers(
|
||||
response: &mut Builder,
|
||||
request_headers: &HeaderMap,
|
||||
allowed_origins: Option<&Vec<String>>,
|
||||
) {
|
||||
let request_origin = request_headers
|
||||
.get(ORIGIN)
|
||||
.map(|v| v.to_str().unwrap_or(""));
|
||||
|
||||
let response_allow_origin = match (request_origin, allowed_origins) {
|
||||
(Some(or), Some(allowed_origins)) => {
|
||||
if allowed_origins.iter().any(|o| o == or) {
|
||||
Some(HeaderValue::from_str(or).unwrap_or(HEADER_VALUE_ALLOW_ALL_ORIGINS))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
(Some(_), None) => Some(HEADER_VALUE_ALLOW_ALL_ORIGINS),
|
||||
_ => None,
|
||||
};
|
||||
if let Some(h) = response.headers_mut() {
|
||||
h.insert(
|
||||
ACCESS_CONTROL_EXPOSE_HEADERS,
|
||||
ACCESS_CONTROL_EXPOSE_HEADERS_VALUE,
|
||||
);
|
||||
if let Some(origin) = response_allow_origin {
|
||||
h.insert(ACCESS_CONTROL_ALLOW_ORIGIN, origin);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn handle_rest_inner(
|
||||
config: &'static ProxyConfig,
|
||||
@@ -733,12 +778,6 @@ async fn handle_rest_inner(
|
||||
jwt: String,
|
||||
backend: Arc<PoolingBackend>,
|
||||
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, RestError> {
|
||||
// validate the jwt token
|
||||
let jwt_parsed = backend
|
||||
.authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
|
||||
.await
|
||||
.map_err(HttpConnError::from)?;
|
||||
|
||||
let db_schema_cache =
|
||||
config
|
||||
.rest_config
|
||||
@@ -754,28 +793,83 @@ async fn handle_rest_inner(
|
||||
message: "Failed to get endpoint cache key".to_string(),
|
||||
}))?;
|
||||
|
||||
let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?;
|
||||
|
||||
let (parts, originial_body) = request.into_parts();
|
||||
|
||||
// try and get the cached entry for this endpoint
|
||||
// it contains the api config and the introspected db schema
|
||||
let cached_entry = db_schema_cache.get_cached(&endpoint_cache_key);
|
||||
|
||||
let allowed_origins = cached_entry
|
||||
.as_ref()
|
||||
.and_then(|arc| arc.0.server_cors_allowed_origins.as_ref());
|
||||
|
||||
let mut response = Response::builder();
|
||||
apply_common_cors_headers(&mut response, &parts.headers, allowed_origins);
|
||||
|
||||
// handle the OPTIONS request
|
||||
if parts.method == Method::OPTIONS {
|
||||
let allowed_headers = parts
|
||||
.headers
|
||||
.get(ACCESS_CONTROL_REQUEST_HEADERS)
|
||||
.and_then(|a| a.to_str().ok())
|
||||
.filter(|v| !v.is_empty())
|
||||
.map_or_else(
|
||||
|| "Authorization".to_string(),
|
||||
|v| format!("{v}, Authorization"),
|
||||
);
|
||||
return response
|
||||
.status(StatusCode::OK)
|
||||
.header(
|
||||
ACCESS_CONTROL_ALLOW_METHODS,
|
||||
ACCESS_CONTROL_ALLOW_METHODS_VALUE,
|
||||
)
|
||||
.header(ACCESS_CONTROL_MAX_AGE, ACCESS_CONTROL_MAX_AGE_VALUE)
|
||||
.header(
|
||||
ACCESS_CONTROL_ALLOW_HEADERS,
|
||||
HeaderValue::from_str(&allowed_headers)
|
||||
.unwrap_or(ACCESS_CONTROL_ALLOW_HEADERS_VALUE),
|
||||
)
|
||||
.header(ALLOW, ACCESS_CONTROL_ALLOW_METHODS_VALUE)
|
||||
.body(Empty::new().map_err(|x| match x {}).boxed())
|
||||
.map_err(|e| {
|
||||
RestError::SubzeroCore(InternalError {
|
||||
message: e.to_string(),
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
// validate the jwt token
|
||||
let jwt_parsed = backend
|
||||
.authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
|
||||
.await
|
||||
.map_err(HttpConnError::from)?;
|
||||
|
||||
let auth_header = parts
|
||||
.headers
|
||||
.get(AUTHORIZATION)
|
||||
.ok_or(RestError::SubzeroCore(InternalError {
|
||||
message: "Authorization header is required".to_string(),
|
||||
}))?;
|
||||
let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?;
|
||||
|
||||
let entry = db_schema_cache
|
||||
.get_cached_or_remote(
|
||||
&endpoint_cache_key,
|
||||
auth_header,
|
||||
connection_string,
|
||||
&mut client,
|
||||
ctx,
|
||||
config,
|
||||
)
|
||||
.await?;
|
||||
let entry = match cached_entry {
|
||||
Some(e) => e,
|
||||
None => {
|
||||
// if not cached, get the remote entry (will run the introspection query)
|
||||
db_schema_cache
|
||||
.get_remote(
|
||||
&endpoint_cache_key,
|
||||
auth_header,
|
||||
connection_string,
|
||||
&mut client,
|
||||
ctx,
|
||||
config,
|
||||
)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
let (api_config, db_schema_owned) = entry.as_ref();
|
||||
|
||||
let db_schema = db_schema_owned.borrow_schema();
|
||||
|
||||
let db_schemas = &api_config.db_schemas; // list of schemas available for the api
|
||||
@@ -999,8 +1093,8 @@ async fn handle_rest_inner(
|
||||
let _metrics = client.metrics(ctx); // FIXME: is everything in the context set correctly?
|
||||
|
||||
// send the request to the local proxy
|
||||
let response = make_raw_local_proxy_request(&mut client, headers, req_body).await?;
|
||||
let (parts, body) = response.into_parts();
|
||||
let proxy_response = make_raw_local_proxy_request(&mut client, headers, req_body).await?;
|
||||
let (response_parts, body) = proxy_response.into_parts();
|
||||
|
||||
let max_response = config.http_config.max_response_size_bytes;
|
||||
let bytes = read_body_with_limit(body, max_response)
|
||||
@@ -1009,7 +1103,7 @@ async fn handle_rest_inner(
|
||||
|
||||
// if the response status is greater than 399, then it is an error
|
||||
// FIXME: check if there are other error codes or shapes of the response
|
||||
if parts.status.as_u16() > 399 {
|
||||
if response_parts.status.as_u16() > 399 {
|
||||
// turn this postgres error from the json into PostgresError
|
||||
let postgres_error = serde_json::from_slice(&bytes)
|
||||
.map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?;
|
||||
@@ -1175,7 +1269,7 @@ async fn handle_rest_inner(
|
||||
.boxed();
|
||||
|
||||
// build the response
|
||||
let mut response = Response::builder()
|
||||
response = response
|
||||
.status(StatusCode::from_u16(status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR))
|
||||
.header(CONTENT_TYPE, http_content_type);
|
||||
|
||||
|
||||
@@ -612,19 +612,25 @@ pub async fn handle_request(
|
||||
}
|
||||
}
|
||||
|
||||
let max_term = statuses
|
||||
.iter()
|
||||
.map(|(status, _)| status.acceptor_state.term)
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// Find the most advanced safekeeper
|
||||
let (status, i) = statuses
|
||||
.into_iter()
|
||||
.max_by_key(|(status, _)| {
|
||||
(
|
||||
status.acceptor_state.epoch,
|
||||
status.flush_lsn,
|
||||
/* BEGIN_HADRON */
|
||||
// We need to pull from the SK with the highest term.
|
||||
// This is because another compute may come online and vote the same highest term again on the other two SKs.
|
||||
// Then, there will be 2 computes running on the same term.
|
||||
status.acceptor_state.term,
|
||||
/* END_HADRON */
|
||||
status.flush_lsn,
|
||||
status.commit_lsn,
|
||||
)
|
||||
})
|
||||
@@ -634,6 +640,22 @@ pub async fn handle_request(
|
||||
assert!(status.tenant_id == request.tenant_id);
|
||||
assert!(status.timeline_id == request.timeline_id);
|
||||
|
||||
// TODO(diko): This is hadron only check to make sure that we pull the timeline
|
||||
// from the safekeeper with the highest term during timeline restore.
|
||||
// We could avoid returning the error by calling bump_term after pull_timeline.
|
||||
// However, this is not a big deal because we retry the pull_timeline requests.
|
||||
// The check should be removed together with removing custom hadron logic for
|
||||
// safekeeper restore.
|
||||
if wait_for_peer_timeline_status && status.acceptor_state.term != max_term {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!(
|
||||
"choosen safekeeper {} has term {}, but the most advanced term is {}",
|
||||
safekeeper_host, status.acceptor_state.term, max_term
|
||||
)
|
||||
.into(),
|
||||
));
|
||||
}
|
||||
|
||||
match pull_timeline(
|
||||
status,
|
||||
safekeeper_host,
|
||||
|
||||
@@ -195,12 +195,14 @@ impl StateSK {
|
||||
to: Configuration,
|
||||
) -> Result<TimelineMembershipSwitchResponse> {
|
||||
let result = self.state_mut().membership_switch(to).await?;
|
||||
let flush_lsn = self.flush_lsn();
|
||||
let last_log_term = self.state().acceptor_state.get_last_log_term(flush_lsn);
|
||||
|
||||
Ok(TimelineMembershipSwitchResponse {
|
||||
previous_conf: result.previous_conf,
|
||||
current_conf: result.current_conf,
|
||||
last_log_term: self.state().acceptor_state.term,
|
||||
flush_lsn: self.flush_lsn(),
|
||||
last_log_term,
|
||||
flush_lsn,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -644,6 +644,7 @@ async fn handle_tenant_timeline_safekeeper_migrate(
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
// TODO(diko): it's not PS operation, there should be a different permission scope.
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
maybe_rate_limit(&req, tenant_id).await;
|
||||
|
||||
@@ -665,6 +666,23 @@ async fn handle_tenant_timeline_safekeeper_migrate(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_safekeeper_migrate_abort(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
|
||||
// TODO(diko): it's not PS operation, there should be a different permission scope.
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
maybe_rate_limit(&req, tenant_id).await;
|
||||
|
||||
service
|
||||
.tenant_timeline_safekeeper_migrate_abort(tenant_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_lsn_lease(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
@@ -2611,6 +2629,16 @@ pub fn make_router(
|
||||
)
|
||||
},
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_timeline_safekeeper_migrate_abort,
|
||||
RequestName("v1_tenant_timeline_safekeeper_migrate_abort"),
|
||||
)
|
||||
},
|
||||
)
|
||||
// LSN lease passthrough to all shards
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease",
|
||||
|
||||
@@ -471,11 +471,17 @@ impl Persistence {
|
||||
&self,
|
||||
input_node_id: NodeId,
|
||||
input_https_port: Option<u16>,
|
||||
input_grpc_addr: Option<String>,
|
||||
input_grpc_port: Option<u16>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::nodes::dsl::*;
|
||||
self.update_node(
|
||||
input_node_id,
|
||||
listen_https_port.eq(input_https_port.map(|x| x as i32)),
|
||||
(
|
||||
listen_https_port.eq(input_https_port.map(|x| x as i32)),
|
||||
listen_grpc_addr.eq(input_grpc_addr),
|
||||
listen_grpc_port.eq(input_grpc_port.map(|x| x as i32)),
|
||||
),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -7813,7 +7813,7 @@ impl Service {
|
||||
register_req.listen_https_port,
|
||||
register_req.listen_pg_addr,
|
||||
register_req.listen_pg_port,
|
||||
register_req.listen_grpc_addr,
|
||||
register_req.listen_grpc_addr.clone(),
|
||||
register_req.listen_grpc_port,
|
||||
register_req.availability_zone_id.clone(),
|
||||
self.config.use_https_pageserver_api,
|
||||
@@ -7848,6 +7848,8 @@ impl Service {
|
||||
.update_node_on_registration(
|
||||
register_req.node_id,
|
||||
register_req.listen_https_port,
|
||||
register_req.listen_grpc_addr,
|
||||
register_req.listen_grpc_port,
|
||||
)
|
||||
.await?
|
||||
}
|
||||
|
||||
@@ -24,12 +24,12 @@ use pageserver_api::controller_api::{
|
||||
};
|
||||
use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo};
|
||||
use safekeeper_api::PgVersionId;
|
||||
use safekeeper_api::Term;
|
||||
use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration};
|
||||
use safekeeper_api::models::{
|
||||
PullTimelineRequest, TimelineLocateResponse, TimelineMembershipSwitchRequest,
|
||||
TimelineMembershipSwitchResponse,
|
||||
};
|
||||
use safekeeper_api::{INITIAL_TERM, Term};
|
||||
use safekeeper_client::mgmt_api;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -1230,10 +1230,7 @@ impl Service {
|
||||
}
|
||||
// It it is the same new_sk_set, we can continue the migration (retry).
|
||||
} else {
|
||||
let prev_finished = timeline.cplane_notified_generation == timeline.generation
|
||||
&& timeline.sk_set_notified_generation == timeline.generation;
|
||||
|
||||
if !prev_finished {
|
||||
if !is_migration_finished(&timeline) {
|
||||
// The previous migration is committed, but the finish step failed.
|
||||
// Safekeepers/cplane might not know about the last membership configuration.
|
||||
// Retry the finish step to ensure smooth migration.
|
||||
@@ -1298,13 +1295,7 @@ impl Service {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut sync_position = (INITIAL_TERM, Lsn::INVALID);
|
||||
for res in results.into_iter().flatten() {
|
||||
let sk_position = (res.last_log_term, res.flush_lsn);
|
||||
if sync_position < sk_position {
|
||||
sync_position = sk_position;
|
||||
}
|
||||
}
|
||||
let sync_position = Self::get_sync_position(&results)?;
|
||||
|
||||
tracing::info!(
|
||||
%generation,
|
||||
@@ -1551,6 +1542,8 @@ impl Service {
|
||||
timeline_id: TimelineId,
|
||||
timeline: &TimelinePersistence,
|
||||
) -> Result<(), ApiError> {
|
||||
tracing::info!(generation=?timeline.generation, sk_set=?timeline.sk_set, new_sk_set=?timeline.new_sk_set, "retrying finish safekeeper migration");
|
||||
|
||||
if timeline.new_sk_set.is_some() {
|
||||
// Logical error, should never happen.
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
@@ -1598,4 +1591,152 @@ impl Service {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get membership switch responses from all safekeepers and return the sync position.
|
||||
///
|
||||
/// Sync position is a position equal or greater than the commit position.
|
||||
/// It is guaranteed that all WAL entries with (last_log_term, flush_lsn)
|
||||
/// greater than the sync position are not committed (= not on a quorum).
|
||||
///
|
||||
/// Returns error if there is no quorum of successful responses.
|
||||
fn get_sync_position(
|
||||
responses: &[mgmt_api::Result<TimelineMembershipSwitchResponse>],
|
||||
) -> Result<(Term, Lsn), ApiError> {
|
||||
let quorum_size = responses.len() / 2 + 1;
|
||||
|
||||
let mut wal_positions = responses
|
||||
.iter()
|
||||
.flatten()
|
||||
.map(|res| (res.last_log_term, res.flush_lsn))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// Should be already checked if the responses are from tenant_timeline_set_membership_quorum.
|
||||
if wal_positions.len() < quorum_size {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"not enough successful responses to get sync position: {}/{}",
|
||||
wal_positions.len(),
|
||||
quorum_size,
|
||||
)));
|
||||
}
|
||||
|
||||
wal_positions.sort();
|
||||
|
||||
Ok(wal_positions[quorum_size - 1])
|
||||
}
|
||||
|
||||
/// Abort ongoing safekeeper migration.
|
||||
pub(crate) async fn tenant_timeline_safekeeper_migrate_abort(
|
||||
self: &Arc<Self>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<(), ApiError> {
|
||||
// TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks.
|
||||
let _tenant_lock = trace_shared_lock(
|
||||
&self.tenant_op_locks,
|
||||
tenant_id,
|
||||
TenantOperations::TimelineSafekeeperMigrate,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Fetch current timeline configuration from the configuration storage.
|
||||
let timeline = self
|
||||
.persistence
|
||||
.get_timeline(tenant_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let Some(timeline) = timeline else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!(
|
||||
"timeline {tenant_id}/{timeline_id} doesn't exist in timelines table"
|
||||
)
|
||||
.into(),
|
||||
));
|
||||
};
|
||||
|
||||
let mut generation = SafekeeperGeneration::new(timeline.generation as u32);
|
||||
|
||||
let Some(new_sk_set) = &timeline.new_sk_set else {
|
||||
// No new_sk_set -> no active migration that we can abort.
|
||||
tracing::info!("timeline has no active migration");
|
||||
|
||||
if !is_migration_finished(&timeline) {
|
||||
// The last migration is committed, but the finish step failed.
|
||||
// Safekeepers/cplane might not know about the last membership configuration.
|
||||
// Retry the finish step to make the timeline state clean.
|
||||
self.finish_safekeeper_migration_retry(tenant_id, timeline_id, &timeline)
|
||||
.await?;
|
||||
}
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
tracing::info!(sk_set=?timeline.sk_set, ?new_sk_set, ?generation, "aborting timeline migration");
|
||||
|
||||
let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?;
|
||||
let new_safekeepers = self.get_safekeepers(new_sk_set)?;
|
||||
|
||||
let cur_sk_member_set =
|
||||
Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// Increment current generation and remove new_sk_set from the timeline to abort the migration.
|
||||
generation = generation.next();
|
||||
|
||||
let mconf = membership::Configuration {
|
||||
generation,
|
||||
members: cur_sk_member_set,
|
||||
new_members: None,
|
||||
};
|
||||
|
||||
// Exclude safekeepers which were added during the current migration.
|
||||
let cur_ids: HashSet<NodeId> = cur_safekeepers.iter().map(|sk| sk.get_id()).collect();
|
||||
let exclude_safekeepers = new_safekeepers
|
||||
.into_iter()
|
||||
.filter(|sk| !cur_ids.contains(&sk.get_id()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let exclude_requests = exclude_safekeepers
|
||||
.iter()
|
||||
.map(|sk| TimelinePendingOpPersistence {
|
||||
sk_id: sk.skp.id,
|
||||
tenant_id: tenant_id.to_string(),
|
||||
timeline_id: timeline_id.to_string(),
|
||||
generation: generation.into_inner() as i32,
|
||||
op_kind: SafekeeperTimelineOpKind::Exclude,
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let cur_sk_set = cur_safekeepers
|
||||
.iter()
|
||||
.map(|sk| sk.get_id())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// Persist new mconf and exclude requests.
|
||||
self.persistence
|
||||
.update_timeline_membership(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
generation,
|
||||
&cur_sk_set,
|
||||
None,
|
||||
&exclude_requests,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// At this point we have already commited the abort, but still need to notify
|
||||
// cplane/safekeepers with the new mconf. That's what finish_safekeeper_migration does.
|
||||
self.finish_safekeeper_migration(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
&cur_safekeepers,
|
||||
&mconf,
|
||||
&exclude_safekeepers,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn is_migration_finished(timeline: &TimelinePersistence) -> bool {
|
||||
timeline.cplane_notified_generation == timeline.generation
|
||||
&& timeline.sk_set_notified_generation == timeline.generation
|
||||
}
|
||||
|
||||
@@ -78,20 +78,26 @@ class EndpointHttpClient(requests.Session):
|
||||
json: dict[str, str] = res.json()
|
||||
return json
|
||||
|
||||
def prewarm_lfc(self, from_endpoint_id: str | None = None):
|
||||
def prewarm_lfc(self, from_endpoint_id: str | None = None) -> dict[str, str]:
|
||||
"""
|
||||
Prewarm LFC cache from given endpoint and wait till it finishes or errors
|
||||
"""
|
||||
params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
|
||||
self.post(self.prewarm_url, params=params).raise_for_status()
|
||||
self.prewarm_lfc_wait()
|
||||
return self.prewarm_lfc_wait()
|
||||
|
||||
def prewarm_lfc_wait(self):
|
||||
def cancel_prewarm_lfc(self):
|
||||
"""
|
||||
Cancel LFC prewarm if any is ongoing
|
||||
"""
|
||||
self.delete(self.prewarm_url).raise_for_status()
|
||||
|
||||
def prewarm_lfc_wait(self) -> dict[str, str]:
|
||||
"""
|
||||
Wait till LFC prewarm returns with error or success.
|
||||
If prewarm was not requested before calling this function, it will error
|
||||
"""
|
||||
statuses = "failed", "completed", "skipped"
|
||||
statuses = "failed", "completed", "skipped", "cancelled"
|
||||
|
||||
def prewarmed():
|
||||
json = self.prewarm_lfc_status()
|
||||
@@ -101,6 +107,7 @@ class EndpointHttpClient(requests.Session):
|
||||
wait_until(prewarmed, timeout=60)
|
||||
res = self.prewarm_lfc_status()
|
||||
assert res["status"] != "failed", res
|
||||
return res
|
||||
|
||||
def offload_lfc_status(self) -> dict[str, str]:
|
||||
res = self.get(self.offload_url)
|
||||
@@ -108,29 +115,31 @@ class EndpointHttpClient(requests.Session):
|
||||
json: dict[str, str] = res.json()
|
||||
return json
|
||||
|
||||
def offload_lfc(self):
|
||||
def offload_lfc(self) -> dict[str, str]:
|
||||
"""
|
||||
Offload LFC cache to endpoint storage and wait till offload finishes or errors
|
||||
"""
|
||||
self.post(self.offload_url).raise_for_status()
|
||||
self.offload_lfc_wait()
|
||||
return self.offload_lfc_wait()
|
||||
|
||||
def offload_lfc_wait(self):
|
||||
def offload_lfc_wait(self) -> dict[str, str]:
|
||||
"""
|
||||
Wait till LFC offload returns with error or success.
|
||||
If offload was not requested before calling this function, it will error
|
||||
"""
|
||||
statuses = "failed", "completed", "skipped"
|
||||
|
||||
def offloaded():
|
||||
json = self.offload_lfc_status()
|
||||
status, err = json["status"], json.get("error")
|
||||
assert status in ["failed", "completed"], f"{status}, {err=}"
|
||||
assert status in statuses, f"{status}, {err=}"
|
||||
|
||||
wait_until(offloaded, timeout=60)
|
||||
res = self.offload_lfc_status()
|
||||
assert res["status"] != "failed", res
|
||||
return res
|
||||
|
||||
def promote(self, promote_spec: dict[str, Any], disconnect: bool = False):
|
||||
def promote(self, promote_spec: dict[str, Any], disconnect: bool = False) -> dict[str, str]:
|
||||
url = f"http://localhost:{self.external_port}/promote"
|
||||
if disconnect:
|
||||
try: # send first request to start promote and disconnect
|
||||
|
||||
@@ -262,7 +262,6 @@ class PgProtocol:
|
||||
# pooler does not support statement_timeout
|
||||
# Check if the hostname contains the string 'pooler'
|
||||
hostname = result.get("host", "")
|
||||
log.info(f"Hostname: {hostname}")
|
||||
options = result.get("options", "")
|
||||
if "statement_timeout" not in options and "pooler" not in hostname:
|
||||
options = f"-cstatement_timeout=120s {options}"
|
||||
@@ -2314,6 +2313,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
timeline_id: TimelineId,
|
||||
new_sk_set: list[int],
|
||||
):
|
||||
log.info(f"migrate_safekeepers({tenant_id}, {timeline_id}, {new_sk_set})")
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate",
|
||||
@@ -2323,6 +2323,19 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
response.raise_for_status()
|
||||
log.info(f"migrate_safekeepers success: {response.json()}")
|
||||
|
||||
def abort_safekeeper_migration(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
):
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate_abort",
|
||||
headers=self.headers(TokenScope.PAGE_SERVER_API),
|
||||
)
|
||||
response.raise_for_status()
|
||||
log.info(f"abort_safekeeper_migration success: {response.json()}")
|
||||
|
||||
def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
|
||||
"""
|
||||
:return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int}
|
||||
|
||||
@@ -863,7 +863,6 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
|
||||
assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*")
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Lakebase mode")
|
||||
def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that when the pageserver detects corruption during image layer creation,
|
||||
@@ -890,7 +889,9 @@ def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder):
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload = Workload(
|
||||
env, tenant_id, timeline_id, endpoint_opts={"config_lines": ["neon.lakebase_mode=true"]}
|
||||
)
|
||||
workload.init()
|
||||
|
||||
# Enable the failpoint that will cause image layer creation to fail due to a (simulated) detected
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import random
|
||||
import threading
|
||||
from enum import StrEnum
|
||||
from threading import Thread
|
||||
from time import sleep
|
||||
from typing import Any
|
||||
|
||||
@@ -47,19 +47,23 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor)
|
||||
# With autoprewarm, we need to be sure LFC was offloaded after all writes
|
||||
# finish, so we sleep. Otherwise we'll have less prewarmed pages than we want
|
||||
sleep(AUTOOFFLOAD_INTERVAL_SECS)
|
||||
client.offload_lfc_wait()
|
||||
return
|
||||
offload_res = client.offload_lfc_wait()
|
||||
log.info(offload_res)
|
||||
return offload_res
|
||||
|
||||
if method == PrewarmMethod.COMPUTE_CTL:
|
||||
status = client.prewarm_lfc_status()
|
||||
assert status["status"] == "not_prewarmed"
|
||||
assert "error" not in status
|
||||
client.offload_lfc()
|
||||
offload_res = client.offload_lfc()
|
||||
log.info(offload_res)
|
||||
assert client.prewarm_lfc_status()["status"] == "not_prewarmed"
|
||||
|
||||
parsed = prom_parse(client)
|
||||
desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0}
|
||||
assert parsed == desired, f"{parsed=} != {desired=}"
|
||||
return
|
||||
|
||||
return offload_res
|
||||
|
||||
raise AssertionError(f"{method} not in PrewarmMethod")
|
||||
|
||||
@@ -68,21 +72,30 @@ def prewarm_endpoint(
|
||||
method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None
|
||||
):
|
||||
if method == PrewarmMethod.AUTOPREWARM:
|
||||
client.prewarm_lfc_wait()
|
||||
prewarm_res = client.prewarm_lfc_wait()
|
||||
log.info(prewarm_res)
|
||||
elif method == PrewarmMethod.COMPUTE_CTL:
|
||||
client.prewarm_lfc()
|
||||
prewarm_res = client.prewarm_lfc()
|
||||
log.info(prewarm_res)
|
||||
return prewarm_res
|
||||
elif method == PrewarmMethod.POSTGRES:
|
||||
cur.execute("select neon.prewarm_local_cache(%s)", (lfc_state,))
|
||||
|
||||
|
||||
def check_prewarmed(
|
||||
def check_prewarmed_contains(
|
||||
method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int]
|
||||
):
|
||||
if method == PrewarmMethod.AUTOPREWARM:
|
||||
assert client.prewarm_lfc_status() == desired_status
|
||||
prewarm_status = client.prewarm_lfc_status()
|
||||
for k in desired_status:
|
||||
assert desired_status[k] == prewarm_status[k]
|
||||
|
||||
assert prom_parse(client)[PREWARM_LABEL] == 1
|
||||
elif method == PrewarmMethod.COMPUTE_CTL:
|
||||
assert client.prewarm_lfc_status() == desired_status
|
||||
prewarm_status = client.prewarm_lfc_status()
|
||||
for k in desired_status:
|
||||
assert desired_status[k] == prewarm_status[k]
|
||||
|
||||
desired = {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1, PREWARM_ERR_LABEL: 0, OFFLOAD_ERR_LABEL: 0}
|
||||
assert prom_parse(client) == desired
|
||||
|
||||
@@ -149,9 +162,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
|
||||
log.info(f"Used LFC size: {lfc_used_pages}")
|
||||
pg_cur.execute("select * from neon.get_prewarm_info()")
|
||||
total, prewarmed, skipped, _ = pg_cur.fetchall()[0]
|
||||
log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}")
|
||||
progress = (prewarmed + skipped) * 100 // total
|
||||
log.info(f"Prewarm progress: {progress}%")
|
||||
assert lfc_used_pages > 10000
|
||||
assert total > 0
|
||||
assert prewarmed > 0
|
||||
@@ -161,7 +171,54 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
|
||||
assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
|
||||
|
||||
desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
|
||||
check_prewarmed(method, client, desired)
|
||||
check_prewarmed_contains(method, client, desired)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
def test_lfc_prewarm_cancel(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test we can cancel LFC prewarm and prewarm successfully after
|
||||
"""
|
||||
env = neon_simple_env
|
||||
n_records = 1000000
|
||||
cfg = [
|
||||
"autovacuum = off",
|
||||
"shared_buffers=1MB",
|
||||
"neon.max_file_cache_size=1GB",
|
||||
"neon.file_cache_size_limit=1GB",
|
||||
"neon.file_cache_prewarm_limit=1000",
|
||||
]
|
||||
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
|
||||
|
||||
pg_conn = endpoint.connect()
|
||||
pg_cur = pg_conn.cursor()
|
||||
pg_cur.execute("create schema neon; create extension neon with schema neon")
|
||||
pg_cur.execute("create database lfc")
|
||||
|
||||
lfc_conn = endpoint.connect(dbname="lfc")
|
||||
lfc_cur = lfc_conn.cursor()
|
||||
log.info(f"Inserting {n_records} rows")
|
||||
lfc_cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))")
|
||||
lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))")
|
||||
log.info(f"Inserted {n_records} rows")
|
||||
|
||||
client = endpoint.http_client()
|
||||
method = PrewarmMethod.COMPUTE_CTL
|
||||
offload_lfc(method, client, pg_cur)
|
||||
|
||||
endpoint.stop()
|
||||
endpoint.start()
|
||||
|
||||
thread = Thread(target=lambda: prewarm_endpoint(method, client, pg_cur, None))
|
||||
thread.start()
|
||||
# wait 2 seconds to ensure we cancel prewarm SQL query
|
||||
sleep(2)
|
||||
client.cancel_prewarm_lfc()
|
||||
thread.join()
|
||||
assert client.prewarm_lfc_status()["status"] == "cancelled"
|
||||
|
||||
prewarm_endpoint(method, client, pg_cur, None)
|
||||
assert client.prewarm_lfc_status()["status"] == "completed"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
@@ -178,9 +235,8 @@ def test_lfc_prewarm_empty(neon_simple_env: NeonEnv):
|
||||
cur = conn.cursor()
|
||||
cur.execute("create schema neon; create extension neon with schema neon")
|
||||
method = PrewarmMethod.COMPUTE_CTL
|
||||
offload_lfc(method, client, cur)
|
||||
prewarm_endpoint(method, client, cur, None)
|
||||
assert client.prewarm_lfc_status()["status"] == "skipped"
|
||||
assert offload_lfc(method, client, cur)["status"] == "skipped"
|
||||
assert prewarm_endpoint(method, client, cur, None)["status"] == "skipped"
|
||||
|
||||
|
||||
# autoprewarm isn't needed as we prewarm manually
|
||||
@@ -251,11 +307,11 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
|
||||
|
||||
workload_threads = []
|
||||
for _ in range(n_threads):
|
||||
t = threading.Thread(target=workload)
|
||||
t = Thread(target=workload)
|
||||
workload_threads.append(t)
|
||||
t.start()
|
||||
|
||||
prewarm_thread = threading.Thread(target=prewarm)
|
||||
prewarm_thread = Thread(target=prewarm)
|
||||
prewarm_thread.start()
|
||||
|
||||
def prewarmed():
|
||||
|
||||
@@ -145,6 +145,7 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
|
||||
stop_and_check_lsn(secondary, None)
|
||||
|
||||
if method == PromoteMethod.COMPUTE_CTL:
|
||||
log.info("Restarting primary to check new config")
|
||||
secondary.stop()
|
||||
# In production, compute ultimately receives new compute spec from cplane.
|
||||
secondary.respec(mode="Primary")
|
||||
|
||||
@@ -286,3 +286,265 @@ def test_sk_generation_aware_tombstones(neon_env_builder: NeonEnvBuilder):
|
||||
assert re.match(r".*Timeline .* deleted.*", exc.value.response.text)
|
||||
# The timeline should remain deleted.
|
||||
expect_deleted(second_sk)
|
||||
|
||||
|
||||
def test_safekeeper_migration_stale_timeline(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that safekeeper migration handles stale timeline correctly by migrating to
|
||||
a safekeeper with a stale timeline.
|
||||
1. Check that we are waiting for the stale timeline to catch up with the commit lsn.
|
||||
The migration might fail if there is no compute to advance the WAL.
|
||||
2. Check that we rely on last_log_term (and not the current term) when waiting for the
|
||||
sync_position on step 7.
|
||||
3. Check that migration succeeds if the compute is running.
|
||||
"""
|
||||
neon_env_builder.num_safekeepers = 2
|
||||
neon_env_builder.storage_controller_config = {
|
||||
"timelines_onto_safekeepers": True,
|
||||
"timeline_safekeeper_count": 1,
|
||||
}
|
||||
env = neon_env_builder.init_start()
|
||||
env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS)
|
||||
env.storage_controller.allowed_errors.append(".*not enough successful .* to reach quorum.*")
|
||||
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
|
||||
active_sk = env.get_safekeeper(mconf["sk_set"][0])
|
||||
other_sk = [sk for sk in env.safekeepers if sk.id != active_sk.id][0]
|
||||
|
||||
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
|
||||
ep.start(safekeeper_generation=1, safekeepers=[active_sk.id])
|
||||
ep.safe_psql("CREATE TABLE t(a int)")
|
||||
ep.safe_psql("INSERT INTO t VALUES (0)")
|
||||
|
||||
# Pull the timeline to other_sk, so other_sk now has a "stale" timeline on it.
|
||||
other_sk.pull_timeline([active_sk], env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# Advance the WAL on active_sk.
|
||||
ep.safe_psql("INSERT INTO t VALUES (1)")
|
||||
|
||||
# The test is more tricky if we have the same last_log_term but different term/flush_lsn.
|
||||
# Stop the active_sk during the endpoint shutdown because otherwise compute_ctl runs
|
||||
# sync_safekeepers and advances last_log_term on active_sk.
|
||||
active_sk.stop()
|
||||
ep.stop(mode="immediate")
|
||||
active_sk.start()
|
||||
|
||||
active_sk_status = active_sk.http_client().timeline_status(
|
||||
env.initial_tenant, env.initial_timeline
|
||||
)
|
||||
other_sk_status = other_sk.http_client().timeline_status(
|
||||
env.initial_tenant, env.initial_timeline
|
||||
)
|
||||
|
||||
# other_sk should have the same last_log_term, but a stale flush_lsn.
|
||||
assert active_sk_status.last_log_term == other_sk_status.last_log_term
|
||||
assert active_sk_status.flush_lsn > other_sk_status.flush_lsn
|
||||
|
||||
commit_lsn = active_sk_status.flush_lsn
|
||||
|
||||
# Bump the term on other_sk to make it higher than active_sk.
|
||||
# This is to make sure we don't use current term instead of last_log_term in the algorithm.
|
||||
other_sk.http_client().term_bump(
|
||||
env.initial_tenant, env.initial_timeline, active_sk_status.term + 100
|
||||
)
|
||||
|
||||
# TODO(diko): now it fails because the timeline on other_sk is stale and there is no compute
|
||||
# to catch up it with active_sk. It might be fixed in https://databricks.atlassian.net/browse/LKB-946
|
||||
# if we delete stale timelines before starting the migration.
|
||||
# But the rest of the test is still valid: we should not lose committed WAL after the migration.
|
||||
with pytest.raises(
|
||||
StorageControllerApiException, match="not enough successful .* to reach quorum"
|
||||
):
|
||||
env.storage_controller.migrate_safekeepers(
|
||||
env.initial_tenant, env.initial_timeline, [other_sk.id]
|
||||
)
|
||||
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
assert mconf["new_sk_set"] == [other_sk.id]
|
||||
assert mconf["sk_set"] == [active_sk.id]
|
||||
assert mconf["generation"] == 2
|
||||
|
||||
# Start the endpoint, so it advances the WAL on other_sk.
|
||||
ep.start(safekeeper_generation=2, safekeepers=[active_sk.id, other_sk.id])
|
||||
# Now the migration should succeed.
|
||||
env.storage_controller.migrate_safekeepers(
|
||||
env.initial_tenant, env.initial_timeline, [other_sk.id]
|
||||
)
|
||||
|
||||
# Check that we didn't lose committed WAL.
|
||||
assert (
|
||||
other_sk.http_client().timeline_status(env.initial_tenant, env.initial_timeline).flush_lsn
|
||||
>= commit_lsn
|
||||
)
|
||||
assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)]
|
||||
|
||||
|
||||
def test_pull_from_most_advanced_sk(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that we pull the timeline from the most advanced safekeeper during the
|
||||
migration and do not lose committed WAL.
|
||||
"""
|
||||
neon_env_builder.num_safekeepers = 4
|
||||
neon_env_builder.storage_controller_config = {
|
||||
"timelines_onto_safekeepers": True,
|
||||
"timeline_safekeeper_count": 3,
|
||||
}
|
||||
env = neon_env_builder.init_start()
|
||||
env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS)
|
||||
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
|
||||
sk_set = mconf["sk_set"]
|
||||
assert len(sk_set) == 3
|
||||
|
||||
other_sk = [sk.id for sk in env.safekeepers if sk.id not in sk_set][0]
|
||||
|
||||
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
|
||||
ep.start(safekeeper_generation=1, safekeepers=sk_set)
|
||||
ep.safe_psql("CREATE TABLE t(a int)")
|
||||
ep.safe_psql("INSERT INTO t VALUES (0)")
|
||||
|
||||
# Stop one sk, so we have a lagging WAL on it.
|
||||
env.get_safekeeper(sk_set[0]).stop()
|
||||
# Advance the WAL on the other sks.
|
||||
ep.safe_psql("INSERT INTO t VALUES (1)")
|
||||
|
||||
# Stop other sks to make sure compute_ctl doesn't advance the last_log_term on them during shutdown.
|
||||
for sk_id in sk_set[1:]:
|
||||
env.get_safekeeper(sk_id).stop()
|
||||
ep.stop(mode="immediate")
|
||||
for sk_id in sk_set:
|
||||
env.get_safekeeper(sk_id).start()
|
||||
|
||||
# Bump the term on the lagging sk to make sure we don't use it to choose the most advanced sk.
|
||||
env.get_safekeeper(sk_set[0]).http_client().term_bump(
|
||||
env.initial_tenant, env.initial_timeline, 100
|
||||
)
|
||||
|
||||
def get_commit_lsn(sk_set: list[int]):
|
||||
flush_lsns = []
|
||||
last_log_terms = []
|
||||
for sk_id in sk_set:
|
||||
sk = env.get_safekeeper(sk_id)
|
||||
status = sk.http_client().timeline_status(env.initial_tenant, env.initial_timeline)
|
||||
flush_lsns.append(status.flush_lsn)
|
||||
last_log_terms.append(status.last_log_term)
|
||||
|
||||
# In this test we assume that all sks have the same last_log_term.
|
||||
assert len(set(last_log_terms)) == 1
|
||||
|
||||
flush_lsns.sort(reverse=True)
|
||||
commit_lsn = flush_lsns[len(sk_set) // 2]
|
||||
|
||||
log.info(f"sk_set: {sk_set}, flush_lsns: {flush_lsns}, commit_lsn: {commit_lsn}")
|
||||
return commit_lsn
|
||||
|
||||
commit_lsn_before_migration = get_commit_lsn(sk_set)
|
||||
|
||||
# Make two migrations, so the lagging sk stays in the sk_set, but other sks are replaced.
|
||||
new_sk_set1 = [sk_set[0], sk_set[1], other_sk] # remove sk_set[2], add other_sk
|
||||
new_sk_set2 = [sk_set[0], other_sk, sk_set[2]] # remove sk_set[1], add sk_set[2] back
|
||||
env.storage_controller.migrate_safekeepers(
|
||||
env.initial_tenant, env.initial_timeline, new_sk_set1
|
||||
)
|
||||
env.storage_controller.migrate_safekeepers(
|
||||
env.initial_tenant, env.initial_timeline, new_sk_set2
|
||||
)
|
||||
|
||||
commit_lsn_after_migration = get_commit_lsn(new_sk_set2)
|
||||
|
||||
# We should not lose committed WAL.
|
||||
# If we have choosen the lagging sk to pull the timeline from, this might fail.
|
||||
assert commit_lsn_before_migration <= commit_lsn_after_migration
|
||||
|
||||
ep.start(safekeeper_generation=5, safekeepers=new_sk_set2)
|
||||
assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)]
|
||||
|
||||
|
||||
def test_abort_safekeeper_migration(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that safekeeper migration can be aborted.
|
||||
1. Insert failpoints and ensure the abort successfully reverts the timeline state.
|
||||
2. Check that endpoint is operational after the abort.
|
||||
"""
|
||||
neon_env_builder.num_safekeepers = 2
|
||||
neon_env_builder.storage_controller_config = {
|
||||
"timelines_onto_safekeepers": True,
|
||||
"timeline_safekeeper_count": 1,
|
||||
}
|
||||
env = neon_env_builder.init_start()
|
||||
env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS)
|
||||
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
assert len(mconf["sk_set"]) == 1
|
||||
cur_sk = mconf["sk_set"][0]
|
||||
cur_gen = 1
|
||||
|
||||
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
|
||||
ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
|
||||
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
|
||||
ep.safe_psql("CREATE TABLE t(a int)")
|
||||
ep.safe_psql("INSERT INTO t VALUES (1)")
|
||||
|
||||
another_sk = [sk.id for sk in env.safekeepers if sk.id != cur_sk][0]
|
||||
|
||||
failpoints = [
|
||||
"sk-migration-after-step-3",
|
||||
"sk-migration-after-step-4",
|
||||
"sk-migration-after-step-5",
|
||||
"sk-migration-after-step-7",
|
||||
]
|
||||
|
||||
for fp in failpoints:
|
||||
env.storage_controller.configure_failpoints((fp, "return(1)"))
|
||||
|
||||
with pytest.raises(StorageControllerApiException, match=f"failpoint {fp}"):
|
||||
env.storage_controller.migrate_safekeepers(
|
||||
env.initial_tenant, env.initial_timeline, [another_sk]
|
||||
)
|
||||
cur_gen += 1
|
||||
|
||||
env.storage_controller.configure_failpoints((fp, "off"))
|
||||
|
||||
# We should have a joint mconf after the failure.
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
assert mconf["generation"] == cur_gen
|
||||
assert mconf["sk_set"] == [cur_sk]
|
||||
assert mconf["new_sk_set"] == [another_sk]
|
||||
|
||||
env.storage_controller.abort_safekeeper_migration(env.initial_tenant, env.initial_timeline)
|
||||
cur_gen += 1
|
||||
|
||||
# Abort should revert the timeline to the previous sk_set and increment the generation.
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
assert mconf["generation"] == cur_gen
|
||||
assert mconf["sk_set"] == [cur_sk]
|
||||
assert mconf["new_sk_set"] is None
|
||||
|
||||
assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith(f"g#{cur_gen}:")
|
||||
ep.safe_psql(f"INSERT INTO t VALUES ({cur_gen})")
|
||||
|
||||
# After step-8 the final mconf is committed and the migration is not abortable anymore.
|
||||
# So the abort should not abort anything.
|
||||
env.storage_controller.configure_failpoints(("sk-migration-after-step-8", "return(1)"))
|
||||
|
||||
with pytest.raises(StorageControllerApiException, match="failpoint sk-migration-after-step-8"):
|
||||
env.storage_controller.migrate_safekeepers(
|
||||
env.initial_tenant, env.initial_timeline, [another_sk]
|
||||
)
|
||||
cur_gen += 2
|
||||
|
||||
env.storage_controller.configure_failpoints((fp, "off"))
|
||||
|
||||
env.storage_controller.abort_safekeeper_migration(env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# The migration is fully committed, no abort should have been performed.
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
assert mconf["generation"] == cur_gen
|
||||
assert mconf["sk_set"] == [another_sk]
|
||||
assert mconf["new_sk_set"] is None
|
||||
|
||||
ep.safe_psql(f"INSERT INTO t VALUES ({cur_gen})")
|
||||
ep.clear_buffers()
|
||||
assert ep.safe_psql("SELECT * FROM t") == [(i + 1,) for i in range(cur_gen) if i % 2 == 0]
|
||||
|
||||
@@ -2742,7 +2742,6 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
|
||||
wait_until(unevicted)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Lakebase mode")
|
||||
def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that the timeline disk usage circuit breaker works as expected. We test that:
|
||||
@@ -2762,7 +2761,12 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Create a timeline and endpoint
|
||||
env.create_branch("test_timeline_disk_usage_limit")
|
||||
endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit")
|
||||
endpoint = env.endpoints.create_start(
|
||||
"test_timeline_disk_usage_limit",
|
||||
config_lines=[
|
||||
"neon.lakebase_mode=true",
|
||||
],
|
||||
)
|
||||
|
||||
# Install the neon extension in the test database. We need it to query perf counter metrics.
|
||||
with closing(endpoint.connect()) as conn:
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: c9f9fdd011...2155cb165d
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: aaaeff2550...2aaab3bb4a
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: 9b9cb4b3e3...a42351fcd4
2
vendor/postgres-v17
vendored
2
vendor/postgres-v17
vendored
Submodule vendor/postgres-v17 updated: fa1788475e...1e01fcea2a
8
vendor/revisions.json
vendored
8
vendor/revisions.json
vendored
@@ -1,18 +1,18 @@
|
||||
{
|
||||
"v17": [
|
||||
"17.5",
|
||||
"fa1788475e3146cc9c7c6a1b74f48fd296898fcd"
|
||||
"1e01fcea2a6b38180021aa83e0051d95286d9096"
|
||||
],
|
||||
"v16": [
|
||||
"16.9",
|
||||
"9b9cb4b3e33347aea8f61e606bb6569979516de5"
|
||||
"a42351fcd41ea01edede1daed65f651e838988fc"
|
||||
],
|
||||
"v15": [
|
||||
"15.13",
|
||||
"aaaeff2550d5deba58847f112af9b98fa3a58b00"
|
||||
"2aaab3bb4a13557aae05bb2ae0ef0a132d0c4f85"
|
||||
],
|
||||
"v14": [
|
||||
"14.18",
|
||||
"c9f9fdd0113b52c0bd535afdb09d3a543aeee25f"
|
||||
"2155cb165d05f617eb2c8ad7e43367189b627703"
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user