diff --git a/compute/etc/sql_exporter/checkpoints_req.17.sql b/compute/etc/sql_exporter/checkpoints_req.17.sql index a4b946e8e2..28c868ff72 100644 --- a/compute/etc/sql_exporter/checkpoints_req.17.sql +++ b/compute/etc/sql_exporter/checkpoints_req.17.sql @@ -1 +1 @@ -SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer; +SELECT num_requested AS checkpoints_req FROM pg_catalog.pg_stat_checkpointer; diff --git a/compute/etc/sql_exporter/checkpoints_req.sql b/compute/etc/sql_exporter/checkpoints_req.sql index eb8427c883..421448c0de 100644 --- a/compute/etc/sql_exporter/checkpoints_req.sql +++ b/compute/etc/sql_exporter/checkpoints_req.sql @@ -1 +1 @@ -SELECT checkpoints_req FROM pg_stat_bgwriter; +SELECT checkpoints_req FROM pg_catalog.pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/checkpoints_timed.sql b/compute/etc/sql_exporter/checkpoints_timed.sql index c50853134c..bfa9b1b3d6 100644 --- a/compute/etc/sql_exporter/checkpoints_timed.sql +++ b/compute/etc/sql_exporter/checkpoints_timed.sql @@ -1 +1 @@ -SELECT checkpoints_timed FROM pg_stat_bgwriter; +SELECT checkpoints_timed FROM pg_catalog.pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql index d97d625d4c..3fe638e489 100644 --- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql +++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql @@ -1 +1 @@ -SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled; +SELECT (neon.backpressure_throttling_time()::pg_catalog.float8 / 1000000) AS throttled; diff --git a/compute/etc/sql_exporter/compute_current_lsn.sql b/compute/etc/sql_exporter/compute_current_lsn.sql index be02b8a094..9a042547f0 100644 --- a/compute/etc/sql_exporter/compute_current_lsn.sql +++ b/compute/etc/sql_exporter/compute_current_lsn.sql @@ -1,4 +1,4 @@ SELECT CASE - WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 - ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8 + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_replay_lsn() - '0/0')::pg_catalog.FLOAT8 + ELSE (pg_catalog.pg_current_wal_lsn() - '0/0')::pg_catalog.FLOAT8 END AS lsn; diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql index f2454235b7..2224c02d8d 100644 --- a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql @@ -1,7 +1,7 @@ SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + (SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files - (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; + (SELECT COUNT(*) FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql index 73a9c11405..17cf1228d3 100644 --- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql +++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql @@ -1,7 +1,7 @@ SELECT - (SELECT current_setting('neon.timeline_id')) AS timeline_id, + (SELECT pg_catalog.current_setting('neon.timeline_id')) AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files - (SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes; + (SELECT COALESCE(pg_catalog.sum(size), 0) FROM pg_catalog.pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes; diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql index 16da899de2..33ca1137dc 100644 --- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql +++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql @@ -1,9 +1,9 @@ SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + (SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files - (SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0) - FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name + (SELECT COALESCE(pg_catalog.sum((pg_catalog.pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0) + FROM (SELECT * FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name ) AS logical_snapshots_bytes; diff --git a/compute/etc/sql_exporter/compute_max_connections.sql b/compute/etc/sql_exporter/compute_max_connections.sql index 99a49483a6..1613c962a2 100644 --- a/compute/etc/sql_exporter/compute_max_connections.sql +++ b/compute/etc/sql_exporter/compute_max_connections.sql @@ -1 +1 @@ -SELECT current_setting('max_connections') as max_connections; +SELECT pg_catalog.current_setting('max_connections') AS max_connections; diff --git a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql index d2281fdd42..e613939e71 100644 --- a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql +++ b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql @@ -1,4 +1,4 @@ SELECT datname database_name, - age(datfrozenxid) frozen_xid_age -FROM pg_database + pg_catalog.age(datfrozenxid) frozen_xid_age +FROM pg_catalog.pg_database ORDER BY frozen_xid_age DESC LIMIT 10; diff --git a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql index ed57894b3a..7949bacfff 100644 --- a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql +++ b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql @@ -1,4 +1,4 @@ SELECT datname database_name, - mxid_age(datminmxid) min_mxid_age -FROM pg_database + pg_catalog.mxid_age(datminmxid) min_mxid_age +FROM pg_catalog.pg_database ORDER BY min_mxid_age DESC LIMIT 10; diff --git a/compute/etc/sql_exporter/compute_receive_lsn.sql b/compute/etc/sql_exporter/compute_receive_lsn.sql index 318b31ab41..fb96056881 100644 --- a/compute/etc/sql_exporter/compute_receive_lsn.sql +++ b/compute/etc/sql_exporter/compute_receive_lsn.sql @@ -1,4 +1,4 @@ SELECT CASE - WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_receive_lsn() - '0/0')::pg_catalog.FLOAT8 ELSE 0 END AS lsn; diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.sql b/compute/etc/sql_exporter/compute_subscriptions_count.sql index 50740cb5df..e380a7acc7 100644 --- a/compute/etc/sql_exporter/compute_subscriptions_count.sql +++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql @@ -1 +1 @@ -SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled; +SELECT subenabled::pg_catalog.text AS enabled, pg_catalog.count(*) AS subscriptions_count FROM pg_catalog.pg_subscription GROUP BY subenabled; diff --git a/compute/etc/sql_exporter/connection_counts.sql b/compute/etc/sql_exporter/connection_counts.sql index 6824480fdb..480c4fb439 100644 --- a/compute/etc/sql_exporter/connection_counts.sql +++ b/compute/etc/sql_exporter/connection_counts.sql @@ -1 +1 @@ -SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state; +SELECT datname, state, pg_catalog.count(*) AS count FROM pg_catalog.pg_stat_activity WHERE state <> '' GROUP BY datname, state; diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql index fe0360ab5c..59205e6ed3 100644 --- a/compute/etc/sql_exporter/db_total_size.sql +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -1,5 +1,5 @@ -SELECT sum(pg_database_size(datname)) AS total -FROM pg_database +SELECT pg_catalog.sum(pg_catalog.pg_database_size(datname)) AS total +FROM pg_catalog.pg_database -- Ignore invalid databases, as we will likely have problems with -- getting their size from the Pageserver. WHERE datconnlimit != -2; diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql index 35fa42c34c..02cb2b4649 100644 --- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql @@ -3,6 +3,6 @@ -- minutes. SELECT - x::text as duration_seconds, + x::pg_catalog.text AS duration_seconds, neon.approximate_working_set_size_seconds(x) AS size FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql index 46c7d1610c..aab93d433a 100644 --- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql @@ -3,6 +3,6 @@ SELECT x AS duration, - neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM ( + neon.approximate_working_set_size_seconds(extract('epoch' FROM x::pg_catalog.interval)::pg_catalog.int4) AS size FROM ( VALUES ('5m'), ('15m'), ('1h') ) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.sql b/compute/etc/sql_exporter/lfc_cache_size_limit.sql index 378904c1fe..41c11e0adc 100644 --- a/compute/etc/sql_exporter/lfc_cache_size_limit.sql +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql @@ -1 +1 @@ -SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; +SELECT pg_catalog.pg_size_bytes(pg_catalog.current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql index 1b1c038501..8964d0d8ff 100644 --- a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql @@ -1,3 +1,3 @@ -SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn -FROM pg_replication_slots +SELECT slot_name, (restart_lsn - '0/0')::pg_catalog.FLOAT8 AS restart_lsn +FROM pg_catalog.pg_replication_slots WHERE slot_type = 'logical'; diff --git a/compute/etc/sql_exporter/max_cluster_size.sql b/compute/etc/sql_exporter/max_cluster_size.sql index 2d2355a9a7..d44fdebe38 100644 --- a/compute/etc/sql_exporter/max_cluster_size.sql +++ b/compute/etc/sql_exporter/max_cluster_size.sql @@ -1 +1 @@ -SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size'; +SELECT setting::pg_catalog.int4 AS max_cluster_size FROM pg_catalog.pg_settings WHERE name = 'neon.max_cluster_size'; diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql index 12e6c4ae59..1a1e54a7c6 100644 --- a/compute/etc/sql_exporter/pg_stats_userdb.sql +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -1,13 +1,13 @@ -- We export stats for 10 non-system databases. Without this limit it is too -- easy to abuse the system by creating lots of databases. -SELECT pg_database_size(datname) AS db_size, +SELECT pg_catalog.pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, tup_updated AS updated, tup_deleted AS deleted, datname -FROM pg_stat_database +FROM pg_catalog.pg_stat_database WHERE datname IN ( SELECT datname FROM pg_database -- Ignore invalid databases, as we will likely have problems with diff --git a/compute/etc/sql_exporter/replication_delay_bytes.sql b/compute/etc/sql_exporter/replication_delay_bytes.sql index 60a6981acd..d3b7aa724b 100644 --- a/compute/etc/sql_exporter/replication_delay_bytes.sql +++ b/compute/etc/sql_exporter/replication_delay_bytes.sql @@ -3,4 +3,4 @@ -- replay LSN may have advanced past the receive LSN we are using for the -- calculation. -SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; +SELECT GREATEST(0, pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), pg_catalog.pg_last_wal_replay_lsn())) AS replication_delay_bytes; diff --git a/compute/etc/sql_exporter/replication_delay_seconds.sql b/compute/etc/sql_exporter/replication_delay_seconds.sql index a76809ad74..af4dd3fd90 100644 --- a/compute/etc/sql_exporter/replication_delay_seconds.sql +++ b/compute/etc/sql_exporter/replication_delay_seconds.sql @@ -1,5 +1,5 @@ SELECT CASE - WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 - ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + WHEN pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST(0, EXTRACT (EPOCH FROM pg_catalog.now() - pg_catalog.pg_last_xact_replay_timestamp())) END AS replication_delay_seconds; diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql index 3e2aadfc28..ccb3504d58 100644 --- a/compute/etc/sql_exporter/retained_wal.sql +++ b/compute/etc/sql_exporter/retained_wal.sql @@ -1,10 +1,10 @@ SELECT slot_name, - pg_wal_lsn_diff( + pg_catalog.pg_wal_lsn_diff( CASE - WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() - ELSE pg_current_wal_lsn() + WHEN pg_catalog.pg_is_in_recovery() THEN pg_catalog.pg_last_wal_replay_lsn() + ELSE pg_catalog.pg_current_wal_lsn() END, - restart_lsn)::FLOAT8 AS retained_wal -FROM pg_replication_slots + restart_lsn)::pg_catalog.FLOAT8 AS retained_wal +FROM pg_catalog.pg_replication_slots WHERE active = false; diff --git a/compute/etc/sql_exporter/wal_is_lost.sql b/compute/etc/sql_exporter/wal_is_lost.sql index 5521270851..5a94cc3373 100644 --- a/compute/etc/sql_exporter/wal_is_lost.sql +++ b/compute/etc/sql_exporter/wal_is_lost.sql @@ -4,4 +4,4 @@ SELECT WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost -FROM pg_replication_slots; +FROM pg_catalog.pg_replication_slots; diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 2b4802f309..f383683ef8 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -279,7 +279,7 @@ fn main() -> Result<()> { config, )?; - let exit_code = compute_node.run()?; + let exit_code = compute_node.run().context("running compute node")?; scenario.teardown(); diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index e4207876ac..2458fe3c11 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -24,9 +24,9 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> { }); let query = " - INSERT INTO health_check VALUES (1, now()) + INSERT INTO public.health_check VALUES (1, pg_catalog.now()) ON CONFLICT (id) DO UPDATE - SET updated_at = now();"; + SET updated_at = pg_catalog.now();"; match client.simple_query(query).await { Result::Ok(result) => { diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1033670e2b..c0f0289e06 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -32,8 +32,12 @@ use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::time::{Duration, Instant}; use std::{env, fs}; use tokio::{spawn, sync::watch, task::JoinHandle, time}; +use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, instrument, warn}; use url::Url; +use utils::backoff::{ + DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff_duration, +}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::measured_stream::MeasuredReader; @@ -192,6 +196,7 @@ pub struct ComputeState { pub startup_span: Option, pub lfc_prewarm_state: LfcPrewarmState, + pub lfc_prewarm_token: CancellationToken, pub lfc_offload_state: LfcOffloadState, /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if @@ -217,6 +222,7 @@ impl ComputeState { lfc_offload_state: LfcOffloadState::default(), terminate_flush_lsn: None, promote_state: None, + lfc_prewarm_token: CancellationToken::new(), } } @@ -583,7 +589,7 @@ impl ComputeNode { // that can affect `compute_ctl` and prevent it from properly configuring the database schema. // Unset them via connection string options before connecting to the database. // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. - const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none"; + const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0 -c pgaudit.log=none"; let options = match conn_conf.get_options() { // Allow the control plane to override any options set by the // compute @@ -1554,6 +1560,41 @@ impl ComputeNode { Ok(lsn) } + fn sync_safekeepers_with_retries(&self, storage_auth_token: Option) -> Result { + let max_retries = 5; + let mut attempts = 0; + loop { + let result = self.sync_safekeepers(storage_auth_token.clone()); + match &result { + Ok(_) => { + if attempts > 0 { + tracing::info!("sync_safekeepers succeeded after {attempts} retries"); + } + return result; + } + Err(e) if attempts < max_retries => { + tracing::info!( + "sync_safekeepers failed, will retry (attempt {attempts}): {e:#}" + ); + } + Err(err) => { + tracing::warn!( + "sync_safekeepers still failed after {attempts} retries, giving up: {err:?}" + ); + return result; + } + } + // sleep and retry + let backoff = exponential_backoff_duration( + attempts, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ); + std::thread::sleep(backoff); + attempts += 1; + } + } + /// Do all the preparations like PGDATA directory creation, configuration, /// safekeepers sync, basebackup, etc. #[instrument(skip_all)] @@ -1589,7 +1630,7 @@ impl ComputeNode { lsn } else { info!("starting safekeepers syncing"); - self.sync_safekeepers(pspec.storage_auth_token.clone()) + self.sync_safekeepers_with_retries(pspec.storage_auth_token.clone()) .with_context(|| "failed to sync safekeepers")? }; info!("safekeepers synced at LSN {}", lsn); @@ -1884,7 +1925,7 @@ impl ComputeNode { // It doesn't matter what were the options before, here we just want // to connect and create a new superuser role. - const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0"; zenith_admin_conf.options(ZENITH_OPTIONS); let mut client = @@ -2339,13 +2380,13 @@ impl ComputeNode { let result = client .simple_query( "SELECT - row_to_json(pg_stat_statements) + pg_catalog.row_to_json(pss) FROM - pg_stat_statements + public.pg_stat_statements pss WHERE - userid != 'cloud_admin'::regrole::oid + pss.userid != 'cloud_admin'::pg_catalog.regrole::pg_catalog.oid ORDER BY - (mean_exec_time + mean_plan_time) DESC + (pss.mean_exec_time + pss.mean_plan_time) DESC LIMIT 100", ) .await; @@ -2473,11 +2514,11 @@ LIMIT 100", // check the role grants first - to gracefully handle read-replicas. let select = "SELECT privilege_type - FROM pg_namespace - JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true - JOIN pg_user users ON acl.grantee = users.usesysid - WHERE users.usename = $1 - AND nspname = $2"; + FROM pg_catalog.pg_namespace + JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) AS acl ON true + JOIN pg_catalog.pg_user users ON acl.grantee = users.usesysid + WHERE users.usename OPERATOR(pg_catalog.=) $1::pg_catalog.name + AND nspname OPERATOR(pg_catalog.=) $2::pg_catalog.name"; let rows = db_client .query(select, &[role_name, schema_name]) .await @@ -2546,8 +2587,9 @@ LIMIT 100", .await .with_context(|| format!("Failed to execute query: {query}"))?; } else { - let query = - format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}"); + let query = format!( + "CREATE EXTENSION IF NOT EXISTS {ext_name} WITH SCHEMA public VERSION {quoted_version}" + ); db_client .simple_query(&query) .await @@ -2738,7 +2780,7 @@ LIMIT 100", // 4. We start again and try to prewarm with the state from 2. instead of the previous complete state if matches!( prewarm_state, - LfcPrewarmState::Completed + LfcPrewarmState::Completed { .. } | LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Skipped ) { diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs index 97e62c1c80..c7051d35d0 100644 --- a/compute_tools/src/compute_prewarm.rs +++ b/compute_tools/src/compute_prewarm.rs @@ -7,18 +7,11 @@ use http::StatusCode; use reqwest::Client; use std::mem::replace; use std::sync::Arc; -use tokio::{io::AsyncReadExt, spawn}; +use std::time::Instant; +use tokio::{io::AsyncReadExt, select, spawn}; +use tokio_util::sync::CancellationToken; use tracing::{error, info}; -#[derive(serde::Serialize, Default)] -pub struct LfcPrewarmStateWithProgress { - #[serde(flatten)] - base: LfcPrewarmState, - total: i32, - prewarmed: i32, - skipped: i32, -} - /// A pair of url and a token to query endpoint storage for LFC prewarm-related tasks struct EndpointStoragePair { url: String, @@ -27,7 +20,7 @@ struct EndpointStoragePair { const KEY: &str = "lfc_state"; impl EndpointStoragePair { - /// endpoint_id is set to None while prewarming from other endpoint, see replica promotion + /// endpoint_id is set to None while prewarming from other endpoint, see compute_promote.rs /// If not None, takes precedence over pspec.spec.endpoint_id fn from_spec_and_endpoint( pspec: &crate::compute::ParsedSpec, @@ -53,36 +46,8 @@ impl EndpointStoragePair { } impl ComputeNode { - // If prewarm failed, we want to get overall number of segments as well as done ones. - // However, this function should be reliable even if querying postgres failed. - pub async fn lfc_prewarm_state(&self) -> LfcPrewarmStateWithProgress { - info!("requesting LFC prewarm state from postgres"); - let mut state = LfcPrewarmStateWithProgress::default(); - { - state.base = self.state.lock().unwrap().lfc_prewarm_state.clone(); - } - - let client = match ComputeNode::get_maintenance_client(&self.tokio_conn_conf).await { - Ok(client) => client, - Err(err) => { - error!(%err, "connecting to postgres"); - return state; - } - }; - let row = match client - .query_one("select * from neon.get_prewarm_info()", &[]) - .await - { - Ok(row) => row, - Err(err) => { - error!(%err, "querying LFC prewarm status"); - return state; - } - }; - state.total = row.try_get(0).unwrap_or_default(); - state.prewarmed = row.try_get(1).unwrap_or_default(); - state.skipped = row.try_get(2).unwrap_or_default(); - state + pub async fn lfc_prewarm_state(&self) -> LfcPrewarmState { + self.state.lock().unwrap().lfc_prewarm_state.clone() } pub fn lfc_offload_state(&self) -> LfcOffloadState { @@ -92,34 +57,35 @@ impl ComputeNode { /// If there is a prewarm request ongoing, return `false`, `true` otherwise. /// Has a failpoint "compute-prewarm" pub fn prewarm_lfc(self: &Arc, from_endpoint: Option) -> bool { + let token: CancellationToken; { - let state = &mut self.state.lock().unwrap().lfc_prewarm_state; - if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) { + let state = &mut self.state.lock().unwrap(); + token = state.lfc_prewarm_token.clone(); + if let LfcPrewarmState::Prewarming = + replace(&mut state.lfc_prewarm_state, LfcPrewarmState::Prewarming) + { return false; } } crate::metrics::LFC_PREWARMS.inc(); - let cloned = self.clone(); + let this = self.clone(); spawn(async move { - let state = match cloned.prewarm_impl(from_endpoint).await { - Ok(true) => LfcPrewarmState::Completed, - Ok(false) => { - info!( - "skipping LFC prewarm because LFC state is not found in endpoint storage" - ); - LfcPrewarmState::Skipped - } + let prewarm_state = match this.prewarm_impl(from_endpoint, token).await { + Ok(state) => state, Err(err) => { crate::metrics::LFC_PREWARM_ERRORS.inc(); error!(%err, "could not prewarm LFC"); - LfcPrewarmState::Failed { - error: format!("{err:#}"), - } + let error = format!("{err:#}"); + LfcPrewarmState::Failed { error } } }; - cloned.state.lock().unwrap().lfc_prewarm_state = state; + let state = &mut this.state.lock().unwrap(); + if let LfcPrewarmState::Cancelled = prewarm_state { + state.lfc_prewarm_token = CancellationToken::new(); + } + state.lfc_prewarm_state = prewarm_state; }); true } @@ -131,55 +97,101 @@ impl ComputeNode { } /// Request LFC state from endpoint storage and load corresponding pages into Postgres. - /// Returns a result with `false` if the LFC state is not found in endpoint storage. - async fn prewarm_impl(&self, from_endpoint: Option) -> Result { - let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?; + async fn prewarm_impl( + &self, + from_endpoint: Option, + token: CancellationToken, + ) -> Result { + let EndpointStoragePair { + url, + token: storage_token, + } = self.endpoint_storage_pair(from_endpoint)?; #[cfg(feature = "testing")] - fail::fail_point!("compute-prewarm", |_| { - bail!("prewarm configured to fail because of a failpoint") - }); + fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint")); info!(%url, "requesting LFC state from endpoint storage"); - let request = Client::new().get(&url).bearer_auth(token); - let res = request.send().await.context("querying endpoint storage")?; - match res.status() { + let mut now = Instant::now(); + let request = Client::new().get(&url).bearer_auth(storage_token); + let response = select! { + _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), + response = request.send() => response + } + .context("querying endpoint storage")?; + + match response.status() { StatusCode::OK => (), - StatusCode::NOT_FOUND => { - return Ok(false); - } + StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped), status => bail!("{status} querying endpoint storage"), } + let state_download_time_ms = now.elapsed().as_millis() as u32; + now = Instant::now(); let mut uncompressed = Vec::new(); - let lfc_state = res - .bytes() - .await - .context("getting request body from endpoint storage")?; - ZstdDecoder::new(lfc_state.iter().as_slice()) - .read_to_end(&mut uncompressed) - .await - .context("decoding LFC state")?; + let lfc_state = select! { + _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), + lfc_state = response.bytes() => lfc_state + } + .context("getting request body from endpoint storage")?; + + let mut decoder = ZstdDecoder::new(lfc_state.iter().as_slice()); + select! { + _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), + read = decoder.read_to_end(&mut uncompressed) => read + } + .context("decoding LFC state")?; + let uncompress_time_ms = now.elapsed().as_millis() as u32; + now = Instant::now(); + let uncompressed_len = uncompressed.len(); + info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}"); - info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres"); - - ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + // Client connection and prewarm info querying are fast and therefore don't need + // cancellation + let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await - .context("connecting to postgres")? - .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed]) - .await - .context("loading LFC state into postgres") - .map(|_| ())?; + .context("connecting to postgres")?; + let pg_token = client.cancel_token(); - Ok(true) + let params: Vec<&(dyn postgres_types::ToSql + Sync)> = vec![&uncompressed]; + select! { + res = client.query_one("select neon.prewarm_local_cache($1)", ¶ms) => res, + _ = token.cancelled() => { + pg_token.cancel_query(postgres::NoTls).await + .context("cancelling neon.prewarm_local_cache()")?; + return Ok(LfcPrewarmState::Cancelled) + } + } + .context("loading LFC state into postgres") + .map(|_| ())?; + let prewarm_time_ms = now.elapsed().as_millis() as u32; + + let row = client + .query_one("select * from neon.get_prewarm_info()", &[]) + .await + .context("querying prewarm info")?; + let total = row.try_get(0).unwrap_or_default(); + let prewarmed = row.try_get(1).unwrap_or_default(); + let skipped = row.try_get(2).unwrap_or_default(); + + Ok(LfcPrewarmState::Completed { + total, + prewarmed, + skipped, + state_download_time_ms, + uncompress_time_ms, + prewarm_time_ms, + }) } /// If offload request is ongoing, return false, true otherwise pub fn offload_lfc(self: &Arc) -> bool { { let state = &mut self.state.lock().unwrap().lfc_offload_state; - if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading { + if matches!( + replace(state, LfcOffloadState::Offloading), + LfcOffloadState::Offloading + ) { return false; } } @@ -191,7 +203,10 @@ impl ComputeNode { pub async fn offload_lfc_async(self: &Arc) { { let state = &mut self.state.lock().unwrap().lfc_offload_state; - if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading { + if matches!( + replace(state, LfcOffloadState::Offloading), + LfcOffloadState::Offloading + ) { return; } } @@ -200,23 +215,23 @@ impl ComputeNode { async fn offload_lfc_with_state_update(&self) { crate::metrics::LFC_OFFLOADS.inc(); - - let Err(err) = self.offload_lfc_impl().await else { - self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed; - return; - }; - - crate::metrics::LFC_OFFLOAD_ERRORS.inc(); - error!(%err, "could not offload LFC state to endpoint storage"); - self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed { - error: format!("{err:#}"), + let state = match self.offload_lfc_impl().await { + Ok(state) => state, + Err(err) => { + crate::metrics::LFC_OFFLOAD_ERRORS.inc(); + error!(%err, "could not offload LFC"); + let error = format!("{err:#}"); + LfcOffloadState::Failed { error } + } }; + self.state.lock().unwrap().lfc_offload_state = state; } - async fn offload_lfc_impl(&self) -> Result<()> { + async fn offload_lfc_impl(&self) -> Result { let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?; info!(%url, "requesting LFC state from Postgres"); + let mut now = Instant::now(); let row = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await .context("connecting to postgres")? @@ -228,26 +243,41 @@ impl ComputeNode { .context("deserializing LFC state")?; let Some(state) = state else { info!(%url, "empty LFC state, not exporting"); - return Ok(()); + return Ok(LfcOffloadState::Skipped); }; + let state_query_time_ms = now.elapsed().as_millis() as u32; + now = Instant::now(); let mut compressed = Vec::new(); ZstdEncoder::new(state) .read_to_end(&mut compressed) .await .context("compressing LFC state")?; + let compress_time_ms = now.elapsed().as_millis() as u32; + now = Instant::now(); let compressed_len = compressed.len(); - info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage"); + info!(%url, "downloaded LFC state, compressed size {compressed_len}"); let request = Client::new().put(url).bearer_auth(token).body(compressed); - match request.send().await { - Ok(res) if res.status() == StatusCode::OK => Ok(()), - Ok(res) => bail!( - "Request to endpoint storage failed with status: {}", - res.status() - ), - Err(err) => Err(err).context("writing to endpoint storage"), + let response = request + .send() + .await + .context("writing to endpoint storage")?; + let state_upload_time_ms = now.elapsed().as_millis() as u32; + let status = response.status(); + if status != StatusCode::OK { + bail!("request to endpoint storage failed: {status}"); } + + Ok(LfcOffloadState::Completed { + compress_time_ms, + state_query_time_ms, + state_upload_time_ms, + }) + } + + pub fn cancel_prewarm(self: &Arc) { + self.state.lock().unwrap().lfc_prewarm_token.cancel(); } } diff --git a/compute_tools/src/compute_promote.rs b/compute_tools/src/compute_promote.rs index a34368c531..15b5fcfb46 100644 --- a/compute_tools/src/compute_promote.rs +++ b/compute_tools/src/compute_promote.rs @@ -1,32 +1,24 @@ use crate::compute::ComputeNode; -use anyhow::{Context, Result, bail}; +use anyhow::{Context, bail}; use compute_api::responses::{LfcPrewarmState, PromoteConfig, PromoteState}; -use compute_api::spec::ComputeMode; -use itertools::Itertools; -use std::collections::HashMap; -use std::{sync::Arc, time::Duration}; -use tokio::time::sleep; +use std::time::Instant; use tracing::info; -use utils::lsn::Lsn; impl ComputeNode { - /// Returns only when promote fails or succeeds. If a network error occurs - /// and http client disconnects, this does not stop promotion, and subsequent - /// calls block until promote finishes. + /// Returns only when promote fails or succeeds. If http client calling this function + /// disconnects, this does not stop promotion, and subsequent calls block until promote finishes. /// Called by control plane on secondary after primary endpoint is terminated /// Has a failpoint "compute-promotion" - pub async fn promote(self: &Arc, cfg: PromoteConfig) -> PromoteState { - let cloned = self.clone(); - let promote_fn = async move || { - let Err(err) = cloned.promote_impl(cfg).await else { - return PromoteState::Completed; - }; - tracing::error!(%err, "promoting"); - PromoteState::Failed { - error: format!("{err:#}"), + pub async fn promote(self: &std::sync::Arc, cfg: PromoteConfig) -> PromoteState { + let this = self.clone(); + let promote_fn = async move || match this.promote_impl(cfg).await { + Ok(state) => state, + Err(err) => { + tracing::error!(%err, "promoting replica"); + let error = format!("{err:#}"); + PromoteState::Failed { error } } }; - let start_promotion = || { let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted); tokio::spawn(async move { tx.send(promote_fn().await) }); @@ -34,36 +26,31 @@ impl ComputeNode { }; let mut task; - // self.state is unlocked after block ends so we lock it in promote_impl - // and task.changed() is reached + // promote_impl locks self.state so we need to unlock it before calling task.changed() { - task = self - .state - .lock() - .unwrap() - .promote_state - .get_or_insert_with(start_promotion) - .clone() + let promote_state = &mut self.state.lock().unwrap().promote_state; + task = promote_state.get_or_insert_with(start_promotion).clone() + } + if task.changed().await.is_err() { + let error = "promote sender dropped".to_string(); + return PromoteState::Failed { error }; } - task.changed().await.expect("promote sender dropped"); task.borrow().clone() } - async fn promote_impl(&self, mut cfg: PromoteConfig) -> Result<()> { + async fn promote_impl(&self, cfg: PromoteConfig) -> anyhow::Result { { let state = self.state.lock().unwrap(); let mode = &state.pspec.as_ref().unwrap().spec.mode; - if *mode != ComputeMode::Replica { - bail!("{} is not replica", mode.to_type_str()); + if *mode != compute_api::spec::ComputeMode::Replica { + bail!("compute mode \"{}\" is not replica", mode.to_type_str()); } - - // we don't need to query Postgres so not self.lfc_prewarm_state() match &state.lfc_prewarm_state { - LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => { - bail!("prewarm not requested or pending") + status @ (LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming) => { + bail!("compute {status}") } LfcPrewarmState::Failed { error } => { - tracing::warn!(%error, "replica prewarm failed") + tracing::warn!(%error, "compute prewarm failed") } _ => {} } @@ -72,26 +59,29 @@ impl ComputeNode { let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await .context("connecting to postgres")?; + let mut now = Instant::now(); let primary_lsn = cfg.wal_flush_lsn; - let mut last_wal_replay_lsn: Lsn = Lsn::INVALID; + let mut standby_lsn = utils::lsn::Lsn::INVALID; const RETRIES: i32 = 20; for i in 0..=RETRIES { let row = client - .query_one("SELECT pg_last_wal_replay_lsn()", &[]) + .query_one("SELECT pg_catalog.pg_last_wal_replay_lsn()", &[]) .await .context("getting last replay lsn")?; let lsn: u64 = row.get::(0).into(); - last_wal_replay_lsn = lsn.into(); - if last_wal_replay_lsn >= primary_lsn { + standby_lsn = lsn.into(); + if standby_lsn >= primary_lsn { break; } - info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}"); - sleep(Duration::from_secs(1)).await; + info!(%standby_lsn, %primary_lsn, "catching up, try {i}"); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; } - if last_wal_replay_lsn < primary_lsn { + if standby_lsn < primary_lsn { bail!("didn't catch up with primary in {RETRIES} retries"); } + let lsn_wait_time_ms = now.elapsed().as_millis() as u32; + now = Instant::now(); // using $1 doesn't work with ALTER SYSTEM SET let safekeepers_sql = format!( @@ -103,26 +93,32 @@ impl ComputeNode { .await .context("setting safekeepers")?; client - .query("SELECT pg_reload_conf()", &[]) + .query( + "ALTER SYSTEM SET synchronous_standby_names=walproposer", + &[], + ) + .await + .context("setting synchronous_standby_names")?; + client + .query("SELECT pg_catalog.pg_reload_conf()", &[]) .await .context("reloading postgres config")?; #[cfg(feature = "testing")] - fail::fail_point!("compute-promotion", |_| { - bail!("promotion configured to fail because of a failpoint") - }); + fail::fail_point!("compute-promotion", |_| bail!( + "compute-promotion failpoint" + )); let row = client - .query_one("SELECT * FROM pg_promote()", &[]) + .query_one("SELECT * FROM pg_catalog.pg_promote()", &[]) .await .context("pg_promote")?; if !row.get::(0) { - bail!("pg_promote() returned false"); + bail!("pg_promote() failed"); } + let pg_promote_time_ms = now.elapsed().as_millis() as u32; + let now = Instant::now(); - let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) - .await - .context("connecting to postgres")?; let row = client .query_one("SHOW transaction_read_only", &[]) .await @@ -131,36 +127,47 @@ impl ComputeNode { bail!("replica in read only mode after promotion"); } + // Already checked validity in http handler + #[allow(unused_mut)] + let mut new_pspec = crate::compute::ParsedSpec::try_from(cfg.spec).expect("invalid spec"); { let mut state = self.state.lock().unwrap(); - let spec = &mut state.pspec.as_mut().unwrap().spec; - spec.mode = ComputeMode::Primary; - let new_conf = cfg.spec.cluster.postgresql_conf.as_mut().unwrap(); - let existing_conf = spec.cluster.postgresql_conf.as_ref().unwrap(); - Self::merge_spec(new_conf, existing_conf); + + // Local setup has different ports for pg process (port=) for primary and secondary. + // Primary is stopped so we need secondary's "port" value + #[cfg(feature = "testing")] + { + let old_spec = &state.pspec.as_ref().unwrap().spec; + let Some(old_conf) = old_spec.cluster.postgresql_conf.as_ref() else { + bail!("pspec.spec.cluster.postgresql_conf missing for endpoint"); + }; + let set: std::collections::HashMap<&str, &str> = old_conf + .split_terminator('\n') + .map(|e| e.split_once("=").expect("invalid item")) + .collect(); + + let Some(new_conf) = new_pspec.spec.cluster.postgresql_conf.as_mut() else { + bail!("pspec.spec.cluster.postgresql_conf missing for supplied config"); + }; + new_conf.push_str(&format!("port={}\n", set["port"])); + } + + tracing::debug!("applied spec: {:#?}", new_pspec.spec); + if self.params.lakebase_mode { + ComputeNode::set_spec(&self.params, &mut state, new_pspec); + } else { + state.pspec = Some(new_pspec); + } } + info!("applied new spec, reconfiguring as primary"); - self.reconfigure() - } + self.reconfigure()?; + let reconfigure_time_ms = now.elapsed().as_millis() as u32; - /// Merge old and new Postgres conf specs to apply on secondary. - /// Change new spec's port and safekeepers since they are supplied - /// differenly - fn merge_spec(new_conf: &mut String, existing_conf: &str) { - let mut new_conf_set: HashMap<&str, &str> = new_conf - .split_terminator('\n') - .map(|e| e.split_once("=").expect("invalid item")) - .collect(); - new_conf_set.remove("neon.safekeepers"); - - let existing_conf_set: HashMap<&str, &str> = existing_conf - .split_terminator('\n') - .map(|e| e.split_once("=").expect("invalid item")) - .collect(); - new_conf_set.insert("port", existing_conf_set["port"]); - *new_conf = new_conf_set - .iter() - .map(|(k, v)| format!("{k}={v}")) - .join("\n"); + Ok(PromoteState::Completed { + lsn_wait_time_ms, + pg_promote_time_ms, + reconfigure_time_ms, + }) } } diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index e7dde5c5f5..56602ab620 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -65,14 +65,19 @@ pub fn write_postgres_conf( writeln!(file, "{conf}")?; } - // Stripe size GUC should be defined prior to connection string - if let Some(stripe_size) = spec.shard_stripe_size { - writeln!(file, "neon.stripe_size={stripe_size}")?; - } // Add options for connecting to storage writeln!(file, "# Neon storage settings")?; writeln!(file)?; if let Some(conninfo) = &spec.pageserver_connection_info { + // Stripe size GUC should be defined prior to connection string + if let Some(stripe_size) = conninfo.stripe_size { + writeln!( + file, + "# from compute spec's pageserver_connection_info.stripe_size field" + )?; + writeln!(file, "neon.stripe_size={stripe_size}")?; + } + let mut libpq_urls: Option> = Some(Vec::new()); let num_shards = if conninfo.shard_count.0 == 0 { 1 // unsharded, treat it as a single shard @@ -110,7 +115,7 @@ pub fn write_postgres_conf( if let Some(libpq_urls) = libpq_urls { writeln!( file, - "# derived from compute spec's pageserver_conninfo field" + "# derived from compute spec's pageserver_connection_info field" )?; writeln!( file, @@ -120,24 +125,16 @@ pub fn write_postgres_conf( } else { writeln!(file, "# no neon.pageserver_connstring")?; } - - if let Some(stripe_size) = conninfo.stripe_size { - writeln!( - file, - "# from compute spec's pageserver_conninfo.stripe_size field" - )?; - writeln!(file, "neon.stripe_size={stripe_size}")?; - } } else { - if let Some(s) = &spec.pageserver_connstring { - writeln!(file, "# from compute spec's pageserver_connstring field")?; - writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?; - } - + // Stripe size GUC should be defined prior to connection string if let Some(stripe_size) = spec.shard_stripe_size { writeln!(file, "# from compute spec's shard_stripe_size field")?; writeln!(file, "neon.stripe_size={stripe_size}")?; } + if let Some(s) = &spec.pageserver_connstring { + writeln!(file, "# from compute spec's pageserver_connstring field")?; + writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?; + } } if !spec.safekeeper_connstrings.is_empty() { diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index ab729d62b5..82e61acfdc 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -139,6 +139,15 @@ paths: application/json: schema: $ref: "#/components/schemas/LfcPrewarmState" + delete: + tags: + - Prewarm + summary: Cancel ongoing LFC prewarm + description: "" + operationId: cancelLfcPrewarm + responses: + 202: + description: Prewarm cancelled /lfc/offload: post: @@ -608,9 +617,6 @@ components: type: object required: - status - - total - - prewarmed - - skipped properties: status: description: LFC prewarm status @@ -628,6 +634,15 @@ components: skipped: description: Pages processed but not prewarmed type: integer + state_download_time_ms: + description: Time it takes to download LFC state to compute + type: integer + uncompress_time_ms: + description: Time it takes to uncompress LFC state + type: integer + prewarm_time_ms: + description: Time it takes to prewarm LFC state in Postgres + type: integer LfcOffloadState: type: object @@ -636,11 +651,21 @@ components: properties: status: description: LFC offload status - enum: [not_offloaded, offloading, completed, failed] + enum: [not_offloaded, offloading, completed, skipped, failed] type: string error: description: LFC offload error, if any type: string + state_query_time_ms: + description: Time it takes to get LFC state from Postgres + type: integer + compress_time_ms: + description: Time it takes to compress LFC state + type: integer + state_upload_time_ms: + description: Time it takes to upload LFC state to endpoint storage + type: integer + PromoteState: type: object @@ -654,6 +679,15 @@ components: error: description: Promote error, if any type: string + lsn_wait_time_ms: + description: Time it takes for secondary to catch up with primary WAL flush LSN + type: integer + pg_promote_time_ms: + description: Time it takes to call pg_promote on secondary + type: integer + reconfigure_time_ms: + description: Time it takes to reconfigure promoted secondary + type: integer SetRoleGrantsRequest: type: object diff --git a/compute_tools/src/http/routes/lfc.rs b/compute_tools/src/http/routes/lfc.rs index e98bd781a2..6ad216778e 100644 --- a/compute_tools/src/http/routes/lfc.rs +++ b/compute_tools/src/http/routes/lfc.rs @@ -1,12 +1,11 @@ -use crate::compute_prewarm::LfcPrewarmStateWithProgress; use crate::http::JsonResponse; use axum::response::{IntoResponse, Response}; use axum::{Json, http::StatusCode}; use axum_extra::extract::OptionalQuery; -use compute_api::responses::LfcOffloadState; +use compute_api::responses::{LfcOffloadState, LfcPrewarmState}; type Compute = axum::extract::State>; -pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json { +pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json { Json(compute.lfc_prewarm_state().await) } @@ -46,3 +45,8 @@ pub(in crate::http) async fn offload(compute: Compute) -> Response { ) } } + +pub(in crate::http) async fn cancel_prewarm(compute: Compute) -> StatusCode { + compute.cancel_prewarm(); + StatusCode::ACCEPTED +} diff --git a/compute_tools/src/http/routes/promote.rs b/compute_tools/src/http/routes/promote.rs index 7ca3464b63..865de0da91 100644 --- a/compute_tools/src/http/routes/promote.rs +++ b/compute_tools/src/http/routes/promote.rs @@ -1,11 +1,22 @@ use crate::http::JsonResponse; use axum::extract::Json; +use compute_api::responses::PromoteConfig; use http::StatusCode; pub(in crate::http) async fn promote( compute: axum::extract::State>, - Json(cfg): Json, + Json(cfg): Json, ) -> axum::response::Response { + // Return early at the cost of extra parsing spec + let pspec = match crate::compute::ParsedSpec::try_from(cfg.spec) { + Ok(p) => p, + Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e), + }; + + let cfg = PromoteConfig { + spec: pspec.spec, + wal_flush_lsn: cfg.wal_flush_lsn, + }; let state = compute.promote(cfg).await; if let compute_api::responses::PromoteState::Failed { error: _ } = state { return JsonResponse::create_response(StatusCode::INTERNAL_SERVER_ERROR, state); diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 2fd3121f4f..869fdef11d 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -99,7 +99,12 @@ impl From<&Server> for Router> { ); let authenticated_router = Router::>::new() - .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm)) + .route( + "/lfc/prewarm", + get(lfc::prewarm_state) + .post(lfc::prewarm) + .delete(lfc::cancel_prewarm), + ) .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload)) .route("/promote", post(promote::promote)) .route("/check_writability", post(check_writability::is_writable)) diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 5f60b711c8..a9ddef58e5 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -19,7 +19,7 @@ async fn list_dbs(client: &mut Client) -> Result, PostgresError> { .query( "SELECT datname FROM pg_catalog.pg_database WHERE datallowconn - AND datconnlimit <> - 2 + AND datconnlimit OPERATOR(pg_catalog.<>) (OPERATOR(pg_catalog.-) 2::pg_catalog.int4) LIMIT 500", &[], ) @@ -67,7 +67,7 @@ pub async fn get_installed_extensions( let extensions: Vec<(String, String, i32)> = client .query( - "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension", + "SELECT extname, extversion, extowner::pg_catalog.int4 FROM pg_catalog.pg_extension", &[], ) .await? diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index 88d870df97..c8f911b5a6 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -76,7 +76,7 @@ impl<'m> MigrationRunner<'m> { self.client .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration") .await?; - self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?; + self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key pg_catalog.int4 NOT NULL PRIMARY KEY, id pg_catalog.int8 NOT NULL DEFAULT 0)").await?; self.client .simple_query( "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING", diff --git a/compute_tools/src/migrations/0002-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql index 367356e6eb..6e28e8d32c 100644 --- a/compute_tools/src/migrations/0002-alter_roles.sql +++ b/compute_tools/src/migrations/0002-alter_roles.sql @@ -15,17 +15,17 @@ DO $$ DECLARE role_name text; BEGIN - FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member') + FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT'; + RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', pg_catalog.quote_ident(role_name); + EXECUTE pg_catalog.format('ALTER ROLE %I INHERIT;', role_name); END LOOP; - FOR role_name IN SELECT rolname FROM pg_roles + FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE - NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_') + NOT pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT pg_catalog.starts_with(rolname, 'pg_') LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS'; + RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', pg_catalog.quote_ident(role_name); + EXECUTE pg_catalog.format('ALTER ROLE %I NOBYPASSRLS;', role_name); END LOOP; END $$; diff --git a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql index adf159dc06..d67d6457c6 100644 --- a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql +++ b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN - IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name = 'server_version_num') THEN EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}'; END IF; END $$; diff --git a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql index 47129d65b8..7f74d4ee28 100644 --- a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql +++ b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql @@ -5,9 +5,9 @@ DO $$ DECLARE role_name TEXT; BEGIN - FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE + FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE rolreplication IS TRUE LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION'; + RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', pg_catalog.quote_ident(role_name); + EXECUTE pg_catalog.format('ALTER ROLE %I NOREPLICATION;', role_name); END LOOP; END $$; diff --git a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql index 84fcb36391..714bdc735a 100644 --- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql +++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN - IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name OPERATOR(pg_catalog.=) 'server_version_num'::pg_catalog.text) THEN EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}'; EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}'; END IF; diff --git a/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql b/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql index 0c81cef1c4..b5b209ef5e 100644 --- a/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql @@ -2,7 +2,7 @@ DO $$ DECLARE bypassrls boolean; BEGIN - SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser'; + SELECT rolbypassrls INTO bypassrls FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser'; IF NOT bypassrls THEN RAISE EXCEPTION 'neon_superuser cannot bypass RLS'; END IF; diff --git a/compute_tools/src/migrations/tests/0002-alter_roles.sql b/compute_tools/src/migrations/tests/0002-alter_roles.sql index 433f7b34f7..1755c9088c 100644 --- a/compute_tools/src/migrations/tests/0002-alter_roles.sql +++ b/compute_tools/src/migrations/tests/0002-alter_roles.sql @@ -4,8 +4,8 @@ DECLARE BEGIN FOR role IN SELECT rolname AS name, rolinherit AS inherit - FROM pg_roles - WHERE pg_has_role(rolname, 'neon_superuser', 'member') + FROM pg_catalog.pg_roles + WHERE pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member') LOOP IF NOT role.inherit THEN RAISE EXCEPTION '% cannot inherit', quote_ident(role.name); @@ -14,12 +14,12 @@ BEGIN FOR role IN SELECT rolname AS name, rolbypassrls AS bypassrls - FROM pg_roles - WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member') - AND NOT starts_with(rolname, 'pg_') + FROM pg_catalog.pg_roles + WHERE NOT pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member') + AND NOT pg_catalog.starts_with(rolname, 'pg_') LOOP IF role.bypassrls THEN - RAISE EXCEPTION '% can bypass RLS', quote_ident(role.name); + RAISE EXCEPTION '% can bypass RLS', pg_catalog.quote_ident(role.name); END IF; END LOOP; END $$; diff --git a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql index b164d61295..498770f4fa 100644 --- a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql @@ -1,10 +1,10 @@ DO $$ BEGIN - IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN + IF (SELECT pg_catalog.current_setting('server_version_num')::pg_catalog.numeric < 160000) THEN RETURN; END IF; - IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN + IF NOT (SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription'; END IF; END $$; diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql index 3464a2b1cf..ec04cfe199 100644 --- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql @@ -2,12 +2,12 @@ DO $$ DECLARE monitor record; BEGIN - SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member, + SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member, admin_option AS admin INTO monitor - FROM pg_auth_members - WHERE roleid = 'pg_monitor'::regrole - AND member = 'neon_superuser'::regrole; + FROM pg_catalog.pg_auth_members + WHERE roleid = 'pg_monitor'::pg_catalog.regrole + AND member = 'neon_superuser'::pg_catalog.regrole; IF monitor IS NULL THEN RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor'; diff --git a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql index af7f50e95d..f3b28d76c9 100644 --- a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql @@ -2,11 +2,11 @@ DO $$ DECLARE can_execute boolean; BEGIN - SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute')) + SELECT pg_catalog.bool_and(pg_catalog.has_function_privilege('neon_superuser', oid, 'execute')) INTO can_execute - FROM pg_proc + FROM pg_catalog.pg_proc WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot') - AND pronamespace = 'pg_catalog'::regnamespace; + AND pronamespace = 'pg_catalog'::pg_catalog.regnamespace; IF NOT can_execute THEN RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot'; END IF; diff --git a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql index e55dcdc3b6..197211300b 100644 --- a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql @@ -2,9 +2,9 @@ DO $$ DECLARE can_execute boolean; BEGIN - SELECT has_function_privilege('neon_superuser', oid, 'execute') + SELECT pg_catalog.has_function_privilege('neon_superuser', oid, 'execute') INTO can_execute - FROM pg_proc + FROM pg_catalog.pg_proc WHERE proname = 'pg_show_replication_origin_status' AND pronamespace = 'pg_catalog'::regnamespace; IF NOT can_execute THEN diff --git a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql index e62b742d30..0f772d67bd 100644 --- a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql @@ -2,10 +2,10 @@ DO $$ DECLARE signal_backend record; BEGIN - SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member, + SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member, admin_option AS admin INTO signal_backend - FROM pg_auth_members + FROM pg_catalog.pg_auth_members WHERE roleid = 'pg_signal_backend'::regrole AND member = 'neon_superuser'::regrole; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index e164f15dba..78ac423a9b 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -407,9 +407,9 @@ fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> { // like `postgres_exporter` use it to query Postgres statistics. // Use explicit 8 bytes type casts to match Rust types. let stats = cli.query_one( - "SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time, - coalesce(sum(sessions), 0)::bigint AS total_sessions - FROM pg_stat_database + "SELECT pg_catalog.coalesce(pg_catalog.sum(active_time), 0.0)::pg_catalog.float8 AS total_active_time, + pg_catalog.coalesce(pg_catalog.sum(sessions), 0)::pg_catalog.bigint AS total_sessions + FROM pg_catalog.pg_stat_database WHERE datname NOT IN ( 'postgres', 'template0', @@ -445,11 +445,11 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result> = None; // Get all running client backends except ourself, use RFC3339 DateTime format. let backends = cli.query( - "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change + "SELECT state, pg_catalog.to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"'::pg_catalog.text) AS state_change FROM pg_stat_activity - WHERE backend_type = 'client backend' - AND pid != pg_backend_pid() - AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? + WHERE backend_type OPERATOR(pg_catalog.=) 'client backend'::pg_catalog.text + AND pid OPERATOR(pg_catalog.!=) pg_catalog.pg_backend_pid() + AND usename OPERATOR(pg_catalog.!=) 'cloud_admin'::pg_catalog.name;", // XXX: find a better way to filter other monitors? &[], ); diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 09bbe89b41..4e16a75181 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -299,9 +299,9 @@ pub async fn get_existing_dbs_async( .query_raw::( "SELECT datname AS name, - (SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner, + (SELECT rolname FROM pg_catalog.pg_roles WHERE oid OPERATOR(pg_catalog.=) datdba) AS owner, NOT datallowconn AS restrict_conn, - datconnlimit = - 2 AS invalid + datconnlimit OPERATOR(pg_catalog.=) (OPERATOR(pg_catalog.-) 2) AS invalid FROM pg_catalog.pg_database;", &[], diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 00f34c1f0e..90c0e234d5 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -82,7 +82,7 @@ impl ComputeNode { info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); drop_subscriptions_done = match - client.query("select 1 from neon.drop_subscriptions_done where timeline_id = $1", &[&timeline_id.to_string()]).await { + client.query("select 1 from neon.drop_subscriptions_done where timeline_id OPERATOR(pg_catalog.=) $1", &[&timeline_id.to_string()]).await { Ok(result) => !result.is_empty(), Err(e) => { @@ -1142,7 +1142,9 @@ async fn get_operations<'a>( if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { if libs.contains("pg_stat_statements") { return Ok(Box::new(once(Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"), + query: String::from( + "CREATE EXTENSION IF NOT EXISTS pg_stat_statements WITH SCHEMA public", + ), comment: Some(String::from("create system extensions")), }))); } @@ -1150,11 +1152,13 @@ async fn get_operations<'a>( Ok(Box::new(empty())) } ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"), + query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit WITH SCHEMA public"), comment: Some(String::from("create pgaudit extensions")), }))), ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"), + query: String::from( + "CREATE EXTENSION IF NOT EXISTS pgauditlogtofile WITH SCHEMA public", + ), comment: Some(String::from("create pgauditlogtofile extensions")), }))), // Disable pgaudit logging for postgres database. @@ -1178,7 +1182,7 @@ async fn get_operations<'a>( }, Operation { query: String::from( - "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'", + "UPDATE pg_catalog.pg_extension SET extrelocatable = true WHERE extname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name AND extrelocatable OPERATOR(pg_catalog.=) false", ), comment: Some(String::from("compat/fix: make neon relocatable")), }, diff --git a/compute_tools/src/sql/add_availabilitycheck_tables.sql b/compute_tools/src/sql/add_availabilitycheck_tables.sql index 7c60690c78..dd27105e16 100644 --- a/compute_tools/src/sql/add_availabilitycheck_tables.sql +++ b/compute_tools/src/sql/add_availabilitycheck_tables.sql @@ -3,16 +3,17 @@ BEGIN IF NOT EXISTS( SELECT 1 FROM pg_catalog.pg_tables - WHERE tablename = 'health_check' + WHERE tablename::pg_catalog.name OPERATOR(pg_catalog.=) 'health_check'::pg_catalog.name + AND schemaname::pg_catalog.name OPERATOR(pg_catalog.=) 'public'::pg_catalog.name ) THEN - CREATE TABLE health_check ( - id serial primary key, - updated_at timestamptz default now() + CREATE TABLE public.health_check ( + id pg_catalog.int4 primary key generated by default as identity, + updated_at pg_catalog.timestamptz default pg_catalog.now() ); - INSERT INTO health_check VALUES (1, now()) + INSERT INTO public.health_check VALUES (1, pg_catalog.now()) ON CONFLICT (id) DO UPDATE - SET updated_at = now(); + SET updated_at = pg_catalog.now(); END IF; END $$ \ No newline at end of file diff --git a/compute_tools/src/sql/anon_ext_fn_reassign.sql b/compute_tools/src/sql/anon_ext_fn_reassign.sql deleted file mode 100644 index 3d7b15c590..0000000000 --- a/compute_tools/src/sql/anon_ext_fn_reassign.sql +++ /dev/null @@ -1,12 +0,0 @@ -DO $$ -DECLARE - query varchar; -BEGIN - FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};' - FROM pg_proc p - JOIN pg_namespace nsp ON p.pronamespace = nsp.oid - WHERE nsp.nspname = 'anon' LOOP - EXECUTE query; - END LOOP; -END -$$; diff --git a/compute_tools/src/sql/create_privileged_role.sql b/compute_tools/src/sql/create_privileged_role.sql index ac2521445f..a682089cce 100644 --- a/compute_tools/src/sql/create_privileged_role.sql +++ b/compute_tools/src/sql/create_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN - IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}') + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname OPERATOR(pg_catalog.=) '{privileged_role_name}'::pg_catalog.name) THEN CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data; END IF; diff --git a/compute_tools/src/sql/default_grants.sql b/compute_tools/src/sql/default_grants.sql index 58ebb0690b..d572332270 100644 --- a/compute_tools/src/sql/default_grants.sql +++ b/compute_tools/src/sql/default_grants.sql @@ -4,14 +4,14 @@ $$ IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public' ) AND - current_setting('server_version_num')::int / 10000 >= 15 + pg_catalog.current_setting('server_version_num')::int OPERATOR(pg_catalog./) 10000 OPERATOR(pg_catalog.>=) 15 THEN IF EXISTS( SELECT rolname FROM pg_catalog.pg_roles - WHERE rolname = 'web_access' + WHERE rolname OPERATOR(pg_catalog.=) 'web_access' ) THEN GRANT CREATE ON SCHEMA public TO web_access; @@ -20,7 +20,7 @@ $$ IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public' ) THEN ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION; diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql index f5d9420130..68b3f8b729 100644 --- a/compute_tools/src/sql/drop_subscriptions.sql +++ b/compute_tools/src/sql/drop_subscriptions.sql @@ -2,11 +2,17 @@ DO ${outer_tag}$ DECLARE subname TEXT; BEGIN - LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE; - FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP - EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname); - EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); - EXECUTE format('DROP SUBSCRIPTION %I;', subname); + LOCK TABLE pg_catalog.pg_subscription IN ACCESS EXCLUSIVE MODE; + FOR subname IN + SELECT pg_subscription.subname + FROM pg_catalog.pg_subscription + WHERE subdbid OPERATOR(pg_catalog.=) ( + SELECT oid FROM pg_database WHERE datname OPERATOR(pg_catalog.=) {datname_str}::pg_catalog.name + ) + LOOP + EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I DISABLE;', subname); + EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); + EXECUTE pg_catalog.format('DROP SUBSCRIPTION %I;', subname); END LOOP; END; ${outer_tag}$; diff --git a/compute_tools/src/sql/finalize_drop_subscriptions.sql b/compute_tools/src/sql/finalize_drop_subscriptions.sql index 4bb291876f..1a8876ad61 100644 --- a/compute_tools/src/sql/finalize_drop_subscriptions.sql +++ b/compute_tools/src/sql/finalize_drop_subscriptions.sql @@ -3,19 +3,19 @@ BEGIN IF NOT EXISTS( SELECT 1 FROM pg_catalog.pg_tables - WHERE tablename = 'drop_subscriptions_done' - AND schemaname = 'neon' + WHERE tablename OPERATOR(pg_catalog.=) 'drop_subscriptions_done'::pg_catalog.name + AND schemaname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name ) THEN CREATE TABLE neon.drop_subscriptions_done - (id serial primary key, timeline_id text); + (id pg_catalog.int4 primary key generated by default as identity, timeline_id pg_catalog.text); END IF; -- preserve the timeline_id of the last drop_subscriptions run -- to ensure that the cleanup of a timeline is executed only once. -- use upsert to avoid the table bloat in case of cascade branching (branch of a branch) - INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id')) + INSERT INTO neon.drop_subscriptions_done VALUES (1, pg_catalog.current_setting('neon.timeline_id')) ON CONFLICT (id) DO UPDATE - SET timeline_id = current_setting('neon.timeline_id'); + SET timeline_id = pg_catalog.current_setting('neon.timeline_id')::pg_catalog.text; END $$ diff --git a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql index 734607be02..2ed0f94bad 100644 --- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql +++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql @@ -15,15 +15,15 @@ BEGIN WHERE schema_name IN ('public') LOOP FOR grantor IN EXECUTE - format( - 'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s', + pg_catalog.format( + 'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee OPERATOR(pg_catalog.=) %s', -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` quote_literal({role_name}) ) LOOP - EXECUTE format('SET LOCAL ROLE %I', grantor); + EXECUTE pg_catalog.format('SET LOCAL ROLE %I', grantor); - revoke_query := format( + revoke_query := pg_catalog.format( 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I', schema, -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql index dc502c6d2d..41bd0d4689 100644 --- a/compute_tools/src/sql/set_public_schema_owner.sql +++ b/compute_tools/src/sql/set_public_schema_owner.sql @@ -5,17 +5,17 @@ DO ${outer_tag}$ IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.name ) THEN SELECT nspowner::regrole::text FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.text INTO schema_owner; - IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin' + IF schema_owner OPERATOR(pg_catalog.=) 'cloud_admin'::pg_catalog.text OR schema_owner OPERATOR(pg_catalog.=) 'zenith_admin'::pg_catalog.text THEN - EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner}); + EXECUTE pg_catalog.format('ALTER SCHEMA public OWNER TO %I', {db_owner}); END IF; END IF; END diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql index 36dc648beb..03225d5e64 100644 --- a/compute_tools/src/sql/unset_template_for_drop_dbs.sql +++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql @@ -3,10 +3,10 @@ DO ${outer_tag}$ IF EXISTS( SELECT 1 FROM pg_catalog.pg_database - WHERE datname = {datname} + WHERE datname OPERATOR(pg_catalog.=) {datname}::pg_catalog.name ) THEN - EXECUTE format('ALTER DATABASE %I is_template false', {datname}); + EXECUTE pg_catalog.format('ALTER DATABASE %I is_template false', {datname}); END IF; END ${outer_tag}$; diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 95500b0b18..2b81c3957c 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -71,8 +71,9 @@ const DEFAULT_PG_VERSION_NUM: &str = "17"; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; +/// Neon CLI. #[derive(clap::Parser)] -#[command(version = GIT_VERSION, about, name = "Neon CLI")] +#[command(version = GIT_VERSION, name = "Neon CLI")] struct Cli { #[command(subcommand)] command: NeonLocalCmd, @@ -107,30 +108,31 @@ enum NeonLocalCmd { Stop(StopCmdArgs), } +/// Initialize a new Neon repository, preparing configs for services to start with. #[derive(clap::Args)] -#[clap(about = "Initialize a new Neon repository, preparing configs for services to start with")] struct InitCmdArgs { - #[clap(long, help("How many pageservers to create (default 1)"))] + /// How many pageservers to create (default 1). + #[clap(long)] num_pageservers: Option, #[clap(long)] config: Option, - #[clap(long, help("Force initialization even if the repository is not empty"))] + /// Force initialization even if the repository is not empty. + #[clap(long, default_value = "must-not-exist")] #[arg(value_parser)] - #[clap(default_value = "must-not-exist")] force: InitForceMode, } +/// Start pageserver and safekeepers. #[derive(clap::Args)] -#[clap(about = "Start pageserver and safekeepers")] struct StartCmdArgs { #[clap(long = "start-timeout", default_value = "10s")] timeout: humantime::Duration, } +/// Stop pageserver and safekeepers. #[derive(clap::Args)] -#[clap(about = "Stop pageserver and safekeepers")] struct StopCmdArgs { #[arg(value_enum)] #[clap(long, default_value_t = StopMode::Fast)] @@ -143,8 +145,8 @@ enum StopMode { Immediate, } +/// Manage tenants. #[derive(clap::Subcommand)] -#[clap(about = "Manage tenants")] enum TenantCmd { List, Create(TenantCreateCmdArgs), @@ -155,38 +157,36 @@ enum TenantCmd { #[derive(clap::Args)] struct TenantCreateCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: Option, - #[clap( - long, - help = "Use a specific timeline id when creating a tenant and its initial timeline" - )] + /// Use a specific timeline id when creating a tenant and its initial timeline. + #[clap(long)] timeline_id: Option, #[clap(short = 'c')] config: Vec, + /// Postgres version to use for the initial timeline. #[arg(default_value = DEFAULT_PG_VERSION_NUM)] - #[clap(long, help = "Postgres version to use for the initial timeline")] + #[clap(long)] pg_version: PgMajorVersion, - #[clap( - long, - help = "Use this tenant in future CLI commands where tenant_id is needed, but not specified" - )] + /// Use this tenant in future CLI commands where tenant_id is needed, but not specified. + #[clap(long)] set_default: bool, - #[clap(long, help = "Number of shards in the new tenant")] + /// Number of shards in the new tenant. + #[clap(long)] #[arg(default_value_t = 0)] shard_count: u8, - #[clap(long, help = "Sharding stripe size in pages")] + /// Sharding stripe size in pages. + #[clap(long)] shard_stripe_size: Option, - #[clap(long, help = "Placement policy shards in this tenant")] + /// Placement policy shards in this tenant. + #[clap(long)] #[arg(value_parser = parse_placement_policy)] placement_policy: Option, } @@ -195,44 +195,35 @@ fn parse_placement_policy(s: &str) -> anyhow::Result { Ok(serde_json::from_str::(s)?) } +/// Set a particular tenant as default in future CLI commands where tenant_id is needed, but not +/// specified. #[derive(clap::Args)] -#[clap( - about = "Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified" -)] struct TenantSetDefaultCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: TenantId, } #[derive(clap::Args)] struct TenantConfigCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: Option, #[clap(short = 'c')] config: Vec, } +/// Import a tenant that is present in remote storage, and create branches for its timelines. #[derive(clap::Args)] -#[clap( - about = "Import a tenant that is present in remote storage, and create branches for its timelines" -)] struct TenantImportCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: TenantId, } +/// Manage timelines. #[derive(clap::Subcommand)] -#[clap(about = "Manage timelines")] enum TimelineCmd { List(TimelineListCmdArgs), Branch(TimelineBranchCmdArgs), @@ -240,98 +231,87 @@ enum TimelineCmd { Import(TimelineImportCmdArgs), } +/// List all timelines available to this pageserver. #[derive(clap::Args)] -#[clap(about = "List all timelines available to this pageserver")] struct TimelineListCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_shard_id: Option, } +/// Create a new timeline, branching off from another timeline. #[derive(clap::Args)] -#[clap(about = "Create a new timeline, branching off from another timeline")] struct TimelineBranchCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: Option, - - #[clap(long, help = "New timeline's ID")] + /// New timeline's ID, as a 32-byte hexadecimal string. + #[clap(long)] timeline_id: Option, - - #[clap(long, help = "Human-readable alias for the new timeline")] + /// Human-readable alias for the new timeline. + #[clap(long)] branch_name: String, - - #[clap( - long, - help = "Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name." - )] + /// Use last Lsn of another timeline (and its data) as base when creating the new timeline. The + /// timeline gets resolved by its branch name. + #[clap(long)] ancestor_branch_name: Option, - - #[clap( - long, - help = "When using another timeline as base, use a specific Lsn in it instead of the latest one" - )] + /// When using another timeline as base, use a specific Lsn in it instead of the latest one. + #[clap(long)] ancestor_start_lsn: Option, } +/// Create a new blank timeline. #[derive(clap::Args)] -#[clap(about = "Create a new blank timeline")] struct TimelineCreateCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: Option, - - #[clap(long, help = "New timeline's ID")] + /// New timeline's ID, as a 32-byte hexadecimal string. + #[clap(long)] timeline_id: Option, - - #[clap(long, help = "Human-readable alias for the new timeline")] + /// Human-readable alias for the new timeline. + #[clap(long)] branch_name: String, + /// Postgres version. #[arg(default_value = DEFAULT_PG_VERSION_NUM)] - #[clap(long, help = "Postgres version")] + #[clap(long)] pg_version: PgMajorVersion, } +/// Import a timeline from a basebackup directory. #[derive(clap::Args)] -#[clap(about = "Import timeline from a basebackup directory")] struct TimelineImportCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: Option, - - #[clap(long, help = "New timeline's ID")] + /// New timeline's ID, as a 32-byte hexadecimal string. + #[clap(long)] timeline_id: TimelineId, - - #[clap(long, help = "Human-readable alias for the new timeline")] + /// Human-readable alias for the new timeline. + #[clap(long)] branch_name: String, - - #[clap(long, help = "Basebackup tarfile to import")] + /// Basebackup tarfile to import. + #[clap(long)] base_tarfile: PathBuf, - - #[clap(long, help = "Lsn the basebackup starts at")] + /// LSN the basebackup starts at. + #[clap(long)] base_lsn: Lsn, - - #[clap(long, help = "Wal to add after base")] + /// WAL to add after base. + #[clap(long)] wal_tarfile: Option, - - #[clap(long, help = "Lsn the basebackup ends at")] + /// LSN the basebackup ends at. + #[clap(long)] end_lsn: Option, + /// Postgres version of the basebackup being imported. #[arg(default_value = DEFAULT_PG_VERSION_NUM)] - #[clap(long, help = "Postgres version of the backup being imported")] + #[clap(long)] pg_version: PgMajorVersion, } +/// Manage pageservers. #[derive(clap::Subcommand)] -#[clap(about = "Manage pageservers")] enum PageserverCmd { Status(PageserverStatusCmdArgs), Start(PageserverStartCmdArgs), @@ -339,223 +319,202 @@ enum PageserverCmd { Restart(PageserverRestartCmdArgs), } +/// Show status of a local pageserver. #[derive(clap::Args)] -#[clap(about = "Show status of a local pageserver")] struct PageserverStatusCmdArgs { - #[clap(long = "id", help = "pageserver id")] + /// Pageserver ID. + #[clap(long = "id")] pageserver_id: Option, } +/// Start local pageserver. #[derive(clap::Args)] -#[clap(about = "Start local pageserver")] struct PageserverStartCmdArgs { - #[clap(long = "id", help = "pageserver id")] + /// Pageserver ID. + #[clap(long = "id")] pageserver_id: Option, - - #[clap(short = 't', long, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } +/// Stop local pageserver. #[derive(clap::Args)] -#[clap(about = "Stop local pageserver")] struct PageserverStopCmdArgs { - #[clap(long = "id", help = "pageserver id")] + /// Pageserver ID. + #[clap(long = "id")] pageserver_id: Option, - - #[clap( - short = 'm', - help = "If 'immediate', don't flush repository data at shutdown" - )] + /// If 'immediate', don't flush repository data at shutdown + #[clap(short = 'm')] #[arg(value_enum, default_value = "fast")] stop_mode: StopMode, } +/// Restart local pageserver. #[derive(clap::Args)] -#[clap(about = "Restart local pageserver")] struct PageserverRestartCmdArgs { - #[clap(long = "id", help = "pageserver id")] + /// Pageserver ID. + #[clap(long = "id")] pageserver_id: Option, - - #[clap(short = 't', long, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } +/// Manage storage controller. #[derive(clap::Subcommand)] -#[clap(about = "Manage storage controller")] enum StorageControllerCmd { Start(StorageControllerStartCmdArgs), Stop(StorageControllerStopCmdArgs), } +/// Start storage controller. #[derive(clap::Args)] -#[clap(about = "Start storage controller")] struct StorageControllerStartCmdArgs { - #[clap(short = 't', long, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, - - #[clap( - long, - help = "Identifier used to distinguish storage controller instances" - )] + /// Identifier used to distinguish storage controller instances. + #[clap(long)] #[arg(default_value_t = 1)] instance_id: u8, - - #[clap( - long, - help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)" - )] + /// Base port for the storage controller instance identified by instance-id (defaults to + /// pageserver cplane api). + #[clap(long)] base_port: Option, - #[clap( - long, - help = "Whether the storage controller should handle pageserver-reported local disk loss events." - )] + /// Whether the storage controller should handle pageserver-reported local disk loss events. + #[clap(long)] handle_ps_local_disk_loss: Option, } +/// Stop storage controller. #[derive(clap::Args)] -#[clap(about = "Stop storage controller")] struct StorageControllerStopCmdArgs { - #[clap( - short = 'm', - help = "If 'immediate', don't flush repository data at shutdown" - )] + /// If 'immediate', don't flush repository data at shutdown + #[clap(short = 'm')] #[arg(value_enum, default_value = "fast")] stop_mode: StopMode, - - #[clap( - long, - help = "Identifier used to distinguish storage controller instances" - )] + /// Identifier used to distinguish storage controller instances. + #[clap(long)] #[arg(default_value_t = 1)] instance_id: u8, } +/// Manage storage broker. #[derive(clap::Subcommand)] -#[clap(about = "Manage storage broker")] enum StorageBrokerCmd { Start(StorageBrokerStartCmdArgs), Stop(StorageBrokerStopCmdArgs), } +/// Start broker. #[derive(clap::Args)] -#[clap(about = "Start broker")] struct StorageBrokerStartCmdArgs { - #[clap(short = 't', long, help = "timeout until we fail the command")] - #[arg(default_value = "10s")] + /// Timeout until we fail the command. + #[clap(short = 't', long, default_value = "10s")] start_timeout: humantime::Duration, } +/// Stop broker. #[derive(clap::Args)] -#[clap(about = "stop broker")] struct StorageBrokerStopCmdArgs { - #[clap( - short = 'm', - help = "If 'immediate', don't flush repository data at shutdown" - )] + /// If 'immediate', don't flush repository data on shutdown. + #[clap(short = 'm')] #[arg(value_enum, default_value = "fast")] stop_mode: StopMode, } +/// Manage safekeepers. #[derive(clap::Subcommand)] -#[clap(about = "Manage safekeepers")] enum SafekeeperCmd { Start(SafekeeperStartCmdArgs), Stop(SafekeeperStopCmdArgs), Restart(SafekeeperRestartCmdArgs), } +/// Manage object storage. #[derive(clap::Subcommand)] -#[clap(about = "Manage object storage")] enum EndpointStorageCmd { Start(EndpointStorageStartCmd), Stop(EndpointStorageStopCmd), } +/// Start object storage. #[derive(clap::Args)] -#[clap(about = "Start object storage")] struct EndpointStorageStartCmd { - #[clap(short = 't', long, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } +/// Stop object storage. #[derive(clap::Args)] -#[clap(about = "Stop object storage")] struct EndpointStorageStopCmd { + /// If 'immediate', don't flush repository data on shutdown. + #[clap(short = 'm')] #[arg(value_enum, default_value = "fast")] - #[clap( - short = 'm', - help = "If 'immediate', don't flush repository data at shutdown" - )] stop_mode: StopMode, } +/// Start local safekeeper. #[derive(clap::Args)] -#[clap(about = "Start local safekeeper")] struct SafekeeperStartCmdArgs { - #[clap(help = "safekeeper id")] + /// Safekeeper ID. #[arg(default_value_t = NodeId(1))] id: NodeId, - #[clap( - short = 'e', - long = "safekeeper-extra-opt", - help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo" - )] + /// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo. + #[clap(short = 'e', long = "safekeeper-extra-opt")] extra_opt: Vec, - #[clap(short = 't', long, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } +/// Stop local safekeeper. #[derive(clap::Args)] -#[clap(about = "Stop local safekeeper")] struct SafekeeperStopCmdArgs { - #[clap(help = "safekeeper id")] + /// Safekeeper ID. #[arg(default_value_t = NodeId(1))] id: NodeId, + /// If 'immediate', don't flush repository data on shutdown. #[arg(value_enum, default_value = "fast")] - #[clap( - short = 'm', - help = "If 'immediate', don't flush repository data at shutdown" - )] + #[clap(short = 'm')] stop_mode: StopMode, } +/// Restart local safekeeper. #[derive(clap::Args)] -#[clap(about = "Restart local safekeeper")] struct SafekeeperRestartCmdArgs { - #[clap(help = "safekeeper id")] + /// Safekeeper ID. #[arg(default_value_t = NodeId(1))] id: NodeId, + /// If 'immediate', don't flush repository data on shutdown. #[arg(value_enum, default_value = "fast")] - #[clap( - short = 'm', - help = "If 'immediate', don't flush repository data at shutdown" - )] + #[clap(short = 'm')] stop_mode: StopMode, - #[clap( - short = 'e', - long = "safekeeper-extra-opt", - help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo" - )] + /// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo. + #[clap(short = 'e', long = "safekeeper-extra-opt")] extra_opt: Vec, - #[clap(short = 't', long, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } +/// Manage Postgres instances. #[derive(clap::Subcommand)] -#[clap(about = "Manage Postgres instances")] enum EndpointCmd { List(EndpointListCmdArgs), Create(EndpointCreateCmdArgs), @@ -567,33 +526,27 @@ enum EndpointCmd { GenerateJwt(EndpointGenerateJwtCmdArgs), } +/// List endpoints. #[derive(clap::Args)] -#[clap(about = "List endpoints")] struct EndpointListCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_shard_id: Option, } +/// Create a compute endpoint. #[derive(clap::Args)] -#[clap(about = "Create a compute endpoint")] struct EndpointCreateCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: Option, - - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint ID. endpoint_id: Option, - #[clap(long, help = "Name of the branch the endpoint will run on")] + /// Name of the branch the endpoint will run on. + #[clap(long)] branch_name: Option, - #[clap( - long, - help = "Specify Lsn on the timeline to start from. By default, end of the timeline would be used" - )] + /// Specify LSN on the timeline to start from. By default, end of the timeline would be used. + #[clap(long)] lsn: Option, #[clap(long)] pg_port: Option, @@ -604,16 +557,13 @@ struct EndpointCreateCmdArgs { #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, - #[clap( - long, - help = "Don't do basebackup, create endpoint directory with only config files", - action = clap::ArgAction::Set, - default_value_t = false - )] + /// Don't do basebackup, create endpoint directory with only config files. + #[clap(long, action = clap::ArgAction::Set, default_value_t = false)] config_only: bool, + /// Postgres version. #[arg(default_value = DEFAULT_PG_VERSION_NUM)] - #[clap(long, help = "Postgres version")] + #[clap(long)] pg_version: PgMajorVersion, /// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings. @@ -624,170 +574,140 @@ struct EndpointCreateCmdArgs { #[clap(long)] grpc: bool, - #[clap( - long, - help = "If set, the node will be a hot replica on the specified timeline", - action = clap::ArgAction::Set, - default_value_t = false - )] + /// If set, the node will be a hot replica on the specified timeline. + #[clap(long, action = clap::ArgAction::Set, default_value_t = false)] hot_standby: bool, - - #[clap(long, help = "If set, will set up the catalog for neon_superuser")] + /// If set, will set up the catalog for neon_superuser. + #[clap(long)] update_catalog: bool, - - #[clap( - long, - help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests." - )] + /// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but + /// useful for tests. + #[clap(long)] allow_multiple: bool, - /// Only allow changing it on creation - #[clap(long, help = "Name of the privileged role for the endpoint")] + /// Name of the privileged role for the endpoint. + // Only allow changing it on creation. + #[clap(long)] privileged_role_name: Option, } +/// Start Postgres. If the endpoint doesn't exist yet, it is created. #[derive(clap::Args)] -#[clap(about = "Start postgres. If the endpoint doesn't exist yet, it is created.")] struct EndpointStartCmdArgs { - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint ID. endpoint_id: String, + /// Pageserver ID. #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, - - #[clap( - long, - help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations." - )] + /// Safekeepers membership generation to prefix neon.safekeepers with. + #[clap(long)] safekeepers_generation: Option, - #[clap( - long, - help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override." - )] + /// List of safekeepers endpoint will talk to. + #[clap(long)] safekeepers: Option, - - #[clap( - long, - help = "Configure the remote extensions storage proxy gateway URL to request for extensions.", - alias = "remote-ext-config" - )] + /// Configure the remote extensions storage proxy gateway URL to request for extensions. + #[clap(long, alias = "remote-ext-config")] remote_ext_base_url: Option, - - #[clap( - long, - help = "If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`" - )] + /// If set, will create test user `user` and `neondb` database. Requires `update-catalog = true` + #[clap(long)] create_test_user: bool, - - #[clap( - long, - help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests." - )] + /// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but + /// useful for tests. + #[clap(long)] allow_multiple: bool, - - #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long, value_parser= humantime::parse_duration)] #[arg(default_value = "90s")] start_timeout: Duration, - #[clap( - long, - help = "Download LFC cache from endpoint storage on endpoint startup", - default_value = "false" - )] + /// Download LFC cache from endpoint storage on endpoint startup + #[clap(long, default_value = "false")] autoprewarm: bool, - #[clap(long, help = "Upload LFC cache to endpoint storage periodically")] + /// Upload LFC cache to endpoint storage periodically + #[clap(long)] offload_lfc_interval_seconds: Option, - #[clap( - long, - help = "Run in development mode, skipping VM-specific operations like process termination", - action = clap::ArgAction::SetTrue - )] + /// Run in development mode, skipping VM-specific operations like process termination + #[clap(long, action = clap::ArgAction::SetTrue)] dev: bool, } +/// Reconfigure an endpoint. #[derive(clap::Args)] -#[clap(about = "Reconfigure an endpoint")] struct EndpointReconfigureCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant id. Represented as a hexadecimal string 32 symbols length + #[clap(long = "tenant-id")] tenant_id: Option, - - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint ID. endpoint_id: String, + /// Pageserver ID. #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, - #[clap(long)] safekeepers: Option, } +/// Refresh the endpoint's configuration by forcing it reload it's spec #[derive(clap::Args)] -#[clap(about = "Refresh the endpoint's configuration by forcing it reload it's spec")] struct EndpointRefreshConfigurationArgs { - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint id endpoint_id: String, } +/// Stop an endpoint. #[derive(clap::Args)] -#[clap(about = "Stop an endpoint")] struct EndpointStopCmdArgs { - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint ID. endpoint_id: String, - - #[clap( - long, - help = "Also delete data directory (now optional, should be default in future)" - )] + /// Also delete data directory (now optional, should be default in future). + #[clap(long)] destroy: bool, - #[clap(long, help = "Postgres shutdown mode")] + /// Postgres shutdown mode, passed to `pg_ctl -m `. + #[clap(long)] #[clap(default_value = "fast")] mode: EndpointTerminateMode, } +/// Update the pageservers in the spec file of the compute endpoint #[derive(clap::Args)] -#[clap(about = "Update the pageservers in the spec file of the compute endpoint")] struct EndpointUpdatePageserversCmdArgs { - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint id endpoint_id: String, - #[clap(short = 'p', long, help = "Specified pageserver id")] + /// Specified pageserver id + #[clap(short = 'p', long)] pageserver_id: Option, } +/// Generate a JWT for an endpoint. #[derive(clap::Args)] -#[clap(about = "Generate a JWT for an endpoint")] struct EndpointGenerateJwtCmdArgs { - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint ID. endpoint_id: String, - - #[clap(short = 's', long, help = "Scope to generate the JWT with", value_parser = ComputeClaimsScope::from_str)] + /// Scope to generate the JWT with. + #[clap(short = 's', long, value_parser = ComputeClaimsScope::from_str)] scope: Option, } +/// Manage neon_local branch name mappings. #[derive(clap::Subcommand)] -#[clap(about = "Manage neon_local branch name mappings")] enum MappingsCmd { Map(MappingsMapCmdArgs), } +/// Create new mapping which cannot exist already. #[derive(clap::Args)] -#[clap(about = "Create new mapping which cannot exist already")] struct MappingsMapCmdArgs { - #[clap( - long, - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long)] tenant_id: TenantId, - #[clap( - long, - help = "Timeline id. Represented as a hexadecimal string 32 symbols length" - )] + /// Timeline ID, as a 32-byte hexadecimal string. + #[clap(long)] timeline_id: TimelineId, - #[clap(long, help = "Branch name to give to the timeline")] + /// Branch name to give to the timeline. + #[clap(long)] branch_name: String, } diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index a4d1030488..635b1858ec 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -303,6 +303,13 @@ enum Command { #[arg(long, required = true, value_delimiter = ',')] new_sk_set: Vec, }, + /// Abort ongoing safekeeper migration. + TimelineSafekeeperMigrateAbort { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + timeline_id: TimelineId, + }, } #[derive(Parser)] @@ -1396,6 +1403,17 @@ async fn main() -> anyhow::Result<()> { ) .await?; } + Command::TimelineSafekeeperMigrateAbort { + tenant_id, + timeline_id, + } => { + let path = + format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate_abort"); + + storcon_client + .dispatch::<(), ()>(Method::POST, path, None) + .await?; + } } Ok(()) diff --git a/docs/rfcs/2025-07-07-node-deletion-api-improvement.md b/docs/rfcs/2025-07-07-node-deletion-api-improvement.md new file mode 100644 index 0000000000..47dadaee35 --- /dev/null +++ b/docs/rfcs/2025-07-07-node-deletion-api-improvement.md @@ -0,0 +1,246 @@ +# Node deletion API improvement + +Created on 2025-07-07 +Implemented on _TBD_ + +## Summary + +This RFC describes improvements to the storage controller API for gracefully deleting pageserver +nodes. + +## Motivation + +The basic node deletion API introduced in [#8226](https://github.com/neondatabase/neon/issues/8333) +has several limitations: + +- Deleted nodes can re-add themselves if they restart (e.g., a flaky node that keeps restarting and +we cannot reach via SSH to stop the pageserver). This issue has been resolved by tombstone +mechanism in [#12036](https://github.com/neondatabase/neon/issues/12036) +- Process of node deletion is not graceful, i.e. it just imitates a node failure + +In this context, "graceful" node deletion means that users do not experience any disruption or +negative effects, provided the system remains in a healthy state (i.e., the remaining pageservers +can handle the workload and all requirements are met). To achieve this, the system must perform +live migration of all tenant shards from the node being deleted while the node is still running +and continue processing all incoming requests. The node is removed only after all tenant shards +have been safely migrated. + +Although live migrations can be achieved with the drain functionality, it leads to incorrect shard +placement, such as not matching availability zones. This results in unnecessary work to optimize +the placement that was just recently performed. + +If we delete a node before its tenant shards are fully moved, the new node won't have all the +needed data (e.g. heatmaps) ready. This means user requests to the new node will be much slower at +first. If there are many tenant shards, this slowdown affects a huge amount of users. + +Graceful node deletion is more complicated and can introduce new issues. It takes longer because +live migration of each tenant shard can last several minutes. Using non-blocking accessors may +also cause deletion to wait if other processes are holding inner state lock. It also gets trickier +because we need to handle other requests, like drain and fill, at the same time. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +- storage controller +- pageserver (indirectly) + +## Proposed implementation + +### Tombstones + +To resolve the problem of deleted nodes re-adding themselves, a tombstone mechanism was introduced +as part of the node stored information. Each node has a separate `NodeLifecycle` field with two +possible states: `Active` and `Deleted`. When node deletion completes, the database row is not +deleted but instead has its `NodeLifecycle` column switched to `Deleted`. Nodes with `Deleted` +lifecycle are treated as if the row is absent for most handlers, with several exceptions: reattach +and register functionality must be aware of tombstones. Additionally, new debug handlers are +available for listing and deleting tombstones via the `/debug/v1/tombstone` path. + +### Gracefulness + +The problem of making node deletion graceful is complex and involves several challenges: + +- **Cancellable**: The operation must be cancellable to allow administrators to abort the process +if needed, e.g. if run by mistake. +- **Non-blocking**: We don't want to block deployment operations like draining/filling on the node +deletion process. We need clear policies for handling concurrent operations: what happens when a +drain/fill request arrives while deletion is in progress, and what happens when a delete request +arrives while drain/fill is in progress. +- **Persistent**: If the storage controller restarts during this long-running operation, we must +preserve progress and automatically resume the deletion process after the storage controller +restarts. +- **Migrated correctly**: We cannot simply use the existing drain mechanism for nodes scheduled +for deletion, as this would move shards to irrelevant locations. The drain process expects the +node to return, so it only moves shards to backup locations, not to their preferred AZs. It also +leaves secondary locations unmoved. This could result in unnecessary load on the storage +controller and inefficient resource utilization. +- **Force option**: Administrators need the ability to force immediate, non-graceful deletion when +time constraints or emergency situations require it, bypassing the normal graceful migration +process. + +See below for a detailed breakdown of the proposed changes and mechanisms. + +#### Node lifecycle + +New `NodeLifecycle` enum and a matching database field with these values: +- `Active`: The normal state. All operations are allowed. +- `ScheduledForDeletion`: The node is marked to be deleted soon. Deletion may be in progress or +will happen later, but the node will eventually be removed. All operations are allowed. +- `Deleted`: The node is fully deleted. No operations are allowed, and the node cannot be brought +back. The only action left is to remove its record from the database. Any attempt to register a +node in this state will fail. + +This state persists across storage controller restarts. + +**State transition** +``` + +--------------------+ + +---| Active |<---------------------+ + | +--------------------+ | + | ^ | + | start_node_delete | cancel_node_delete | + v | | + +----------------------------------+ | + | ScheduledForDeletion | | + +----------------------------------+ | + | | + | node_register | + | | + | delete_node (at the finish) | + | | + v | + +---------+ tombstone_delete +----------+ + | Deleted |-------------------------------->| no row | + +---------+ +----------+ +``` + +#### NodeSchedulingPolicy::Deleting + +A `Deleting` variant to the `NodeSchedulingPolicy` enum. This means the deletion function is +running for the node right now. Only one node can have the `Deleting` policy at a time. + +The `NodeSchedulingPolicy::Deleting` state is persisted in the database. However, after a storage +controller restart, any node previously marked as `Deleting` will have its scheduling policy reset +to `Pause`. The policy will only transition back to `Deleting` when the deletion operation is +actively started again, as triggered by the node's `NodeLifecycle::ScheduledForDeletion` state. + +`NodeSchedulingPolicy` transition details: +1. When `node_delete` begins, set the policy to `NodeSchedulingPolicy::Deleting`. +2. If `node_delete` is cancelled (for example, due to a concurrent drain operation), revert the +policy to its previous value. The policy is persisted in storcon DB. +3. After `node_delete` completes, the final value of the scheduling policy is irrelevant, since +`NodeLifecycle::Deleted` prevents any further access to this field. + +The deletion process cannot be initiated for nodes currently undergoing deployment-related +operations (`Draining`, `Filling`, or `PauseForRestart` policies). Deletion will only be triggered +once the node transitions to either the `Active` or `Pause` state. + +#### OperationTracker + +A replacement for `Option ongoing_operation`, the `OperationTracker` is a +dedicated service state object responsible for managing all long-running node operations (drain, +fill, delete) with robust concurrency control. + +Key responsibilities: +- Orchestrates the execution of operations +- Supports cancellation of currently running operations +- Enforces operation constraints, e.g. allowing only single drain/fill operation at a time +- Persists deletion state, enabling recovery of pending deletions across restarts +- Ensures thread safety across concurrent requests + +#### Attached tenant shard processing + +When deleting a node, handle each attached tenant shard as follows: + +1. Pick the best node to become the new attached (the candidate). +2. If the candidate already has this shard as a secondary: + - Create a new secondary for the shard on another suitable node. + Otherwise: + - Create a secondary for the shard on the candidate node. +3. Wait until all secondaries are ready and pre-warmed. +4. Promote the candidate's secondary to attached. +5. Remove the secondary from the node being deleted. + +This process safely moves all attached shards before deleting the node. + +#### Secondary tenant shard processing + +When deleting a node, handle each secondary tenant shard as follows: + +1. Choose the best node to become the new secondary. +2. Create a secondary for the shard on that node. +3. Wait until the new secondary is ready. +4. Remove the secondary from the node being deleted. + +This ensures all secondary shards are safely moved before deleting the node. + +### Reliability, failure modes and corner cases + +In case of a storage controller failure and following restart, the system behavior depends on the +`NodeLifecycle` state: + +- If `NodeLifecycle` is `Active`: No action is taken for this node. +- If `NodeLifecycle` is `Deleted`: The node will not be re-added. +- If `NodeLifecycle` is `ScheduledForDeletion`: A deletion background task will be launched for +this node. + +In case of a pageserver node failure during deletion, the behavior depends on the `force` flag: +- If `force` is set: The node deletion will proceed regardless of the node's availability. +- If `force` is not set: The deletion will be retried a limited number of times. If the node +remains unavailable, the deletion process will pause and automatically resume when the node +becomes healthy again. + +### Operations concurrency + +The following sections describe the behavior when different types of requests arrive at the storage +controller and how they interact with ongoing operations. + +#### Delete request + +Handler: `PUT /control/v1/node/:node_id/delete` + +1. If node lifecycle is `NodeLifecycle::ScheduledForDeletion`: + - Return `200 OK`: there is already an ongoing deletion request for this node +2. Update & persist lifecycle to `NodeLifecycle::ScheduledForDeletion` +3. Persist current scheduling policy +4. If there is no active operation (drain/fill/delete): + - Run deletion process for this node + +#### Cancel delete request + +Handler: `DELETE /control/v1/node/:node_id/delete` + +1. If node lifecycle is not `NodeLifecycle::ScheduledForDeletion`: + - Return `404 Not Found`: there is no current deletion request for this node +2. If the active operation is deleting this node, cancel it +3. Update & persist lifecycle to `NodeLifecycle::Active` +4. Restore the last scheduling policy from persistence + +#### Drain/fill request + +1. If there are already ongoing drain/fill processes: + - Return `409 Conflict`: queueing of drain/fill processes is not supported +2. If there is an ongoing delete process: + - Cancel it and wait until it is cancelled +3. Run the drain/fill process +4. After the drain/fill process is cancelled or finished: + - Try to find another candidate to delete and run the deletion process for that node + +#### Drain/fill cancel request + +1. If the active operation is not the related process: + - Return `400 Bad Request`: cancellation request is incorrect, operations are not the same +2. Cancel the active operation +3. Try to find another candidate to delete and run the deletion process for that node + +## Definition of Done + +- [x] Fix flaky node scenario and introduce related debug handlers +- [ ] Node deletion intent is persistent - a node will be eventually deleted after a deletion +request regardless of draining/filling requests and restarts +- [ ] Node deletion can be graceful - deletion completes only after moving all tenant shards to +recommended locations +- [ ] Deploying does not break due to long deletions - drain/fill operations override deletion +process and deletion resumes after drain/fill completes +- [ ] `force` flag is implemented and provides fast, failure-tolerant node removal (e.g., when a +pageserver node does not respond) +- [ ] Legacy delete handler code is removed from storage_controller, test_runner, and storcon_cli diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index a27301e45e..a61d418dd1 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -1,10 +1,9 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. -use std::fmt::Display; - use chrono::{DateTime, Utc}; use jsonwebtoken::jwk::JwkSet; use serde::{Deserialize, Serialize, Serializer}; +use std::fmt::Display; use crate::privilege::Privilege; use crate::spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}; @@ -49,7 +48,7 @@ pub struct ExtensionInstallResponse { /// Status of the LFC prewarm process. The same state machine is reused for /// both autoprewarm (prewarm after compute/Postgres start using the previously /// stored LFC state) and explicit prewarming via API. -#[derive(Serialize, Default, Debug, Clone, PartialEq)] +#[derive(Serialize, Default, Debug, Clone)] #[serde(tag = "status", rename_all = "snake_case")] pub enum LfcPrewarmState { /// Default value when compute boots up. @@ -59,7 +58,14 @@ pub enum LfcPrewarmState { Prewarming, /// We found requested LFC state in the endpoint storage and /// completed prewarming successfully. - Completed, + Completed { + total: i32, + prewarmed: i32, + skipped: i32, + state_download_time_ms: u32, + uncompress_time_ms: u32, + prewarm_time_ms: u32, + }, /// Unexpected error happened during prewarming. Note, `Not Found 404` /// response from the endpoint storage is explicitly excluded here /// because it can normally happen on the first compute start, @@ -68,11 +74,15 @@ pub enum LfcPrewarmState { /// We tried to fetch the corresponding LFC state from the endpoint storage, /// but received `Not Found 404`. This should normally happen only during the /// first endpoint start after creation with `autoprewarm: true`. + /// This may also happen if LFC is turned off or not initialized /// /// During the orchestrated prewarm via API, when a caller explicitly /// provides the LFC state key to prewarm from, it's the caller responsibility /// to handle this status as an error state in this case. Skipped, + /// LFC prewarm was cancelled. Some pages in LFC cache may be prewarmed if query + /// has started working before cancellation + Cancelled, } impl Display for LfcPrewarmState { @@ -80,32 +90,44 @@ impl Display for LfcPrewarmState { match self { LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"), LfcPrewarmState::Prewarming => f.write_str("Prewarming"), - LfcPrewarmState::Completed => f.write_str("Completed"), + LfcPrewarmState::Completed { .. } => f.write_str("Completed"), LfcPrewarmState::Skipped => f.write_str("Skipped"), LfcPrewarmState::Failed { error } => write!(f, "Error({error})"), + LfcPrewarmState::Cancelled => f.write_str("Cancelled"), } } } -#[derive(Serialize, Default, Debug, Clone, PartialEq)] +#[derive(Serialize, Default, Debug, Clone)] #[serde(tag = "status", rename_all = "snake_case")] pub enum LfcOffloadState { #[default] NotOffloaded, Offloading, - Completed, + Completed { + state_query_time_ms: u32, + compress_time_ms: u32, + state_upload_time_ms: u32, + }, Failed { error: String, }, + /// LFC state was empty so it wasn't offloaded + Skipped, } -#[derive(Serialize, Debug, Clone, PartialEq)] +#[derive(Serialize, Debug, Clone)] #[serde(tag = "status", rename_all = "snake_case")] -/// Response of /promote pub enum PromoteState { NotPromoted, - Completed, - Failed { error: String }, + Completed { + lsn_wait_time_ms: u32, + pg_promote_time_ms: u32, + reconfigure_time_ms: u32, + }, + Failed { + error: String, + }, } #[derive(Deserialize, Default, Debug)] diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 511bdc4e42..5b356c8460 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -458,7 +458,7 @@ pub(crate) enum LocalProxyConnError { impl ReportableError for HttpConnError { fn get_error_kind(&self) -> ErrorKind { match self { - HttpConnError::ConnectError(_) => ErrorKind::Compute, + HttpConnError::ConnectError(e) => e.get_error_kind(), HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, HttpConnError::PostgresConnectionError(p) => match p.as_db_error() { // user provided a wrong database name diff --git a/proxy/src/serverless/rest.rs b/proxy/src/serverless/rest.rs index 0c3d2c958d..9f98e87272 100644 --- a/proxy/src/serverless/rest.rs +++ b/proxy/src/serverless/rest.rs @@ -5,12 +5,17 @@ use std::sync::Arc; use bytes::Bytes; use http::Method; -use http::header::{AUTHORIZATION, CONTENT_TYPE, HOST}; +use http::header::{ + ACCESS_CONTROL_ALLOW_HEADERS, ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN, + ACCESS_CONTROL_EXPOSE_HEADERS, ACCESS_CONTROL_MAX_AGE, ACCESS_CONTROL_REQUEST_HEADERS, ALLOW, + AUTHORIZATION, CONTENT_TYPE, HOST, ORIGIN, +}; use http_body_util::combinators::BoxBody; -use http_body_util::{BodyExt, Full}; +use http_body_util::{BodyExt, Empty, Full}; use http_utils::error::ApiError; use hyper::body::Incoming; -use hyper::http::{HeaderName, HeaderValue}; +use hyper::http::response::Builder; +use hyper::http::{HeaderMap, HeaderName, HeaderValue}; use hyper::{Request, Response, StatusCode}; use indexmap::IndexMap; use moka::sync::Cache; @@ -67,6 +72,15 @@ use crate::util::deserialize_json_string; static EMPTY_JSON_SCHEMA: &str = r#"{"schemas":[]}"#; const INTROSPECTION_SQL: &str = POSTGRESQL_INTROSPECTION_SQL; +const HEADER_VALUE_ALLOW_ALL_ORIGINS: HeaderValue = HeaderValue::from_static("*"); +// CORS headers values +const ACCESS_CONTROL_ALLOW_METHODS_VALUE: HeaderValue = + HeaderValue::from_static("GET, POST, PATCH, PUT, DELETE, OPTIONS"); +const ACCESS_CONTROL_MAX_AGE_VALUE: HeaderValue = HeaderValue::from_static("86400"); +const ACCESS_CONTROL_EXPOSE_HEADERS_VALUE: HeaderValue = HeaderValue::from_static( + "Content-Encoding, Content-Location, Content-Range, Content-Type, Date, Location, Server, Transfer-Encoding, Range-Unit", +); +const ACCESS_CONTROL_ALLOW_HEADERS_VALUE: HeaderValue = HeaderValue::from_static("Authorization"); // A wrapper around the DbSchema that allows for self-referencing #[self_referencing] @@ -137,6 +151,8 @@ pub struct ApiConfig { pub role_claim_key: String, #[serde(default, deserialize_with = "deserialize_comma_separated_option")] pub db_extra_search_path: Option>, + #[serde(default, deserialize_with = "deserialize_comma_separated_option")] + pub server_cors_allowed_origins: Option>, } // The DbSchemaCache is a cache of the ApiConfig and DbSchemaOwned for each endpoint @@ -165,7 +181,13 @@ impl DbSchemaCache { } } - pub async fn get_cached_or_remote( + pub fn get_cached( + &self, + endpoint_id: &EndpointCacheKey, + ) -> Option> { + count_cache_outcome(CacheKind::Schema, self.0.get(endpoint_id)) + } + pub async fn get_remote( &self, endpoint_id: &EndpointCacheKey, auth_header: &HeaderValue, @@ -174,47 +196,42 @@ impl DbSchemaCache { ctx: &RequestContext, config: &'static ProxyConfig, ) -> Result, RestError> { - let cache_result = count_cache_outcome(CacheKind::Schema, self.0.get(endpoint_id)); - match cache_result { - Some(v) => Ok(v), - None => { - info!("db_schema cache miss for endpoint: {:?}", endpoint_id); - let remote_value = self - .get_remote(auth_header, connection_string, client, ctx, config) - .await; - let (api_config, schema_owned) = match remote_value { - Ok((api_config, schema_owned)) => (api_config, schema_owned), - Err(e @ RestError::SchemaTooLarge) => { - // for the case where the schema is too large, we cache an empty dummy value - // all the other requests will fail without triggering the introspection query - let schema_owned = serde_json::from_str::(EMPTY_JSON_SCHEMA) - .map_err(|e| JsonDeserialize { source: e })?; + info!("db_schema cache miss for endpoint: {:?}", endpoint_id); + let remote_value = self + .internal_get_remote(auth_header, connection_string, client, ctx, config) + .await; + let (api_config, schema_owned) = match remote_value { + Ok((api_config, schema_owned)) => (api_config, schema_owned), + Err(e @ RestError::SchemaTooLarge) => { + // for the case where the schema is too large, we cache an empty dummy value + // all the other requests will fail without triggering the introspection query + let schema_owned = serde_json::from_str::(EMPTY_JSON_SCHEMA) + .map_err(|e| JsonDeserialize { source: e })?; - let api_config = ApiConfig { - db_schemas: vec![], - db_anon_role: None, - db_max_rows: None, - db_allowed_select_functions: vec![], - role_claim_key: String::new(), - db_extra_search_path: None, - }; - let value = Arc::new((api_config, schema_owned)); - count_cache_insert(CacheKind::Schema); - self.0.insert(endpoint_id.clone(), value); - return Err(e); - } - Err(e) => { - return Err(e); - } + let api_config = ApiConfig { + db_schemas: vec![], + db_anon_role: None, + db_max_rows: None, + db_allowed_select_functions: vec![], + role_claim_key: String::new(), + db_extra_search_path: None, + server_cors_allowed_origins: None, }; let value = Arc::new((api_config, schema_owned)); count_cache_insert(CacheKind::Schema); - self.0.insert(endpoint_id.clone(), value.clone()); - Ok(value) + self.0.insert(endpoint_id.clone(), value); + return Err(e); } - } + Err(e) => { + return Err(e); + } + }; + let value = Arc::new((api_config, schema_owned)); + count_cache_insert(CacheKind::Schema); + self.0.insert(endpoint_id.clone(), value.clone()); + Ok(value) } - pub async fn get_remote( + async fn internal_get_remote( &self, auth_header: &HeaderValue, connection_string: &str, @@ -531,7 +548,7 @@ pub(crate) async fn handle( ) -> Result>, ApiError> { let result = handle_inner(cancel, config, &ctx, request, backend).await; - let mut response = match result { + let response = match result { Ok(r) => { ctx.set_success(); @@ -640,9 +657,6 @@ pub(crate) async fn handle( } }; - response - .headers_mut() - .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); Ok(response) } @@ -722,6 +736,37 @@ async fn handle_inner( } } +fn apply_common_cors_headers( + response: &mut Builder, + request_headers: &HeaderMap, + allowed_origins: Option<&Vec>, +) { + let request_origin = request_headers + .get(ORIGIN) + .map(|v| v.to_str().unwrap_or("")); + + let response_allow_origin = match (request_origin, allowed_origins) { + (Some(or), Some(allowed_origins)) => { + if allowed_origins.iter().any(|o| o == or) { + Some(HeaderValue::from_str(or).unwrap_or(HEADER_VALUE_ALLOW_ALL_ORIGINS)) + } else { + None + } + } + (Some(_), None) => Some(HEADER_VALUE_ALLOW_ALL_ORIGINS), + _ => None, + }; + if let Some(h) = response.headers_mut() { + h.insert( + ACCESS_CONTROL_EXPOSE_HEADERS, + ACCESS_CONTROL_EXPOSE_HEADERS_VALUE, + ); + if let Some(origin) = response_allow_origin { + h.insert(ACCESS_CONTROL_ALLOW_ORIGIN, origin); + } + } +} + #[allow(clippy::too_many_arguments)] async fn handle_rest_inner( config: &'static ProxyConfig, @@ -733,12 +778,6 @@ async fn handle_rest_inner( jwt: String, backend: Arc, ) -> Result>, RestError> { - // validate the jwt token - let jwt_parsed = backend - .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) - .await - .map_err(HttpConnError::from)?; - let db_schema_cache = config .rest_config @@ -754,28 +793,83 @@ async fn handle_rest_inner( message: "Failed to get endpoint cache key".to_string(), }))?; - let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?; - let (parts, originial_body) = request.into_parts(); + // try and get the cached entry for this endpoint + // it contains the api config and the introspected db schema + let cached_entry = db_schema_cache.get_cached(&endpoint_cache_key); + + let allowed_origins = cached_entry + .as_ref() + .and_then(|arc| arc.0.server_cors_allowed_origins.as_ref()); + + let mut response = Response::builder(); + apply_common_cors_headers(&mut response, &parts.headers, allowed_origins); + + // handle the OPTIONS request + if parts.method == Method::OPTIONS { + let allowed_headers = parts + .headers + .get(ACCESS_CONTROL_REQUEST_HEADERS) + .and_then(|a| a.to_str().ok()) + .filter(|v| !v.is_empty()) + .map_or_else( + || "Authorization".to_string(), + |v| format!("{v}, Authorization"), + ); + return response + .status(StatusCode::OK) + .header( + ACCESS_CONTROL_ALLOW_METHODS, + ACCESS_CONTROL_ALLOW_METHODS_VALUE, + ) + .header(ACCESS_CONTROL_MAX_AGE, ACCESS_CONTROL_MAX_AGE_VALUE) + .header( + ACCESS_CONTROL_ALLOW_HEADERS, + HeaderValue::from_str(&allowed_headers) + .unwrap_or(ACCESS_CONTROL_ALLOW_HEADERS_VALUE), + ) + .header(ALLOW, ACCESS_CONTROL_ALLOW_METHODS_VALUE) + .body(Empty::new().map_err(|x| match x {}).boxed()) + .map_err(|e| { + RestError::SubzeroCore(InternalError { + message: e.to_string(), + }) + }); + } + + // validate the jwt token + let jwt_parsed = backend + .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) + .await + .map_err(HttpConnError::from)?; + let auth_header = parts .headers .get(AUTHORIZATION) .ok_or(RestError::SubzeroCore(InternalError { message: "Authorization header is required".to_string(), }))?; + let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?; - let entry = db_schema_cache - .get_cached_or_remote( - &endpoint_cache_key, - auth_header, - connection_string, - &mut client, - ctx, - config, - ) - .await?; + let entry = match cached_entry { + Some(e) => e, + None => { + // if not cached, get the remote entry (will run the introspection query) + db_schema_cache + .get_remote( + &endpoint_cache_key, + auth_header, + connection_string, + &mut client, + ctx, + config, + ) + .await? + } + }; let (api_config, db_schema_owned) = entry.as_ref(); + let db_schema = db_schema_owned.borrow_schema(); let db_schemas = &api_config.db_schemas; // list of schemas available for the api @@ -999,8 +1093,8 @@ async fn handle_rest_inner( let _metrics = client.metrics(ctx); // FIXME: is everything in the context set correctly? // send the request to the local proxy - let response = make_raw_local_proxy_request(&mut client, headers, req_body).await?; - let (parts, body) = response.into_parts(); + let proxy_response = make_raw_local_proxy_request(&mut client, headers, req_body).await?; + let (response_parts, body) = proxy_response.into_parts(); let max_response = config.http_config.max_response_size_bytes; let bytes = read_body_with_limit(body, max_response) @@ -1009,7 +1103,7 @@ async fn handle_rest_inner( // if the response status is greater than 399, then it is an error // FIXME: check if there are other error codes or shapes of the response - if parts.status.as_u16() > 399 { + if response_parts.status.as_u16() > 399 { // turn this postgres error from the json into PostgresError let postgres_error = serde_json::from_slice(&bytes) .map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?; @@ -1175,7 +1269,7 @@ async fn handle_rest_inner( .boxed(); // build the response - let mut response = Response::builder() + response = response .status(StatusCode::from_u16(status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR)) .header(CONTENT_TYPE, http_content_type); diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 43232db950..4febc7656e 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -612,19 +612,25 @@ pub async fn handle_request( } } + let max_term = statuses + .iter() + .map(|(status, _)| status.acceptor_state.term) + .max() + .unwrap(); + // Find the most advanced safekeeper let (status, i) = statuses .into_iter() .max_by_key(|(status, _)| { ( status.acceptor_state.epoch, + status.flush_lsn, /* BEGIN_HADRON */ // We need to pull from the SK with the highest term. // This is because another compute may come online and vote the same highest term again on the other two SKs. // Then, there will be 2 computes running on the same term. status.acceptor_state.term, /* END_HADRON */ - status.flush_lsn, status.commit_lsn, ) }) @@ -634,6 +640,22 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); + // TODO(diko): This is hadron only check to make sure that we pull the timeline + // from the safekeeper with the highest term during timeline restore. + // We could avoid returning the error by calling bump_term after pull_timeline. + // However, this is not a big deal because we retry the pull_timeline requests. + // The check should be removed together with removing custom hadron logic for + // safekeeper restore. + if wait_for_peer_timeline_status && status.acceptor_state.term != max_term { + return Err(ApiError::PreconditionFailed( + format!( + "choosen safekeeper {} has term {}, but the most advanced term is {}", + safekeeper_host, status.acceptor_state.term, max_term + ) + .into(), + )); + } + match pull_timeline( status, safekeeper_host, diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index e083a49428..25ac8e5bd3 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -195,12 +195,14 @@ impl StateSK { to: Configuration, ) -> Result { let result = self.state_mut().membership_switch(to).await?; + let flush_lsn = self.flush_lsn(); + let last_log_term = self.state().acceptor_state.get_last_log_term(flush_lsn); Ok(TimelineMembershipSwitchResponse { previous_conf: result.previous_conf, current_conf: result.current_conf, - last_log_term: self.state().acceptor_state.term, - flush_lsn: self.flush_lsn(), + last_log_term, + flush_lsn, }) } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index ff73719adb..b40da4fd65 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -644,6 +644,7 @@ async fn handle_tenant_timeline_safekeeper_migrate( req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + // TODO(diko): it's not PS operation, there should be a different permission scope. check_permissions(&req, Scope::PageServerApi)?; maybe_rate_limit(&req, tenant_id).await; @@ -665,6 +666,23 @@ async fn handle_tenant_timeline_safekeeper_migrate( json_response(StatusCode::OK, ()) } +async fn handle_tenant_timeline_safekeeper_migrate_abort( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + // TODO(diko): it's not PS operation, there should be a different permission scope. + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; + + service + .tenant_timeline_safekeeper_migrate_abort(tenant_id, timeline_id) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn handle_tenant_timeline_lsn_lease( service: Arc, req: Request, @@ -2611,6 +2629,16 @@ pub fn make_router( ) }, ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_safekeeper_migrate_abort, + RequestName("v1_tenant_timeline_safekeeper_migrate_abort"), + ) + }, + ) // LSN lease passthrough to all shards .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease", diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 619b5f69b8..c61ef9ff5d 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -471,11 +471,17 @@ impl Persistence { &self, input_node_id: NodeId, input_https_port: Option, + input_grpc_addr: Option, + input_grpc_port: Option, ) -> DatabaseResult<()> { use crate::schema::nodes::dsl::*; self.update_node( input_node_id, - listen_https_port.eq(input_https_port.map(|x| x as i32)), + ( + listen_https_port.eq(input_https_port.map(|x| x as i32)), + listen_grpc_addr.eq(input_grpc_addr), + listen_grpc_port.eq(input_grpc_port.map(|x| x as i32)), + ), ) .await } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 37380b8fbe..2c7e4ee2de 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -7813,7 +7813,7 @@ impl Service { register_req.listen_https_port, register_req.listen_pg_addr, register_req.listen_pg_port, - register_req.listen_grpc_addr, + register_req.listen_grpc_addr.clone(), register_req.listen_grpc_port, register_req.availability_zone_id.clone(), self.config.use_https_pageserver_api, @@ -7848,6 +7848,8 @@ impl Service { .update_node_on_registration( register_req.node_id, register_req.listen_https_port, + register_req.listen_grpc_addr, + register_req.listen_grpc_port, ) .await? } diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index a60ebb85c6..689d341b6a 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -24,12 +24,12 @@ use pageserver_api::controller_api::{ }; use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo}; use safekeeper_api::PgVersionId; +use safekeeper_api::Term; use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration}; use safekeeper_api::models::{ PullTimelineRequest, TimelineLocateResponse, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse, }; -use safekeeper_api::{INITIAL_TERM, Term}; use safekeeper_client::mgmt_api; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; @@ -1230,10 +1230,7 @@ impl Service { } // It it is the same new_sk_set, we can continue the migration (retry). } else { - let prev_finished = timeline.cplane_notified_generation == timeline.generation - && timeline.sk_set_notified_generation == timeline.generation; - - if !prev_finished { + if !is_migration_finished(&timeline) { // The previous migration is committed, but the finish step failed. // Safekeepers/cplane might not know about the last membership configuration. // Retry the finish step to ensure smooth migration. @@ -1298,13 +1295,7 @@ impl Service { ) .await?; - let mut sync_position = (INITIAL_TERM, Lsn::INVALID); - for res in results.into_iter().flatten() { - let sk_position = (res.last_log_term, res.flush_lsn); - if sync_position < sk_position { - sync_position = sk_position; - } - } + let sync_position = Self::get_sync_position(&results)?; tracing::info!( %generation, @@ -1551,6 +1542,8 @@ impl Service { timeline_id: TimelineId, timeline: &TimelinePersistence, ) -> Result<(), ApiError> { + tracing::info!(generation=?timeline.generation, sk_set=?timeline.sk_set, new_sk_set=?timeline.new_sk_set, "retrying finish safekeeper migration"); + if timeline.new_sk_set.is_some() { // Logical error, should never happen. return Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -1598,4 +1591,152 @@ impl Service { Ok(()) } + + /// Get membership switch responses from all safekeepers and return the sync position. + /// + /// Sync position is a position equal or greater than the commit position. + /// It is guaranteed that all WAL entries with (last_log_term, flush_lsn) + /// greater than the sync position are not committed (= not on a quorum). + /// + /// Returns error if there is no quorum of successful responses. + fn get_sync_position( + responses: &[mgmt_api::Result], + ) -> Result<(Term, Lsn), ApiError> { + let quorum_size = responses.len() / 2 + 1; + + let mut wal_positions = responses + .iter() + .flatten() + .map(|res| (res.last_log_term, res.flush_lsn)) + .collect::>(); + + // Should be already checked if the responses are from tenant_timeline_set_membership_quorum. + if wal_positions.len() < quorum_size { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "not enough successful responses to get sync position: {}/{}", + wal_positions.len(), + quorum_size, + ))); + } + + wal_positions.sort(); + + Ok(wal_positions[quorum_size - 1]) + } + + /// Abort ongoing safekeeper migration. + pub(crate) async fn tenant_timeline_safekeeper_migrate_abort( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result<(), ApiError> { + // TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks. + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineSafekeeperMigrate, + ) + .await; + + // Fetch current timeline configuration from the configuration storage. + let timeline = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + + let Some(timeline) = timeline else { + return Err(ApiError::NotFound( + anyhow::anyhow!( + "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table" + ) + .into(), + )); + }; + + let mut generation = SafekeeperGeneration::new(timeline.generation as u32); + + let Some(new_sk_set) = &timeline.new_sk_set else { + // No new_sk_set -> no active migration that we can abort. + tracing::info!("timeline has no active migration"); + + if !is_migration_finished(&timeline) { + // The last migration is committed, but the finish step failed. + // Safekeepers/cplane might not know about the last membership configuration. + // Retry the finish step to make the timeline state clean. + self.finish_safekeeper_migration_retry(tenant_id, timeline_id, &timeline) + .await?; + } + return Ok(()); + }; + + tracing::info!(sk_set=?timeline.sk_set, ?new_sk_set, ?generation, "aborting timeline migration"); + + let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?; + let new_safekeepers = self.get_safekeepers(new_sk_set)?; + + let cur_sk_member_set = + Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?; + + // Increment current generation and remove new_sk_set from the timeline to abort the migration. + generation = generation.next(); + + let mconf = membership::Configuration { + generation, + members: cur_sk_member_set, + new_members: None, + }; + + // Exclude safekeepers which were added during the current migration. + let cur_ids: HashSet = cur_safekeepers.iter().map(|sk| sk.get_id()).collect(); + let exclude_safekeepers = new_safekeepers + .into_iter() + .filter(|sk| !cur_ids.contains(&sk.get_id())) + .collect::>(); + + let exclude_requests = exclude_safekeepers + .iter() + .map(|sk| TimelinePendingOpPersistence { + sk_id: sk.skp.id, + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + generation: generation.into_inner() as i32, + op_kind: SafekeeperTimelineOpKind::Exclude, + }) + .collect::>(); + + let cur_sk_set = cur_safekeepers + .iter() + .map(|sk| sk.get_id()) + .collect::>(); + + // Persist new mconf and exclude requests. + self.persistence + .update_timeline_membership( + tenant_id, + timeline_id, + generation, + &cur_sk_set, + None, + &exclude_requests, + ) + .await?; + + // At this point we have already commited the abort, but still need to notify + // cplane/safekeepers with the new mconf. That's what finish_safekeeper_migration does. + self.finish_safekeeper_migration( + tenant_id, + timeline_id, + &cur_safekeepers, + &mconf, + &exclude_safekeepers, + ) + .await?; + + Ok(()) + } +} + +fn is_migration_finished(timeline: &TimelinePersistence) -> bool { + timeline.cplane_notified_generation == timeline.generation + && timeline.sk_set_notified_generation == timeline.generation } diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index d235ac2143..c77a372017 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -78,20 +78,26 @@ class EndpointHttpClient(requests.Session): json: dict[str, str] = res.json() return json - def prewarm_lfc(self, from_endpoint_id: str | None = None): + def prewarm_lfc(self, from_endpoint_id: str | None = None) -> dict[str, str]: """ Prewarm LFC cache from given endpoint and wait till it finishes or errors """ params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict() self.post(self.prewarm_url, params=params).raise_for_status() - self.prewarm_lfc_wait() + return self.prewarm_lfc_wait() - def prewarm_lfc_wait(self): + def cancel_prewarm_lfc(self): + """ + Cancel LFC prewarm if any is ongoing + """ + self.delete(self.prewarm_url).raise_for_status() + + def prewarm_lfc_wait(self) -> dict[str, str]: """ Wait till LFC prewarm returns with error or success. If prewarm was not requested before calling this function, it will error """ - statuses = "failed", "completed", "skipped" + statuses = "failed", "completed", "skipped", "cancelled" def prewarmed(): json = self.prewarm_lfc_status() @@ -101,6 +107,7 @@ class EndpointHttpClient(requests.Session): wait_until(prewarmed, timeout=60) res = self.prewarm_lfc_status() assert res["status"] != "failed", res + return res def offload_lfc_status(self) -> dict[str, str]: res = self.get(self.offload_url) @@ -108,29 +115,31 @@ class EndpointHttpClient(requests.Session): json: dict[str, str] = res.json() return json - def offload_lfc(self): + def offload_lfc(self) -> dict[str, str]: """ Offload LFC cache to endpoint storage and wait till offload finishes or errors """ self.post(self.offload_url).raise_for_status() - self.offload_lfc_wait() + return self.offload_lfc_wait() - def offload_lfc_wait(self): + def offload_lfc_wait(self) -> dict[str, str]: """ Wait till LFC offload returns with error or success. If offload was not requested before calling this function, it will error """ + statuses = "failed", "completed", "skipped" def offloaded(): json = self.offload_lfc_status() status, err = json["status"], json.get("error") - assert status in ["failed", "completed"], f"{status}, {err=}" + assert status in statuses, f"{status}, {err=}" wait_until(offloaded, timeout=60) res = self.offload_lfc_status() assert res["status"] != "failed", res + return res - def promote(self, promote_spec: dict[str, Any], disconnect: bool = False): + def promote(self, promote_spec: dict[str, Any], disconnect: bool = False) -> dict[str, str]: url = f"http://localhost:{self.external_port}/promote" if disconnect: try: # send first request to start promote and disconnect diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b5148cbfdc..41213d374a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -262,7 +262,6 @@ class PgProtocol: # pooler does not support statement_timeout # Check if the hostname contains the string 'pooler' hostname = result.get("host", "") - log.info(f"Hostname: {hostname}") options = result.get("options", "") if "statement_timeout" not in options and "pooler" not in hostname: options = f"-cstatement_timeout=120s {options}" @@ -2314,6 +2313,7 @@ class NeonStorageController(MetricsGetter, LogUtils): timeline_id: TimelineId, new_sk_set: list[int], ): + log.info(f"migrate_safekeepers({tenant_id}, {timeline_id}, {new_sk_set})") response = self.request( "POST", f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate", @@ -2323,6 +2323,19 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() log.info(f"migrate_safekeepers success: {response.json()}") + def abort_safekeeper_migration( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + ): + response = self.request( + "POST", + f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate_abort", + headers=self.headers(TokenScope.PAGE_SERVER_API), + ) + response.raise_for_status() + log.info(f"abort_safekeeper_migration success: {response.json()}") + def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: """ :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index a780aa2e52..94c18ac548 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -863,7 +863,6 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*") -@pytest.mark.skip(reason="Lakebase mode") def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder): """ Test that when the pageserver detects corruption during image layer creation, @@ -890,7 +889,9 @@ def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder): timeline_id = env.initial_timeline pageserver_http = env.pageserver.http_client() - workload = Workload(env, tenant_id, timeline_id) + workload = Workload( + env, tenant_id, timeline_id, endpoint_opts={"config_lines": ["neon.lakebase_mode=true"]} + ) workload.init() # Enable the failpoint that will cause image layer creation to fail due to a (simulated) detected diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py index 2bbe8c3e97..a96f18177c 100644 --- a/test_runner/regress/test_lfc_prewarm.py +++ b/test_runner/regress/test_lfc_prewarm.py @@ -1,6 +1,6 @@ import random -import threading from enum import StrEnum +from threading import Thread from time import sleep from typing import Any @@ -47,19 +47,23 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) # With autoprewarm, we need to be sure LFC was offloaded after all writes # finish, so we sleep. Otherwise we'll have less prewarmed pages than we want sleep(AUTOOFFLOAD_INTERVAL_SECS) - client.offload_lfc_wait() - return + offload_res = client.offload_lfc_wait() + log.info(offload_res) + return offload_res if method == PrewarmMethod.COMPUTE_CTL: status = client.prewarm_lfc_status() assert status["status"] == "not_prewarmed" assert "error" not in status - client.offload_lfc() + offload_res = client.offload_lfc() + log.info(offload_res) assert client.prewarm_lfc_status()["status"] == "not_prewarmed" + parsed = prom_parse(client) desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0} assert parsed == desired, f"{parsed=} != {desired=}" - return + + return offload_res raise AssertionError(f"{method} not in PrewarmMethod") @@ -68,21 +72,30 @@ def prewarm_endpoint( method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None ): if method == PrewarmMethod.AUTOPREWARM: - client.prewarm_lfc_wait() + prewarm_res = client.prewarm_lfc_wait() + log.info(prewarm_res) elif method == PrewarmMethod.COMPUTE_CTL: - client.prewarm_lfc() + prewarm_res = client.prewarm_lfc() + log.info(prewarm_res) + return prewarm_res elif method == PrewarmMethod.POSTGRES: cur.execute("select neon.prewarm_local_cache(%s)", (lfc_state,)) -def check_prewarmed( +def check_prewarmed_contains( method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int] ): if method == PrewarmMethod.AUTOPREWARM: - assert client.prewarm_lfc_status() == desired_status + prewarm_status = client.prewarm_lfc_status() + for k in desired_status: + assert desired_status[k] == prewarm_status[k] + assert prom_parse(client)[PREWARM_LABEL] == 1 elif method == PrewarmMethod.COMPUTE_CTL: - assert client.prewarm_lfc_status() == desired_status + prewarm_status = client.prewarm_lfc_status() + for k in desired_status: + assert desired_status[k] == prewarm_status[k] + desired = {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1, PREWARM_ERR_LABEL: 0, OFFLOAD_ERR_LABEL: 0} assert prom_parse(client) == desired @@ -149,9 +162,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): log.info(f"Used LFC size: {lfc_used_pages}") pg_cur.execute("select * from neon.get_prewarm_info()") total, prewarmed, skipped, _ = pg_cur.fetchall()[0] - log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}") - progress = (prewarmed + skipped) * 100 // total - log.info(f"Prewarm progress: {progress}%") assert lfc_used_pages > 10000 assert total > 0 assert prewarmed > 0 @@ -161,7 +171,54 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped} - check_prewarmed(method, client, desired) + check_prewarmed_contains(method, client, desired) + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_lfc_prewarm_cancel(neon_simple_env: NeonEnv): + """ + Test we can cancel LFC prewarm and prewarm successfully after + """ + env = neon_simple_env + n_records = 1000000 + cfg = [ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "neon.file_cache_prewarm_limit=1000", + ] + endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg) + + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("create schema neon; create extension neon with schema neon") + pg_cur.execute("create database lfc") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + log.info(f"Inserting {n_records} rows") + lfc_cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))") + lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))") + log.info(f"Inserted {n_records} rows") + + client = endpoint.http_client() + method = PrewarmMethod.COMPUTE_CTL + offload_lfc(method, client, pg_cur) + + endpoint.stop() + endpoint.start() + + thread = Thread(target=lambda: prewarm_endpoint(method, client, pg_cur, None)) + thread.start() + # wait 2 seconds to ensure we cancel prewarm SQL query + sleep(2) + client.cancel_prewarm_lfc() + thread.join() + assert client.prewarm_lfc_status()["status"] == "cancelled" + + prewarm_endpoint(method, client, pg_cur, None) + assert client.prewarm_lfc_status()["status"] == "completed" @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") @@ -178,9 +235,8 @@ def test_lfc_prewarm_empty(neon_simple_env: NeonEnv): cur = conn.cursor() cur.execute("create schema neon; create extension neon with schema neon") method = PrewarmMethod.COMPUTE_CTL - offload_lfc(method, client, cur) - prewarm_endpoint(method, client, cur, None) - assert client.prewarm_lfc_status()["status"] == "skipped" + assert offload_lfc(method, client, cur)["status"] == "skipped" + assert prewarm_endpoint(method, client, cur, None)["status"] == "skipped" # autoprewarm isn't needed as we prewarm manually @@ -251,11 +307,11 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet workload_threads = [] for _ in range(n_threads): - t = threading.Thread(target=workload) + t = Thread(target=workload) workload_threads.append(t) t.start() - prewarm_thread = threading.Thread(target=prewarm) + prewarm_thread = Thread(target=prewarm) prewarm_thread.start() def prewarmed(): diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py index 9415d6886c..26165b40cd 100644 --- a/test_runner/regress/test_replica_promotes.py +++ b/test_runner/regress/test_replica_promotes.py @@ -145,6 +145,7 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod): stop_and_check_lsn(secondary, None) if method == PromoteMethod.COMPUTE_CTL: + log.info("Restarting primary to check new config") secondary.stop() # In production, compute ultimately receives new compute spec from cplane. secondary.respec(mode="Primary") diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py index 2ceeea37a5..ba067b97de 100644 --- a/test_runner/regress/test_safekeeper_migration.py +++ b/test_runner/regress/test_safekeeper_migration.py @@ -286,3 +286,265 @@ def test_sk_generation_aware_tombstones(neon_env_builder: NeonEnvBuilder): assert re.match(r".*Timeline .* deleted.*", exc.value.response.text) # The timeline should remain deleted. expect_deleted(second_sk) + + +def test_safekeeper_migration_stale_timeline(neon_env_builder: NeonEnvBuilder): + """ + Test that safekeeper migration handles stale timeline correctly by migrating to + a safekeeper with a stale timeline. + 1. Check that we are waiting for the stale timeline to catch up with the commit lsn. + The migration might fail if there is no compute to advance the WAL. + 2. Check that we rely on last_log_term (and not the current term) when waiting for the + sync_position on step 7. + 3. Check that migration succeeds if the compute is running. + """ + neon_env_builder.num_safekeepers = 2 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 1, + } + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS) + env.storage_controller.allowed_errors.append(".*not enough successful .* to reach quorum.*") + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + + active_sk = env.get_safekeeper(mconf["sk_set"][0]) + other_sk = [sk for sk in env.safekeepers if sk.id != active_sk.id][0] + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + ep.start(safekeeper_generation=1, safekeepers=[active_sk.id]) + ep.safe_psql("CREATE TABLE t(a int)") + ep.safe_psql("INSERT INTO t VALUES (0)") + + # Pull the timeline to other_sk, so other_sk now has a "stale" timeline on it. + other_sk.pull_timeline([active_sk], env.initial_tenant, env.initial_timeline) + + # Advance the WAL on active_sk. + ep.safe_psql("INSERT INTO t VALUES (1)") + + # The test is more tricky if we have the same last_log_term but different term/flush_lsn. + # Stop the active_sk during the endpoint shutdown because otherwise compute_ctl runs + # sync_safekeepers and advances last_log_term on active_sk. + active_sk.stop() + ep.stop(mode="immediate") + active_sk.start() + + active_sk_status = active_sk.http_client().timeline_status( + env.initial_tenant, env.initial_timeline + ) + other_sk_status = other_sk.http_client().timeline_status( + env.initial_tenant, env.initial_timeline + ) + + # other_sk should have the same last_log_term, but a stale flush_lsn. + assert active_sk_status.last_log_term == other_sk_status.last_log_term + assert active_sk_status.flush_lsn > other_sk_status.flush_lsn + + commit_lsn = active_sk_status.flush_lsn + + # Bump the term on other_sk to make it higher than active_sk. + # This is to make sure we don't use current term instead of last_log_term in the algorithm. + other_sk.http_client().term_bump( + env.initial_tenant, env.initial_timeline, active_sk_status.term + 100 + ) + + # TODO(diko): now it fails because the timeline on other_sk is stale and there is no compute + # to catch up it with active_sk. It might be fixed in https://databricks.atlassian.net/browse/LKB-946 + # if we delete stale timelines before starting the migration. + # But the rest of the test is still valid: we should not lose committed WAL after the migration. + with pytest.raises( + StorageControllerApiException, match="not enough successful .* to reach quorum" + ): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [other_sk.id] + ) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["new_sk_set"] == [other_sk.id] + assert mconf["sk_set"] == [active_sk.id] + assert mconf["generation"] == 2 + + # Start the endpoint, so it advances the WAL on other_sk. + ep.start(safekeeper_generation=2, safekeepers=[active_sk.id, other_sk.id]) + # Now the migration should succeed. + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [other_sk.id] + ) + + # Check that we didn't lose committed WAL. + assert ( + other_sk.http_client().timeline_status(env.initial_tenant, env.initial_timeline).flush_lsn + >= commit_lsn + ) + assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)] + + +def test_pull_from_most_advanced_sk(neon_env_builder: NeonEnvBuilder): + """ + Test that we pull the timeline from the most advanced safekeeper during the + migration and do not lose committed WAL. + """ + neon_env_builder.num_safekeepers = 4 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 3, + } + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + + sk_set = mconf["sk_set"] + assert len(sk_set) == 3 + + other_sk = [sk.id for sk in env.safekeepers if sk.id not in sk_set][0] + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + ep.start(safekeeper_generation=1, safekeepers=sk_set) + ep.safe_psql("CREATE TABLE t(a int)") + ep.safe_psql("INSERT INTO t VALUES (0)") + + # Stop one sk, so we have a lagging WAL on it. + env.get_safekeeper(sk_set[0]).stop() + # Advance the WAL on the other sks. + ep.safe_psql("INSERT INTO t VALUES (1)") + + # Stop other sks to make sure compute_ctl doesn't advance the last_log_term on them during shutdown. + for sk_id in sk_set[1:]: + env.get_safekeeper(sk_id).stop() + ep.stop(mode="immediate") + for sk_id in sk_set: + env.get_safekeeper(sk_id).start() + + # Bump the term on the lagging sk to make sure we don't use it to choose the most advanced sk. + env.get_safekeeper(sk_set[0]).http_client().term_bump( + env.initial_tenant, env.initial_timeline, 100 + ) + + def get_commit_lsn(sk_set: list[int]): + flush_lsns = [] + last_log_terms = [] + for sk_id in sk_set: + sk = env.get_safekeeper(sk_id) + status = sk.http_client().timeline_status(env.initial_tenant, env.initial_timeline) + flush_lsns.append(status.flush_lsn) + last_log_terms.append(status.last_log_term) + + # In this test we assume that all sks have the same last_log_term. + assert len(set(last_log_terms)) == 1 + + flush_lsns.sort(reverse=True) + commit_lsn = flush_lsns[len(sk_set) // 2] + + log.info(f"sk_set: {sk_set}, flush_lsns: {flush_lsns}, commit_lsn: {commit_lsn}") + return commit_lsn + + commit_lsn_before_migration = get_commit_lsn(sk_set) + + # Make two migrations, so the lagging sk stays in the sk_set, but other sks are replaced. + new_sk_set1 = [sk_set[0], sk_set[1], other_sk] # remove sk_set[2], add other_sk + new_sk_set2 = [sk_set[0], other_sk, sk_set[2]] # remove sk_set[1], add sk_set[2] back + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, new_sk_set1 + ) + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, new_sk_set2 + ) + + commit_lsn_after_migration = get_commit_lsn(new_sk_set2) + + # We should not lose committed WAL. + # If we have choosen the lagging sk to pull the timeline from, this might fail. + assert commit_lsn_before_migration <= commit_lsn_after_migration + + ep.start(safekeeper_generation=5, safekeepers=new_sk_set2) + assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)] + + +def test_abort_safekeeper_migration(neon_env_builder: NeonEnvBuilder): + """ + Test that safekeeper migration can be aborted. + 1. Insert failpoints and ensure the abort successfully reverts the timeline state. + 2. Check that endpoint is operational after the abort. + """ + neon_env_builder.num_safekeepers = 2 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 1, + } + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert len(mconf["sk_set"]) == 1 + cur_sk = mconf["sk_set"][0] + cur_gen = 1 + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"]) + ep.safe_psql("CREATE EXTENSION neon_test_utils;") + ep.safe_psql("CREATE TABLE t(a int)") + ep.safe_psql("INSERT INTO t VALUES (1)") + + another_sk = [sk.id for sk in env.safekeepers if sk.id != cur_sk][0] + + failpoints = [ + "sk-migration-after-step-3", + "sk-migration-after-step-4", + "sk-migration-after-step-5", + "sk-migration-after-step-7", + ] + + for fp in failpoints: + env.storage_controller.configure_failpoints((fp, "return(1)")) + + with pytest.raises(StorageControllerApiException, match=f"failpoint {fp}"): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [another_sk] + ) + cur_gen += 1 + + env.storage_controller.configure_failpoints((fp, "off")) + + # We should have a joint mconf after the failure. + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["generation"] == cur_gen + assert mconf["sk_set"] == [cur_sk] + assert mconf["new_sk_set"] == [another_sk] + + env.storage_controller.abort_safekeeper_migration(env.initial_tenant, env.initial_timeline) + cur_gen += 1 + + # Abort should revert the timeline to the previous sk_set and increment the generation. + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["generation"] == cur_gen + assert mconf["sk_set"] == [cur_sk] + assert mconf["new_sk_set"] is None + + assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith(f"g#{cur_gen}:") + ep.safe_psql(f"INSERT INTO t VALUES ({cur_gen})") + + # After step-8 the final mconf is committed and the migration is not abortable anymore. + # So the abort should not abort anything. + env.storage_controller.configure_failpoints(("sk-migration-after-step-8", "return(1)")) + + with pytest.raises(StorageControllerApiException, match="failpoint sk-migration-after-step-8"): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [another_sk] + ) + cur_gen += 2 + + env.storage_controller.configure_failpoints((fp, "off")) + + env.storage_controller.abort_safekeeper_migration(env.initial_tenant, env.initial_timeline) + + # The migration is fully committed, no abort should have been performed. + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["generation"] == cur_gen + assert mconf["sk_set"] == [another_sk] + assert mconf["new_sk_set"] is None + + ep.safe_psql(f"INSERT INTO t VALUES ({cur_gen})") + ep.clear_buffers() + assert ep.safe_psql("SELECT * FROM t") == [(i + 1,) for i in range(cur_gen) if i % 2 == 0] diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 668bdc7e51..a685992b51 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2742,7 +2742,6 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde wait_until(unevicted) -@pytest.mark.skip(reason="Lakebase mode") def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): """ Test that the timeline disk usage circuit breaker works as expected. We test that: @@ -2762,7 +2761,12 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): # Create a timeline and endpoint env.create_branch("test_timeline_disk_usage_limit") - endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit") + endpoint = env.endpoints.create_start( + "test_timeline_disk_usage_limit", + config_lines=[ + "neon.lakebase_mode=true", + ], + ) # Install the neon extension in the test database. We need it to query perf counter metrics. with closing(endpoint.connect()) as conn: diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c9f9fdd011..2155cb165d 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c9f9fdd0113b52c0bd535afdb09d3a543aeee25f +Subproject commit 2155cb165d05f617eb2c8ad7e43367189b627703 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index aaaeff2550..2aaab3bb4a 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit aaaeff2550d5deba58847f112af9b98fa3a58b00 +Subproject commit 2aaab3bb4a13557aae05bb2ae0ef0a132d0c4f85 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 9b9cb4b3e3..a42351fcd4 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 9b9cb4b3e33347aea8f61e606bb6569979516de5 +Subproject commit a42351fcd41ea01edede1daed65f651e838988fc diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index fa1788475e..1e01fcea2a 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit fa1788475e3146cc9c7c6a1b74f48fd296898fcd +Subproject commit 1e01fcea2a6b38180021aa83e0051d95286d9096 diff --git a/vendor/revisions.json b/vendor/revisions.json index 7212c9f7c7..c02c748a72 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.5", - "fa1788475e3146cc9c7c6a1b74f48fd296898fcd" + "1e01fcea2a6b38180021aa83e0051d95286d9096" ], "v16": [ "16.9", - "9b9cb4b3e33347aea8f61e606bb6569979516de5" + "a42351fcd41ea01edede1daed65f651e838988fc" ], "v15": [ "15.13", - "aaaeff2550d5deba58847f112af9b98fa3a58b00" + "2aaab3bb4a13557aae05bb2ae0ef0a132d0c4f85" ], "v14": [ "14.18", - "c9f9fdd0113b52c0bd535afdb09d3a543aeee25f" + "2155cb165d05f617eb2c8ad7e43367189b627703" ] }