diff --git a/compute/etc/sql_exporter/checkpoints_req.17.sql b/compute/etc/sql_exporter/checkpoints_req.17.sql index a4b946e8e2..28c868ff72 100644 --- a/compute/etc/sql_exporter/checkpoints_req.17.sql +++ b/compute/etc/sql_exporter/checkpoints_req.17.sql @@ -1 +1 @@ -SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer; +SELECT num_requested AS checkpoints_req FROM pg_catalog.pg_stat_checkpointer; diff --git a/compute/etc/sql_exporter/checkpoints_req.sql b/compute/etc/sql_exporter/checkpoints_req.sql index eb8427c883..421448c0de 100644 --- a/compute/etc/sql_exporter/checkpoints_req.sql +++ b/compute/etc/sql_exporter/checkpoints_req.sql @@ -1 +1 @@ -SELECT checkpoints_req FROM pg_stat_bgwriter; +SELECT checkpoints_req FROM pg_catalog.pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/checkpoints_timed.sql b/compute/etc/sql_exporter/checkpoints_timed.sql index c50853134c..bfa9b1b3d6 100644 --- a/compute/etc/sql_exporter/checkpoints_timed.sql +++ b/compute/etc/sql_exporter/checkpoints_timed.sql @@ -1 +1 @@ -SELECT checkpoints_timed FROM pg_stat_bgwriter; +SELECT checkpoints_timed FROM pg_catalog.pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql index d97d625d4c..3fe638e489 100644 --- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql +++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql @@ -1 +1 @@ -SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled; +SELECT (neon.backpressure_throttling_time()::pg_catalog.float8 / 1000000) AS throttled; diff --git a/compute/etc/sql_exporter/compute_current_lsn.sql b/compute/etc/sql_exporter/compute_current_lsn.sql index be02b8a094..9a042547f0 100644 --- a/compute/etc/sql_exporter/compute_current_lsn.sql +++ b/compute/etc/sql_exporter/compute_current_lsn.sql @@ -1,4 +1,4 @@ SELECT CASE - WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 - ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8 + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_replay_lsn() - '0/0')::pg_catalog.FLOAT8 + ELSE (pg_catalog.pg_current_wal_lsn() - '0/0')::pg_catalog.FLOAT8 END AS lsn; diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql index f2454235b7..2224c02d8d 100644 --- a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql @@ -1,7 +1,7 @@ SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + (SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files - (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; + (SELECT COUNT(*) FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql index 73a9c11405..17cf1228d3 100644 --- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql +++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql @@ -1,7 +1,7 @@ SELECT - (SELECT current_setting('neon.timeline_id')) AS timeline_id, + (SELECT pg_catalog.current_setting('neon.timeline_id')) AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files - (SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes; + (SELECT COALESCE(pg_catalog.sum(size), 0) FROM pg_catalog.pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes; diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql index 16da899de2..33ca1137dc 100644 --- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql +++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql @@ -1,9 +1,9 @@ SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + (SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files - (SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0) - FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name + (SELECT COALESCE(pg_catalog.sum((pg_catalog.pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0) + FROM (SELECT * FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name ) AS logical_snapshots_bytes; diff --git a/compute/etc/sql_exporter/compute_max_connections.sql b/compute/etc/sql_exporter/compute_max_connections.sql index 99a49483a6..1613c962a2 100644 --- a/compute/etc/sql_exporter/compute_max_connections.sql +++ b/compute/etc/sql_exporter/compute_max_connections.sql @@ -1 +1 @@ -SELECT current_setting('max_connections') as max_connections; +SELECT pg_catalog.current_setting('max_connections') AS max_connections; diff --git a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql index d2281fdd42..e613939e71 100644 --- a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql +++ b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql @@ -1,4 +1,4 @@ SELECT datname database_name, - age(datfrozenxid) frozen_xid_age -FROM pg_database + pg_catalog.age(datfrozenxid) frozen_xid_age +FROM pg_catalog.pg_database ORDER BY frozen_xid_age DESC LIMIT 10; diff --git a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql index ed57894b3a..7949bacfff 100644 --- a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql +++ b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql @@ -1,4 +1,4 @@ SELECT datname database_name, - mxid_age(datminmxid) min_mxid_age -FROM pg_database + pg_catalog.mxid_age(datminmxid) min_mxid_age +FROM pg_catalog.pg_database ORDER BY min_mxid_age DESC LIMIT 10; diff --git a/compute/etc/sql_exporter/compute_receive_lsn.sql b/compute/etc/sql_exporter/compute_receive_lsn.sql index 318b31ab41..fb96056881 100644 --- a/compute/etc/sql_exporter/compute_receive_lsn.sql +++ b/compute/etc/sql_exporter/compute_receive_lsn.sql @@ -1,4 +1,4 @@ SELECT CASE - WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_receive_lsn() - '0/0')::pg_catalog.FLOAT8 ELSE 0 END AS lsn; diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.sql b/compute/etc/sql_exporter/compute_subscriptions_count.sql index 50740cb5df..e380a7acc7 100644 --- a/compute/etc/sql_exporter/compute_subscriptions_count.sql +++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql @@ -1 +1 @@ -SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled; +SELECT subenabled::pg_catalog.text AS enabled, pg_catalog.count(*) AS subscriptions_count FROM pg_catalog.pg_subscription GROUP BY subenabled; diff --git a/compute/etc/sql_exporter/connection_counts.sql b/compute/etc/sql_exporter/connection_counts.sql index 6824480fdb..480c4fb439 100644 --- a/compute/etc/sql_exporter/connection_counts.sql +++ b/compute/etc/sql_exporter/connection_counts.sql @@ -1 +1 @@ -SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state; +SELECT datname, state, pg_catalog.count(*) AS count FROM pg_catalog.pg_stat_activity WHERE state <> '' GROUP BY datname, state; diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql index fe0360ab5c..59205e6ed3 100644 --- a/compute/etc/sql_exporter/db_total_size.sql +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -1,5 +1,5 @@ -SELECT sum(pg_database_size(datname)) AS total -FROM pg_database +SELECT pg_catalog.sum(pg_catalog.pg_database_size(datname)) AS total +FROM pg_catalog.pg_database -- Ignore invalid databases, as we will likely have problems with -- getting their size from the Pageserver. WHERE datconnlimit != -2; diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql index 35fa42c34c..02cb2b4649 100644 --- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql @@ -3,6 +3,6 @@ -- minutes. SELECT - x::text as duration_seconds, + x::pg_catalog.text AS duration_seconds, neon.approximate_working_set_size_seconds(x) AS size FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql index 46c7d1610c..aab93d433a 100644 --- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql @@ -3,6 +3,6 @@ SELECT x AS duration, - neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM ( + neon.approximate_working_set_size_seconds(extract('epoch' FROM x::pg_catalog.interval)::pg_catalog.int4) AS size FROM ( VALUES ('5m'), ('15m'), ('1h') ) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.sql b/compute/etc/sql_exporter/lfc_cache_size_limit.sql index 378904c1fe..41c11e0adc 100644 --- a/compute/etc/sql_exporter/lfc_cache_size_limit.sql +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql @@ -1 +1 @@ -SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; +SELECT pg_catalog.pg_size_bytes(pg_catalog.current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql index 1b1c038501..8964d0d8ff 100644 --- a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql @@ -1,3 +1,3 @@ -SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn -FROM pg_replication_slots +SELECT slot_name, (restart_lsn - '0/0')::pg_catalog.FLOAT8 AS restart_lsn +FROM pg_catalog.pg_replication_slots WHERE slot_type = 'logical'; diff --git a/compute/etc/sql_exporter/max_cluster_size.sql b/compute/etc/sql_exporter/max_cluster_size.sql index 2d2355a9a7..d44fdebe38 100644 --- a/compute/etc/sql_exporter/max_cluster_size.sql +++ b/compute/etc/sql_exporter/max_cluster_size.sql @@ -1 +1 @@ -SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size'; +SELECT setting::pg_catalog.int4 AS max_cluster_size FROM pg_catalog.pg_settings WHERE name = 'neon.max_cluster_size'; diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql index 12e6c4ae59..1a1e54a7c6 100644 --- a/compute/etc/sql_exporter/pg_stats_userdb.sql +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -1,13 +1,13 @@ -- We export stats for 10 non-system databases. Without this limit it is too -- easy to abuse the system by creating lots of databases. -SELECT pg_database_size(datname) AS db_size, +SELECT pg_catalog.pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, tup_updated AS updated, tup_deleted AS deleted, datname -FROM pg_stat_database +FROM pg_catalog.pg_stat_database WHERE datname IN ( SELECT datname FROM pg_database -- Ignore invalid databases, as we will likely have problems with diff --git a/compute/etc/sql_exporter/replication_delay_bytes.sql b/compute/etc/sql_exporter/replication_delay_bytes.sql index 60a6981acd..d3b7aa724b 100644 --- a/compute/etc/sql_exporter/replication_delay_bytes.sql +++ b/compute/etc/sql_exporter/replication_delay_bytes.sql @@ -3,4 +3,4 @@ -- replay LSN may have advanced past the receive LSN we are using for the -- calculation. -SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; +SELECT GREATEST(0, pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), pg_catalog.pg_last_wal_replay_lsn())) AS replication_delay_bytes; diff --git a/compute/etc/sql_exporter/replication_delay_seconds.sql b/compute/etc/sql_exporter/replication_delay_seconds.sql index a76809ad74..af4dd3fd90 100644 --- a/compute/etc/sql_exporter/replication_delay_seconds.sql +++ b/compute/etc/sql_exporter/replication_delay_seconds.sql @@ -1,5 +1,5 @@ SELECT CASE - WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 - ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + WHEN pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST(0, EXTRACT (EPOCH FROM pg_catalog.now() - pg_catalog.pg_last_xact_replay_timestamp())) END AS replication_delay_seconds; diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql index 3e2aadfc28..ccb3504d58 100644 --- a/compute/etc/sql_exporter/retained_wal.sql +++ b/compute/etc/sql_exporter/retained_wal.sql @@ -1,10 +1,10 @@ SELECT slot_name, - pg_wal_lsn_diff( + pg_catalog.pg_wal_lsn_diff( CASE - WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() - ELSE pg_current_wal_lsn() + WHEN pg_catalog.pg_is_in_recovery() THEN pg_catalog.pg_last_wal_replay_lsn() + ELSE pg_catalog.pg_current_wal_lsn() END, - restart_lsn)::FLOAT8 AS retained_wal -FROM pg_replication_slots + restart_lsn)::pg_catalog.FLOAT8 AS retained_wal +FROM pg_catalog.pg_replication_slots WHERE active = false; diff --git a/compute/etc/sql_exporter/wal_is_lost.sql b/compute/etc/sql_exporter/wal_is_lost.sql index 5521270851..5a94cc3373 100644 --- a/compute/etc/sql_exporter/wal_is_lost.sql +++ b/compute/etc/sql_exporter/wal_is_lost.sql @@ -4,4 +4,4 @@ SELECT WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost -FROM pg_replication_slots; +FROM pg_catalog.pg_replication_slots; diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 2b4802f309..f383683ef8 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -279,7 +279,7 @@ fn main() -> Result<()> { config, )?; - let exit_code = compute_node.run()?; + let exit_code = compute_node.run().context("running compute node")?; scenario.teardown(); diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index e4207876ac..2458fe3c11 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -24,9 +24,9 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> { }); let query = " - INSERT INTO health_check VALUES (1, now()) + INSERT INTO public.health_check VALUES (1, pg_catalog.now()) ON CONFLICT (id) DO UPDATE - SET updated_at = now();"; + SET updated_at = pg_catalog.now();"; match client.simple_query(query).await { Result::Ok(result) => { diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 5086f2f998..1606816b3b 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -32,8 +32,12 @@ use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::time::{Duration, Instant}; use std::{env, fs}; use tokio::{spawn, sync::watch, task::JoinHandle, time}; +use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, instrument, warn}; use url::Url; +use utils::backoff::{ + DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff_duration, +}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::measured_stream::MeasuredReader; @@ -194,6 +198,7 @@ pub struct ComputeState { pub startup_span: Option, pub lfc_prewarm_state: LfcPrewarmState, + pub lfc_prewarm_token: CancellationToken, pub lfc_offload_state: LfcOffloadState, /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if @@ -219,6 +224,7 @@ impl ComputeState { lfc_offload_state: LfcOffloadState::default(), terminate_flush_lsn: None, promote_state: None, + lfc_prewarm_token: CancellationToken::new(), } } @@ -585,7 +591,7 @@ impl ComputeNode { // that can affect `compute_ctl` and prevent it from properly configuring the database schema. // Unset them via connection string options before connecting to the database. // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. - const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none"; + const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0 -c pgaudit.log=none"; let options = match conn_conf.get_options() { // Allow the control plane to override any options set by the // compute @@ -1559,6 +1565,41 @@ impl ComputeNode { Ok(lsn) } + fn sync_safekeepers_with_retries(&self, storage_auth_token: Option) -> Result { + let max_retries = 5; + let mut attempts = 0; + loop { + let result = self.sync_safekeepers(storage_auth_token.clone()); + match &result { + Ok(_) => { + if attempts > 0 { + tracing::info!("sync_safekeepers succeeded after {attempts} retries"); + } + return result; + } + Err(e) if attempts < max_retries => { + tracing::info!( + "sync_safekeepers failed, will retry (attempt {attempts}): {e:#}" + ); + } + Err(err) => { + tracing::warn!( + "sync_safekeepers still failed after {attempts} retries, giving up: {err:?}" + ); + return result; + } + } + // sleep and retry + let backoff = exponential_backoff_duration( + attempts, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ); + std::thread::sleep(backoff); + attempts += 1; + } + } + /// Do all the preparations like PGDATA directory creation, configuration, /// safekeepers sync, basebackup, etc. #[instrument(skip_all)] @@ -1594,7 +1635,7 @@ impl ComputeNode { lsn } else { info!("starting safekeepers syncing"); - self.sync_safekeepers(pspec.storage_auth_token.clone()) + self.sync_safekeepers_with_retries(pspec.storage_auth_token.clone()) .with_context(|| "failed to sync safekeepers")? }; info!("safekeepers synced at LSN {}", lsn); @@ -1889,7 +1930,7 @@ impl ComputeNode { // It doesn't matter what were the options before, here we just want // to connect and create a new superuser role. - const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0"; zenith_admin_conf.options(ZENITH_OPTIONS); let mut client = @@ -2344,13 +2385,13 @@ impl ComputeNode { let result = client .simple_query( "SELECT - row_to_json(pg_stat_statements) + pg_catalog.row_to_json(pss) FROM - pg_stat_statements + public.pg_stat_statements pss WHERE - userid != 'cloud_admin'::regrole::oid + pss.userid != 'cloud_admin'::pg_catalog.regrole::pg_catalog.oid ORDER BY - (mean_exec_time + mean_plan_time) DESC + (pss.mean_exec_time + pss.mean_plan_time) DESC LIMIT 100", ) .await; @@ -2478,11 +2519,11 @@ LIMIT 100", // check the role grants first - to gracefully handle read-replicas. let select = "SELECT privilege_type - FROM pg_namespace - JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true - JOIN pg_user users ON acl.grantee = users.usesysid - WHERE users.usename = $1 - AND nspname = $2"; + FROM pg_catalog.pg_namespace + JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) AS acl ON true + JOIN pg_catalog.pg_user users ON acl.grantee = users.usesysid + WHERE users.usename OPERATOR(pg_catalog.=) $1::pg_catalog.name + AND nspname OPERATOR(pg_catalog.=) $2::pg_catalog.name"; let rows = db_client .query(select, &[role_name, schema_name]) .await @@ -2551,8 +2592,9 @@ LIMIT 100", .await .with_context(|| format!("Failed to execute query: {query}"))?; } else { - let query = - format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}"); + let query = format!( + "CREATE EXTENSION IF NOT EXISTS {ext_name} WITH SCHEMA public VERSION {quoted_version}" + ); db_client .simple_query(&query) .await diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs index 97e62c1c80..82cb28f1ac 100644 --- a/compute_tools/src/compute_prewarm.rs +++ b/compute_tools/src/compute_prewarm.rs @@ -7,7 +7,8 @@ use http::StatusCode; use reqwest::Client; use std::mem::replace; use std::sync::Arc; -use tokio::{io::AsyncReadExt, spawn}; +use tokio::{io::AsyncReadExt, select, spawn}; +use tokio_util::sync::CancellationToken; use tracing::{error, info}; #[derive(serde::Serialize, Default)] @@ -92,34 +93,35 @@ impl ComputeNode { /// If there is a prewarm request ongoing, return `false`, `true` otherwise. /// Has a failpoint "compute-prewarm" pub fn prewarm_lfc(self: &Arc, from_endpoint: Option) -> bool { + let token: CancellationToken; { - let state = &mut self.state.lock().unwrap().lfc_prewarm_state; - if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) { + let state = &mut self.state.lock().unwrap(); + token = state.lfc_prewarm_token.clone(); + if let LfcPrewarmState::Prewarming = + replace(&mut state.lfc_prewarm_state, LfcPrewarmState::Prewarming) + { return false; } } crate::metrics::LFC_PREWARMS.inc(); - let cloned = self.clone(); + let this = self.clone(); spawn(async move { - let state = match cloned.prewarm_impl(from_endpoint).await { - Ok(true) => LfcPrewarmState::Completed, - Ok(false) => { - info!( - "skipping LFC prewarm because LFC state is not found in endpoint storage" - ); - LfcPrewarmState::Skipped - } + let prewarm_state = match this.prewarm_impl(from_endpoint, token).await { + Ok(state) => state, Err(err) => { crate::metrics::LFC_PREWARM_ERRORS.inc(); error!(%err, "could not prewarm LFC"); - LfcPrewarmState::Failed { - error: format!("{err:#}"), - } + let error = format!("{err:#}"); + LfcPrewarmState::Failed { error } } }; - cloned.state.lock().unwrap().lfc_prewarm_state = state; + let state = &mut this.state.lock().unwrap(); + if let LfcPrewarmState::Cancelled = prewarm_state { + state.lfc_prewarm_token = CancellationToken::new(); + } + state.lfc_prewarm_state = prewarm_state; }); true } @@ -132,47 +134,70 @@ impl ComputeNode { /// Request LFC state from endpoint storage and load corresponding pages into Postgres. /// Returns a result with `false` if the LFC state is not found in endpoint storage. - async fn prewarm_impl(&self, from_endpoint: Option) -> Result { - let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?; + async fn prewarm_impl( + &self, + from_endpoint: Option, + token: CancellationToken, + ) -> Result { + let EndpointStoragePair { + url, + token: storage_token, + } = self.endpoint_storage_pair(from_endpoint)?; #[cfg(feature = "testing")] - fail::fail_point!("compute-prewarm", |_| { - bail!("prewarm configured to fail because of a failpoint") - }); + fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint")); info!(%url, "requesting LFC state from endpoint storage"); - let request = Client::new().get(&url).bearer_auth(token); - let res = request.send().await.context("querying endpoint storage")?; - match res.status() { + let request = Client::new().get(&url).bearer_auth(storage_token); + let response = select! { + _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), + response = request.send() => response + } + .context("querying endpoint storage")?; + + match response.status() { StatusCode::OK => (), - StatusCode::NOT_FOUND => { - return Ok(false); - } + StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped), status => bail!("{status} querying endpoint storage"), } let mut uncompressed = Vec::new(); - let lfc_state = res - .bytes() - .await - .context("getting request body from endpoint storage")?; - ZstdDecoder::new(lfc_state.iter().as_slice()) - .read_to_end(&mut uncompressed) - .await - .context("decoding LFC state")?; + let lfc_state = select! { + _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), + lfc_state = response.bytes() => lfc_state + } + .context("getting request body from endpoint storage")?; + + let mut decoder = ZstdDecoder::new(lfc_state.iter().as_slice()); + select! { + _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), + read = decoder.read_to_end(&mut uncompressed) => read + } + .context("decoding LFC state")?; + let uncompressed_len = uncompressed.len(); + info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}"); - info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres"); - - ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + // Client connection and prewarm info querying are fast and therefore don't need + // cancellation + let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await - .context("connecting to postgres")? - .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed]) - .await - .context("loading LFC state into postgres") - .map(|_| ())?; + .context("connecting to postgres")?; + let pg_token = client.cancel_token(); - Ok(true) + let params: Vec<&(dyn postgres_types::ToSql + Sync)> = vec![&uncompressed]; + select! { + res = client.query_one("select neon.prewarm_local_cache($1)", ¶ms) => res, + _ = token.cancelled() => { + pg_token.cancel_query(postgres::NoTls).await + .context("cancelling neon.prewarm_local_cache()")?; + return Ok(LfcPrewarmState::Cancelled) + } + } + .context("loading LFC state into postgres") + .map(|_| ())?; + + Ok(LfcPrewarmState::Completed) } /// If offload request is ongoing, return false, true otherwise @@ -200,20 +225,20 @@ impl ComputeNode { async fn offload_lfc_with_state_update(&self) { crate::metrics::LFC_OFFLOADS.inc(); - - let Err(err) = self.offload_lfc_impl().await else { - self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed; - return; + let state = match self.offload_lfc_impl().await { + Ok(state) => state, + Err(err) => { + crate::metrics::LFC_OFFLOAD_ERRORS.inc(); + error!(%err, "could not offload LFC"); + let error = format!("{err:#}"); + LfcOffloadState::Failed { error } + } }; - crate::metrics::LFC_OFFLOAD_ERRORS.inc(); - error!(%err, "could not offload LFC state to endpoint storage"); - self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed { - error: format!("{err:#}"), - }; + self.state.lock().unwrap().lfc_offload_state = state; } - async fn offload_lfc_impl(&self) -> Result<()> { + async fn offload_lfc_impl(&self) -> Result { let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?; info!(%url, "requesting LFC state from Postgres"); @@ -228,7 +253,7 @@ impl ComputeNode { .context("deserializing LFC state")?; let Some(state) = state else { info!(%url, "empty LFC state, not exporting"); - return Ok(()); + return Ok(LfcOffloadState::Skipped); }; let mut compressed = Vec::new(); @@ -242,7 +267,7 @@ impl ComputeNode { let request = Client::new().put(url).bearer_auth(token).body(compressed); match request.send().await { - Ok(res) if res.status() == StatusCode::OK => Ok(()), + Ok(res) if res.status() == StatusCode::OK => Ok(LfcOffloadState::Completed), Ok(res) => bail!( "Request to endpoint storage failed with status: {}", res.status() @@ -250,4 +275,8 @@ impl ComputeNode { Err(err) => Err(err).context("writing to endpoint storage"), } } + + pub fn cancel_prewarm(self: &Arc) { + self.state.lock().unwrap().lfc_prewarm_token.cancel(); + } } diff --git a/compute_tools/src/compute_promote.rs b/compute_tools/src/compute_promote.rs index a34368c531..29195b60e9 100644 --- a/compute_tools/src/compute_promote.rs +++ b/compute_tools/src/compute_promote.rs @@ -78,7 +78,7 @@ impl ComputeNode { const RETRIES: i32 = 20; for i in 0..=RETRIES { let row = client - .query_one("SELECT pg_last_wal_replay_lsn()", &[]) + .query_one("SELECT pg_catalog.pg_last_wal_replay_lsn()", &[]) .await .context("getting last replay lsn")?; let lsn: u64 = row.get::(0).into(); @@ -103,7 +103,7 @@ impl ComputeNode { .await .context("setting safekeepers")?; client - .query("SELECT pg_reload_conf()", &[]) + .query("SELECT pg_catalog.pg_reload_conf()", &[]) .await .context("reloading postgres config")?; @@ -113,7 +113,7 @@ impl ComputeNode { }); let row = client - .query_one("SELECT * FROM pg_promote()", &[]) + .query_one("SELECT * FROM pg_catalog.pg_promote()", &[]) .await .context("pg_promote")?; if !row.get::(0) { diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index ab729d62b5..27e610a87d 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -139,6 +139,15 @@ paths: application/json: schema: $ref: "#/components/schemas/LfcPrewarmState" + delete: + tags: + - Prewarm + summary: Cancel ongoing LFC prewarm + description: "" + operationId: cancelLfcPrewarm + responses: + 202: + description: Prewarm cancelled /lfc/offload: post: @@ -636,7 +645,7 @@ components: properties: status: description: LFC offload status - enum: [not_offloaded, offloading, completed, failed] + enum: [not_offloaded, offloading, completed, skipped, failed] type: string error: description: LFC offload error, if any diff --git a/compute_tools/src/http/routes/lfc.rs b/compute_tools/src/http/routes/lfc.rs index e98bd781a2..7483198723 100644 --- a/compute_tools/src/http/routes/lfc.rs +++ b/compute_tools/src/http/routes/lfc.rs @@ -46,3 +46,8 @@ pub(in crate::http) async fn offload(compute: Compute) -> Response { ) } } + +pub(in crate::http) async fn cancel_prewarm(compute: Compute) -> StatusCode { + compute.cancel_prewarm(); + StatusCode::ACCEPTED +} diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 2fd3121f4f..869fdef11d 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -99,7 +99,12 @@ impl From<&Server> for Router> { ); let authenticated_router = Router::>::new() - .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm)) + .route( + "/lfc/prewarm", + get(lfc::prewarm_state) + .post(lfc::prewarm) + .delete(lfc::cancel_prewarm), + ) .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload)) .route("/promote", post(promote::promote)) .route("/check_writability", post(check_writability::is_writable)) diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 5f60b711c8..a9ddef58e5 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -19,7 +19,7 @@ async fn list_dbs(client: &mut Client) -> Result, PostgresError> { .query( "SELECT datname FROM pg_catalog.pg_database WHERE datallowconn - AND datconnlimit <> - 2 + AND datconnlimit OPERATOR(pg_catalog.<>) (OPERATOR(pg_catalog.-) 2::pg_catalog.int4) LIMIT 500", &[], ) @@ -67,7 +67,7 @@ pub async fn get_installed_extensions( let extensions: Vec<(String, String, i32)> = client .query( - "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension", + "SELECT extname, extversion, extowner::pg_catalog.int4 FROM pg_catalog.pg_extension", &[], ) .await? diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index 88d870df97..c8f911b5a6 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -76,7 +76,7 @@ impl<'m> MigrationRunner<'m> { self.client .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration") .await?; - self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?; + self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key pg_catalog.int4 NOT NULL PRIMARY KEY, id pg_catalog.int8 NOT NULL DEFAULT 0)").await?; self.client .simple_query( "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING", diff --git a/compute_tools/src/migrations/0002-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql index 367356e6eb..6e28e8d32c 100644 --- a/compute_tools/src/migrations/0002-alter_roles.sql +++ b/compute_tools/src/migrations/0002-alter_roles.sql @@ -15,17 +15,17 @@ DO $$ DECLARE role_name text; BEGIN - FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member') + FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT'; + RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', pg_catalog.quote_ident(role_name); + EXECUTE pg_catalog.format('ALTER ROLE %I INHERIT;', role_name); END LOOP; - FOR role_name IN SELECT rolname FROM pg_roles + FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE - NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_') + NOT pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT pg_catalog.starts_with(rolname, 'pg_') LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS'; + RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', pg_catalog.quote_ident(role_name); + EXECUTE pg_catalog.format('ALTER ROLE %I NOBYPASSRLS;', role_name); END LOOP; END $$; diff --git a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql index adf159dc06..d67d6457c6 100644 --- a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql +++ b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN - IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name = 'server_version_num') THEN EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}'; END IF; END $$; diff --git a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql index 47129d65b8..7f74d4ee28 100644 --- a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql +++ b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql @@ -5,9 +5,9 @@ DO $$ DECLARE role_name TEXT; BEGIN - FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE + FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE rolreplication IS TRUE LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION'; + RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', pg_catalog.quote_ident(role_name); + EXECUTE pg_catalog.format('ALTER ROLE %I NOREPLICATION;', role_name); END LOOP; END $$; diff --git a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql index 84fcb36391..714bdc735a 100644 --- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql +++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN - IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name OPERATOR(pg_catalog.=) 'server_version_num'::pg_catalog.text) THEN EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}'; EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}'; END IF; diff --git a/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql b/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql index 0c81cef1c4..b5b209ef5e 100644 --- a/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql @@ -2,7 +2,7 @@ DO $$ DECLARE bypassrls boolean; BEGIN - SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser'; + SELECT rolbypassrls INTO bypassrls FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser'; IF NOT bypassrls THEN RAISE EXCEPTION 'neon_superuser cannot bypass RLS'; END IF; diff --git a/compute_tools/src/migrations/tests/0002-alter_roles.sql b/compute_tools/src/migrations/tests/0002-alter_roles.sql index 433f7b34f7..1755c9088c 100644 --- a/compute_tools/src/migrations/tests/0002-alter_roles.sql +++ b/compute_tools/src/migrations/tests/0002-alter_roles.sql @@ -4,8 +4,8 @@ DECLARE BEGIN FOR role IN SELECT rolname AS name, rolinherit AS inherit - FROM pg_roles - WHERE pg_has_role(rolname, 'neon_superuser', 'member') + FROM pg_catalog.pg_roles + WHERE pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member') LOOP IF NOT role.inherit THEN RAISE EXCEPTION '% cannot inherit', quote_ident(role.name); @@ -14,12 +14,12 @@ BEGIN FOR role IN SELECT rolname AS name, rolbypassrls AS bypassrls - FROM pg_roles - WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member') - AND NOT starts_with(rolname, 'pg_') + FROM pg_catalog.pg_roles + WHERE NOT pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member') + AND NOT pg_catalog.starts_with(rolname, 'pg_') LOOP IF role.bypassrls THEN - RAISE EXCEPTION '% can bypass RLS', quote_ident(role.name); + RAISE EXCEPTION '% can bypass RLS', pg_catalog.quote_ident(role.name); END IF; END LOOP; END $$; diff --git a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql index b164d61295..498770f4fa 100644 --- a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql @@ -1,10 +1,10 @@ DO $$ BEGIN - IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN + IF (SELECT pg_catalog.current_setting('server_version_num')::pg_catalog.numeric < 160000) THEN RETURN; END IF; - IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN + IF NOT (SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription'; END IF; END $$; diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql index 3464a2b1cf..ec04cfe199 100644 --- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql @@ -2,12 +2,12 @@ DO $$ DECLARE monitor record; BEGIN - SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member, + SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member, admin_option AS admin INTO monitor - FROM pg_auth_members - WHERE roleid = 'pg_monitor'::regrole - AND member = 'neon_superuser'::regrole; + FROM pg_catalog.pg_auth_members + WHERE roleid = 'pg_monitor'::pg_catalog.regrole + AND member = 'neon_superuser'::pg_catalog.regrole; IF monitor IS NULL THEN RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor'; diff --git a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql index af7f50e95d..f3b28d76c9 100644 --- a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql @@ -2,11 +2,11 @@ DO $$ DECLARE can_execute boolean; BEGIN - SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute')) + SELECT pg_catalog.bool_and(pg_catalog.has_function_privilege('neon_superuser', oid, 'execute')) INTO can_execute - FROM pg_proc + FROM pg_catalog.pg_proc WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot') - AND pronamespace = 'pg_catalog'::regnamespace; + AND pronamespace = 'pg_catalog'::pg_catalog.regnamespace; IF NOT can_execute THEN RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot'; END IF; diff --git a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql index e55dcdc3b6..197211300b 100644 --- a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql @@ -2,9 +2,9 @@ DO $$ DECLARE can_execute boolean; BEGIN - SELECT has_function_privilege('neon_superuser', oid, 'execute') + SELECT pg_catalog.has_function_privilege('neon_superuser', oid, 'execute') INTO can_execute - FROM pg_proc + FROM pg_catalog.pg_proc WHERE proname = 'pg_show_replication_origin_status' AND pronamespace = 'pg_catalog'::regnamespace; IF NOT can_execute THEN diff --git a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql index e62b742d30..0f772d67bd 100644 --- a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql @@ -2,10 +2,10 @@ DO $$ DECLARE signal_backend record; BEGIN - SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member, + SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member, admin_option AS admin INTO signal_backend - FROM pg_auth_members + FROM pg_catalog.pg_auth_members WHERE roleid = 'pg_signal_backend'::regrole AND member = 'neon_superuser'::regrole; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index ef14507436..8520d7221a 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -449,9 +449,9 @@ fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> { // like `postgres_exporter` use it to query Postgres statistics. // Use explicit 8 bytes type casts to match Rust types. let stats = cli.query_one( - "SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time, - coalesce(sum(sessions), 0)::bigint AS total_sessions - FROM pg_stat_database + "SELECT pg_catalog.coalesce(pg_catalog.sum(active_time), 0.0)::pg_catalog.float8 AS total_active_time, + pg_catalog.coalesce(pg_catalog.sum(sessions), 0)::pg_catalog.bigint AS total_sessions + FROM pg_catalog.pg_stat_database WHERE datname NOT IN ( 'postgres', 'template0', @@ -487,11 +487,11 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result> = None; // Get all running client backends except ourself, use RFC3339 DateTime format. let backends = cli.query( - "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change + "SELECT state, pg_catalog.to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"'::pg_catalog.text) AS state_change FROM pg_stat_activity - WHERE backend_type = 'client backend' - AND pid != pg_backend_pid() - AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? + WHERE backend_type OPERATOR(pg_catalog.=) 'client backend'::pg_catalog.text + AND pid OPERATOR(pg_catalog.!=) pg_catalog.pg_backend_pid() + AND usename OPERATOR(pg_catalog.!=) 'cloud_admin'::pg_catalog.name;", // XXX: find a better way to filter other monitors? &[], ); diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 09bbe89b41..4e16a75181 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -299,9 +299,9 @@ pub async fn get_existing_dbs_async( .query_raw::( "SELECT datname AS name, - (SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner, + (SELECT rolname FROM pg_catalog.pg_roles WHERE oid OPERATOR(pg_catalog.=) datdba) AS owner, NOT datallowconn AS restrict_conn, - datconnlimit = - 2 AS invalid + datconnlimit OPERATOR(pg_catalog.=) (OPERATOR(pg_catalog.-) 2) AS invalid FROM pg_catalog.pg_database;", &[], diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 00f34c1f0e..90c0e234d5 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -82,7 +82,7 @@ impl ComputeNode { info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); drop_subscriptions_done = match - client.query("select 1 from neon.drop_subscriptions_done where timeline_id = $1", &[&timeline_id.to_string()]).await { + client.query("select 1 from neon.drop_subscriptions_done where timeline_id OPERATOR(pg_catalog.=) $1", &[&timeline_id.to_string()]).await { Ok(result) => !result.is_empty(), Err(e) => { @@ -1142,7 +1142,9 @@ async fn get_operations<'a>( if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { if libs.contains("pg_stat_statements") { return Ok(Box::new(once(Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"), + query: String::from( + "CREATE EXTENSION IF NOT EXISTS pg_stat_statements WITH SCHEMA public", + ), comment: Some(String::from("create system extensions")), }))); } @@ -1150,11 +1152,13 @@ async fn get_operations<'a>( Ok(Box::new(empty())) } ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"), + query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit WITH SCHEMA public"), comment: Some(String::from("create pgaudit extensions")), }))), ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"), + query: String::from( + "CREATE EXTENSION IF NOT EXISTS pgauditlogtofile WITH SCHEMA public", + ), comment: Some(String::from("create pgauditlogtofile extensions")), }))), // Disable pgaudit logging for postgres database. @@ -1178,7 +1182,7 @@ async fn get_operations<'a>( }, Operation { query: String::from( - "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'", + "UPDATE pg_catalog.pg_extension SET extrelocatable = true WHERE extname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name AND extrelocatable OPERATOR(pg_catalog.=) false", ), comment: Some(String::from("compat/fix: make neon relocatable")), }, diff --git a/compute_tools/src/sql/add_availabilitycheck_tables.sql b/compute_tools/src/sql/add_availabilitycheck_tables.sql index 7c60690c78..dd27105e16 100644 --- a/compute_tools/src/sql/add_availabilitycheck_tables.sql +++ b/compute_tools/src/sql/add_availabilitycheck_tables.sql @@ -3,16 +3,17 @@ BEGIN IF NOT EXISTS( SELECT 1 FROM pg_catalog.pg_tables - WHERE tablename = 'health_check' + WHERE tablename::pg_catalog.name OPERATOR(pg_catalog.=) 'health_check'::pg_catalog.name + AND schemaname::pg_catalog.name OPERATOR(pg_catalog.=) 'public'::pg_catalog.name ) THEN - CREATE TABLE health_check ( - id serial primary key, - updated_at timestamptz default now() + CREATE TABLE public.health_check ( + id pg_catalog.int4 primary key generated by default as identity, + updated_at pg_catalog.timestamptz default pg_catalog.now() ); - INSERT INTO health_check VALUES (1, now()) + INSERT INTO public.health_check VALUES (1, pg_catalog.now()) ON CONFLICT (id) DO UPDATE - SET updated_at = now(); + SET updated_at = pg_catalog.now(); END IF; END $$ \ No newline at end of file diff --git a/compute_tools/src/sql/anon_ext_fn_reassign.sql b/compute_tools/src/sql/anon_ext_fn_reassign.sql index 3d7b15c590..714a6db1e9 100644 --- a/compute_tools/src/sql/anon_ext_fn_reassign.sql +++ b/compute_tools/src/sql/anon_ext_fn_reassign.sql @@ -2,10 +2,11 @@ DO $$ DECLARE query varchar; BEGIN - FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};' - FROM pg_proc p - JOIN pg_namespace nsp ON p.pronamespace = nsp.oid - WHERE nsp.nspname = 'anon' LOOP + FOR query IN + SELECT pg_catalog.format('ALTER FUNCTION %I(%s) OWNER TO {db_owner};', p.oid::regproc, pg_catalog.pg_get_function_identity_arguments(p.oid)) + FROM pg_catalog.pg_proc p + WHERE p.pronamespace OPERATOR(pg_catalog.=) 'anon'::regnamespace::oid + LOOP EXECUTE query; END LOOP; END diff --git a/compute_tools/src/sql/create_privileged_role.sql b/compute_tools/src/sql/create_privileged_role.sql index ac2521445f..a682089cce 100644 --- a/compute_tools/src/sql/create_privileged_role.sql +++ b/compute_tools/src/sql/create_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN - IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}') + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname OPERATOR(pg_catalog.=) '{privileged_role_name}'::pg_catalog.name) THEN CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data; END IF; diff --git a/compute_tools/src/sql/default_grants.sql b/compute_tools/src/sql/default_grants.sql index 58ebb0690b..d572332270 100644 --- a/compute_tools/src/sql/default_grants.sql +++ b/compute_tools/src/sql/default_grants.sql @@ -4,14 +4,14 @@ $$ IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public' ) AND - current_setting('server_version_num')::int / 10000 >= 15 + pg_catalog.current_setting('server_version_num')::int OPERATOR(pg_catalog./) 10000 OPERATOR(pg_catalog.>=) 15 THEN IF EXISTS( SELECT rolname FROM pg_catalog.pg_roles - WHERE rolname = 'web_access' + WHERE rolname OPERATOR(pg_catalog.=) 'web_access' ) THEN GRANT CREATE ON SCHEMA public TO web_access; @@ -20,7 +20,7 @@ $$ IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public' ) THEN ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION; diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql index f5d9420130..68b3f8b729 100644 --- a/compute_tools/src/sql/drop_subscriptions.sql +++ b/compute_tools/src/sql/drop_subscriptions.sql @@ -2,11 +2,17 @@ DO ${outer_tag}$ DECLARE subname TEXT; BEGIN - LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE; - FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP - EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname); - EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); - EXECUTE format('DROP SUBSCRIPTION %I;', subname); + LOCK TABLE pg_catalog.pg_subscription IN ACCESS EXCLUSIVE MODE; + FOR subname IN + SELECT pg_subscription.subname + FROM pg_catalog.pg_subscription + WHERE subdbid OPERATOR(pg_catalog.=) ( + SELECT oid FROM pg_database WHERE datname OPERATOR(pg_catalog.=) {datname_str}::pg_catalog.name + ) + LOOP + EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I DISABLE;', subname); + EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); + EXECUTE pg_catalog.format('DROP SUBSCRIPTION %I;', subname); END LOOP; END; ${outer_tag}$; diff --git a/compute_tools/src/sql/finalize_drop_subscriptions.sql b/compute_tools/src/sql/finalize_drop_subscriptions.sql index 4bb291876f..1a8876ad61 100644 --- a/compute_tools/src/sql/finalize_drop_subscriptions.sql +++ b/compute_tools/src/sql/finalize_drop_subscriptions.sql @@ -3,19 +3,19 @@ BEGIN IF NOT EXISTS( SELECT 1 FROM pg_catalog.pg_tables - WHERE tablename = 'drop_subscriptions_done' - AND schemaname = 'neon' + WHERE tablename OPERATOR(pg_catalog.=) 'drop_subscriptions_done'::pg_catalog.name + AND schemaname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name ) THEN CREATE TABLE neon.drop_subscriptions_done - (id serial primary key, timeline_id text); + (id pg_catalog.int4 primary key generated by default as identity, timeline_id pg_catalog.text); END IF; -- preserve the timeline_id of the last drop_subscriptions run -- to ensure that the cleanup of a timeline is executed only once. -- use upsert to avoid the table bloat in case of cascade branching (branch of a branch) - INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id')) + INSERT INTO neon.drop_subscriptions_done VALUES (1, pg_catalog.current_setting('neon.timeline_id')) ON CONFLICT (id) DO UPDATE - SET timeline_id = current_setting('neon.timeline_id'); + SET timeline_id = pg_catalog.current_setting('neon.timeline_id')::pg_catalog.text; END $$ diff --git a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql index 734607be02..2ed0f94bad 100644 --- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql +++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql @@ -15,15 +15,15 @@ BEGIN WHERE schema_name IN ('public') LOOP FOR grantor IN EXECUTE - format( - 'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s', + pg_catalog.format( + 'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee OPERATOR(pg_catalog.=) %s', -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` quote_literal({role_name}) ) LOOP - EXECUTE format('SET LOCAL ROLE %I', grantor); + EXECUTE pg_catalog.format('SET LOCAL ROLE %I', grantor); - revoke_query := format( + revoke_query := pg_catalog.format( 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I', schema, -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql index dc502c6d2d..41bd0d4689 100644 --- a/compute_tools/src/sql/set_public_schema_owner.sql +++ b/compute_tools/src/sql/set_public_schema_owner.sql @@ -5,17 +5,17 @@ DO ${outer_tag}$ IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.name ) THEN SELECT nspowner::regrole::text FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.text INTO schema_owner; - IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin' + IF schema_owner OPERATOR(pg_catalog.=) 'cloud_admin'::pg_catalog.text OR schema_owner OPERATOR(pg_catalog.=) 'zenith_admin'::pg_catalog.text THEN - EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner}); + EXECUTE pg_catalog.format('ALTER SCHEMA public OWNER TO %I', {db_owner}); END IF; END IF; END diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql index 36dc648beb..03225d5e64 100644 --- a/compute_tools/src/sql/unset_template_for_drop_dbs.sql +++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql @@ -3,10 +3,10 @@ DO ${outer_tag}$ IF EXISTS( SELECT 1 FROM pg_catalog.pg_database - WHERE datname = {datname} + WHERE datname OPERATOR(pg_catalog.=) {datname}::pg_catalog.name ) THEN - EXECUTE format('ALTER DATABASE %I is_template false', {datname}); + EXECUTE pg_catalog.format('ALTER DATABASE %I is_template false', {datname}); END IF; END ${outer_tag}$; diff --git a/docs/rfcs/2025-07-07-node-deletion-api-improvement.md b/docs/rfcs/2025-07-07-node-deletion-api-improvement.md new file mode 100644 index 0000000000..47dadaee35 --- /dev/null +++ b/docs/rfcs/2025-07-07-node-deletion-api-improvement.md @@ -0,0 +1,246 @@ +# Node deletion API improvement + +Created on 2025-07-07 +Implemented on _TBD_ + +## Summary + +This RFC describes improvements to the storage controller API for gracefully deleting pageserver +nodes. + +## Motivation + +The basic node deletion API introduced in [#8226](https://github.com/neondatabase/neon/issues/8333) +has several limitations: + +- Deleted nodes can re-add themselves if they restart (e.g., a flaky node that keeps restarting and +we cannot reach via SSH to stop the pageserver). This issue has been resolved by tombstone +mechanism in [#12036](https://github.com/neondatabase/neon/issues/12036) +- Process of node deletion is not graceful, i.e. it just imitates a node failure + +In this context, "graceful" node deletion means that users do not experience any disruption or +negative effects, provided the system remains in a healthy state (i.e., the remaining pageservers +can handle the workload and all requirements are met). To achieve this, the system must perform +live migration of all tenant shards from the node being deleted while the node is still running +and continue processing all incoming requests. The node is removed only after all tenant shards +have been safely migrated. + +Although live migrations can be achieved with the drain functionality, it leads to incorrect shard +placement, such as not matching availability zones. This results in unnecessary work to optimize +the placement that was just recently performed. + +If we delete a node before its tenant shards are fully moved, the new node won't have all the +needed data (e.g. heatmaps) ready. This means user requests to the new node will be much slower at +first. If there are many tenant shards, this slowdown affects a huge amount of users. + +Graceful node deletion is more complicated and can introduce new issues. It takes longer because +live migration of each tenant shard can last several minutes. Using non-blocking accessors may +also cause deletion to wait if other processes are holding inner state lock. It also gets trickier +because we need to handle other requests, like drain and fill, at the same time. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +- storage controller +- pageserver (indirectly) + +## Proposed implementation + +### Tombstones + +To resolve the problem of deleted nodes re-adding themselves, a tombstone mechanism was introduced +as part of the node stored information. Each node has a separate `NodeLifecycle` field with two +possible states: `Active` and `Deleted`. When node deletion completes, the database row is not +deleted but instead has its `NodeLifecycle` column switched to `Deleted`. Nodes with `Deleted` +lifecycle are treated as if the row is absent for most handlers, with several exceptions: reattach +and register functionality must be aware of tombstones. Additionally, new debug handlers are +available for listing and deleting tombstones via the `/debug/v1/tombstone` path. + +### Gracefulness + +The problem of making node deletion graceful is complex and involves several challenges: + +- **Cancellable**: The operation must be cancellable to allow administrators to abort the process +if needed, e.g. if run by mistake. +- **Non-blocking**: We don't want to block deployment operations like draining/filling on the node +deletion process. We need clear policies for handling concurrent operations: what happens when a +drain/fill request arrives while deletion is in progress, and what happens when a delete request +arrives while drain/fill is in progress. +- **Persistent**: If the storage controller restarts during this long-running operation, we must +preserve progress and automatically resume the deletion process after the storage controller +restarts. +- **Migrated correctly**: We cannot simply use the existing drain mechanism for nodes scheduled +for deletion, as this would move shards to irrelevant locations. The drain process expects the +node to return, so it only moves shards to backup locations, not to their preferred AZs. It also +leaves secondary locations unmoved. This could result in unnecessary load on the storage +controller and inefficient resource utilization. +- **Force option**: Administrators need the ability to force immediate, non-graceful deletion when +time constraints or emergency situations require it, bypassing the normal graceful migration +process. + +See below for a detailed breakdown of the proposed changes and mechanisms. + +#### Node lifecycle + +New `NodeLifecycle` enum and a matching database field with these values: +- `Active`: The normal state. All operations are allowed. +- `ScheduledForDeletion`: The node is marked to be deleted soon. Deletion may be in progress or +will happen later, but the node will eventually be removed. All operations are allowed. +- `Deleted`: The node is fully deleted. No operations are allowed, and the node cannot be brought +back. The only action left is to remove its record from the database. Any attempt to register a +node in this state will fail. + +This state persists across storage controller restarts. + +**State transition** +``` + +--------------------+ + +---| Active |<---------------------+ + | +--------------------+ | + | ^ | + | start_node_delete | cancel_node_delete | + v | | + +----------------------------------+ | + | ScheduledForDeletion | | + +----------------------------------+ | + | | + | node_register | + | | + | delete_node (at the finish) | + | | + v | + +---------+ tombstone_delete +----------+ + | Deleted |-------------------------------->| no row | + +---------+ +----------+ +``` + +#### NodeSchedulingPolicy::Deleting + +A `Deleting` variant to the `NodeSchedulingPolicy` enum. This means the deletion function is +running for the node right now. Only one node can have the `Deleting` policy at a time. + +The `NodeSchedulingPolicy::Deleting` state is persisted in the database. However, after a storage +controller restart, any node previously marked as `Deleting` will have its scheduling policy reset +to `Pause`. The policy will only transition back to `Deleting` when the deletion operation is +actively started again, as triggered by the node's `NodeLifecycle::ScheduledForDeletion` state. + +`NodeSchedulingPolicy` transition details: +1. When `node_delete` begins, set the policy to `NodeSchedulingPolicy::Deleting`. +2. If `node_delete` is cancelled (for example, due to a concurrent drain operation), revert the +policy to its previous value. The policy is persisted in storcon DB. +3. After `node_delete` completes, the final value of the scheduling policy is irrelevant, since +`NodeLifecycle::Deleted` prevents any further access to this field. + +The deletion process cannot be initiated for nodes currently undergoing deployment-related +operations (`Draining`, `Filling`, or `PauseForRestart` policies). Deletion will only be triggered +once the node transitions to either the `Active` or `Pause` state. + +#### OperationTracker + +A replacement for `Option ongoing_operation`, the `OperationTracker` is a +dedicated service state object responsible for managing all long-running node operations (drain, +fill, delete) with robust concurrency control. + +Key responsibilities: +- Orchestrates the execution of operations +- Supports cancellation of currently running operations +- Enforces operation constraints, e.g. allowing only single drain/fill operation at a time +- Persists deletion state, enabling recovery of pending deletions across restarts +- Ensures thread safety across concurrent requests + +#### Attached tenant shard processing + +When deleting a node, handle each attached tenant shard as follows: + +1. Pick the best node to become the new attached (the candidate). +2. If the candidate already has this shard as a secondary: + - Create a new secondary for the shard on another suitable node. + Otherwise: + - Create a secondary for the shard on the candidate node. +3. Wait until all secondaries are ready and pre-warmed. +4. Promote the candidate's secondary to attached. +5. Remove the secondary from the node being deleted. + +This process safely moves all attached shards before deleting the node. + +#### Secondary tenant shard processing + +When deleting a node, handle each secondary tenant shard as follows: + +1. Choose the best node to become the new secondary. +2. Create a secondary for the shard on that node. +3. Wait until the new secondary is ready. +4. Remove the secondary from the node being deleted. + +This ensures all secondary shards are safely moved before deleting the node. + +### Reliability, failure modes and corner cases + +In case of a storage controller failure and following restart, the system behavior depends on the +`NodeLifecycle` state: + +- If `NodeLifecycle` is `Active`: No action is taken for this node. +- If `NodeLifecycle` is `Deleted`: The node will not be re-added. +- If `NodeLifecycle` is `ScheduledForDeletion`: A deletion background task will be launched for +this node. + +In case of a pageserver node failure during deletion, the behavior depends on the `force` flag: +- If `force` is set: The node deletion will proceed regardless of the node's availability. +- If `force` is not set: The deletion will be retried a limited number of times. If the node +remains unavailable, the deletion process will pause and automatically resume when the node +becomes healthy again. + +### Operations concurrency + +The following sections describe the behavior when different types of requests arrive at the storage +controller and how they interact with ongoing operations. + +#### Delete request + +Handler: `PUT /control/v1/node/:node_id/delete` + +1. If node lifecycle is `NodeLifecycle::ScheduledForDeletion`: + - Return `200 OK`: there is already an ongoing deletion request for this node +2. Update & persist lifecycle to `NodeLifecycle::ScheduledForDeletion` +3. Persist current scheduling policy +4. If there is no active operation (drain/fill/delete): + - Run deletion process for this node + +#### Cancel delete request + +Handler: `DELETE /control/v1/node/:node_id/delete` + +1. If node lifecycle is not `NodeLifecycle::ScheduledForDeletion`: + - Return `404 Not Found`: there is no current deletion request for this node +2. If the active operation is deleting this node, cancel it +3. Update & persist lifecycle to `NodeLifecycle::Active` +4. Restore the last scheduling policy from persistence + +#### Drain/fill request + +1. If there are already ongoing drain/fill processes: + - Return `409 Conflict`: queueing of drain/fill processes is not supported +2. If there is an ongoing delete process: + - Cancel it and wait until it is cancelled +3. Run the drain/fill process +4. After the drain/fill process is cancelled or finished: + - Try to find another candidate to delete and run the deletion process for that node + +#### Drain/fill cancel request + +1. If the active operation is not the related process: + - Return `400 Bad Request`: cancellation request is incorrect, operations are not the same +2. Cancel the active operation +3. Try to find another candidate to delete and run the deletion process for that node + +## Definition of Done + +- [x] Fix flaky node scenario and introduce related debug handlers +- [ ] Node deletion intent is persistent - a node will be eventually deleted after a deletion +request regardless of draining/filling requests and restarts +- [ ] Node deletion can be graceful - deletion completes only after moving all tenant shards to +recommended locations +- [ ] Deploying does not break due to long deletions - drain/fill operations override deletion +process and deletion resumes after drain/fill completes +- [ ] `force` flag is implemented and provides fast, failure-tolerant node removal (e.g., when a +pageserver node does not respond) +- [ ] Legacy delete handler code is removed from storage_controller, test_runner, and storcon_cli diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index a27301e45e..a918644e4c 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -68,11 +68,15 @@ pub enum LfcPrewarmState { /// We tried to fetch the corresponding LFC state from the endpoint storage, /// but received `Not Found 404`. This should normally happen only during the /// first endpoint start after creation with `autoprewarm: true`. + /// This may also happen if LFC is turned off or not initialized /// /// During the orchestrated prewarm via API, when a caller explicitly /// provides the LFC state key to prewarm from, it's the caller responsibility /// to handle this status as an error state in this case. Skipped, + /// LFC prewarm was cancelled. Some pages in LFC cache may be prewarmed if query + /// has started working before cancellation + Cancelled, } impl Display for LfcPrewarmState { @@ -83,6 +87,7 @@ impl Display for LfcPrewarmState { LfcPrewarmState::Completed => f.write_str("Completed"), LfcPrewarmState::Skipped => f.write_str("Skipped"), LfcPrewarmState::Failed { error } => write!(f, "Error({error})"), + LfcPrewarmState::Cancelled => f.write_str("Cancelled"), } } } @@ -97,6 +102,7 @@ pub enum LfcOffloadState { Failed { error: String, }, + Skipped, } #[derive(Serialize, Debug, Clone, PartialEq)] diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 9f88ea6b11..9c90beb379 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -341,6 +341,34 @@ extern "C-unwind" fn log_internal( } } +/* BEGIN_HADRON */ +extern "C" fn reset_safekeeper_statuses_for_metrics(wp: *mut WalProposer, num_safekeepers: u32) { + unsafe { + let callback_data = (*(*wp).config).callback_data; + let api = callback_data as *mut Box; + if api.is_null() { + return; + } + (*api).reset_safekeeper_statuses_for_metrics(&mut (*wp), num_safekeepers); + } +} + +extern "C" fn update_safekeeper_status_for_metrics( + wp: *mut WalProposer, + sk_index: u32, + status: u8, +) { + unsafe { + let callback_data = (*(*wp).config).callback_data; + let api = callback_data as *mut Box; + if api.is_null() { + return; + } + (*api).update_safekeeper_status_for_metrics(&mut (*wp), sk_index, status); + } +} +/* END_HADRON */ + #[derive(Debug, PartialEq)] pub enum Level { Debug5, @@ -414,6 +442,10 @@ pub(crate) fn create_api() -> walproposer_api { finish_sync_safekeepers: Some(finish_sync_safekeepers), process_safekeeper_feedback: Some(process_safekeeper_feedback), log_internal: Some(log_internal), + /* BEGIN_HADRON */ + reset_safekeeper_statuses_for_metrics: Some(reset_safekeeper_statuses_for_metrics), + update_safekeeper_status_for_metrics: Some(update_safekeeper_status_for_metrics), + /* END_HADRON */ } } @@ -451,6 +483,8 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { replica_promote: false, min_ps_feedback: empty_feedback, wal_rate_limiter: empty_wal_rate_limiter, + num_safekeepers: 0, + safekeeper_status: [0; 32], } } diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 93bb0d5eb0..8453279c5c 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -159,6 +159,21 @@ pub trait ApiImpl { fn after_election(&self, _wp: &mut WalProposer) { todo!() } + + /* BEGIN_HADRON */ + fn reset_safekeeper_statuses_for_metrics(&self, _wp: &mut WalProposer, _num_safekeepers: u32) { + // Do nothing for testing purposes. + } + + fn update_safekeeper_status_for_metrics( + &self, + _wp: &mut WalProposer, + _sk_index: u32, + _status: u8, + ) { + // Do nothing for testing purposes. + } + /* END_HADRON */ } #[derive(Debug)] diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 3c680eab86..bb810ada68 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -49,6 +49,7 @@ #include "neon.h" #include "neon_lwlsncache.h" #include "neon_perf_counters.h" +#include "neon_utils.h" #include "pagestore_client.h" #include "communicator.h" @@ -673,8 +674,19 @@ lfc_get_state(size_t max_entries) { if (GET_STATE(entry, j) != UNAVAILABLE) { - BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j); - n_pages += 1; + /* Validate the buffer tag before including it */ + BufferTag test_tag = entry->key; + test_tag.blockNum += j; + + if (BufferTagIsValid(&test_tag)) + { + BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j); + n_pages += 1; + } + else + { + elog(ERROR, "LFC: Skipping invalid buffer tag during cache state capture: blockNum=%u", test_tag.blockNum); + } } } if (++i == n_entries) @@ -683,7 +695,7 @@ lfc_get_state(size_t max_entries) Assert(i == n_entries); fcs->n_pages = n_pages; Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages); - elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages); + elog(LOG, "LFC: save state of %d chunks %d pages (validated)", (int)n_entries, (int)n_pages); } LWLockRelease(lfc_lock); @@ -702,6 +714,7 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers) size_t n_entries; size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size); size_t fcs_size; + uint32_t max_prefetch_pages; dsm_segment *seg; BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS]; @@ -746,6 +759,11 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers) n_entries = Min(fcs->n_chunks, lfc_prewarm_limit); Assert(n_entries != 0); + max_prefetch_pages = n_entries << fcs_chunk_size_log; + if (fcs->n_pages > max_prefetch_pages) { + elog(ERROR, "LFC: Number of pages in file cache state (%d) is more than the limit (%d)", fcs->n_pages, max_prefetch_pages); + } + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); /* Do not prewarm more entries than LFC limit */ @@ -898,6 +916,11 @@ lfc_prewarm_main(Datum main_arg) { tag = fcs->chunks[snd_idx >> fcs_chunk_size_log]; tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1); + + if (!BufferTagIsValid(&tag)) { + elog(ERROR, "LFC: Invalid buffer tag: %u", tag.blockNum); + } + if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum)) { (void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index fada4cba1e..4527084514 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -391,6 +391,12 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) neon_per_backend_counters totals = {0}; metric_t *metrics; + /* BEGIN_HADRON */ + WalproposerShmemState *wp_shmem; + uint32 num_safekeepers; + uint32 num_active_safekeepers; + /* END_HADRON */ + /* We put all the tuples into a tuplestore in one go. */ InitMaterializedSRF(fcinfo, 0); @@ -437,11 +443,32 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) // Not ideal but piggyback our databricks counters into the neon perf counters view // so that we don't need to introduce neon--1.x+1.sql to add a new view. { + // Keeping this code in its own block to work around the C90 "don't mix declarations and code" rule when we define + // the `databricks_metrics` array in the next block. Yes, we are seriously dealing with C90 rules in 2025. + + // Read safekeeper status from wal proposer shared memory first. + // Note that we are taking a mutex when reading from walproposer shared memory so that the total safekeeper count is + // consistent with the active wal acceptors count. Assuming that we don't query this view too often the mutex should + // not be a huge deal. + wp_shmem = GetWalpropShmemState(); + SpinLockAcquire(&wp_shmem->mutex); + num_safekeepers = wp_shmem->num_safekeepers; + num_active_safekeepers = 0; + for (int i = 0; i < num_safekeepers; i++) { + if (wp_shmem->safekeeper_status[i] == 1) { + num_active_safekeepers++; + } + } + SpinLockRelease(&wp_shmem->mutex); + } + { metric_t databricks_metrics[] = { {"sql_index_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->index_corruption_count)}, {"sql_data_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->data_corruption_count)}, {"sql_internal_error_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->internal_error_count)}, {"ps_corruption_detected", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->ps_corruption_detected)}, + {"num_active_safekeepers", false, 0.0, (double) num_active_safekeepers}, + {"num_configured_safekeepers", false, 0.0, (double) num_safekeepers}, {NULL, false, 0, 0}, }; for (int i = 0; databricks_metrics[i].name != NULL; i++) diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index 1fad44bd58..847d380eb3 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -183,3 +183,22 @@ alloc_curl_handle(void) } #endif + +/* + * Check if a BufferTag is valid by verifying all its fields are not invalid. + */ +bool +BufferTagIsValid(const BufferTag *tag) +{ + #if PG_MAJORVERSION_NUM >= 16 + return (tag->spcOid != InvalidOid) && + (tag->relNumber != InvalidRelFileNumber) && + (tag->forkNum != InvalidForkNumber) && + (tag->blockNum != InvalidBlockNumber); + #else + return (tag->rnode.spcNode != InvalidOid) && + (tag->rnode.relNode != InvalidOid) && + (tag->forkNum != InvalidForkNumber) && + (tag->blockNum != InvalidBlockNumber); + #endif +} diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index 7480ac28cc..65d280788d 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -2,6 +2,7 @@ #define __NEON_UTILS_H__ #include "lib/stringinfo.h" +#include "storage/buf_internals.h" #ifndef WALPROPOSER_LIB #include @@ -16,6 +17,9 @@ void pq_sendint32_le(StringInfo buf, uint32 i); void pq_sendint64_le(StringInfo buf, uint64 i); void disable_core_dump(void); +/* Buffer tag validation function */ +bool BufferTagIsValid(const BufferTag *tag); + #ifndef WALPROPOSER_LIB CURL * alloc_curl_handle(void); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index c85a6f4b6f..dd42eaf18e 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -154,7 +154,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE; wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND; wp->safekeeper[wp->n_safekeepers].wp = wp; - + /* BEGIN_HADRON */ + wp->safekeeper[wp->n_safekeepers].index = wp->n_safekeepers; + /* END_HADRON */ { Safekeeper *sk = &wp->safekeeper[wp->n_safekeepers]; int written = 0; @@ -183,6 +185,10 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3) wp_log(FATAL, "enabling generations requires protocol version 3"); wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version); + + /* BEGIN_HADRON */ + wp->api.reset_safekeeper_statuses_for_metrics(wp, wp->n_safekeepers); + /* END_HADRON */ /* Fill the greeting package */ wp->greetRequest.pam.tag = 'g'; @@ -355,6 +361,10 @@ ShutdownConnection(Safekeeper *sk) sk->state = SS_OFFLINE; sk->streamingAt = InvalidXLogRecPtr; + /* BEGIN_HADRON */ + sk->wp->api.update_safekeeper_status_for_metrics(sk->wp, sk->index, 0); + /* END_HADRON */ + MembershipConfigurationFree(&sk->greetResponse.mconf); if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); @@ -1530,6 +1540,10 @@ StartStreaming(Safekeeper *sk) sk->active_state = SS_ACTIVE_SEND; sk->streamingAt = sk->startStreamingAt; + /* BEGIN_HADRON */ + sk->wp->api.update_safekeeper_status_for_metrics(sk->wp, sk->index, 1); + /* END_HADRON */ + /* * Donors can only be in SS_ACTIVE state, so we potentially update the * donor when we switch one to SS_ACTIVE. diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index d6cd532bec..ac42c2925d 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -432,6 +432,10 @@ typedef struct WalproposerShmemState /* BEGIN_HADRON */ /* The WAL rate limiter */ WalRateLimiter wal_rate_limiter; + /* Number of safekeepers in the config */ + uint32 num_safekeepers; + /* Per-safekeeper status flags: 0=inactive, 1=active */ + uint8 safekeeper_status[MAX_SAFEKEEPERS]; /* END_HADRON */ } WalproposerShmemState; @@ -483,6 +487,11 @@ typedef struct Safekeeper char const *host; char const *port; + /* BEGIN_HADRON */ + /* index of this safekeeper in the WalProposer array */ + uint32 index; + /* END_HADRON */ + /* * connection string for connecting/reconnecting. * @@ -731,6 +740,23 @@ typedef struct walproposer_api * handled by elog(). */ void (*log_internal) (WalProposer *wp, int level, const char *line); + + /* + * BEGIN_HADRON + * APIs manipulating shared memory state used for Safekeeper quorum health metrics. + */ + + /* + * Reset the safekeeper statuses in shared memory for metric purposes. + */ + void (*reset_safekeeper_statuses_for_metrics) (WalProposer *wp, uint32 num_safekeepers); + + /* + * Update the safekeeper status in shared memory for metric purposes. + */ + void (*update_safekeeper_status_for_metrics) (WalProposer *wp, uint32 sk_index, uint8 status); + + /* END_HADRON */ } walproposer_api; /* diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index da86c5d498..47b5ec523f 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -2261,6 +2261,27 @@ GetNeonCurrentClusterSize(void) } uint64 GetNeonCurrentClusterSize(void); +/* BEGIN_HADRON */ +static void +walprop_pg_reset_safekeeper_statuses_for_metrics(WalProposer *wp, uint32 num_safekeepers) +{ + WalproposerShmemState* shmem = wp->api.get_shmem_state(wp); + SpinLockAcquire(&shmem->mutex); + shmem->num_safekeepers = num_safekeepers; + memset(shmem->safekeeper_status, 0, sizeof(shmem->safekeeper_status)); + SpinLockRelease(&shmem->mutex); +} + +static void +walprop_pg_update_safekeeper_status_for_metrics(WalProposer *wp, uint32 sk_index, uint8 status) +{ + WalproposerShmemState* shmem = wp->api.get_shmem_state(wp); + Assert(sk_index < MAX_SAFEKEEPERS); + SpinLockAcquire(&shmem->mutex); + shmem->safekeeper_status[sk_index] = status; + SpinLockRelease(&shmem->mutex); +} +/* END_HADRON */ static const walproposer_api walprop_pg = { .get_shmem_state = walprop_pg_get_shmem_state, @@ -2294,4 +2315,6 @@ static const walproposer_api walprop_pg = { .finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers, .process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback, .log_internal = walprop_pg_log_internal, + .reset_safekeeper_statuses_for_metrics = walprop_pg_reset_safekeeper_statuses_for_metrics, + .update_safekeeper_status_for_metrics = walprop_pg_update_safekeeper_status_for_metrics, }; diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 29b0ad53f2..583cdc95bf 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -700,7 +700,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { ip_allowlist_check_enabled: !args.is_private_access_proxy, is_vpc_acccess_proxy: args.is_private_access_proxy, is_auth_broker: args.is_auth_broker, + #[cfg(not(feature = "rest_broker"))] accept_jwts: args.is_auth_broker, + #[cfg(feature = "rest_broker")] + accept_jwts: args.is_auth_broker || args.is_rest_broker, console_redirect_confirmation_timeout: args.webauth_confirmation_timeout, }; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 511bdc4e42..5b356c8460 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -458,7 +458,7 @@ pub(crate) enum LocalProxyConnError { impl ReportableError for HttpConnError { fn get_error_kind(&self) -> ErrorKind { match self { - HttpConnError::ConnectError(_) => ErrorKind::Compute, + HttpConnError::ConnectError(e) => e.get_error_kind(), HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, HttpConnError::PostgresConnectionError(p) => match p.as_db_error() { // user provided a wrong database name diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 43232db950..4febc7656e 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -612,19 +612,25 @@ pub async fn handle_request( } } + let max_term = statuses + .iter() + .map(|(status, _)| status.acceptor_state.term) + .max() + .unwrap(); + // Find the most advanced safekeeper let (status, i) = statuses .into_iter() .max_by_key(|(status, _)| { ( status.acceptor_state.epoch, + status.flush_lsn, /* BEGIN_HADRON */ // We need to pull from the SK with the highest term. // This is because another compute may come online and vote the same highest term again on the other two SKs. // Then, there will be 2 computes running on the same term. status.acceptor_state.term, /* END_HADRON */ - status.flush_lsn, status.commit_lsn, ) }) @@ -634,6 +640,22 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); + // TODO(diko): This is hadron only check to make sure that we pull the timeline + // from the safekeeper with the highest term during timeline restore. + // We could avoid returning the error by calling bump_term after pull_timeline. + // However, this is not a big deal because we retry the pull_timeline requests. + // The check should be removed together with removing custom hadron logic for + // safekeeper restore. + if wait_for_peer_timeline_status && status.acceptor_state.term != max_term { + return Err(ApiError::PreconditionFailed( + format!( + "choosen safekeeper {} has term {}, but the most advanced term is {}", + safekeeper_host, status.acceptor_state.term, max_term + ) + .into(), + )); + } + match pull_timeline( status, safekeeper_host, diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index e083a49428..25ac8e5bd3 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -195,12 +195,14 @@ impl StateSK { to: Configuration, ) -> Result { let result = self.state_mut().membership_switch(to).await?; + let flush_lsn = self.flush_lsn(); + let last_log_term = self.state().acceptor_state.get_last_log_term(flush_lsn); Ok(TimelineMembershipSwitchResponse { previous_conf: result.previous_conf, current_conf: result.current_conf, - last_log_term: self.state().acceptor_state.term, - flush_lsn: self.flush_lsn(), + last_log_term, + flush_lsn, }) } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 619b5f69b8..c61ef9ff5d 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -471,11 +471,17 @@ impl Persistence { &self, input_node_id: NodeId, input_https_port: Option, + input_grpc_addr: Option, + input_grpc_port: Option, ) -> DatabaseResult<()> { use crate::schema::nodes::dsl::*; self.update_node( input_node_id, - listen_https_port.eq(input_https_port.map(|x| x as i32)), + ( + listen_https_port.eq(input_https_port.map(|x| x as i32)), + listen_grpc_addr.eq(input_grpc_addr), + listen_grpc_port.eq(input_grpc_port.map(|x| x as i32)), + ), ) .await } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 37380b8fbe..2c7e4ee2de 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -7813,7 +7813,7 @@ impl Service { register_req.listen_https_port, register_req.listen_pg_addr, register_req.listen_pg_port, - register_req.listen_grpc_addr, + register_req.listen_grpc_addr.clone(), register_req.listen_grpc_port, register_req.availability_zone_id.clone(), self.config.use_https_pageserver_api, @@ -7848,6 +7848,8 @@ impl Service { .update_node_on_registration( register_req.node_id, register_req.listen_https_port, + register_req.listen_grpc_addr, + register_req.listen_grpc_port, ) .await? } diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index a60ebb85c6..fab1342d5d 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -24,12 +24,12 @@ use pageserver_api::controller_api::{ }; use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo}; use safekeeper_api::PgVersionId; +use safekeeper_api::Term; use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration}; use safekeeper_api::models::{ PullTimelineRequest, TimelineLocateResponse, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse, }; -use safekeeper_api::{INITIAL_TERM, Term}; use safekeeper_client::mgmt_api; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; @@ -1298,13 +1298,7 @@ impl Service { ) .await?; - let mut sync_position = (INITIAL_TERM, Lsn::INVALID); - for res in results.into_iter().flatten() { - let sk_position = (res.last_log_term, res.flush_lsn); - if sync_position < sk_position { - sync_position = sk_position; - } - } + let sync_position = Self::get_sync_position(&results)?; tracing::info!( %generation, @@ -1598,4 +1592,36 @@ impl Service { Ok(()) } + + /// Get membership switch responses from all safekeepers and return the sync position. + /// + /// Sync position is a position equal or greater than the commit position. + /// It is guaranteed that all WAL entries with (last_log_term, flush_lsn) + /// greater than the sync position are not committed (= not on a quorum). + /// + /// Returns error if there is no quorum of successful responses. + fn get_sync_position( + responses: &[mgmt_api::Result], + ) -> Result<(Term, Lsn), ApiError> { + let quorum_size = responses.len() / 2 + 1; + + let mut wal_positions = responses + .iter() + .flatten() + .map(|res| (res.last_log_term, res.flush_lsn)) + .collect::>(); + + // Should be already checked if the responses are from tenant_timeline_set_membership_quorum. + if wal_positions.len() < quorum_size { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "not enough successful responses to get sync position: {}/{}", + wal_positions.len(), + quorum_size, + ))); + } + + wal_positions.sort(); + + Ok(wal_positions[quorum_size - 1]) + } } diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index d235ac2143..c77a372017 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -78,20 +78,26 @@ class EndpointHttpClient(requests.Session): json: dict[str, str] = res.json() return json - def prewarm_lfc(self, from_endpoint_id: str | None = None): + def prewarm_lfc(self, from_endpoint_id: str | None = None) -> dict[str, str]: """ Prewarm LFC cache from given endpoint and wait till it finishes or errors """ params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict() self.post(self.prewarm_url, params=params).raise_for_status() - self.prewarm_lfc_wait() + return self.prewarm_lfc_wait() - def prewarm_lfc_wait(self): + def cancel_prewarm_lfc(self): + """ + Cancel LFC prewarm if any is ongoing + """ + self.delete(self.prewarm_url).raise_for_status() + + def prewarm_lfc_wait(self) -> dict[str, str]: """ Wait till LFC prewarm returns with error or success. If prewarm was not requested before calling this function, it will error """ - statuses = "failed", "completed", "skipped" + statuses = "failed", "completed", "skipped", "cancelled" def prewarmed(): json = self.prewarm_lfc_status() @@ -101,6 +107,7 @@ class EndpointHttpClient(requests.Session): wait_until(prewarmed, timeout=60) res = self.prewarm_lfc_status() assert res["status"] != "failed", res + return res def offload_lfc_status(self) -> dict[str, str]: res = self.get(self.offload_url) @@ -108,29 +115,31 @@ class EndpointHttpClient(requests.Session): json: dict[str, str] = res.json() return json - def offload_lfc(self): + def offload_lfc(self) -> dict[str, str]: """ Offload LFC cache to endpoint storage and wait till offload finishes or errors """ self.post(self.offload_url).raise_for_status() - self.offload_lfc_wait() + return self.offload_lfc_wait() - def offload_lfc_wait(self): + def offload_lfc_wait(self) -> dict[str, str]: """ Wait till LFC offload returns with error or success. If offload was not requested before calling this function, it will error """ + statuses = "failed", "completed", "skipped" def offloaded(): json = self.offload_lfc_status() status, err = json["status"], json.get("error") - assert status in ["failed", "completed"], f"{status}, {err=}" + assert status in statuses, f"{status}, {err=}" wait_until(offloaded, timeout=60) res = self.offload_lfc_status() assert res["status"] != "failed", res + return res - def promote(self, promote_spec: dict[str, Any], disconnect: bool = False): + def promote(self, promote_spec: dict[str, Any], disconnect: bool = False) -> dict[str, str]: url = f"http://localhost:{self.external_port}/promote" if disconnect: try: # send first request to start promote and disconnect diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7776c0aa66..4fad66574b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -262,7 +262,6 @@ class PgProtocol: # pooler does not support statement_timeout # Check if the hostname contains the string 'pooler' hostname = result.get("host", "") - log.info(f"Hostname: {hostname}") options = result.get("options", "") if "statement_timeout" not in options and "pooler" not in hostname: options = f"-cstatement_timeout=120s {options}" @@ -2316,6 +2315,7 @@ class NeonStorageController(MetricsGetter, LogUtils): timeline_id: TimelineId, new_sk_set: list[int], ): + log.info(f"migrate_safekeepers({tenant_id}, {timeline_id}, {new_sk_set})") response = self.request( "POST", f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate", @@ -5290,16 +5290,32 @@ class EndpointFactory: ) def stop_all(self, fail_on_error=True) -> Self: - exception = None - for ep in self.endpoints: + """ + Stop all the endpoints in parallel. + """ + + # Note: raising an exception from a task in a task group cancels + # all the other tasks. We don't want that, hence the 'stop_one' + # function catches exceptions and puts them on the 'exceptions' + # list for later processing. + exceptions = [] + + async def stop_one(ep): try: - ep.stop() + await asyncio.to_thread(ep.stop) except Exception as e: log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}") - exception = e + exceptions.append(e) - if fail_on_error and exception is not None: - raise exception + async def async_stop_all(): + async with asyncio.TaskGroup() as tg: + for ep in self.endpoints: + tg.create_task(stop_one(ep)) + + asyncio.run(async_stop_all()) + + if fail_on_error and exceptions: + raise ExceptionGroup("stopping an endpoint failed", exceptions) return self diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index a780aa2e52..94c18ac548 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -863,7 +863,6 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*") -@pytest.mark.skip(reason="Lakebase mode") def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder): """ Test that when the pageserver detects corruption during image layer creation, @@ -890,7 +889,9 @@ def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder): timeline_id = env.initial_timeline pageserver_http = env.pageserver.http_client() - workload = Workload(env, tenant_id, timeline_id) + workload = Workload( + env, tenant_id, timeline_id, endpoint_opts={"config_lines": ["neon.lakebase_mode=true"]} + ) workload.init() # Enable the failpoint that will cause image layer creation to fail due to a (simulated) detected diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py index 2bbe8c3e97..a96f18177c 100644 --- a/test_runner/regress/test_lfc_prewarm.py +++ b/test_runner/regress/test_lfc_prewarm.py @@ -1,6 +1,6 @@ import random -import threading from enum import StrEnum +from threading import Thread from time import sleep from typing import Any @@ -47,19 +47,23 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) # With autoprewarm, we need to be sure LFC was offloaded after all writes # finish, so we sleep. Otherwise we'll have less prewarmed pages than we want sleep(AUTOOFFLOAD_INTERVAL_SECS) - client.offload_lfc_wait() - return + offload_res = client.offload_lfc_wait() + log.info(offload_res) + return offload_res if method == PrewarmMethod.COMPUTE_CTL: status = client.prewarm_lfc_status() assert status["status"] == "not_prewarmed" assert "error" not in status - client.offload_lfc() + offload_res = client.offload_lfc() + log.info(offload_res) assert client.prewarm_lfc_status()["status"] == "not_prewarmed" + parsed = prom_parse(client) desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0} assert parsed == desired, f"{parsed=} != {desired=}" - return + + return offload_res raise AssertionError(f"{method} not in PrewarmMethod") @@ -68,21 +72,30 @@ def prewarm_endpoint( method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None ): if method == PrewarmMethod.AUTOPREWARM: - client.prewarm_lfc_wait() + prewarm_res = client.prewarm_lfc_wait() + log.info(prewarm_res) elif method == PrewarmMethod.COMPUTE_CTL: - client.prewarm_lfc() + prewarm_res = client.prewarm_lfc() + log.info(prewarm_res) + return prewarm_res elif method == PrewarmMethod.POSTGRES: cur.execute("select neon.prewarm_local_cache(%s)", (lfc_state,)) -def check_prewarmed( +def check_prewarmed_contains( method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int] ): if method == PrewarmMethod.AUTOPREWARM: - assert client.prewarm_lfc_status() == desired_status + prewarm_status = client.prewarm_lfc_status() + for k in desired_status: + assert desired_status[k] == prewarm_status[k] + assert prom_parse(client)[PREWARM_LABEL] == 1 elif method == PrewarmMethod.COMPUTE_CTL: - assert client.prewarm_lfc_status() == desired_status + prewarm_status = client.prewarm_lfc_status() + for k in desired_status: + assert desired_status[k] == prewarm_status[k] + desired = {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1, PREWARM_ERR_LABEL: 0, OFFLOAD_ERR_LABEL: 0} assert prom_parse(client) == desired @@ -149,9 +162,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): log.info(f"Used LFC size: {lfc_used_pages}") pg_cur.execute("select * from neon.get_prewarm_info()") total, prewarmed, skipped, _ = pg_cur.fetchall()[0] - log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}") - progress = (prewarmed + skipped) * 100 // total - log.info(f"Prewarm progress: {progress}%") assert lfc_used_pages > 10000 assert total > 0 assert prewarmed > 0 @@ -161,7 +171,54 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped} - check_prewarmed(method, client, desired) + check_prewarmed_contains(method, client, desired) + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_lfc_prewarm_cancel(neon_simple_env: NeonEnv): + """ + Test we can cancel LFC prewarm and prewarm successfully after + """ + env = neon_simple_env + n_records = 1000000 + cfg = [ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "neon.file_cache_prewarm_limit=1000", + ] + endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg) + + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("create schema neon; create extension neon with schema neon") + pg_cur.execute("create database lfc") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + log.info(f"Inserting {n_records} rows") + lfc_cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))") + lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))") + log.info(f"Inserted {n_records} rows") + + client = endpoint.http_client() + method = PrewarmMethod.COMPUTE_CTL + offload_lfc(method, client, pg_cur) + + endpoint.stop() + endpoint.start() + + thread = Thread(target=lambda: prewarm_endpoint(method, client, pg_cur, None)) + thread.start() + # wait 2 seconds to ensure we cancel prewarm SQL query + sleep(2) + client.cancel_prewarm_lfc() + thread.join() + assert client.prewarm_lfc_status()["status"] == "cancelled" + + prewarm_endpoint(method, client, pg_cur, None) + assert client.prewarm_lfc_status()["status"] == "completed" @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") @@ -178,9 +235,8 @@ def test_lfc_prewarm_empty(neon_simple_env: NeonEnv): cur = conn.cursor() cur.execute("create schema neon; create extension neon with schema neon") method = PrewarmMethod.COMPUTE_CTL - offload_lfc(method, client, cur) - prewarm_endpoint(method, client, cur, None) - assert client.prewarm_lfc_status()["status"] == "skipped" + assert offload_lfc(method, client, cur)["status"] == "skipped" + assert prewarm_endpoint(method, client, cur, None)["status"] == "skipped" # autoprewarm isn't needed as we prewarm manually @@ -251,11 +307,11 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet workload_threads = [] for _ in range(n_threads): - t = threading.Thread(target=workload) + t = Thread(target=workload) workload_threads.append(t) t.start() - prewarm_thread = threading.Thread(target=prewarm) + prewarm_thread = Thread(target=prewarm) prewarm_thread.start() def prewarmed(): diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py index 2ceeea37a5..97a6ece446 100644 --- a/test_runner/regress/test_safekeeper_migration.py +++ b/test_runner/regress/test_safekeeper_migration.py @@ -286,3 +286,177 @@ def test_sk_generation_aware_tombstones(neon_env_builder: NeonEnvBuilder): assert re.match(r".*Timeline .* deleted.*", exc.value.response.text) # The timeline should remain deleted. expect_deleted(second_sk) + + +def test_safekeeper_migration_stale_timeline(neon_env_builder: NeonEnvBuilder): + """ + Test that safekeeper migration handles stale timeline correctly by migrating to + a safekeeper with a stale timeline. + 1. Check that we are waiting for the stale timeline to catch up with the commit lsn. + The migration might fail if there is no compute to advance the WAL. + 2. Check that we rely on last_log_term (and not the current term) when waiting for the + sync_position on step 7. + 3. Check that migration succeeds if the compute is running. + """ + neon_env_builder.num_safekeepers = 2 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 1, + } + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS) + env.storage_controller.allowed_errors.append(".*not enough successful .* to reach quorum.*") + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + + active_sk = env.get_safekeeper(mconf["sk_set"][0]) + other_sk = [sk for sk in env.safekeepers if sk.id != active_sk.id][0] + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + ep.start(safekeeper_generation=1, safekeepers=[active_sk.id]) + ep.safe_psql("CREATE TABLE t(a int)") + ep.safe_psql("INSERT INTO t VALUES (0)") + + # Pull the timeline to other_sk, so other_sk now has a "stale" timeline on it. + other_sk.pull_timeline([active_sk], env.initial_tenant, env.initial_timeline) + + # Advance the WAL on active_sk. + ep.safe_psql("INSERT INTO t VALUES (1)") + + # The test is more tricky if we have the same last_log_term but different term/flush_lsn. + # Stop the active_sk during the endpoint shutdown because otherwise compute_ctl runs + # sync_safekeepers and advances last_log_term on active_sk. + active_sk.stop() + ep.stop(mode="immediate") + active_sk.start() + + active_sk_status = active_sk.http_client().timeline_status( + env.initial_tenant, env.initial_timeline + ) + other_sk_status = other_sk.http_client().timeline_status( + env.initial_tenant, env.initial_timeline + ) + + # other_sk should have the same last_log_term, but a stale flush_lsn. + assert active_sk_status.last_log_term == other_sk_status.last_log_term + assert active_sk_status.flush_lsn > other_sk_status.flush_lsn + + commit_lsn = active_sk_status.flush_lsn + + # Bump the term on other_sk to make it higher than active_sk. + # This is to make sure we don't use current term instead of last_log_term in the algorithm. + other_sk.http_client().term_bump( + env.initial_tenant, env.initial_timeline, active_sk_status.term + 100 + ) + + # TODO(diko): now it fails because the timeline on other_sk is stale and there is no compute + # to catch up it with active_sk. It might be fixed in https://databricks.atlassian.net/browse/LKB-946 + # if we delete stale timelines before starting the migration. + # But the rest of the test is still valid: we should not lose committed WAL after the migration. + with pytest.raises( + StorageControllerApiException, match="not enough successful .* to reach quorum" + ): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [other_sk.id] + ) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["new_sk_set"] == [other_sk.id] + assert mconf["sk_set"] == [active_sk.id] + assert mconf["generation"] == 2 + + # Start the endpoint, so it advances the WAL on other_sk. + ep.start(safekeeper_generation=2, safekeepers=[active_sk.id, other_sk.id]) + # Now the migration should succeed. + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [other_sk.id] + ) + + # Check that we didn't lose committed WAL. + assert ( + other_sk.http_client().timeline_status(env.initial_tenant, env.initial_timeline).flush_lsn + >= commit_lsn + ) + assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)] + + +def test_pull_from_most_advanced_sk(neon_env_builder: NeonEnvBuilder): + """ + Test that we pull the timeline from the most advanced safekeeper during the + migration and do not lose committed WAL. + """ + neon_env_builder.num_safekeepers = 4 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 3, + } + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + + sk_set = mconf["sk_set"] + assert len(sk_set) == 3 + + other_sk = [sk.id for sk in env.safekeepers if sk.id not in sk_set][0] + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + ep.start(safekeeper_generation=1, safekeepers=sk_set) + ep.safe_psql("CREATE TABLE t(a int)") + ep.safe_psql("INSERT INTO t VALUES (0)") + + # Stop one sk, so we have a lagging WAL on it. + env.get_safekeeper(sk_set[0]).stop() + # Advance the WAL on the other sks. + ep.safe_psql("INSERT INTO t VALUES (1)") + + # Stop other sks to make sure compute_ctl doesn't advance the last_log_term on them during shutdown. + for sk_id in sk_set[1:]: + env.get_safekeeper(sk_id).stop() + ep.stop(mode="immediate") + for sk_id in sk_set: + env.get_safekeeper(sk_id).start() + + # Bump the term on the lagging sk to make sure we don't use it to choose the most advanced sk. + env.get_safekeeper(sk_set[0]).http_client().term_bump( + env.initial_tenant, env.initial_timeline, 100 + ) + + def get_commit_lsn(sk_set: list[int]): + flush_lsns = [] + last_log_terms = [] + for sk_id in sk_set: + sk = env.get_safekeeper(sk_id) + status = sk.http_client().timeline_status(env.initial_tenant, env.initial_timeline) + flush_lsns.append(status.flush_lsn) + last_log_terms.append(status.last_log_term) + + # In this test we assume that all sks have the same last_log_term. + assert len(set(last_log_terms)) == 1 + + flush_lsns.sort(reverse=True) + commit_lsn = flush_lsns[len(sk_set) // 2] + + log.info(f"sk_set: {sk_set}, flush_lsns: {flush_lsns}, commit_lsn: {commit_lsn}") + return commit_lsn + + commit_lsn_before_migration = get_commit_lsn(sk_set) + + # Make two migrations, so the lagging sk stays in the sk_set, but other sks are replaced. + new_sk_set1 = [sk_set[0], sk_set[1], other_sk] # remove sk_set[2], add other_sk + new_sk_set2 = [sk_set[0], other_sk, sk_set[2]] # remove sk_set[1], add sk_set[2] back + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, new_sk_set1 + ) + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, new_sk_set2 + ) + + commit_lsn_after_migration = get_commit_lsn(new_sk_set2) + + # We should not lose committed WAL. + # If we have choosen the lagging sk to pull the timeline from, this might fail. + assert commit_lsn_before_migration <= commit_lsn_after_migration + + ep.start(safekeeper_generation=5, safekeepers=new_sk_set2) + assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)] diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index c691087259..1e8ca216d0 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2757,18 +2757,37 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): remote_storage_kind = s3_storage() neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) - # Set a very small disk usage limit (1KB) - neon_env_builder.safekeeper_extra_opts = ["--max-timeline-disk-usage-bytes=1024"] - env = neon_env_builder.init_start() # Create a timeline and endpoint env.create_branch("test_timeline_disk_usage_limit") - endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit") + endpoint = env.endpoints.create_start( + "test_timeline_disk_usage_limit", + config_lines=[ + "neon.lakebase_mode=true", + ], + ) + + # Install the neon extension in the test database. We need it to query perf counter metrics. + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE EXTENSION IF NOT EXISTS neon") + # Sanity-check safekeeper connection status in neon_perf_counters in the happy case. + cur.execute( + "SELECT value FROM neon_perf_counters WHERE metric = 'num_active_safekeepers'" + ) + assert cur.fetchone() == (1,), "Expected 1 active safekeeper" + cur.execute( + "SELECT value FROM neon_perf_counters WHERE metric = 'num_configured_safekeepers'" + ) + assert cur.fetchone() == (1,), "Expected 1 configured safekeeper" # Get the safekeeper sk = env.safekeepers[0] + # Restart the safekeeper with a very small disk usage limit (1KB) + sk.stop().start(["--max-timeline-disk-usage-bytes=1024"]) + # Inject a failpoint to stop WAL backup with sk.http_client() as http_cli: http_cli.configure_failpoints([("backup-lsn-range-pausable", "pause")]) @@ -2794,6 +2813,18 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): wait_until(error_logged) log.info("Found expected error message in compute log, resuming.") + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + # Confirm that neon_perf_counters also indicates that there are no active safekeepers + cur.execute( + "SELECT value FROM neon_perf_counters WHERE metric = 'num_active_safekeepers'" + ) + assert cur.fetchone() == (0,), "Expected 0 active safekeepers" + cur.execute( + "SELECT value FROM neon_perf_counters WHERE metric = 'num_configured_safekeepers'" + ) + assert cur.fetchone() == (1,), "Expected 1 configured safekeeper" + # Sanity check that the hanging insert is indeed still hanging. Otherwise means the circuit breaker we # implemented didn't work as expected. time.sleep(2) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c9f9fdd011..2155cb165d 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c9f9fdd0113b52c0bd535afdb09d3a543aeee25f +Subproject commit 2155cb165d05f617eb2c8ad7e43367189b627703 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index aaaeff2550..2aaab3bb4a 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit aaaeff2550d5deba58847f112af9b98fa3a58b00 +Subproject commit 2aaab3bb4a13557aae05bb2ae0ef0a132d0c4f85 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 9b9cb4b3e3..a42351fcd4 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 9b9cb4b3e33347aea8f61e606bb6569979516de5 +Subproject commit a42351fcd41ea01edede1daed65f651e838988fc diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index fa1788475e..1e01fcea2a 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit fa1788475e3146cc9c7c6a1b74f48fd296898fcd +Subproject commit 1e01fcea2a6b38180021aa83e0051d95286d9096 diff --git a/vendor/revisions.json b/vendor/revisions.json index 7212c9f7c7..c02c748a72 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.5", - "fa1788475e3146cc9c7c6a1b74f48fd296898fcd" + "1e01fcea2a6b38180021aa83e0051d95286d9096" ], "v16": [ "16.9", - "9b9cb4b3e33347aea8f61e606bb6569979516de5" + "a42351fcd41ea01edede1daed65f651e838988fc" ], "v15": [ "15.13", - "aaaeff2550d5deba58847f112af9b98fa3a58b00" + "2aaab3bb4a13557aae05bb2ae0ef0a132d0c4f85" ], "v14": [ "14.18", - "c9f9fdd0113b52c0bd535afdb09d3a543aeee25f" + "2155cb165d05f617eb2c8ad7e43367189b627703" ] }