From f3a0e4f255631b3c2039265abc465282d9057c4e Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 30 Jul 2025 17:29:16 +0200 Subject: [PATCH 01/15] Improve specificity with which we apply compute specs (#12773) This makes sure we don't confuse user-controlled functions with PG's builtin functions. ## Problem See https://github.com/neondatabase/cloud/issues/31628 --- .../etc/sql_exporter/checkpoints_req.17.sql | 2 +- compute/etc/sql_exporter/checkpoints_req.sql | 2 +- .../etc/sql_exporter/checkpoints_timed.sql | 2 +- ..._backpressure_throttling_seconds_total.sql | 2 +- .../etc/sql_exporter/compute_current_lsn.sql | 4 +-- .../compute_logical_snapshot_files.sql | 4 +-- .../compute_logical_snapshots_bytes.15.sql | 4 +-- .../compute_logical_snapshots_bytes.sql | 6 ++--- .../sql_exporter/compute_max_connections.sql | 2 +- .../compute_pg_oldest_frozen_xid_age.sql | 4 +-- .../compute_pg_oldest_mxid_age.sql | 4 +-- .../etc/sql_exporter/compute_receive_lsn.sql | 2 +- .../compute_subscriptions_count.sql | 2 +- .../etc/sql_exporter/connection_counts.sql | 2 +- compute/etc/sql_exporter/db_total_size.sql | 4 +-- ...e_working_set_size_windows.autoscaling.sql | 2 +- ...c_approximate_working_set_size_windows.sql | 2 +- .../etc/sql_exporter/lfc_cache_size_limit.sql | 2 +- .../sql_exporter/logical_slot_restart_lsn.sql | 4 +-- compute/etc/sql_exporter/max_cluster_size.sql | 2 +- compute/etc/sql_exporter/pg_stats_userdb.sql | 4 +-- .../sql_exporter/replication_delay_bytes.sql | 2 +- .../replication_delay_seconds.sql | 4 +-- compute/etc/sql_exporter/retained_wal.sql | 10 +++---- compute/etc/sql_exporter/wal_is_lost.sql | 2 +- compute_tools/src/bin/compute_ctl.rs | 2 +- compute_tools/src/checker.rs | 4 +-- compute_tools/src/compute.rs | 27 ++++++++++--------- compute_tools/src/compute_promote.rs | 6 ++--- compute_tools/src/installed_extensions.rs | 4 +-- compute_tools/src/migration.rs | 2 +- .../src/migrations/0002-alter_roles.sql | 14 +++++----- ...create_subscription_to_privileged_role.sql | 2 +- ...plication_for_previously_allowed_roles.sql | 6 ++--- ...nchronization_funcs_to_privileged_role.sql | 2 +- ...0001-add_bypass_rls_to_privileged_role.sql | 2 +- .../src/migrations/tests/0002-alter_roles.sql | 12 ++++----- ...create_subscription_to_privileged_role.sql | 4 +-- ...04-grant_pg_monitor_to_privileged_role.sql | 8 +++--- ...nchronization_funcs_to_privileged_role.sql | 6 ++--- ...ation_origin_status_to_privileged_role.sql | 4 +-- ...t_pg_signal_backend_to_privileged_role.sql | 4 +-- compute_tools/src/monitor.rs | 14 +++++----- compute_tools/src/pg_helpers.rs | 4 +-- compute_tools/src/spec_apply.rs | 14 ++++++---- .../src/sql/add_availabilitycheck_tables.sql | 13 ++++----- .../src/sql/anon_ext_fn_reassign.sql | 9 ++++--- .../src/sql/create_privileged_role.sql | 2 +- compute_tools/src/sql/default_grants.sql | 8 +++--- compute_tools/src/sql/drop_subscriptions.sql | 16 +++++++---- .../src/sql/finalize_drop_subscriptions.sql | 10 +++---- .../sql/pre_drop_role_revoke_privileges.sql | 8 +++--- .../src/sql/set_public_schema_owner.sql | 8 +++--- .../src/sql/unset_template_for_drop_dbs.sql | 4 +-- 54 files changed, 156 insertions(+), 143 deletions(-) diff --git a/compute/etc/sql_exporter/checkpoints_req.17.sql b/compute/etc/sql_exporter/checkpoints_req.17.sql index a4b946e8e2..28c868ff72 100644 --- a/compute/etc/sql_exporter/checkpoints_req.17.sql +++ b/compute/etc/sql_exporter/checkpoints_req.17.sql @@ -1 +1 @@ -SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer; +SELECT num_requested AS checkpoints_req FROM pg_catalog.pg_stat_checkpointer; diff --git a/compute/etc/sql_exporter/checkpoints_req.sql b/compute/etc/sql_exporter/checkpoints_req.sql index eb8427c883..421448c0de 100644 --- a/compute/etc/sql_exporter/checkpoints_req.sql +++ b/compute/etc/sql_exporter/checkpoints_req.sql @@ -1 +1 @@ -SELECT checkpoints_req FROM pg_stat_bgwriter; +SELECT checkpoints_req FROM pg_catalog.pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/checkpoints_timed.sql b/compute/etc/sql_exporter/checkpoints_timed.sql index c50853134c..bfa9b1b3d6 100644 --- a/compute/etc/sql_exporter/checkpoints_timed.sql +++ b/compute/etc/sql_exporter/checkpoints_timed.sql @@ -1 +1 @@ -SELECT checkpoints_timed FROM pg_stat_bgwriter; +SELECT checkpoints_timed FROM pg_catalog.pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql index d97d625d4c..3fe638e489 100644 --- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql +++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql @@ -1 +1 @@ -SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled; +SELECT (neon.backpressure_throttling_time()::pg_catalog.float8 / 1000000) AS throttled; diff --git a/compute/etc/sql_exporter/compute_current_lsn.sql b/compute/etc/sql_exporter/compute_current_lsn.sql index be02b8a094..9a042547f0 100644 --- a/compute/etc/sql_exporter/compute_current_lsn.sql +++ b/compute/etc/sql_exporter/compute_current_lsn.sql @@ -1,4 +1,4 @@ SELECT CASE - WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 - ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8 + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_replay_lsn() - '0/0')::pg_catalog.FLOAT8 + ELSE (pg_catalog.pg_current_wal_lsn() - '0/0')::pg_catalog.FLOAT8 END AS lsn; diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql index f2454235b7..2224c02d8d 100644 --- a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql @@ -1,7 +1,7 @@ SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + (SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files - (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; + (SELECT COUNT(*) FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql index 73a9c11405..17cf1228d3 100644 --- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql +++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql @@ -1,7 +1,7 @@ SELECT - (SELECT current_setting('neon.timeline_id')) AS timeline_id, + (SELECT pg_catalog.current_setting('neon.timeline_id')) AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files - (SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes; + (SELECT COALESCE(pg_catalog.sum(size), 0) FROM pg_catalog.pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes; diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql index 16da899de2..33ca1137dc 100644 --- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql +++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql @@ -1,9 +1,9 @@ SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + (SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files - (SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0) - FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name + (SELECT COALESCE(pg_catalog.sum((pg_catalog.pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0) + FROM (SELECT * FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name ) AS logical_snapshots_bytes; diff --git a/compute/etc/sql_exporter/compute_max_connections.sql b/compute/etc/sql_exporter/compute_max_connections.sql index 99a49483a6..1613c962a2 100644 --- a/compute/etc/sql_exporter/compute_max_connections.sql +++ b/compute/etc/sql_exporter/compute_max_connections.sql @@ -1 +1 @@ -SELECT current_setting('max_connections') as max_connections; +SELECT pg_catalog.current_setting('max_connections') AS max_connections; diff --git a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql index d2281fdd42..e613939e71 100644 --- a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql +++ b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql @@ -1,4 +1,4 @@ SELECT datname database_name, - age(datfrozenxid) frozen_xid_age -FROM pg_database + pg_catalog.age(datfrozenxid) frozen_xid_age +FROM pg_catalog.pg_database ORDER BY frozen_xid_age DESC LIMIT 10; diff --git a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql index ed57894b3a..7949bacfff 100644 --- a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql +++ b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql @@ -1,4 +1,4 @@ SELECT datname database_name, - mxid_age(datminmxid) min_mxid_age -FROM pg_database + pg_catalog.mxid_age(datminmxid) min_mxid_age +FROM pg_catalog.pg_database ORDER BY min_mxid_age DESC LIMIT 10; diff --git a/compute/etc/sql_exporter/compute_receive_lsn.sql b/compute/etc/sql_exporter/compute_receive_lsn.sql index 318b31ab41..fb96056881 100644 --- a/compute/etc/sql_exporter/compute_receive_lsn.sql +++ b/compute/etc/sql_exporter/compute_receive_lsn.sql @@ -1,4 +1,4 @@ SELECT CASE - WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_receive_lsn() - '0/0')::pg_catalog.FLOAT8 ELSE 0 END AS lsn; diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.sql b/compute/etc/sql_exporter/compute_subscriptions_count.sql index 50740cb5df..e380a7acc7 100644 --- a/compute/etc/sql_exporter/compute_subscriptions_count.sql +++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql @@ -1 +1 @@ -SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled; +SELECT subenabled::pg_catalog.text AS enabled, pg_catalog.count(*) AS subscriptions_count FROM pg_catalog.pg_subscription GROUP BY subenabled; diff --git a/compute/etc/sql_exporter/connection_counts.sql b/compute/etc/sql_exporter/connection_counts.sql index 6824480fdb..480c4fb439 100644 --- a/compute/etc/sql_exporter/connection_counts.sql +++ b/compute/etc/sql_exporter/connection_counts.sql @@ -1 +1 @@ -SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state; +SELECT datname, state, pg_catalog.count(*) AS count FROM pg_catalog.pg_stat_activity WHERE state <> '' GROUP BY datname, state; diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql index fe0360ab5c..59205e6ed3 100644 --- a/compute/etc/sql_exporter/db_total_size.sql +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -1,5 +1,5 @@ -SELECT sum(pg_database_size(datname)) AS total -FROM pg_database +SELECT pg_catalog.sum(pg_catalog.pg_database_size(datname)) AS total +FROM pg_catalog.pg_database -- Ignore invalid databases, as we will likely have problems with -- getting their size from the Pageserver. WHERE datconnlimit != -2; diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql index 35fa42c34c..02cb2b4649 100644 --- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql @@ -3,6 +3,6 @@ -- minutes. SELECT - x::text as duration_seconds, + x::pg_catalog.text AS duration_seconds, neon.approximate_working_set_size_seconds(x) AS size FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql index 46c7d1610c..aab93d433a 100644 --- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql @@ -3,6 +3,6 @@ SELECT x AS duration, - neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM ( + neon.approximate_working_set_size_seconds(extract('epoch' FROM x::pg_catalog.interval)::pg_catalog.int4) AS size FROM ( VALUES ('5m'), ('15m'), ('1h') ) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.sql b/compute/etc/sql_exporter/lfc_cache_size_limit.sql index 378904c1fe..41c11e0adc 100644 --- a/compute/etc/sql_exporter/lfc_cache_size_limit.sql +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql @@ -1 +1 @@ -SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; +SELECT pg_catalog.pg_size_bytes(pg_catalog.current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql index 1b1c038501..8964d0d8ff 100644 --- a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql @@ -1,3 +1,3 @@ -SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn -FROM pg_replication_slots +SELECT slot_name, (restart_lsn - '0/0')::pg_catalog.FLOAT8 AS restart_lsn +FROM pg_catalog.pg_replication_slots WHERE slot_type = 'logical'; diff --git a/compute/etc/sql_exporter/max_cluster_size.sql b/compute/etc/sql_exporter/max_cluster_size.sql index 2d2355a9a7..d44fdebe38 100644 --- a/compute/etc/sql_exporter/max_cluster_size.sql +++ b/compute/etc/sql_exporter/max_cluster_size.sql @@ -1 +1 @@ -SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size'; +SELECT setting::pg_catalog.int4 AS max_cluster_size FROM pg_catalog.pg_settings WHERE name = 'neon.max_cluster_size'; diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql index 12e6c4ae59..1a1e54a7c6 100644 --- a/compute/etc/sql_exporter/pg_stats_userdb.sql +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -1,13 +1,13 @@ -- We export stats for 10 non-system databases. Without this limit it is too -- easy to abuse the system by creating lots of databases. -SELECT pg_database_size(datname) AS db_size, +SELECT pg_catalog.pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, tup_updated AS updated, tup_deleted AS deleted, datname -FROM pg_stat_database +FROM pg_catalog.pg_stat_database WHERE datname IN ( SELECT datname FROM pg_database -- Ignore invalid databases, as we will likely have problems with diff --git a/compute/etc/sql_exporter/replication_delay_bytes.sql b/compute/etc/sql_exporter/replication_delay_bytes.sql index 60a6981acd..d3b7aa724b 100644 --- a/compute/etc/sql_exporter/replication_delay_bytes.sql +++ b/compute/etc/sql_exporter/replication_delay_bytes.sql @@ -3,4 +3,4 @@ -- replay LSN may have advanced past the receive LSN we are using for the -- calculation. -SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; +SELECT GREATEST(0, pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), pg_catalog.pg_last_wal_replay_lsn())) AS replication_delay_bytes; diff --git a/compute/etc/sql_exporter/replication_delay_seconds.sql b/compute/etc/sql_exporter/replication_delay_seconds.sql index a76809ad74..af4dd3fd90 100644 --- a/compute/etc/sql_exporter/replication_delay_seconds.sql +++ b/compute/etc/sql_exporter/replication_delay_seconds.sql @@ -1,5 +1,5 @@ SELECT CASE - WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 - ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + WHEN pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST(0, EXTRACT (EPOCH FROM pg_catalog.now() - pg_catalog.pg_last_xact_replay_timestamp())) END AS replication_delay_seconds; diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql index 3e2aadfc28..ccb3504d58 100644 --- a/compute/etc/sql_exporter/retained_wal.sql +++ b/compute/etc/sql_exporter/retained_wal.sql @@ -1,10 +1,10 @@ SELECT slot_name, - pg_wal_lsn_diff( + pg_catalog.pg_wal_lsn_diff( CASE - WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() - ELSE pg_current_wal_lsn() + WHEN pg_catalog.pg_is_in_recovery() THEN pg_catalog.pg_last_wal_replay_lsn() + ELSE pg_catalog.pg_current_wal_lsn() END, - restart_lsn)::FLOAT8 AS retained_wal -FROM pg_replication_slots + restart_lsn)::pg_catalog.FLOAT8 AS retained_wal +FROM pg_catalog.pg_replication_slots WHERE active = false; diff --git a/compute/etc/sql_exporter/wal_is_lost.sql b/compute/etc/sql_exporter/wal_is_lost.sql index 5521270851..5a94cc3373 100644 --- a/compute/etc/sql_exporter/wal_is_lost.sql +++ b/compute/etc/sql_exporter/wal_is_lost.sql @@ -4,4 +4,4 @@ SELECT WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost -FROM pg_replication_slots; +FROM pg_catalog.pg_replication_slots; diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 2b4802f309..f383683ef8 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -279,7 +279,7 @@ fn main() -> Result<()> { config, )?; - let exit_code = compute_node.run()?; + let exit_code = compute_node.run().context("running compute node")?; scenario.teardown(); diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index e4207876ac..2458fe3c11 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -24,9 +24,9 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> { }); let query = " - INSERT INTO health_check VALUES (1, now()) + INSERT INTO public.health_check VALUES (1, pg_catalog.now()) ON CONFLICT (id) DO UPDATE - SET updated_at = now();"; + SET updated_at = pg_catalog.now();"; match client.simple_query(query).await { Result::Ok(result) => { diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1033670e2b..f553bf3c1e 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -583,7 +583,7 @@ impl ComputeNode { // that can affect `compute_ctl` and prevent it from properly configuring the database schema. // Unset them via connection string options before connecting to the database. // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. - const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none"; + const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0 -c pgaudit.log=none"; let options = match conn_conf.get_options() { // Allow the control plane to override any options set by the // compute @@ -1884,7 +1884,7 @@ impl ComputeNode { // It doesn't matter what were the options before, here we just want // to connect and create a new superuser role. - const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0"; zenith_admin_conf.options(ZENITH_OPTIONS); let mut client = @@ -2339,13 +2339,13 @@ impl ComputeNode { let result = client .simple_query( "SELECT - row_to_json(pg_stat_statements) + pg_catalog.row_to_json(pss) FROM - pg_stat_statements + public.pg_stat_statements pss WHERE - userid != 'cloud_admin'::regrole::oid + pss.userid != 'cloud_admin'::pg_catalog.regrole::pg_catalog.oid ORDER BY - (mean_exec_time + mean_plan_time) DESC + (pss.mean_exec_time + pss.mean_plan_time) DESC LIMIT 100", ) .await; @@ -2473,11 +2473,11 @@ LIMIT 100", // check the role grants first - to gracefully handle read-replicas. let select = "SELECT privilege_type - FROM pg_namespace - JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true - JOIN pg_user users ON acl.grantee = users.usesysid - WHERE users.usename = $1 - AND nspname = $2"; + FROM pg_catalog.pg_namespace + JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) AS acl ON true + JOIN pg_catalog.pg_user users ON acl.grantee = users.usesysid + WHERE users.usename OPERATOR(pg_catalog.=) $1::pg_catalog.name + AND nspname OPERATOR(pg_catalog.=) $2::pg_catalog.name"; let rows = db_client .query(select, &[role_name, schema_name]) .await @@ -2546,8 +2546,9 @@ LIMIT 100", .await .with_context(|| format!("Failed to execute query: {query}"))?; } else { - let query = - format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}"); + let query = format!( + "CREATE EXTENSION IF NOT EXISTS {ext_name} WITH SCHEMA public VERSION {quoted_version}" + ); db_client .simple_query(&query) .await diff --git a/compute_tools/src/compute_promote.rs b/compute_tools/src/compute_promote.rs index a34368c531..29195b60e9 100644 --- a/compute_tools/src/compute_promote.rs +++ b/compute_tools/src/compute_promote.rs @@ -78,7 +78,7 @@ impl ComputeNode { const RETRIES: i32 = 20; for i in 0..=RETRIES { let row = client - .query_one("SELECT pg_last_wal_replay_lsn()", &[]) + .query_one("SELECT pg_catalog.pg_last_wal_replay_lsn()", &[]) .await .context("getting last replay lsn")?; let lsn: u64 = row.get::(0).into(); @@ -103,7 +103,7 @@ impl ComputeNode { .await .context("setting safekeepers")?; client - .query("SELECT pg_reload_conf()", &[]) + .query("SELECT pg_catalog.pg_reload_conf()", &[]) .await .context("reloading postgres config")?; @@ -113,7 +113,7 @@ impl ComputeNode { }); let row = client - .query_one("SELECT * FROM pg_promote()", &[]) + .query_one("SELECT * FROM pg_catalog.pg_promote()", &[]) .await .context("pg_promote")?; if !row.get::(0) { diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 5f60b711c8..a9ddef58e5 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -19,7 +19,7 @@ async fn list_dbs(client: &mut Client) -> Result, PostgresError> { .query( "SELECT datname FROM pg_catalog.pg_database WHERE datallowconn - AND datconnlimit <> - 2 + AND datconnlimit OPERATOR(pg_catalog.<>) (OPERATOR(pg_catalog.-) 2::pg_catalog.int4) LIMIT 500", &[], ) @@ -67,7 +67,7 @@ pub async fn get_installed_extensions( let extensions: Vec<(String, String, i32)> = client .query( - "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension", + "SELECT extname, extversion, extowner::pg_catalog.int4 FROM pg_catalog.pg_extension", &[], ) .await? diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index 88d870df97..c8f911b5a6 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -76,7 +76,7 @@ impl<'m> MigrationRunner<'m> { self.client .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration") .await?; - self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?; + self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key pg_catalog.int4 NOT NULL PRIMARY KEY, id pg_catalog.int8 NOT NULL DEFAULT 0)").await?; self.client .simple_query( "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING", diff --git a/compute_tools/src/migrations/0002-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql index 367356e6eb..6e28e8d32c 100644 --- a/compute_tools/src/migrations/0002-alter_roles.sql +++ b/compute_tools/src/migrations/0002-alter_roles.sql @@ -15,17 +15,17 @@ DO $$ DECLARE role_name text; BEGIN - FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member') + FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT'; + RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', pg_catalog.quote_ident(role_name); + EXECUTE pg_catalog.format('ALTER ROLE %I INHERIT;', role_name); END LOOP; - FOR role_name IN SELECT rolname FROM pg_roles + FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE - NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_') + NOT pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT pg_catalog.starts_with(rolname, 'pg_') LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS'; + RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', pg_catalog.quote_ident(role_name); + EXECUTE pg_catalog.format('ALTER ROLE %I NOBYPASSRLS;', role_name); END LOOP; END $$; diff --git a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql index adf159dc06..d67d6457c6 100644 --- a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql +++ b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN - IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name = 'server_version_num') THEN EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}'; END IF; END $$; diff --git a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql index 47129d65b8..7f74d4ee28 100644 --- a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql +++ b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql @@ -5,9 +5,9 @@ DO $$ DECLARE role_name TEXT; BEGIN - FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE + FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE rolreplication IS TRUE LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION'; + RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', pg_catalog.quote_ident(role_name); + EXECUTE pg_catalog.format('ALTER ROLE %I NOREPLICATION;', role_name); END LOOP; END $$; diff --git a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql index 84fcb36391..714bdc735a 100644 --- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql +++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN - IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name OPERATOR(pg_catalog.=) 'server_version_num'::pg_catalog.text) THEN EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}'; EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}'; END IF; diff --git a/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql b/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql index 0c81cef1c4..b5b209ef5e 100644 --- a/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql @@ -2,7 +2,7 @@ DO $$ DECLARE bypassrls boolean; BEGIN - SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser'; + SELECT rolbypassrls INTO bypassrls FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser'; IF NOT bypassrls THEN RAISE EXCEPTION 'neon_superuser cannot bypass RLS'; END IF; diff --git a/compute_tools/src/migrations/tests/0002-alter_roles.sql b/compute_tools/src/migrations/tests/0002-alter_roles.sql index 433f7b34f7..1755c9088c 100644 --- a/compute_tools/src/migrations/tests/0002-alter_roles.sql +++ b/compute_tools/src/migrations/tests/0002-alter_roles.sql @@ -4,8 +4,8 @@ DECLARE BEGIN FOR role IN SELECT rolname AS name, rolinherit AS inherit - FROM pg_roles - WHERE pg_has_role(rolname, 'neon_superuser', 'member') + FROM pg_catalog.pg_roles + WHERE pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member') LOOP IF NOT role.inherit THEN RAISE EXCEPTION '% cannot inherit', quote_ident(role.name); @@ -14,12 +14,12 @@ BEGIN FOR role IN SELECT rolname AS name, rolbypassrls AS bypassrls - FROM pg_roles - WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member') - AND NOT starts_with(rolname, 'pg_') + FROM pg_catalog.pg_roles + WHERE NOT pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member') + AND NOT pg_catalog.starts_with(rolname, 'pg_') LOOP IF role.bypassrls THEN - RAISE EXCEPTION '% can bypass RLS', quote_ident(role.name); + RAISE EXCEPTION '% can bypass RLS', pg_catalog.quote_ident(role.name); END IF; END LOOP; END $$; diff --git a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql index b164d61295..498770f4fa 100644 --- a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql @@ -1,10 +1,10 @@ DO $$ BEGIN - IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN + IF (SELECT pg_catalog.current_setting('server_version_num')::pg_catalog.numeric < 160000) THEN RETURN; END IF; - IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN + IF NOT (SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription'; END IF; END $$; diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql index 3464a2b1cf..ec04cfe199 100644 --- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql @@ -2,12 +2,12 @@ DO $$ DECLARE monitor record; BEGIN - SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member, + SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member, admin_option AS admin INTO monitor - FROM pg_auth_members - WHERE roleid = 'pg_monitor'::regrole - AND member = 'neon_superuser'::regrole; + FROM pg_catalog.pg_auth_members + WHERE roleid = 'pg_monitor'::pg_catalog.regrole + AND member = 'neon_superuser'::pg_catalog.regrole; IF monitor IS NULL THEN RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor'; diff --git a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql index af7f50e95d..f3b28d76c9 100644 --- a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql @@ -2,11 +2,11 @@ DO $$ DECLARE can_execute boolean; BEGIN - SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute')) + SELECT pg_catalog.bool_and(pg_catalog.has_function_privilege('neon_superuser', oid, 'execute')) INTO can_execute - FROM pg_proc + FROM pg_catalog.pg_proc WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot') - AND pronamespace = 'pg_catalog'::regnamespace; + AND pronamespace = 'pg_catalog'::pg_catalog.regnamespace; IF NOT can_execute THEN RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot'; END IF; diff --git a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql index e55dcdc3b6..197211300b 100644 --- a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql @@ -2,9 +2,9 @@ DO $$ DECLARE can_execute boolean; BEGIN - SELECT has_function_privilege('neon_superuser', oid, 'execute') + SELECT pg_catalog.has_function_privilege('neon_superuser', oid, 'execute') INTO can_execute - FROM pg_proc + FROM pg_catalog.pg_proc WHERE proname = 'pg_show_replication_origin_status' AND pronamespace = 'pg_catalog'::regnamespace; IF NOT can_execute THEN diff --git a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql index e62b742d30..0f772d67bd 100644 --- a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql @@ -2,10 +2,10 @@ DO $$ DECLARE signal_backend record; BEGIN - SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member, + SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member, admin_option AS admin INTO signal_backend - FROM pg_auth_members + FROM pg_catalog.pg_auth_members WHERE roleid = 'pg_signal_backend'::regrole AND member = 'neon_superuser'::regrole; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index e164f15dba..78ac423a9b 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -407,9 +407,9 @@ fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> { // like `postgres_exporter` use it to query Postgres statistics. // Use explicit 8 bytes type casts to match Rust types. let stats = cli.query_one( - "SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time, - coalesce(sum(sessions), 0)::bigint AS total_sessions - FROM pg_stat_database + "SELECT pg_catalog.coalesce(pg_catalog.sum(active_time), 0.0)::pg_catalog.float8 AS total_active_time, + pg_catalog.coalesce(pg_catalog.sum(sessions), 0)::pg_catalog.bigint AS total_sessions + FROM pg_catalog.pg_stat_database WHERE datname NOT IN ( 'postgres', 'template0', @@ -445,11 +445,11 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result> = None; // Get all running client backends except ourself, use RFC3339 DateTime format. let backends = cli.query( - "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change + "SELECT state, pg_catalog.to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"'::pg_catalog.text) AS state_change FROM pg_stat_activity - WHERE backend_type = 'client backend' - AND pid != pg_backend_pid() - AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? + WHERE backend_type OPERATOR(pg_catalog.=) 'client backend'::pg_catalog.text + AND pid OPERATOR(pg_catalog.!=) pg_catalog.pg_backend_pid() + AND usename OPERATOR(pg_catalog.!=) 'cloud_admin'::pg_catalog.name;", // XXX: find a better way to filter other monitors? &[], ); diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 09bbe89b41..4e16a75181 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -299,9 +299,9 @@ pub async fn get_existing_dbs_async( .query_raw::( "SELECT datname AS name, - (SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner, + (SELECT rolname FROM pg_catalog.pg_roles WHERE oid OPERATOR(pg_catalog.=) datdba) AS owner, NOT datallowconn AS restrict_conn, - datconnlimit = - 2 AS invalid + datconnlimit OPERATOR(pg_catalog.=) (OPERATOR(pg_catalog.-) 2) AS invalid FROM pg_catalog.pg_database;", &[], diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 00f34c1f0e..90c0e234d5 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -82,7 +82,7 @@ impl ComputeNode { info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); drop_subscriptions_done = match - client.query("select 1 from neon.drop_subscriptions_done where timeline_id = $1", &[&timeline_id.to_string()]).await { + client.query("select 1 from neon.drop_subscriptions_done where timeline_id OPERATOR(pg_catalog.=) $1", &[&timeline_id.to_string()]).await { Ok(result) => !result.is_empty(), Err(e) => { @@ -1142,7 +1142,9 @@ async fn get_operations<'a>( if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { if libs.contains("pg_stat_statements") { return Ok(Box::new(once(Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"), + query: String::from( + "CREATE EXTENSION IF NOT EXISTS pg_stat_statements WITH SCHEMA public", + ), comment: Some(String::from("create system extensions")), }))); } @@ -1150,11 +1152,13 @@ async fn get_operations<'a>( Ok(Box::new(empty())) } ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"), + query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit WITH SCHEMA public"), comment: Some(String::from("create pgaudit extensions")), }))), ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"), + query: String::from( + "CREATE EXTENSION IF NOT EXISTS pgauditlogtofile WITH SCHEMA public", + ), comment: Some(String::from("create pgauditlogtofile extensions")), }))), // Disable pgaudit logging for postgres database. @@ -1178,7 +1182,7 @@ async fn get_operations<'a>( }, Operation { query: String::from( - "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'", + "UPDATE pg_catalog.pg_extension SET extrelocatable = true WHERE extname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name AND extrelocatable OPERATOR(pg_catalog.=) false", ), comment: Some(String::from("compat/fix: make neon relocatable")), }, diff --git a/compute_tools/src/sql/add_availabilitycheck_tables.sql b/compute_tools/src/sql/add_availabilitycheck_tables.sql index 7c60690c78..dd27105e16 100644 --- a/compute_tools/src/sql/add_availabilitycheck_tables.sql +++ b/compute_tools/src/sql/add_availabilitycheck_tables.sql @@ -3,16 +3,17 @@ BEGIN IF NOT EXISTS( SELECT 1 FROM pg_catalog.pg_tables - WHERE tablename = 'health_check' + WHERE tablename::pg_catalog.name OPERATOR(pg_catalog.=) 'health_check'::pg_catalog.name + AND schemaname::pg_catalog.name OPERATOR(pg_catalog.=) 'public'::pg_catalog.name ) THEN - CREATE TABLE health_check ( - id serial primary key, - updated_at timestamptz default now() + CREATE TABLE public.health_check ( + id pg_catalog.int4 primary key generated by default as identity, + updated_at pg_catalog.timestamptz default pg_catalog.now() ); - INSERT INTO health_check VALUES (1, now()) + INSERT INTO public.health_check VALUES (1, pg_catalog.now()) ON CONFLICT (id) DO UPDATE - SET updated_at = now(); + SET updated_at = pg_catalog.now(); END IF; END $$ \ No newline at end of file diff --git a/compute_tools/src/sql/anon_ext_fn_reassign.sql b/compute_tools/src/sql/anon_ext_fn_reassign.sql index 3d7b15c590..714a6db1e9 100644 --- a/compute_tools/src/sql/anon_ext_fn_reassign.sql +++ b/compute_tools/src/sql/anon_ext_fn_reassign.sql @@ -2,10 +2,11 @@ DO $$ DECLARE query varchar; BEGIN - FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};' - FROM pg_proc p - JOIN pg_namespace nsp ON p.pronamespace = nsp.oid - WHERE nsp.nspname = 'anon' LOOP + FOR query IN + SELECT pg_catalog.format('ALTER FUNCTION %I(%s) OWNER TO {db_owner};', p.oid::regproc, pg_catalog.pg_get_function_identity_arguments(p.oid)) + FROM pg_catalog.pg_proc p + WHERE p.pronamespace OPERATOR(pg_catalog.=) 'anon'::regnamespace::oid + LOOP EXECUTE query; END LOOP; END diff --git a/compute_tools/src/sql/create_privileged_role.sql b/compute_tools/src/sql/create_privileged_role.sql index ac2521445f..a682089cce 100644 --- a/compute_tools/src/sql/create_privileged_role.sql +++ b/compute_tools/src/sql/create_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN - IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}') + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname OPERATOR(pg_catalog.=) '{privileged_role_name}'::pg_catalog.name) THEN CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data; END IF; diff --git a/compute_tools/src/sql/default_grants.sql b/compute_tools/src/sql/default_grants.sql index 58ebb0690b..d572332270 100644 --- a/compute_tools/src/sql/default_grants.sql +++ b/compute_tools/src/sql/default_grants.sql @@ -4,14 +4,14 @@ $$ IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public' ) AND - current_setting('server_version_num')::int / 10000 >= 15 + pg_catalog.current_setting('server_version_num')::int OPERATOR(pg_catalog./) 10000 OPERATOR(pg_catalog.>=) 15 THEN IF EXISTS( SELECT rolname FROM pg_catalog.pg_roles - WHERE rolname = 'web_access' + WHERE rolname OPERATOR(pg_catalog.=) 'web_access' ) THEN GRANT CREATE ON SCHEMA public TO web_access; @@ -20,7 +20,7 @@ $$ IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public' ) THEN ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION; diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql index f5d9420130..68b3f8b729 100644 --- a/compute_tools/src/sql/drop_subscriptions.sql +++ b/compute_tools/src/sql/drop_subscriptions.sql @@ -2,11 +2,17 @@ DO ${outer_tag}$ DECLARE subname TEXT; BEGIN - LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE; - FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP - EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname); - EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); - EXECUTE format('DROP SUBSCRIPTION %I;', subname); + LOCK TABLE pg_catalog.pg_subscription IN ACCESS EXCLUSIVE MODE; + FOR subname IN + SELECT pg_subscription.subname + FROM pg_catalog.pg_subscription + WHERE subdbid OPERATOR(pg_catalog.=) ( + SELECT oid FROM pg_database WHERE datname OPERATOR(pg_catalog.=) {datname_str}::pg_catalog.name + ) + LOOP + EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I DISABLE;', subname); + EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); + EXECUTE pg_catalog.format('DROP SUBSCRIPTION %I;', subname); END LOOP; END; ${outer_tag}$; diff --git a/compute_tools/src/sql/finalize_drop_subscriptions.sql b/compute_tools/src/sql/finalize_drop_subscriptions.sql index 4bb291876f..1a8876ad61 100644 --- a/compute_tools/src/sql/finalize_drop_subscriptions.sql +++ b/compute_tools/src/sql/finalize_drop_subscriptions.sql @@ -3,19 +3,19 @@ BEGIN IF NOT EXISTS( SELECT 1 FROM pg_catalog.pg_tables - WHERE tablename = 'drop_subscriptions_done' - AND schemaname = 'neon' + WHERE tablename OPERATOR(pg_catalog.=) 'drop_subscriptions_done'::pg_catalog.name + AND schemaname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name ) THEN CREATE TABLE neon.drop_subscriptions_done - (id serial primary key, timeline_id text); + (id pg_catalog.int4 primary key generated by default as identity, timeline_id pg_catalog.text); END IF; -- preserve the timeline_id of the last drop_subscriptions run -- to ensure that the cleanup of a timeline is executed only once. -- use upsert to avoid the table bloat in case of cascade branching (branch of a branch) - INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id')) + INSERT INTO neon.drop_subscriptions_done VALUES (1, pg_catalog.current_setting('neon.timeline_id')) ON CONFLICT (id) DO UPDATE - SET timeline_id = current_setting('neon.timeline_id'); + SET timeline_id = pg_catalog.current_setting('neon.timeline_id')::pg_catalog.text; END $$ diff --git a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql index 734607be02..2ed0f94bad 100644 --- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql +++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql @@ -15,15 +15,15 @@ BEGIN WHERE schema_name IN ('public') LOOP FOR grantor IN EXECUTE - format( - 'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s', + pg_catalog.format( + 'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee OPERATOR(pg_catalog.=) %s', -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` quote_literal({role_name}) ) LOOP - EXECUTE format('SET LOCAL ROLE %I', grantor); + EXECUTE pg_catalog.format('SET LOCAL ROLE %I', grantor); - revoke_query := format( + revoke_query := pg_catalog.format( 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I', schema, -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql index dc502c6d2d..41bd0d4689 100644 --- a/compute_tools/src/sql/set_public_schema_owner.sql +++ b/compute_tools/src/sql/set_public_schema_owner.sql @@ -5,17 +5,17 @@ DO ${outer_tag}$ IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.name ) THEN SELECT nspowner::regrole::text FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.text INTO schema_owner; - IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin' + IF schema_owner OPERATOR(pg_catalog.=) 'cloud_admin'::pg_catalog.text OR schema_owner OPERATOR(pg_catalog.=) 'zenith_admin'::pg_catalog.text THEN - EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner}); + EXECUTE pg_catalog.format('ALTER SCHEMA public OWNER TO %I', {db_owner}); END IF; END IF; END diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql index 36dc648beb..03225d5e64 100644 --- a/compute_tools/src/sql/unset_template_for_drop_dbs.sql +++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql @@ -3,10 +3,10 @@ DO ${outer_tag}$ IF EXISTS( SELECT 1 FROM pg_catalog.pg_database - WHERE datname = {datname} + WHERE datname OPERATOR(pg_catalog.=) {datname}::pg_catalog.name ) THEN - EXECUTE format('ALTER DATABASE %I is_template false', {datname}); + EXECUTE pg_catalog.format('ALTER DATABASE %I is_template false', {datname}); END IF; END ${outer_tag}$; From eb2741758bbf97616746b16831a72c4531791ee9 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 30 Jul 2025 18:18:35 +0200 Subject: [PATCH 02/15] storcon: actually update gRPC address on reattach (#12784) ## Problem In #12268, we added Pageserver gRPC addresses to the storage controller. However, we didn't actually persist these in the database. ## Summary of changes Update the database with the new gRPC address on reattach. --- storage_controller/src/persistence.rs | 8 +++++++- storage_controller/src/service.rs | 4 +++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 619b5f69b8..c61ef9ff5d 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -471,11 +471,17 @@ impl Persistence { &self, input_node_id: NodeId, input_https_port: Option, + input_grpc_addr: Option, + input_grpc_port: Option, ) -> DatabaseResult<()> { use crate::schema::nodes::dsl::*; self.update_node( input_node_id, - listen_https_port.eq(input_https_port.map(|x| x as i32)), + ( + listen_https_port.eq(input_https_port.map(|x| x as i32)), + listen_grpc_addr.eq(input_grpc_addr), + listen_grpc_port.eq(input_grpc_port.map(|x| x as i32)), + ), ) .await } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 37380b8fbe..2c7e4ee2de 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -7813,7 +7813,7 @@ impl Service { register_req.listen_https_port, register_req.listen_pg_addr, register_req.listen_pg_port, - register_req.listen_grpc_addr, + register_req.listen_grpc_addr.clone(), register_req.listen_grpc_port, register_req.availability_zone_id.clone(), self.config.use_https_pageserver_api, @@ -7848,6 +7848,8 @@ impl Service { .update_node_on_registration( register_req.node_id, register_req.listen_https_port, + register_req.listen_grpc_addr, + register_req.listen_grpc_port, ) .await? } From e470997627b25ae6494ffe5ae4abbd35efd7acf6 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:10:33 -0400 Subject: [PATCH 03/15] enable tests introduced in hadron commits (#12790) Enables skipped tests introduced in hadron integration commits --- test_runner/regress/test_compaction.py | 5 +++-- test_runner/regress/test_wal_acceptor.py | 8 ++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index a780aa2e52..94c18ac548 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -863,7 +863,6 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*") -@pytest.mark.skip(reason="Lakebase mode") def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder): """ Test that when the pageserver detects corruption during image layer creation, @@ -890,7 +889,9 @@ def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder): timeline_id = env.initial_timeline pageserver_http = env.pageserver.http_client() - workload = Workload(env, tenant_id, timeline_id) + workload = Workload( + env, tenant_id, timeline_id, endpoint_opts={"config_lines": ["neon.lakebase_mode=true"]} + ) workload.init() # Enable the failpoint that will cause image layer creation to fail due to a (simulated) detected diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 33d308fb5a..1e8ca216d0 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2742,7 +2742,6 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde wait_until(unevicted) -@pytest.mark.skip(reason="Lakebase mode") def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): """ Test that the timeline disk usage circuit breaker works as expected. We test that: @@ -2762,7 +2761,12 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): # Create a timeline and endpoint env.create_branch("test_timeline_disk_usage_limit") - endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit") + endpoint = env.endpoints.create_start( + "test_timeline_disk_usage_limit", + config_lines=[ + "neon.lakebase_mode=true", + ], + ) # Install the neon extension in the test database. We need it to query perf counter metrics. with closing(endpoint.connect()) as conn: From 81ddd10be6d18da39332bf51c48a99017ce6f9df Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 30 Jul 2025 22:56:22 +0300 Subject: [PATCH 04/15] tests: Don't print Hostname on every test connection (#12782) These lines are a significant fraction of the total log size of the regression tests. And it seems very uninteresting, it's always 'localhost' in local tests. --- test_runner/fixtures/neon_fixtures.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b5148cbfdc..11d54f7831 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -262,7 +262,6 @@ class PgProtocol: # pooler does not support statement_timeout # Check if the hostname contains the string 'pooler' hostname = result.get("host", "") - log.info(f"Hostname: {hostname}") options = result.get("options", "") if "statement_timeout" not in options and "pooler" not in hostname: options = f"-cstatement_timeout=120s {options}" From 4d3b28bd2eecaf38b8700281e17d027b7920fdc3 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 30 Jul 2025 23:34:30 +0200 Subject: [PATCH 05/15] [Hadron] Always run databricks auth hook. (#12683) --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c9f9fdd011..2155cb165d 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c9f9fdd0113b52c0bd535afdb09d3a543aeee25f +Subproject commit 2155cb165d05f617eb2c8ad7e43367189b627703 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index aaaeff2550..2aaab3bb4a 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit aaaeff2550d5deba58847f112af9b98fa3a58b00 +Subproject commit 2aaab3bb4a13557aae05bb2ae0ef0a132d0c4f85 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 9b9cb4b3e3..a42351fcd4 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 9b9cb4b3e33347aea8f61e606bb6569979516de5 +Subproject commit a42351fcd41ea01edede1daed65f651e838988fc diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index fa1788475e..1e01fcea2a 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit fa1788475e3146cc9c7c6a1b74f48fd296898fcd +Subproject commit 1e01fcea2a6b38180021aa83e0051d95286d9096 diff --git a/vendor/revisions.json b/vendor/revisions.json index 7212c9f7c7..c02c748a72 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.5", - "fa1788475e3146cc9c7c6a1b74f48fd296898fcd" + "1e01fcea2a6b38180021aa83e0051d95286d9096" ], "v16": [ "16.9", - "9b9cb4b3e33347aea8f61e606bb6569979516de5" + "a42351fcd41ea01edede1daed65f651e838988fc" ], "v15": [ "15.13", - "aaaeff2550d5deba58847f112af9b98fa3a58b00" + "2aaab3bb4a13557aae05bb2ae0ef0a132d0c4f85" ], "v14": [ "14.18", - "c9f9fdd0113b52c0bd535afdb09d3a543aeee25f" + "2155cb165d05f617eb2c8ad7e43367189b627703" ] } From 01c39f378e0a81037ead9108ea83feb146310de8 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 30 Jul 2025 23:05:51 +0100 Subject: [PATCH 06/15] prewarm cancellation (#12785) Add DELETE /lfc/prewarm route which handles ongoing prewarm cancellation, update API spec, add prewarm Cancelled state Add offload Cancelled state when LFC is not initialized --- compute_tools/src/compute.rs | 3 + compute_tools/src/compute_prewarm.rs | 141 ++++++++++++++--------- compute_tools/src/http/openapi_spec.yaml | 11 +- compute_tools/src/http/routes/lfc.rs | 5 + compute_tools/src/http/server.rs | 7 +- libs/compute_api/src/responses.rs | 6 + test_runner/fixtures/endpoint/http.py | 27 +++-- test_runner/regress/test_lfc_prewarm.py | 94 ++++++++++++--- 8 files changed, 208 insertions(+), 86 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index f553bf3c1e..5b1cdb9805 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -32,6 +32,7 @@ use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::time::{Duration, Instant}; use std::{env, fs}; use tokio::{spawn, sync::watch, task::JoinHandle, time}; +use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, instrument, warn}; use url::Url; use utils::id::{TenantId, TimelineId}; @@ -192,6 +193,7 @@ pub struct ComputeState { pub startup_span: Option, pub lfc_prewarm_state: LfcPrewarmState, + pub lfc_prewarm_token: CancellationToken, pub lfc_offload_state: LfcOffloadState, /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if @@ -217,6 +219,7 @@ impl ComputeState { lfc_offload_state: LfcOffloadState::default(), terminate_flush_lsn: None, promote_state: None, + lfc_prewarm_token: CancellationToken::new(), } } diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs index 97e62c1c80..82cb28f1ac 100644 --- a/compute_tools/src/compute_prewarm.rs +++ b/compute_tools/src/compute_prewarm.rs @@ -7,7 +7,8 @@ use http::StatusCode; use reqwest::Client; use std::mem::replace; use std::sync::Arc; -use tokio::{io::AsyncReadExt, spawn}; +use tokio::{io::AsyncReadExt, select, spawn}; +use tokio_util::sync::CancellationToken; use tracing::{error, info}; #[derive(serde::Serialize, Default)] @@ -92,34 +93,35 @@ impl ComputeNode { /// If there is a prewarm request ongoing, return `false`, `true` otherwise. /// Has a failpoint "compute-prewarm" pub fn prewarm_lfc(self: &Arc, from_endpoint: Option) -> bool { + let token: CancellationToken; { - let state = &mut self.state.lock().unwrap().lfc_prewarm_state; - if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) { + let state = &mut self.state.lock().unwrap(); + token = state.lfc_prewarm_token.clone(); + if let LfcPrewarmState::Prewarming = + replace(&mut state.lfc_prewarm_state, LfcPrewarmState::Prewarming) + { return false; } } crate::metrics::LFC_PREWARMS.inc(); - let cloned = self.clone(); + let this = self.clone(); spawn(async move { - let state = match cloned.prewarm_impl(from_endpoint).await { - Ok(true) => LfcPrewarmState::Completed, - Ok(false) => { - info!( - "skipping LFC prewarm because LFC state is not found in endpoint storage" - ); - LfcPrewarmState::Skipped - } + let prewarm_state = match this.prewarm_impl(from_endpoint, token).await { + Ok(state) => state, Err(err) => { crate::metrics::LFC_PREWARM_ERRORS.inc(); error!(%err, "could not prewarm LFC"); - LfcPrewarmState::Failed { - error: format!("{err:#}"), - } + let error = format!("{err:#}"); + LfcPrewarmState::Failed { error } } }; - cloned.state.lock().unwrap().lfc_prewarm_state = state; + let state = &mut this.state.lock().unwrap(); + if let LfcPrewarmState::Cancelled = prewarm_state { + state.lfc_prewarm_token = CancellationToken::new(); + } + state.lfc_prewarm_state = prewarm_state; }); true } @@ -132,47 +134,70 @@ impl ComputeNode { /// Request LFC state from endpoint storage and load corresponding pages into Postgres. /// Returns a result with `false` if the LFC state is not found in endpoint storage. - async fn prewarm_impl(&self, from_endpoint: Option) -> Result { - let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?; + async fn prewarm_impl( + &self, + from_endpoint: Option, + token: CancellationToken, + ) -> Result { + let EndpointStoragePair { + url, + token: storage_token, + } = self.endpoint_storage_pair(from_endpoint)?; #[cfg(feature = "testing")] - fail::fail_point!("compute-prewarm", |_| { - bail!("prewarm configured to fail because of a failpoint") - }); + fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint")); info!(%url, "requesting LFC state from endpoint storage"); - let request = Client::new().get(&url).bearer_auth(token); - let res = request.send().await.context("querying endpoint storage")?; - match res.status() { + let request = Client::new().get(&url).bearer_auth(storage_token); + let response = select! { + _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), + response = request.send() => response + } + .context("querying endpoint storage")?; + + match response.status() { StatusCode::OK => (), - StatusCode::NOT_FOUND => { - return Ok(false); - } + StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped), status => bail!("{status} querying endpoint storage"), } let mut uncompressed = Vec::new(); - let lfc_state = res - .bytes() - .await - .context("getting request body from endpoint storage")?; - ZstdDecoder::new(lfc_state.iter().as_slice()) - .read_to_end(&mut uncompressed) - .await - .context("decoding LFC state")?; + let lfc_state = select! { + _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), + lfc_state = response.bytes() => lfc_state + } + .context("getting request body from endpoint storage")?; + + let mut decoder = ZstdDecoder::new(lfc_state.iter().as_slice()); + select! { + _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), + read = decoder.read_to_end(&mut uncompressed) => read + } + .context("decoding LFC state")?; + let uncompressed_len = uncompressed.len(); + info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}"); - info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres"); - - ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + // Client connection and prewarm info querying are fast and therefore don't need + // cancellation + let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await - .context("connecting to postgres")? - .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed]) - .await - .context("loading LFC state into postgres") - .map(|_| ())?; + .context("connecting to postgres")?; + let pg_token = client.cancel_token(); - Ok(true) + let params: Vec<&(dyn postgres_types::ToSql + Sync)> = vec![&uncompressed]; + select! { + res = client.query_one("select neon.prewarm_local_cache($1)", ¶ms) => res, + _ = token.cancelled() => { + pg_token.cancel_query(postgres::NoTls).await + .context("cancelling neon.prewarm_local_cache()")?; + return Ok(LfcPrewarmState::Cancelled) + } + } + .context("loading LFC state into postgres") + .map(|_| ())?; + + Ok(LfcPrewarmState::Completed) } /// If offload request is ongoing, return false, true otherwise @@ -200,20 +225,20 @@ impl ComputeNode { async fn offload_lfc_with_state_update(&self) { crate::metrics::LFC_OFFLOADS.inc(); - - let Err(err) = self.offload_lfc_impl().await else { - self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed; - return; + let state = match self.offload_lfc_impl().await { + Ok(state) => state, + Err(err) => { + crate::metrics::LFC_OFFLOAD_ERRORS.inc(); + error!(%err, "could not offload LFC"); + let error = format!("{err:#}"); + LfcOffloadState::Failed { error } + } }; - crate::metrics::LFC_OFFLOAD_ERRORS.inc(); - error!(%err, "could not offload LFC state to endpoint storage"); - self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed { - error: format!("{err:#}"), - }; + self.state.lock().unwrap().lfc_offload_state = state; } - async fn offload_lfc_impl(&self) -> Result<()> { + async fn offload_lfc_impl(&self) -> Result { let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?; info!(%url, "requesting LFC state from Postgres"); @@ -228,7 +253,7 @@ impl ComputeNode { .context("deserializing LFC state")?; let Some(state) = state else { info!(%url, "empty LFC state, not exporting"); - return Ok(()); + return Ok(LfcOffloadState::Skipped); }; let mut compressed = Vec::new(); @@ -242,7 +267,7 @@ impl ComputeNode { let request = Client::new().put(url).bearer_auth(token).body(compressed); match request.send().await { - Ok(res) if res.status() == StatusCode::OK => Ok(()), + Ok(res) if res.status() == StatusCode::OK => Ok(LfcOffloadState::Completed), Ok(res) => bail!( "Request to endpoint storage failed with status: {}", res.status() @@ -250,4 +275,8 @@ impl ComputeNode { Err(err) => Err(err).context("writing to endpoint storage"), } } + + pub fn cancel_prewarm(self: &Arc) { + self.state.lock().unwrap().lfc_prewarm_token.cancel(); + } } diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index ab729d62b5..27e610a87d 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -139,6 +139,15 @@ paths: application/json: schema: $ref: "#/components/schemas/LfcPrewarmState" + delete: + tags: + - Prewarm + summary: Cancel ongoing LFC prewarm + description: "" + operationId: cancelLfcPrewarm + responses: + 202: + description: Prewarm cancelled /lfc/offload: post: @@ -636,7 +645,7 @@ components: properties: status: description: LFC offload status - enum: [not_offloaded, offloading, completed, failed] + enum: [not_offloaded, offloading, completed, skipped, failed] type: string error: description: LFC offload error, if any diff --git a/compute_tools/src/http/routes/lfc.rs b/compute_tools/src/http/routes/lfc.rs index e98bd781a2..7483198723 100644 --- a/compute_tools/src/http/routes/lfc.rs +++ b/compute_tools/src/http/routes/lfc.rs @@ -46,3 +46,8 @@ pub(in crate::http) async fn offload(compute: Compute) -> Response { ) } } + +pub(in crate::http) async fn cancel_prewarm(compute: Compute) -> StatusCode { + compute.cancel_prewarm(); + StatusCode::ACCEPTED +} diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 2fd3121f4f..869fdef11d 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -99,7 +99,12 @@ impl From<&Server> for Router> { ); let authenticated_router = Router::>::new() - .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm)) + .route( + "/lfc/prewarm", + get(lfc::prewarm_state) + .post(lfc::prewarm) + .delete(lfc::cancel_prewarm), + ) .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload)) .route("/promote", post(promote::promote)) .route("/check_writability", post(check_writability::is_writable)) diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index a27301e45e..a918644e4c 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -68,11 +68,15 @@ pub enum LfcPrewarmState { /// We tried to fetch the corresponding LFC state from the endpoint storage, /// but received `Not Found 404`. This should normally happen only during the /// first endpoint start after creation with `autoprewarm: true`. + /// This may also happen if LFC is turned off or not initialized /// /// During the orchestrated prewarm via API, when a caller explicitly /// provides the LFC state key to prewarm from, it's the caller responsibility /// to handle this status as an error state in this case. Skipped, + /// LFC prewarm was cancelled. Some pages in LFC cache may be prewarmed if query + /// has started working before cancellation + Cancelled, } impl Display for LfcPrewarmState { @@ -83,6 +87,7 @@ impl Display for LfcPrewarmState { LfcPrewarmState::Completed => f.write_str("Completed"), LfcPrewarmState::Skipped => f.write_str("Skipped"), LfcPrewarmState::Failed { error } => write!(f, "Error({error})"), + LfcPrewarmState::Cancelled => f.write_str("Cancelled"), } } } @@ -97,6 +102,7 @@ pub enum LfcOffloadState { Failed { error: String, }, + Skipped, } #[derive(Serialize, Debug, Clone, PartialEq)] diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index d235ac2143..c77a372017 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -78,20 +78,26 @@ class EndpointHttpClient(requests.Session): json: dict[str, str] = res.json() return json - def prewarm_lfc(self, from_endpoint_id: str | None = None): + def prewarm_lfc(self, from_endpoint_id: str | None = None) -> dict[str, str]: """ Prewarm LFC cache from given endpoint and wait till it finishes or errors """ params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict() self.post(self.prewarm_url, params=params).raise_for_status() - self.prewarm_lfc_wait() + return self.prewarm_lfc_wait() - def prewarm_lfc_wait(self): + def cancel_prewarm_lfc(self): + """ + Cancel LFC prewarm if any is ongoing + """ + self.delete(self.prewarm_url).raise_for_status() + + def prewarm_lfc_wait(self) -> dict[str, str]: """ Wait till LFC prewarm returns with error or success. If prewarm was not requested before calling this function, it will error """ - statuses = "failed", "completed", "skipped" + statuses = "failed", "completed", "skipped", "cancelled" def prewarmed(): json = self.prewarm_lfc_status() @@ -101,6 +107,7 @@ class EndpointHttpClient(requests.Session): wait_until(prewarmed, timeout=60) res = self.prewarm_lfc_status() assert res["status"] != "failed", res + return res def offload_lfc_status(self) -> dict[str, str]: res = self.get(self.offload_url) @@ -108,29 +115,31 @@ class EndpointHttpClient(requests.Session): json: dict[str, str] = res.json() return json - def offload_lfc(self): + def offload_lfc(self) -> dict[str, str]: """ Offload LFC cache to endpoint storage and wait till offload finishes or errors """ self.post(self.offload_url).raise_for_status() - self.offload_lfc_wait() + return self.offload_lfc_wait() - def offload_lfc_wait(self): + def offload_lfc_wait(self) -> dict[str, str]: """ Wait till LFC offload returns with error or success. If offload was not requested before calling this function, it will error """ + statuses = "failed", "completed", "skipped" def offloaded(): json = self.offload_lfc_status() status, err = json["status"], json.get("error") - assert status in ["failed", "completed"], f"{status}, {err=}" + assert status in statuses, f"{status}, {err=}" wait_until(offloaded, timeout=60) res = self.offload_lfc_status() assert res["status"] != "failed", res + return res - def promote(self, promote_spec: dict[str, Any], disconnect: bool = False): + def promote(self, promote_spec: dict[str, Any], disconnect: bool = False) -> dict[str, str]: url = f"http://localhost:{self.external_port}/promote" if disconnect: try: # send first request to start promote and disconnect diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py index 2bbe8c3e97..a96f18177c 100644 --- a/test_runner/regress/test_lfc_prewarm.py +++ b/test_runner/regress/test_lfc_prewarm.py @@ -1,6 +1,6 @@ import random -import threading from enum import StrEnum +from threading import Thread from time import sleep from typing import Any @@ -47,19 +47,23 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) # With autoprewarm, we need to be sure LFC was offloaded after all writes # finish, so we sleep. Otherwise we'll have less prewarmed pages than we want sleep(AUTOOFFLOAD_INTERVAL_SECS) - client.offload_lfc_wait() - return + offload_res = client.offload_lfc_wait() + log.info(offload_res) + return offload_res if method == PrewarmMethod.COMPUTE_CTL: status = client.prewarm_lfc_status() assert status["status"] == "not_prewarmed" assert "error" not in status - client.offload_lfc() + offload_res = client.offload_lfc() + log.info(offload_res) assert client.prewarm_lfc_status()["status"] == "not_prewarmed" + parsed = prom_parse(client) desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0} assert parsed == desired, f"{parsed=} != {desired=}" - return + + return offload_res raise AssertionError(f"{method} not in PrewarmMethod") @@ -68,21 +72,30 @@ def prewarm_endpoint( method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None ): if method == PrewarmMethod.AUTOPREWARM: - client.prewarm_lfc_wait() + prewarm_res = client.prewarm_lfc_wait() + log.info(prewarm_res) elif method == PrewarmMethod.COMPUTE_CTL: - client.prewarm_lfc() + prewarm_res = client.prewarm_lfc() + log.info(prewarm_res) + return prewarm_res elif method == PrewarmMethod.POSTGRES: cur.execute("select neon.prewarm_local_cache(%s)", (lfc_state,)) -def check_prewarmed( +def check_prewarmed_contains( method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int] ): if method == PrewarmMethod.AUTOPREWARM: - assert client.prewarm_lfc_status() == desired_status + prewarm_status = client.prewarm_lfc_status() + for k in desired_status: + assert desired_status[k] == prewarm_status[k] + assert prom_parse(client)[PREWARM_LABEL] == 1 elif method == PrewarmMethod.COMPUTE_CTL: - assert client.prewarm_lfc_status() == desired_status + prewarm_status = client.prewarm_lfc_status() + for k in desired_status: + assert desired_status[k] == prewarm_status[k] + desired = {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1, PREWARM_ERR_LABEL: 0, OFFLOAD_ERR_LABEL: 0} assert prom_parse(client) == desired @@ -149,9 +162,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): log.info(f"Used LFC size: {lfc_used_pages}") pg_cur.execute("select * from neon.get_prewarm_info()") total, prewarmed, skipped, _ = pg_cur.fetchall()[0] - log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}") - progress = (prewarmed + skipped) * 100 // total - log.info(f"Prewarm progress: {progress}%") assert lfc_used_pages > 10000 assert total > 0 assert prewarmed > 0 @@ -161,7 +171,54 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped} - check_prewarmed(method, client, desired) + check_prewarmed_contains(method, client, desired) + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_lfc_prewarm_cancel(neon_simple_env: NeonEnv): + """ + Test we can cancel LFC prewarm and prewarm successfully after + """ + env = neon_simple_env + n_records = 1000000 + cfg = [ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "neon.file_cache_prewarm_limit=1000", + ] + endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg) + + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("create schema neon; create extension neon with schema neon") + pg_cur.execute("create database lfc") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + log.info(f"Inserting {n_records} rows") + lfc_cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))") + lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))") + log.info(f"Inserted {n_records} rows") + + client = endpoint.http_client() + method = PrewarmMethod.COMPUTE_CTL + offload_lfc(method, client, pg_cur) + + endpoint.stop() + endpoint.start() + + thread = Thread(target=lambda: prewarm_endpoint(method, client, pg_cur, None)) + thread.start() + # wait 2 seconds to ensure we cancel prewarm SQL query + sleep(2) + client.cancel_prewarm_lfc() + thread.join() + assert client.prewarm_lfc_status()["status"] == "cancelled" + + prewarm_endpoint(method, client, pg_cur, None) + assert client.prewarm_lfc_status()["status"] == "completed" @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") @@ -178,9 +235,8 @@ def test_lfc_prewarm_empty(neon_simple_env: NeonEnv): cur = conn.cursor() cur.execute("create schema neon; create extension neon with schema neon") method = PrewarmMethod.COMPUTE_CTL - offload_lfc(method, client, cur) - prewarm_endpoint(method, client, cur, None) - assert client.prewarm_lfc_status()["status"] == "skipped" + assert offload_lfc(method, client, cur)["status"] == "skipped" + assert prewarm_endpoint(method, client, cur, None)["status"] == "skipped" # autoprewarm isn't needed as we prewarm manually @@ -251,11 +307,11 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet workload_threads = [] for _ in range(n_threads): - t = threading.Thread(target=workload) + t = Thread(target=workload) workload_threads.append(t) t.start() - prewarm_thread = threading.Thread(target=prewarm) + prewarm_thread = Thread(target=prewarm) prewarm_thread.start() def prewarmed(): From 975b95f4cd9db3d25587f265f0587ba4fa123ce5 Mon Sep 17 00:00:00 2001 From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com> Date: Thu, 31 Jul 2025 12:34:47 +0400 Subject: [PATCH 07/15] Introduce deletion API improvement RFC (#12484) ## Problem The deletion logic had become difficult to understand and maintain. ## Summary of changes - Added an RFC detailing proposed improvements to all deletion-related APIs. --------- Co-authored-by: Aleksandr Sarantsev --- ...025-07-07-node-deletion-api-improvement.md | 246 ++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 docs/rfcs/2025-07-07-node-deletion-api-improvement.md diff --git a/docs/rfcs/2025-07-07-node-deletion-api-improvement.md b/docs/rfcs/2025-07-07-node-deletion-api-improvement.md new file mode 100644 index 0000000000..47dadaee35 --- /dev/null +++ b/docs/rfcs/2025-07-07-node-deletion-api-improvement.md @@ -0,0 +1,246 @@ +# Node deletion API improvement + +Created on 2025-07-07 +Implemented on _TBD_ + +## Summary + +This RFC describes improvements to the storage controller API for gracefully deleting pageserver +nodes. + +## Motivation + +The basic node deletion API introduced in [#8226](https://github.com/neondatabase/neon/issues/8333) +has several limitations: + +- Deleted nodes can re-add themselves if they restart (e.g., a flaky node that keeps restarting and +we cannot reach via SSH to stop the pageserver). This issue has been resolved by tombstone +mechanism in [#12036](https://github.com/neondatabase/neon/issues/12036) +- Process of node deletion is not graceful, i.e. it just imitates a node failure + +In this context, "graceful" node deletion means that users do not experience any disruption or +negative effects, provided the system remains in a healthy state (i.e., the remaining pageservers +can handle the workload and all requirements are met). To achieve this, the system must perform +live migration of all tenant shards from the node being deleted while the node is still running +and continue processing all incoming requests. The node is removed only after all tenant shards +have been safely migrated. + +Although live migrations can be achieved with the drain functionality, it leads to incorrect shard +placement, such as not matching availability zones. This results in unnecessary work to optimize +the placement that was just recently performed. + +If we delete a node before its tenant shards are fully moved, the new node won't have all the +needed data (e.g. heatmaps) ready. This means user requests to the new node will be much slower at +first. If there are many tenant shards, this slowdown affects a huge amount of users. + +Graceful node deletion is more complicated and can introduce new issues. It takes longer because +live migration of each tenant shard can last several minutes. Using non-blocking accessors may +also cause deletion to wait if other processes are holding inner state lock. It also gets trickier +because we need to handle other requests, like drain and fill, at the same time. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +- storage controller +- pageserver (indirectly) + +## Proposed implementation + +### Tombstones + +To resolve the problem of deleted nodes re-adding themselves, a tombstone mechanism was introduced +as part of the node stored information. Each node has a separate `NodeLifecycle` field with two +possible states: `Active` and `Deleted`. When node deletion completes, the database row is not +deleted but instead has its `NodeLifecycle` column switched to `Deleted`. Nodes with `Deleted` +lifecycle are treated as if the row is absent for most handlers, with several exceptions: reattach +and register functionality must be aware of tombstones. Additionally, new debug handlers are +available for listing and deleting tombstones via the `/debug/v1/tombstone` path. + +### Gracefulness + +The problem of making node deletion graceful is complex and involves several challenges: + +- **Cancellable**: The operation must be cancellable to allow administrators to abort the process +if needed, e.g. if run by mistake. +- **Non-blocking**: We don't want to block deployment operations like draining/filling on the node +deletion process. We need clear policies for handling concurrent operations: what happens when a +drain/fill request arrives while deletion is in progress, and what happens when a delete request +arrives while drain/fill is in progress. +- **Persistent**: If the storage controller restarts during this long-running operation, we must +preserve progress and automatically resume the deletion process after the storage controller +restarts. +- **Migrated correctly**: We cannot simply use the existing drain mechanism for nodes scheduled +for deletion, as this would move shards to irrelevant locations. The drain process expects the +node to return, so it only moves shards to backup locations, not to their preferred AZs. It also +leaves secondary locations unmoved. This could result in unnecessary load on the storage +controller and inefficient resource utilization. +- **Force option**: Administrators need the ability to force immediate, non-graceful deletion when +time constraints or emergency situations require it, bypassing the normal graceful migration +process. + +See below for a detailed breakdown of the proposed changes and mechanisms. + +#### Node lifecycle + +New `NodeLifecycle` enum and a matching database field with these values: +- `Active`: The normal state. All operations are allowed. +- `ScheduledForDeletion`: The node is marked to be deleted soon. Deletion may be in progress or +will happen later, but the node will eventually be removed. All operations are allowed. +- `Deleted`: The node is fully deleted. No operations are allowed, and the node cannot be brought +back. The only action left is to remove its record from the database. Any attempt to register a +node in this state will fail. + +This state persists across storage controller restarts. + +**State transition** +``` + +--------------------+ + +---| Active |<---------------------+ + | +--------------------+ | + | ^ | + | start_node_delete | cancel_node_delete | + v | | + +----------------------------------+ | + | ScheduledForDeletion | | + +----------------------------------+ | + | | + | node_register | + | | + | delete_node (at the finish) | + | | + v | + +---------+ tombstone_delete +----------+ + | Deleted |-------------------------------->| no row | + +---------+ +----------+ +``` + +#### NodeSchedulingPolicy::Deleting + +A `Deleting` variant to the `NodeSchedulingPolicy` enum. This means the deletion function is +running for the node right now. Only one node can have the `Deleting` policy at a time. + +The `NodeSchedulingPolicy::Deleting` state is persisted in the database. However, after a storage +controller restart, any node previously marked as `Deleting` will have its scheduling policy reset +to `Pause`. The policy will only transition back to `Deleting` when the deletion operation is +actively started again, as triggered by the node's `NodeLifecycle::ScheduledForDeletion` state. + +`NodeSchedulingPolicy` transition details: +1. When `node_delete` begins, set the policy to `NodeSchedulingPolicy::Deleting`. +2. If `node_delete` is cancelled (for example, due to a concurrent drain operation), revert the +policy to its previous value. The policy is persisted in storcon DB. +3. After `node_delete` completes, the final value of the scheduling policy is irrelevant, since +`NodeLifecycle::Deleted` prevents any further access to this field. + +The deletion process cannot be initiated for nodes currently undergoing deployment-related +operations (`Draining`, `Filling`, or `PauseForRestart` policies). Deletion will only be triggered +once the node transitions to either the `Active` or `Pause` state. + +#### OperationTracker + +A replacement for `Option ongoing_operation`, the `OperationTracker` is a +dedicated service state object responsible for managing all long-running node operations (drain, +fill, delete) with robust concurrency control. + +Key responsibilities: +- Orchestrates the execution of operations +- Supports cancellation of currently running operations +- Enforces operation constraints, e.g. allowing only single drain/fill operation at a time +- Persists deletion state, enabling recovery of pending deletions across restarts +- Ensures thread safety across concurrent requests + +#### Attached tenant shard processing + +When deleting a node, handle each attached tenant shard as follows: + +1. Pick the best node to become the new attached (the candidate). +2. If the candidate already has this shard as a secondary: + - Create a new secondary for the shard on another suitable node. + Otherwise: + - Create a secondary for the shard on the candidate node. +3. Wait until all secondaries are ready and pre-warmed. +4. Promote the candidate's secondary to attached. +5. Remove the secondary from the node being deleted. + +This process safely moves all attached shards before deleting the node. + +#### Secondary tenant shard processing + +When deleting a node, handle each secondary tenant shard as follows: + +1. Choose the best node to become the new secondary. +2. Create a secondary for the shard on that node. +3. Wait until the new secondary is ready. +4. Remove the secondary from the node being deleted. + +This ensures all secondary shards are safely moved before deleting the node. + +### Reliability, failure modes and corner cases + +In case of a storage controller failure and following restart, the system behavior depends on the +`NodeLifecycle` state: + +- If `NodeLifecycle` is `Active`: No action is taken for this node. +- If `NodeLifecycle` is `Deleted`: The node will not be re-added. +- If `NodeLifecycle` is `ScheduledForDeletion`: A deletion background task will be launched for +this node. + +In case of a pageserver node failure during deletion, the behavior depends on the `force` flag: +- If `force` is set: The node deletion will proceed regardless of the node's availability. +- If `force` is not set: The deletion will be retried a limited number of times. If the node +remains unavailable, the deletion process will pause and automatically resume when the node +becomes healthy again. + +### Operations concurrency + +The following sections describe the behavior when different types of requests arrive at the storage +controller and how they interact with ongoing operations. + +#### Delete request + +Handler: `PUT /control/v1/node/:node_id/delete` + +1. If node lifecycle is `NodeLifecycle::ScheduledForDeletion`: + - Return `200 OK`: there is already an ongoing deletion request for this node +2. Update & persist lifecycle to `NodeLifecycle::ScheduledForDeletion` +3. Persist current scheduling policy +4. If there is no active operation (drain/fill/delete): + - Run deletion process for this node + +#### Cancel delete request + +Handler: `DELETE /control/v1/node/:node_id/delete` + +1. If node lifecycle is not `NodeLifecycle::ScheduledForDeletion`: + - Return `404 Not Found`: there is no current deletion request for this node +2. If the active operation is deleting this node, cancel it +3. Update & persist lifecycle to `NodeLifecycle::Active` +4. Restore the last scheduling policy from persistence + +#### Drain/fill request + +1. If there are already ongoing drain/fill processes: + - Return `409 Conflict`: queueing of drain/fill processes is not supported +2. If there is an ongoing delete process: + - Cancel it and wait until it is cancelled +3. Run the drain/fill process +4. After the drain/fill process is cancelled or finished: + - Try to find another candidate to delete and run the deletion process for that node + +#### Drain/fill cancel request + +1. If the active operation is not the related process: + - Return `400 Bad Request`: cancellation request is incorrect, operations are not the same +2. Cancel the active operation +3. Try to find another candidate to delete and run the deletion process for that node + +## Definition of Done + +- [x] Fix flaky node scenario and introduce related debug handlers +- [ ] Node deletion intent is persistent - a node will be eventually deleted after a deletion +request regardless of draining/filling requests and restarts +- [ ] Node deletion can be graceful - deletion completes only after moving all tenant shards to +recommended locations +- [ ] Deploying does not break due to long deletions - drain/fill operations override deletion +process and deletion resumes after drain/fill completes +- [ ] `force` flag is implemented and provides fast, failure-tolerant node removal (e.g., when a +pageserver node does not respond) +- [ ] Legacy delete handler code is removed from storage_controller, test_runner, and storcon_cli From edd60730c89433dd8b6990bdaf9fdad34213c411 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 31 Jul 2025 13:29:25 +0400 Subject: [PATCH 08/15] safekeeper: use last_log_term in mconf switch + choose most advanced sk in pull timeline (#12778) ## Problem I discovered two bugs corresponding to safekeeper migration, which together might lead to a data loss during the migration. The second bug is from a hadron patch and might lead to a data loss during the safekeeper restore in hadron as well. 1. `switch_membership` returns the current `term` instead of `last_log_term`. It is used to choose the `sync_position` in the algorithm, so we might choose the wrong one and break the correctness guarantees. 2. The current `term` is used to choose the most advanced SK in `pull_timeline` with higher priority than `flush_lsn`. It is incorrect because the most advanced safekeeper is the one with the highest `(last_log_term, flush_lsn)` pair. The compute might bump term on the least advanced sk, making it the best choice to pull from, and thus making committed log entries "uncommitted" after `pull_timeline` Part of https://databricks.atlassian.net/browse/LKB-1017 ## Summary of changes - Return `last_log_term` in `switch_membership` - Use `(last_log_term, flush_lsn)` as a primary key for choosing the most advanced sk in `pull_timeline` and deny pulling if the `max_term` is higher than on the most advanced sk (hadron only) - Write tests for both cases - Retry `sync_safekeepers` in `compute_ctl` - Take into the account the quorum size when calculating `sync_position` --- compute_tools/src/compute.rs | 40 +++- safekeeper/src/pull_timeline.rs | 24 ++- safekeeper/src/timeline.rs | 6 +- .../src/service/safekeeper_service.rs | 42 ++++- test_runner/fixtures/neon_fixtures.py | 1 + .../regress/test_safekeeper_migration.py | 174 ++++++++++++++++++ 6 files changed, 275 insertions(+), 12 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 5b1cdb9805..1df837e1e6 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -35,6 +35,9 @@ use tokio::{spawn, sync::watch, task::JoinHandle, time}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, instrument, warn}; use url::Url; +use utils::backoff::{ + DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff_duration, +}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::measured_stream::MeasuredReader; @@ -1557,6 +1560,41 @@ impl ComputeNode { Ok(lsn) } + fn sync_safekeepers_with_retries(&self, storage_auth_token: Option) -> Result { + let max_retries = 5; + let mut attempts = 0; + loop { + let result = self.sync_safekeepers(storage_auth_token.clone()); + match &result { + Ok(_) => { + if attempts > 0 { + tracing::info!("sync_safekeepers succeeded after {attempts} retries"); + } + return result; + } + Err(e) if attempts < max_retries => { + tracing::info!( + "sync_safekeepers failed, will retry (attempt {attempts}): {e:#}" + ); + } + Err(err) => { + tracing::warn!( + "sync_safekeepers still failed after {attempts} retries, giving up: {err:?}" + ); + return result; + } + } + // sleep and retry + let backoff = exponential_backoff_duration( + attempts, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ); + std::thread::sleep(backoff); + attempts += 1; + } + } + /// Do all the preparations like PGDATA directory creation, configuration, /// safekeepers sync, basebackup, etc. #[instrument(skip_all)] @@ -1592,7 +1630,7 @@ impl ComputeNode { lsn } else { info!("starting safekeepers syncing"); - self.sync_safekeepers(pspec.storage_auth_token.clone()) + self.sync_safekeepers_with_retries(pspec.storage_auth_token.clone()) .with_context(|| "failed to sync safekeepers")? }; info!("safekeepers synced at LSN {}", lsn); diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 43232db950..4febc7656e 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -612,19 +612,25 @@ pub async fn handle_request( } } + let max_term = statuses + .iter() + .map(|(status, _)| status.acceptor_state.term) + .max() + .unwrap(); + // Find the most advanced safekeeper let (status, i) = statuses .into_iter() .max_by_key(|(status, _)| { ( status.acceptor_state.epoch, + status.flush_lsn, /* BEGIN_HADRON */ // We need to pull from the SK with the highest term. // This is because another compute may come online and vote the same highest term again on the other two SKs. // Then, there will be 2 computes running on the same term. status.acceptor_state.term, /* END_HADRON */ - status.flush_lsn, status.commit_lsn, ) }) @@ -634,6 +640,22 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); + // TODO(diko): This is hadron only check to make sure that we pull the timeline + // from the safekeeper with the highest term during timeline restore. + // We could avoid returning the error by calling bump_term after pull_timeline. + // However, this is not a big deal because we retry the pull_timeline requests. + // The check should be removed together with removing custom hadron logic for + // safekeeper restore. + if wait_for_peer_timeline_status && status.acceptor_state.term != max_term { + return Err(ApiError::PreconditionFailed( + format!( + "choosen safekeeper {} has term {}, but the most advanced term is {}", + safekeeper_host, status.acceptor_state.term, max_term + ) + .into(), + )); + } + match pull_timeline( status, safekeeper_host, diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index e083a49428..25ac8e5bd3 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -195,12 +195,14 @@ impl StateSK { to: Configuration, ) -> Result { let result = self.state_mut().membership_switch(to).await?; + let flush_lsn = self.flush_lsn(); + let last_log_term = self.state().acceptor_state.get_last_log_term(flush_lsn); Ok(TimelineMembershipSwitchResponse { previous_conf: result.previous_conf, current_conf: result.current_conf, - last_log_term: self.state().acceptor_state.term, - flush_lsn: self.flush_lsn(), + last_log_term, + flush_lsn, }) } diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index a60ebb85c6..fab1342d5d 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -24,12 +24,12 @@ use pageserver_api::controller_api::{ }; use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo}; use safekeeper_api::PgVersionId; +use safekeeper_api::Term; use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration}; use safekeeper_api::models::{ PullTimelineRequest, TimelineLocateResponse, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse, }; -use safekeeper_api::{INITIAL_TERM, Term}; use safekeeper_client::mgmt_api; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; @@ -1298,13 +1298,7 @@ impl Service { ) .await?; - let mut sync_position = (INITIAL_TERM, Lsn::INVALID); - for res in results.into_iter().flatten() { - let sk_position = (res.last_log_term, res.flush_lsn); - if sync_position < sk_position { - sync_position = sk_position; - } - } + let sync_position = Self::get_sync_position(&results)?; tracing::info!( %generation, @@ -1598,4 +1592,36 @@ impl Service { Ok(()) } + + /// Get membership switch responses from all safekeepers and return the sync position. + /// + /// Sync position is a position equal or greater than the commit position. + /// It is guaranteed that all WAL entries with (last_log_term, flush_lsn) + /// greater than the sync position are not committed (= not on a quorum). + /// + /// Returns error if there is no quorum of successful responses. + fn get_sync_position( + responses: &[mgmt_api::Result], + ) -> Result<(Term, Lsn), ApiError> { + let quorum_size = responses.len() / 2 + 1; + + let mut wal_positions = responses + .iter() + .flatten() + .map(|res| (res.last_log_term, res.flush_lsn)) + .collect::>(); + + // Should be already checked if the responses are from tenant_timeline_set_membership_quorum. + if wal_positions.len() < quorum_size { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "not enough successful responses to get sync position: {}/{}", + wal_positions.len(), + quorum_size, + ))); + } + + wal_positions.sort(); + + Ok(wal_positions[quorum_size - 1]) + } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 11d54f7831..c3dfc78218 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2313,6 +2313,7 @@ class NeonStorageController(MetricsGetter, LogUtils): timeline_id: TimelineId, new_sk_set: list[int], ): + log.info(f"migrate_safekeepers({tenant_id}, {timeline_id}, {new_sk_set})") response = self.request( "POST", f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate", diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py index 2ceeea37a5..97a6ece446 100644 --- a/test_runner/regress/test_safekeeper_migration.py +++ b/test_runner/regress/test_safekeeper_migration.py @@ -286,3 +286,177 @@ def test_sk_generation_aware_tombstones(neon_env_builder: NeonEnvBuilder): assert re.match(r".*Timeline .* deleted.*", exc.value.response.text) # The timeline should remain deleted. expect_deleted(second_sk) + + +def test_safekeeper_migration_stale_timeline(neon_env_builder: NeonEnvBuilder): + """ + Test that safekeeper migration handles stale timeline correctly by migrating to + a safekeeper with a stale timeline. + 1. Check that we are waiting for the stale timeline to catch up with the commit lsn. + The migration might fail if there is no compute to advance the WAL. + 2. Check that we rely on last_log_term (and not the current term) when waiting for the + sync_position on step 7. + 3. Check that migration succeeds if the compute is running. + """ + neon_env_builder.num_safekeepers = 2 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 1, + } + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS) + env.storage_controller.allowed_errors.append(".*not enough successful .* to reach quorum.*") + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + + active_sk = env.get_safekeeper(mconf["sk_set"][0]) + other_sk = [sk for sk in env.safekeepers if sk.id != active_sk.id][0] + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + ep.start(safekeeper_generation=1, safekeepers=[active_sk.id]) + ep.safe_psql("CREATE TABLE t(a int)") + ep.safe_psql("INSERT INTO t VALUES (0)") + + # Pull the timeline to other_sk, so other_sk now has a "stale" timeline on it. + other_sk.pull_timeline([active_sk], env.initial_tenant, env.initial_timeline) + + # Advance the WAL on active_sk. + ep.safe_psql("INSERT INTO t VALUES (1)") + + # The test is more tricky if we have the same last_log_term but different term/flush_lsn. + # Stop the active_sk during the endpoint shutdown because otherwise compute_ctl runs + # sync_safekeepers and advances last_log_term on active_sk. + active_sk.stop() + ep.stop(mode="immediate") + active_sk.start() + + active_sk_status = active_sk.http_client().timeline_status( + env.initial_tenant, env.initial_timeline + ) + other_sk_status = other_sk.http_client().timeline_status( + env.initial_tenant, env.initial_timeline + ) + + # other_sk should have the same last_log_term, but a stale flush_lsn. + assert active_sk_status.last_log_term == other_sk_status.last_log_term + assert active_sk_status.flush_lsn > other_sk_status.flush_lsn + + commit_lsn = active_sk_status.flush_lsn + + # Bump the term on other_sk to make it higher than active_sk. + # This is to make sure we don't use current term instead of last_log_term in the algorithm. + other_sk.http_client().term_bump( + env.initial_tenant, env.initial_timeline, active_sk_status.term + 100 + ) + + # TODO(diko): now it fails because the timeline on other_sk is stale and there is no compute + # to catch up it with active_sk. It might be fixed in https://databricks.atlassian.net/browse/LKB-946 + # if we delete stale timelines before starting the migration. + # But the rest of the test is still valid: we should not lose committed WAL after the migration. + with pytest.raises( + StorageControllerApiException, match="not enough successful .* to reach quorum" + ): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [other_sk.id] + ) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["new_sk_set"] == [other_sk.id] + assert mconf["sk_set"] == [active_sk.id] + assert mconf["generation"] == 2 + + # Start the endpoint, so it advances the WAL on other_sk. + ep.start(safekeeper_generation=2, safekeepers=[active_sk.id, other_sk.id]) + # Now the migration should succeed. + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [other_sk.id] + ) + + # Check that we didn't lose committed WAL. + assert ( + other_sk.http_client().timeline_status(env.initial_tenant, env.initial_timeline).flush_lsn + >= commit_lsn + ) + assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)] + + +def test_pull_from_most_advanced_sk(neon_env_builder: NeonEnvBuilder): + """ + Test that we pull the timeline from the most advanced safekeeper during the + migration and do not lose committed WAL. + """ + neon_env_builder.num_safekeepers = 4 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 3, + } + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + + sk_set = mconf["sk_set"] + assert len(sk_set) == 3 + + other_sk = [sk.id for sk in env.safekeepers if sk.id not in sk_set][0] + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + ep.start(safekeeper_generation=1, safekeepers=sk_set) + ep.safe_psql("CREATE TABLE t(a int)") + ep.safe_psql("INSERT INTO t VALUES (0)") + + # Stop one sk, so we have a lagging WAL on it. + env.get_safekeeper(sk_set[0]).stop() + # Advance the WAL on the other sks. + ep.safe_psql("INSERT INTO t VALUES (1)") + + # Stop other sks to make sure compute_ctl doesn't advance the last_log_term on them during shutdown. + for sk_id in sk_set[1:]: + env.get_safekeeper(sk_id).stop() + ep.stop(mode="immediate") + for sk_id in sk_set: + env.get_safekeeper(sk_id).start() + + # Bump the term on the lagging sk to make sure we don't use it to choose the most advanced sk. + env.get_safekeeper(sk_set[0]).http_client().term_bump( + env.initial_tenant, env.initial_timeline, 100 + ) + + def get_commit_lsn(sk_set: list[int]): + flush_lsns = [] + last_log_terms = [] + for sk_id in sk_set: + sk = env.get_safekeeper(sk_id) + status = sk.http_client().timeline_status(env.initial_tenant, env.initial_timeline) + flush_lsns.append(status.flush_lsn) + last_log_terms.append(status.last_log_term) + + # In this test we assume that all sks have the same last_log_term. + assert len(set(last_log_terms)) == 1 + + flush_lsns.sort(reverse=True) + commit_lsn = flush_lsns[len(sk_set) // 2] + + log.info(f"sk_set: {sk_set}, flush_lsns: {flush_lsns}, commit_lsn: {commit_lsn}") + return commit_lsn + + commit_lsn_before_migration = get_commit_lsn(sk_set) + + # Make two migrations, so the lagging sk stays in the sk_set, but other sks are replaced. + new_sk_set1 = [sk_set[0], sk_set[1], other_sk] # remove sk_set[2], add other_sk + new_sk_set2 = [sk_set[0], other_sk, sk_set[2]] # remove sk_set[1], add sk_set[2] back + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, new_sk_set1 + ) + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, new_sk_set2 + ) + + commit_lsn_after_migration = get_commit_lsn(new_sk_set2) + + # We should not lose committed WAL. + # If we have choosen the lagging sk to pull the timeline from, this might fail. + assert commit_lsn_before_migration <= commit_lsn_after_migration + + ep.start(safekeeper_generation=5, safekeepers=new_sk_set2) + assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)] From f3ee6e818de91cde533199d56426dda4b1d16da5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Szafra=C5=84ski?= Date: Thu, 31 Jul 2025 11:53:48 +0200 Subject: [PATCH 09/15] [proxy] Correctly classify ConnectErrors (#12793) As is, e.g. quota errors on wake compute are logged as "compute" errors. --- proxy/src/serverless/backend.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 511bdc4e42..5b356c8460 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -458,7 +458,7 @@ pub(crate) enum LocalProxyConnError { impl ReportableError for HttpConnError { fn get_error_kind(&self) -> ErrorKind { match self { - HttpConnError::ConnectError(_) => ErrorKind::Compute, + HttpConnError::ConnectError(e) => e.get_error_kind(), HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, HttpConnError::PostgresConnectionError(p) => match p.as_db_error() { // user provided a wrong database name From 8fe75961206877811c337f448ee9f0076c68598d Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 31 Jul 2025 12:11:30 +0200 Subject: [PATCH 10/15] chore(compute_tools): Delete unused anon_ext_fn_reassign.sql (#12787) It's an anon v1 failed launch artifact, I suppose. --- compute_tools/src/sql/anon_ext_fn_reassign.sql | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 compute_tools/src/sql/anon_ext_fn_reassign.sql diff --git a/compute_tools/src/sql/anon_ext_fn_reassign.sql b/compute_tools/src/sql/anon_ext_fn_reassign.sql deleted file mode 100644 index 714a6db1e9..0000000000 --- a/compute_tools/src/sql/anon_ext_fn_reassign.sql +++ /dev/null @@ -1,13 +0,0 @@ -DO $$ -DECLARE - query varchar; -BEGIN - FOR query IN - SELECT pg_catalog.format('ALTER FUNCTION %I(%s) OWNER TO {db_owner};', p.oid::regproc, pg_catalog.pg_get_function_identity_arguments(p.oid)) - FROM pg_catalog.pg_proc p - WHERE p.pronamespace OPERATOR(pg_catalog.=) 'anon'::regnamespace::oid - LOOP - EXECUTE query; - END LOOP; -END -$$; From f8fc0bf3c0552d7af5e257be11c38cd207e52929 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 31 Jul 2025 12:25:33 +0200 Subject: [PATCH 11/15] neon_local: use doc comments for help texts (#12270) Clap automatically uses doc comments as help/about texts. Doc comments are strictly better, since they're also used e.g. for IDE documentation, and are better formatted. This patch updates all `neon_local` commands to use doc comments (courtesy of GPT-o3). --- control_plane/src/bin/neon_local.rs | 520 ++++++++++++---------------- 1 file changed, 220 insertions(+), 300 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 95500b0b18..2b81c3957c 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -71,8 +71,9 @@ const DEFAULT_PG_VERSION_NUM: &str = "17"; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; +/// Neon CLI. #[derive(clap::Parser)] -#[command(version = GIT_VERSION, about, name = "Neon CLI")] +#[command(version = GIT_VERSION, name = "Neon CLI")] struct Cli { #[command(subcommand)] command: NeonLocalCmd, @@ -107,30 +108,31 @@ enum NeonLocalCmd { Stop(StopCmdArgs), } +/// Initialize a new Neon repository, preparing configs for services to start with. #[derive(clap::Args)] -#[clap(about = "Initialize a new Neon repository, preparing configs for services to start with")] struct InitCmdArgs { - #[clap(long, help("How many pageservers to create (default 1)"))] + /// How many pageservers to create (default 1). + #[clap(long)] num_pageservers: Option, #[clap(long)] config: Option, - #[clap(long, help("Force initialization even if the repository is not empty"))] + /// Force initialization even if the repository is not empty. + #[clap(long, default_value = "must-not-exist")] #[arg(value_parser)] - #[clap(default_value = "must-not-exist")] force: InitForceMode, } +/// Start pageserver and safekeepers. #[derive(clap::Args)] -#[clap(about = "Start pageserver and safekeepers")] struct StartCmdArgs { #[clap(long = "start-timeout", default_value = "10s")] timeout: humantime::Duration, } +/// Stop pageserver and safekeepers. #[derive(clap::Args)] -#[clap(about = "Stop pageserver and safekeepers")] struct StopCmdArgs { #[arg(value_enum)] #[clap(long, default_value_t = StopMode::Fast)] @@ -143,8 +145,8 @@ enum StopMode { Immediate, } +/// Manage tenants. #[derive(clap::Subcommand)] -#[clap(about = "Manage tenants")] enum TenantCmd { List, Create(TenantCreateCmdArgs), @@ -155,38 +157,36 @@ enum TenantCmd { #[derive(clap::Args)] struct TenantCreateCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: Option, - #[clap( - long, - help = "Use a specific timeline id when creating a tenant and its initial timeline" - )] + /// Use a specific timeline id when creating a tenant and its initial timeline. + #[clap(long)] timeline_id: Option, #[clap(short = 'c')] config: Vec, + /// Postgres version to use for the initial timeline. #[arg(default_value = DEFAULT_PG_VERSION_NUM)] - #[clap(long, help = "Postgres version to use for the initial timeline")] + #[clap(long)] pg_version: PgMajorVersion, - #[clap( - long, - help = "Use this tenant in future CLI commands where tenant_id is needed, but not specified" - )] + /// Use this tenant in future CLI commands where tenant_id is needed, but not specified. + #[clap(long)] set_default: bool, - #[clap(long, help = "Number of shards in the new tenant")] + /// Number of shards in the new tenant. + #[clap(long)] #[arg(default_value_t = 0)] shard_count: u8, - #[clap(long, help = "Sharding stripe size in pages")] + /// Sharding stripe size in pages. + #[clap(long)] shard_stripe_size: Option, - #[clap(long, help = "Placement policy shards in this tenant")] + /// Placement policy shards in this tenant. + #[clap(long)] #[arg(value_parser = parse_placement_policy)] placement_policy: Option, } @@ -195,44 +195,35 @@ fn parse_placement_policy(s: &str) -> anyhow::Result { Ok(serde_json::from_str::(s)?) } +/// Set a particular tenant as default in future CLI commands where tenant_id is needed, but not +/// specified. #[derive(clap::Args)] -#[clap( - about = "Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified" -)] struct TenantSetDefaultCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: TenantId, } #[derive(clap::Args)] struct TenantConfigCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: Option, #[clap(short = 'c')] config: Vec, } +/// Import a tenant that is present in remote storage, and create branches for its timelines. #[derive(clap::Args)] -#[clap( - about = "Import a tenant that is present in remote storage, and create branches for its timelines" -)] struct TenantImportCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: TenantId, } +/// Manage timelines. #[derive(clap::Subcommand)] -#[clap(about = "Manage timelines")] enum TimelineCmd { List(TimelineListCmdArgs), Branch(TimelineBranchCmdArgs), @@ -240,98 +231,87 @@ enum TimelineCmd { Import(TimelineImportCmdArgs), } +/// List all timelines available to this pageserver. #[derive(clap::Args)] -#[clap(about = "List all timelines available to this pageserver")] struct TimelineListCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_shard_id: Option, } +/// Create a new timeline, branching off from another timeline. #[derive(clap::Args)] -#[clap(about = "Create a new timeline, branching off from another timeline")] struct TimelineBranchCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: Option, - - #[clap(long, help = "New timeline's ID")] + /// New timeline's ID, as a 32-byte hexadecimal string. + #[clap(long)] timeline_id: Option, - - #[clap(long, help = "Human-readable alias for the new timeline")] + /// Human-readable alias for the new timeline. + #[clap(long)] branch_name: String, - - #[clap( - long, - help = "Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name." - )] + /// Use last Lsn of another timeline (and its data) as base when creating the new timeline. The + /// timeline gets resolved by its branch name. + #[clap(long)] ancestor_branch_name: Option, - - #[clap( - long, - help = "When using another timeline as base, use a specific Lsn in it instead of the latest one" - )] + /// When using another timeline as base, use a specific Lsn in it instead of the latest one. + #[clap(long)] ancestor_start_lsn: Option, } +/// Create a new blank timeline. #[derive(clap::Args)] -#[clap(about = "Create a new blank timeline")] struct TimelineCreateCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: Option, - - #[clap(long, help = "New timeline's ID")] + /// New timeline's ID, as a 32-byte hexadecimal string. + #[clap(long)] timeline_id: Option, - - #[clap(long, help = "Human-readable alias for the new timeline")] + /// Human-readable alias for the new timeline. + #[clap(long)] branch_name: String, + /// Postgres version. #[arg(default_value = DEFAULT_PG_VERSION_NUM)] - #[clap(long, help = "Postgres version")] + #[clap(long)] pg_version: PgMajorVersion, } +/// Import a timeline from a basebackup directory. #[derive(clap::Args)] -#[clap(about = "Import timeline from a basebackup directory")] struct TimelineImportCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: Option, - - #[clap(long, help = "New timeline's ID")] + /// New timeline's ID, as a 32-byte hexadecimal string. + #[clap(long)] timeline_id: TimelineId, - - #[clap(long, help = "Human-readable alias for the new timeline")] + /// Human-readable alias for the new timeline. + #[clap(long)] branch_name: String, - - #[clap(long, help = "Basebackup tarfile to import")] + /// Basebackup tarfile to import. + #[clap(long)] base_tarfile: PathBuf, - - #[clap(long, help = "Lsn the basebackup starts at")] + /// LSN the basebackup starts at. + #[clap(long)] base_lsn: Lsn, - - #[clap(long, help = "Wal to add after base")] + /// WAL to add after base. + #[clap(long)] wal_tarfile: Option, - - #[clap(long, help = "Lsn the basebackup ends at")] + /// LSN the basebackup ends at. + #[clap(long)] end_lsn: Option, + /// Postgres version of the basebackup being imported. #[arg(default_value = DEFAULT_PG_VERSION_NUM)] - #[clap(long, help = "Postgres version of the backup being imported")] + #[clap(long)] pg_version: PgMajorVersion, } +/// Manage pageservers. #[derive(clap::Subcommand)] -#[clap(about = "Manage pageservers")] enum PageserverCmd { Status(PageserverStatusCmdArgs), Start(PageserverStartCmdArgs), @@ -339,223 +319,202 @@ enum PageserverCmd { Restart(PageserverRestartCmdArgs), } +/// Show status of a local pageserver. #[derive(clap::Args)] -#[clap(about = "Show status of a local pageserver")] struct PageserverStatusCmdArgs { - #[clap(long = "id", help = "pageserver id")] + /// Pageserver ID. + #[clap(long = "id")] pageserver_id: Option, } +/// Start local pageserver. #[derive(clap::Args)] -#[clap(about = "Start local pageserver")] struct PageserverStartCmdArgs { - #[clap(long = "id", help = "pageserver id")] + /// Pageserver ID. + #[clap(long = "id")] pageserver_id: Option, - - #[clap(short = 't', long, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } +/// Stop local pageserver. #[derive(clap::Args)] -#[clap(about = "Stop local pageserver")] struct PageserverStopCmdArgs { - #[clap(long = "id", help = "pageserver id")] + /// Pageserver ID. + #[clap(long = "id")] pageserver_id: Option, - - #[clap( - short = 'm', - help = "If 'immediate', don't flush repository data at shutdown" - )] + /// If 'immediate', don't flush repository data at shutdown + #[clap(short = 'm')] #[arg(value_enum, default_value = "fast")] stop_mode: StopMode, } +/// Restart local pageserver. #[derive(clap::Args)] -#[clap(about = "Restart local pageserver")] struct PageserverRestartCmdArgs { - #[clap(long = "id", help = "pageserver id")] + /// Pageserver ID. + #[clap(long = "id")] pageserver_id: Option, - - #[clap(short = 't', long, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } +/// Manage storage controller. #[derive(clap::Subcommand)] -#[clap(about = "Manage storage controller")] enum StorageControllerCmd { Start(StorageControllerStartCmdArgs), Stop(StorageControllerStopCmdArgs), } +/// Start storage controller. #[derive(clap::Args)] -#[clap(about = "Start storage controller")] struct StorageControllerStartCmdArgs { - #[clap(short = 't', long, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, - - #[clap( - long, - help = "Identifier used to distinguish storage controller instances" - )] + /// Identifier used to distinguish storage controller instances. + #[clap(long)] #[arg(default_value_t = 1)] instance_id: u8, - - #[clap( - long, - help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)" - )] + /// Base port for the storage controller instance identified by instance-id (defaults to + /// pageserver cplane api). + #[clap(long)] base_port: Option, - #[clap( - long, - help = "Whether the storage controller should handle pageserver-reported local disk loss events." - )] + /// Whether the storage controller should handle pageserver-reported local disk loss events. + #[clap(long)] handle_ps_local_disk_loss: Option, } +/// Stop storage controller. #[derive(clap::Args)] -#[clap(about = "Stop storage controller")] struct StorageControllerStopCmdArgs { - #[clap( - short = 'm', - help = "If 'immediate', don't flush repository data at shutdown" - )] + /// If 'immediate', don't flush repository data at shutdown + #[clap(short = 'm')] #[arg(value_enum, default_value = "fast")] stop_mode: StopMode, - - #[clap( - long, - help = "Identifier used to distinguish storage controller instances" - )] + /// Identifier used to distinguish storage controller instances. + #[clap(long)] #[arg(default_value_t = 1)] instance_id: u8, } +/// Manage storage broker. #[derive(clap::Subcommand)] -#[clap(about = "Manage storage broker")] enum StorageBrokerCmd { Start(StorageBrokerStartCmdArgs), Stop(StorageBrokerStopCmdArgs), } +/// Start broker. #[derive(clap::Args)] -#[clap(about = "Start broker")] struct StorageBrokerStartCmdArgs { - #[clap(short = 't', long, help = "timeout until we fail the command")] - #[arg(default_value = "10s")] + /// Timeout until we fail the command. + #[clap(short = 't', long, default_value = "10s")] start_timeout: humantime::Duration, } +/// Stop broker. #[derive(clap::Args)] -#[clap(about = "stop broker")] struct StorageBrokerStopCmdArgs { - #[clap( - short = 'm', - help = "If 'immediate', don't flush repository data at shutdown" - )] + /// If 'immediate', don't flush repository data on shutdown. + #[clap(short = 'm')] #[arg(value_enum, default_value = "fast")] stop_mode: StopMode, } +/// Manage safekeepers. #[derive(clap::Subcommand)] -#[clap(about = "Manage safekeepers")] enum SafekeeperCmd { Start(SafekeeperStartCmdArgs), Stop(SafekeeperStopCmdArgs), Restart(SafekeeperRestartCmdArgs), } +/// Manage object storage. #[derive(clap::Subcommand)] -#[clap(about = "Manage object storage")] enum EndpointStorageCmd { Start(EndpointStorageStartCmd), Stop(EndpointStorageStopCmd), } +/// Start object storage. #[derive(clap::Args)] -#[clap(about = "Start object storage")] struct EndpointStorageStartCmd { - #[clap(short = 't', long, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } +/// Stop object storage. #[derive(clap::Args)] -#[clap(about = "Stop object storage")] struct EndpointStorageStopCmd { + /// If 'immediate', don't flush repository data on shutdown. + #[clap(short = 'm')] #[arg(value_enum, default_value = "fast")] - #[clap( - short = 'm', - help = "If 'immediate', don't flush repository data at shutdown" - )] stop_mode: StopMode, } +/// Start local safekeeper. #[derive(clap::Args)] -#[clap(about = "Start local safekeeper")] struct SafekeeperStartCmdArgs { - #[clap(help = "safekeeper id")] + /// Safekeeper ID. #[arg(default_value_t = NodeId(1))] id: NodeId, - #[clap( - short = 'e', - long = "safekeeper-extra-opt", - help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo" - )] + /// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo. + #[clap(short = 'e', long = "safekeeper-extra-opt")] extra_opt: Vec, - #[clap(short = 't', long, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } +/// Stop local safekeeper. #[derive(clap::Args)] -#[clap(about = "Stop local safekeeper")] struct SafekeeperStopCmdArgs { - #[clap(help = "safekeeper id")] + /// Safekeeper ID. #[arg(default_value_t = NodeId(1))] id: NodeId, + /// If 'immediate', don't flush repository data on shutdown. #[arg(value_enum, default_value = "fast")] - #[clap( - short = 'm', - help = "If 'immediate', don't flush repository data at shutdown" - )] + #[clap(short = 'm')] stop_mode: StopMode, } +/// Restart local safekeeper. #[derive(clap::Args)] -#[clap(about = "Restart local safekeeper")] struct SafekeeperRestartCmdArgs { - #[clap(help = "safekeeper id")] + /// Safekeeper ID. #[arg(default_value_t = NodeId(1))] id: NodeId, + /// If 'immediate', don't flush repository data on shutdown. #[arg(value_enum, default_value = "fast")] - #[clap( - short = 'm', - help = "If 'immediate', don't flush repository data at shutdown" - )] + #[clap(short = 'm')] stop_mode: StopMode, - #[clap( - short = 'e', - long = "safekeeper-extra-opt", - help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo" - )] + /// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo. + #[clap(short = 'e', long = "safekeeper-extra-opt")] extra_opt: Vec, - #[clap(short = 't', long, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long)] #[arg(default_value = "10s")] start_timeout: humantime::Duration, } +/// Manage Postgres instances. #[derive(clap::Subcommand)] -#[clap(about = "Manage Postgres instances")] enum EndpointCmd { List(EndpointListCmdArgs), Create(EndpointCreateCmdArgs), @@ -567,33 +526,27 @@ enum EndpointCmd { GenerateJwt(EndpointGenerateJwtCmdArgs), } +/// List endpoints. #[derive(clap::Args)] -#[clap(about = "List endpoints")] struct EndpointListCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_shard_id: Option, } +/// Create a compute endpoint. #[derive(clap::Args)] -#[clap(about = "Create a compute endpoint")] struct EndpointCreateCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long = "tenant-id")] tenant_id: Option, - - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint ID. endpoint_id: Option, - #[clap(long, help = "Name of the branch the endpoint will run on")] + /// Name of the branch the endpoint will run on. + #[clap(long)] branch_name: Option, - #[clap( - long, - help = "Specify Lsn on the timeline to start from. By default, end of the timeline would be used" - )] + /// Specify LSN on the timeline to start from. By default, end of the timeline would be used. + #[clap(long)] lsn: Option, #[clap(long)] pg_port: Option, @@ -604,16 +557,13 @@ struct EndpointCreateCmdArgs { #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, - #[clap( - long, - help = "Don't do basebackup, create endpoint directory with only config files", - action = clap::ArgAction::Set, - default_value_t = false - )] + /// Don't do basebackup, create endpoint directory with only config files. + #[clap(long, action = clap::ArgAction::Set, default_value_t = false)] config_only: bool, + /// Postgres version. #[arg(default_value = DEFAULT_PG_VERSION_NUM)] - #[clap(long, help = "Postgres version")] + #[clap(long)] pg_version: PgMajorVersion, /// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings. @@ -624,170 +574,140 @@ struct EndpointCreateCmdArgs { #[clap(long)] grpc: bool, - #[clap( - long, - help = "If set, the node will be a hot replica on the specified timeline", - action = clap::ArgAction::Set, - default_value_t = false - )] + /// If set, the node will be a hot replica on the specified timeline. + #[clap(long, action = clap::ArgAction::Set, default_value_t = false)] hot_standby: bool, - - #[clap(long, help = "If set, will set up the catalog for neon_superuser")] + /// If set, will set up the catalog for neon_superuser. + #[clap(long)] update_catalog: bool, - - #[clap( - long, - help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests." - )] + /// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but + /// useful for tests. + #[clap(long)] allow_multiple: bool, - /// Only allow changing it on creation - #[clap(long, help = "Name of the privileged role for the endpoint")] + /// Name of the privileged role for the endpoint. + // Only allow changing it on creation. + #[clap(long)] privileged_role_name: Option, } +/// Start Postgres. If the endpoint doesn't exist yet, it is created. #[derive(clap::Args)] -#[clap(about = "Start postgres. If the endpoint doesn't exist yet, it is created.")] struct EndpointStartCmdArgs { - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint ID. endpoint_id: String, + /// Pageserver ID. #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, - - #[clap( - long, - help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations." - )] + /// Safekeepers membership generation to prefix neon.safekeepers with. + #[clap(long)] safekeepers_generation: Option, - #[clap( - long, - help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override." - )] + /// List of safekeepers endpoint will talk to. + #[clap(long)] safekeepers: Option, - - #[clap( - long, - help = "Configure the remote extensions storage proxy gateway URL to request for extensions.", - alias = "remote-ext-config" - )] + /// Configure the remote extensions storage proxy gateway URL to request for extensions. + #[clap(long, alias = "remote-ext-config")] remote_ext_base_url: Option, - - #[clap( - long, - help = "If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`" - )] + /// If set, will create test user `user` and `neondb` database. Requires `update-catalog = true` + #[clap(long)] create_test_user: bool, - - #[clap( - long, - help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests." - )] + /// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but + /// useful for tests. + #[clap(long)] allow_multiple: bool, - - #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")] + /// Timeout until we fail the command. + #[clap(short = 't', long, value_parser= humantime::parse_duration)] #[arg(default_value = "90s")] start_timeout: Duration, - #[clap( - long, - help = "Download LFC cache from endpoint storage on endpoint startup", - default_value = "false" - )] + /// Download LFC cache from endpoint storage on endpoint startup + #[clap(long, default_value = "false")] autoprewarm: bool, - #[clap(long, help = "Upload LFC cache to endpoint storage periodically")] + /// Upload LFC cache to endpoint storage periodically + #[clap(long)] offload_lfc_interval_seconds: Option, - #[clap( - long, - help = "Run in development mode, skipping VM-specific operations like process termination", - action = clap::ArgAction::SetTrue - )] + /// Run in development mode, skipping VM-specific operations like process termination + #[clap(long, action = clap::ArgAction::SetTrue)] dev: bool, } +/// Reconfigure an endpoint. #[derive(clap::Args)] -#[clap(about = "Reconfigure an endpoint")] struct EndpointReconfigureCmdArgs { - #[clap( - long = "tenant-id", - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant id. Represented as a hexadecimal string 32 symbols length + #[clap(long = "tenant-id")] tenant_id: Option, - - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint ID. endpoint_id: String, + /// Pageserver ID. #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, - #[clap(long)] safekeepers: Option, } +/// Refresh the endpoint's configuration by forcing it reload it's spec #[derive(clap::Args)] -#[clap(about = "Refresh the endpoint's configuration by forcing it reload it's spec")] struct EndpointRefreshConfigurationArgs { - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint id endpoint_id: String, } +/// Stop an endpoint. #[derive(clap::Args)] -#[clap(about = "Stop an endpoint")] struct EndpointStopCmdArgs { - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint ID. endpoint_id: String, - - #[clap( - long, - help = "Also delete data directory (now optional, should be default in future)" - )] + /// Also delete data directory (now optional, should be default in future). + #[clap(long)] destroy: bool, - #[clap(long, help = "Postgres shutdown mode")] + /// Postgres shutdown mode, passed to `pg_ctl -m `. + #[clap(long)] #[clap(default_value = "fast")] mode: EndpointTerminateMode, } +/// Update the pageservers in the spec file of the compute endpoint #[derive(clap::Args)] -#[clap(about = "Update the pageservers in the spec file of the compute endpoint")] struct EndpointUpdatePageserversCmdArgs { - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint id endpoint_id: String, - #[clap(short = 'p', long, help = "Specified pageserver id")] + /// Specified pageserver id + #[clap(short = 'p', long)] pageserver_id: Option, } +/// Generate a JWT for an endpoint. #[derive(clap::Args)] -#[clap(about = "Generate a JWT for an endpoint")] struct EndpointGenerateJwtCmdArgs { - #[clap(help = "Postgres endpoint id")] + /// Postgres endpoint ID. endpoint_id: String, - - #[clap(short = 's', long, help = "Scope to generate the JWT with", value_parser = ComputeClaimsScope::from_str)] + /// Scope to generate the JWT with. + #[clap(short = 's', long, value_parser = ComputeClaimsScope::from_str)] scope: Option, } +/// Manage neon_local branch name mappings. #[derive(clap::Subcommand)] -#[clap(about = "Manage neon_local branch name mappings")] enum MappingsCmd { Map(MappingsMapCmdArgs), } +/// Create new mapping which cannot exist already. #[derive(clap::Args)] -#[clap(about = "Create new mapping which cannot exist already")] struct MappingsMapCmdArgs { - #[clap( - long, - help = "Tenant id. Represented as a hexadecimal string 32 symbols length" - )] + /// Tenant ID, as a 32-byte hexadecimal string. + #[clap(long)] tenant_id: TenantId, - #[clap( - long, - help = "Timeline id. Represented as a hexadecimal string 32 symbols length" - )] + /// Timeline ID, as a 32-byte hexadecimal string. + #[clap(long)] timeline_id: TimelineId, - #[clap(long, help = "Branch name to give to the timeline")] + /// Branch name to give to the timeline. + #[clap(long)] branch_name: String, } From b4a63e0a34a1c3f0de95cfec8ac935047ce45317 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 31 Jul 2025 14:46:57 +0300 Subject: [PATCH 12/15] Fix how `neon.stripe_size` option is set in postgresql.conf file (#12776) Commit 1dce2a9e74 changed how the `neon.pageserver_connstring` setting is formed, but it messed up setting the `neon.stripe_size` setting so that it was set twice. That got mixed up during development of the patch, as commit 7fef4435c1 landed first and was merged incorrectly. --- compute_tools/src/config.rs | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index e7dde5c5f5..56602ab620 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -65,14 +65,19 @@ pub fn write_postgres_conf( writeln!(file, "{conf}")?; } - // Stripe size GUC should be defined prior to connection string - if let Some(stripe_size) = spec.shard_stripe_size { - writeln!(file, "neon.stripe_size={stripe_size}")?; - } // Add options for connecting to storage writeln!(file, "# Neon storage settings")?; writeln!(file)?; if let Some(conninfo) = &spec.pageserver_connection_info { + // Stripe size GUC should be defined prior to connection string + if let Some(stripe_size) = conninfo.stripe_size { + writeln!( + file, + "# from compute spec's pageserver_connection_info.stripe_size field" + )?; + writeln!(file, "neon.stripe_size={stripe_size}")?; + } + let mut libpq_urls: Option> = Some(Vec::new()); let num_shards = if conninfo.shard_count.0 == 0 { 1 // unsharded, treat it as a single shard @@ -110,7 +115,7 @@ pub fn write_postgres_conf( if let Some(libpq_urls) = libpq_urls { writeln!( file, - "# derived from compute spec's pageserver_conninfo field" + "# derived from compute spec's pageserver_connection_info field" )?; writeln!( file, @@ -120,24 +125,16 @@ pub fn write_postgres_conf( } else { writeln!(file, "# no neon.pageserver_connstring")?; } - - if let Some(stripe_size) = conninfo.stripe_size { - writeln!( - file, - "# from compute spec's pageserver_conninfo.stripe_size field" - )?; - writeln!(file, "neon.stripe_size={stripe_size}")?; - } } else { - if let Some(s) = &spec.pageserver_connstring { - writeln!(file, "# from compute spec's pageserver_connstring field")?; - writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?; - } - + // Stripe size GUC should be defined prior to connection string if let Some(stripe_size) = spec.shard_stripe_size { writeln!(file, "# from compute spec's shard_stripe_size field")?; writeln!(file, "neon.stripe_size={stripe_size}")?; } + if let Some(s) = &spec.pageserver_connstring { + writeln!(file, "# from compute spec's pageserver_connstring field")?; + writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?; + } } if !spec.safekeeper_connstrings.is_empty() { From df4e37b7cc1ba9fb9b21988a3ecd5b0590f33d53 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Thu, 31 Jul 2025 12:51:19 +0100 Subject: [PATCH 13/15] Report timespans for promotion and prewarm (#12730) - Return sub-actions time spans for prewarm, prewarm offload, and promotion in http handlers. - Set `synchronous_standby_names=walproposer` for promoted endpoints. Otherwise, walproposer on promoted standby ignores reply from safekeeper and is stuck on lsn COMMIT eternally. --- compute_tools/src/compute.rs | 2 +- compute_tools/src/compute_prewarm.rs | 107 ++++++------ compute_tools/src/compute_promote.rs | 161 ++++++++++--------- compute_tools/src/http/openapi_spec.yaml | 31 +++- compute_tools/src/http/routes/lfc.rs | 5 +- compute_tools/src/http/routes/promote.rs | 13 +- libs/compute_api/src/responses.rs | 38 +++-- test_runner/regress/test_replica_promotes.py | 1 + 8 files changed, 209 insertions(+), 149 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1df837e1e6..c0f0289e06 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -2780,7 +2780,7 @@ LIMIT 100", // 4. We start again and try to prewarm with the state from 2. instead of the previous complete state if matches!( prewarm_state, - LfcPrewarmState::Completed + LfcPrewarmState::Completed { .. } | LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Skipped ) { diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs index 82cb28f1ac..c7051d35d0 100644 --- a/compute_tools/src/compute_prewarm.rs +++ b/compute_tools/src/compute_prewarm.rs @@ -7,19 +7,11 @@ use http::StatusCode; use reqwest::Client; use std::mem::replace; use std::sync::Arc; +use std::time::Instant; use tokio::{io::AsyncReadExt, select, spawn}; use tokio_util::sync::CancellationToken; use tracing::{error, info}; -#[derive(serde::Serialize, Default)] -pub struct LfcPrewarmStateWithProgress { - #[serde(flatten)] - base: LfcPrewarmState, - total: i32, - prewarmed: i32, - skipped: i32, -} - /// A pair of url and a token to query endpoint storage for LFC prewarm-related tasks struct EndpointStoragePair { url: String, @@ -28,7 +20,7 @@ struct EndpointStoragePair { const KEY: &str = "lfc_state"; impl EndpointStoragePair { - /// endpoint_id is set to None while prewarming from other endpoint, see replica promotion + /// endpoint_id is set to None while prewarming from other endpoint, see compute_promote.rs /// If not None, takes precedence over pspec.spec.endpoint_id fn from_spec_and_endpoint( pspec: &crate::compute::ParsedSpec, @@ -54,36 +46,8 @@ impl EndpointStoragePair { } impl ComputeNode { - // If prewarm failed, we want to get overall number of segments as well as done ones. - // However, this function should be reliable even if querying postgres failed. - pub async fn lfc_prewarm_state(&self) -> LfcPrewarmStateWithProgress { - info!("requesting LFC prewarm state from postgres"); - let mut state = LfcPrewarmStateWithProgress::default(); - { - state.base = self.state.lock().unwrap().lfc_prewarm_state.clone(); - } - - let client = match ComputeNode::get_maintenance_client(&self.tokio_conn_conf).await { - Ok(client) => client, - Err(err) => { - error!(%err, "connecting to postgres"); - return state; - } - }; - let row = match client - .query_one("select * from neon.get_prewarm_info()", &[]) - .await - { - Ok(row) => row, - Err(err) => { - error!(%err, "querying LFC prewarm status"); - return state; - } - }; - state.total = row.try_get(0).unwrap_or_default(); - state.prewarmed = row.try_get(1).unwrap_or_default(); - state.skipped = row.try_get(2).unwrap_or_default(); - state + pub async fn lfc_prewarm_state(&self) -> LfcPrewarmState { + self.state.lock().unwrap().lfc_prewarm_state.clone() } pub fn lfc_offload_state(&self) -> LfcOffloadState { @@ -133,7 +97,6 @@ impl ComputeNode { } /// Request LFC state from endpoint storage and load corresponding pages into Postgres. - /// Returns a result with `false` if the LFC state is not found in endpoint storage. async fn prewarm_impl( &self, from_endpoint: Option, @@ -148,6 +111,7 @@ impl ComputeNode { fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint")); info!(%url, "requesting LFC state from endpoint storage"); + let mut now = Instant::now(); let request = Client::new().get(&url).bearer_auth(storage_token); let response = select! { _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), @@ -160,6 +124,8 @@ impl ComputeNode { StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped), status => bail!("{status} querying endpoint storage"), } + let state_download_time_ms = now.elapsed().as_millis() as u32; + now = Instant::now(); let mut uncompressed = Vec::new(); let lfc_state = select! { @@ -174,6 +140,8 @@ impl ComputeNode { read = decoder.read_to_end(&mut uncompressed) => read } .context("decoding LFC state")?; + let uncompress_time_ms = now.elapsed().as_millis() as u32; + now = Instant::now(); let uncompressed_len = uncompressed.len(); info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}"); @@ -196,15 +164,34 @@ impl ComputeNode { } .context("loading LFC state into postgres") .map(|_| ())?; + let prewarm_time_ms = now.elapsed().as_millis() as u32; - Ok(LfcPrewarmState::Completed) + let row = client + .query_one("select * from neon.get_prewarm_info()", &[]) + .await + .context("querying prewarm info")?; + let total = row.try_get(0).unwrap_or_default(); + let prewarmed = row.try_get(1).unwrap_or_default(); + let skipped = row.try_get(2).unwrap_or_default(); + + Ok(LfcPrewarmState::Completed { + total, + prewarmed, + skipped, + state_download_time_ms, + uncompress_time_ms, + prewarm_time_ms, + }) } /// If offload request is ongoing, return false, true otherwise pub fn offload_lfc(self: &Arc) -> bool { { let state = &mut self.state.lock().unwrap().lfc_offload_state; - if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading { + if matches!( + replace(state, LfcOffloadState::Offloading), + LfcOffloadState::Offloading + ) { return false; } } @@ -216,7 +203,10 @@ impl ComputeNode { pub async fn offload_lfc_async(self: &Arc) { { let state = &mut self.state.lock().unwrap().lfc_offload_state; - if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading { + if matches!( + replace(state, LfcOffloadState::Offloading), + LfcOffloadState::Offloading + ) { return; } } @@ -234,7 +224,6 @@ impl ComputeNode { LfcOffloadState::Failed { error } } }; - self.state.lock().unwrap().lfc_offload_state = state; } @@ -242,6 +231,7 @@ impl ComputeNode { let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?; info!(%url, "requesting LFC state from Postgres"); + let mut now = Instant::now(); let row = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await .context("connecting to postgres")? @@ -255,25 +245,36 @@ impl ComputeNode { info!(%url, "empty LFC state, not exporting"); return Ok(LfcOffloadState::Skipped); }; + let state_query_time_ms = now.elapsed().as_millis() as u32; + now = Instant::now(); let mut compressed = Vec::new(); ZstdEncoder::new(state) .read_to_end(&mut compressed) .await .context("compressing LFC state")?; + let compress_time_ms = now.elapsed().as_millis() as u32; + now = Instant::now(); let compressed_len = compressed.len(); - info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage"); + info!(%url, "downloaded LFC state, compressed size {compressed_len}"); let request = Client::new().put(url).bearer_auth(token).body(compressed); - match request.send().await { - Ok(res) if res.status() == StatusCode::OK => Ok(LfcOffloadState::Completed), - Ok(res) => bail!( - "Request to endpoint storage failed with status: {}", - res.status() - ), - Err(err) => Err(err).context("writing to endpoint storage"), + let response = request + .send() + .await + .context("writing to endpoint storage")?; + let state_upload_time_ms = now.elapsed().as_millis() as u32; + let status = response.status(); + if status != StatusCode::OK { + bail!("request to endpoint storage failed: {status}"); } + + Ok(LfcOffloadState::Completed { + compress_time_ms, + state_query_time_ms, + state_upload_time_ms, + }) } pub fn cancel_prewarm(self: &Arc) { diff --git a/compute_tools/src/compute_promote.rs b/compute_tools/src/compute_promote.rs index 29195b60e9..15b5fcfb46 100644 --- a/compute_tools/src/compute_promote.rs +++ b/compute_tools/src/compute_promote.rs @@ -1,32 +1,24 @@ use crate::compute::ComputeNode; -use anyhow::{Context, Result, bail}; +use anyhow::{Context, bail}; use compute_api::responses::{LfcPrewarmState, PromoteConfig, PromoteState}; -use compute_api::spec::ComputeMode; -use itertools::Itertools; -use std::collections::HashMap; -use std::{sync::Arc, time::Duration}; -use tokio::time::sleep; +use std::time::Instant; use tracing::info; -use utils::lsn::Lsn; impl ComputeNode { - /// Returns only when promote fails or succeeds. If a network error occurs - /// and http client disconnects, this does not stop promotion, and subsequent - /// calls block until promote finishes. + /// Returns only when promote fails or succeeds. If http client calling this function + /// disconnects, this does not stop promotion, and subsequent calls block until promote finishes. /// Called by control plane on secondary after primary endpoint is terminated /// Has a failpoint "compute-promotion" - pub async fn promote(self: &Arc, cfg: PromoteConfig) -> PromoteState { - let cloned = self.clone(); - let promote_fn = async move || { - let Err(err) = cloned.promote_impl(cfg).await else { - return PromoteState::Completed; - }; - tracing::error!(%err, "promoting"); - PromoteState::Failed { - error: format!("{err:#}"), + pub async fn promote(self: &std::sync::Arc, cfg: PromoteConfig) -> PromoteState { + let this = self.clone(); + let promote_fn = async move || match this.promote_impl(cfg).await { + Ok(state) => state, + Err(err) => { + tracing::error!(%err, "promoting replica"); + let error = format!("{err:#}"); + PromoteState::Failed { error } } }; - let start_promotion = || { let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted); tokio::spawn(async move { tx.send(promote_fn().await) }); @@ -34,36 +26,31 @@ impl ComputeNode { }; let mut task; - // self.state is unlocked after block ends so we lock it in promote_impl - // and task.changed() is reached + // promote_impl locks self.state so we need to unlock it before calling task.changed() { - task = self - .state - .lock() - .unwrap() - .promote_state - .get_or_insert_with(start_promotion) - .clone() + let promote_state = &mut self.state.lock().unwrap().promote_state; + task = promote_state.get_or_insert_with(start_promotion).clone() + } + if task.changed().await.is_err() { + let error = "promote sender dropped".to_string(); + return PromoteState::Failed { error }; } - task.changed().await.expect("promote sender dropped"); task.borrow().clone() } - async fn promote_impl(&self, mut cfg: PromoteConfig) -> Result<()> { + async fn promote_impl(&self, cfg: PromoteConfig) -> anyhow::Result { { let state = self.state.lock().unwrap(); let mode = &state.pspec.as_ref().unwrap().spec.mode; - if *mode != ComputeMode::Replica { - bail!("{} is not replica", mode.to_type_str()); + if *mode != compute_api::spec::ComputeMode::Replica { + bail!("compute mode \"{}\" is not replica", mode.to_type_str()); } - - // we don't need to query Postgres so not self.lfc_prewarm_state() match &state.lfc_prewarm_state { - LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => { - bail!("prewarm not requested or pending") + status @ (LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming) => { + bail!("compute {status}") } LfcPrewarmState::Failed { error } => { - tracing::warn!(%error, "replica prewarm failed") + tracing::warn!(%error, "compute prewarm failed") } _ => {} } @@ -72,9 +59,10 @@ impl ComputeNode { let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await .context("connecting to postgres")?; + let mut now = Instant::now(); let primary_lsn = cfg.wal_flush_lsn; - let mut last_wal_replay_lsn: Lsn = Lsn::INVALID; + let mut standby_lsn = utils::lsn::Lsn::INVALID; const RETRIES: i32 = 20; for i in 0..=RETRIES { let row = client @@ -82,16 +70,18 @@ impl ComputeNode { .await .context("getting last replay lsn")?; let lsn: u64 = row.get::(0).into(); - last_wal_replay_lsn = lsn.into(); - if last_wal_replay_lsn >= primary_lsn { + standby_lsn = lsn.into(); + if standby_lsn >= primary_lsn { break; } - info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}"); - sleep(Duration::from_secs(1)).await; + info!(%standby_lsn, %primary_lsn, "catching up, try {i}"); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; } - if last_wal_replay_lsn < primary_lsn { + if standby_lsn < primary_lsn { bail!("didn't catch up with primary in {RETRIES} retries"); } + let lsn_wait_time_ms = now.elapsed().as_millis() as u32; + now = Instant::now(); // using $1 doesn't work with ALTER SYSTEM SET let safekeepers_sql = format!( @@ -102,27 +92,33 @@ impl ComputeNode { .query(&safekeepers_sql, &[]) .await .context("setting safekeepers")?; + client + .query( + "ALTER SYSTEM SET synchronous_standby_names=walproposer", + &[], + ) + .await + .context("setting synchronous_standby_names")?; client .query("SELECT pg_catalog.pg_reload_conf()", &[]) .await .context("reloading postgres config")?; #[cfg(feature = "testing")] - fail::fail_point!("compute-promotion", |_| { - bail!("promotion configured to fail because of a failpoint") - }); + fail::fail_point!("compute-promotion", |_| bail!( + "compute-promotion failpoint" + )); let row = client .query_one("SELECT * FROM pg_catalog.pg_promote()", &[]) .await .context("pg_promote")?; if !row.get::(0) { - bail!("pg_promote() returned false"); + bail!("pg_promote() failed"); } + let pg_promote_time_ms = now.elapsed().as_millis() as u32; + let now = Instant::now(); - let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) - .await - .context("connecting to postgres")?; let row = client .query_one("SHOW transaction_read_only", &[]) .await @@ -131,36 +127,47 @@ impl ComputeNode { bail!("replica in read only mode after promotion"); } + // Already checked validity in http handler + #[allow(unused_mut)] + let mut new_pspec = crate::compute::ParsedSpec::try_from(cfg.spec).expect("invalid spec"); { let mut state = self.state.lock().unwrap(); - let spec = &mut state.pspec.as_mut().unwrap().spec; - spec.mode = ComputeMode::Primary; - let new_conf = cfg.spec.cluster.postgresql_conf.as_mut().unwrap(); - let existing_conf = spec.cluster.postgresql_conf.as_ref().unwrap(); - Self::merge_spec(new_conf, existing_conf); + + // Local setup has different ports for pg process (port=) for primary and secondary. + // Primary is stopped so we need secondary's "port" value + #[cfg(feature = "testing")] + { + let old_spec = &state.pspec.as_ref().unwrap().spec; + let Some(old_conf) = old_spec.cluster.postgresql_conf.as_ref() else { + bail!("pspec.spec.cluster.postgresql_conf missing for endpoint"); + }; + let set: std::collections::HashMap<&str, &str> = old_conf + .split_terminator('\n') + .map(|e| e.split_once("=").expect("invalid item")) + .collect(); + + let Some(new_conf) = new_pspec.spec.cluster.postgresql_conf.as_mut() else { + bail!("pspec.spec.cluster.postgresql_conf missing for supplied config"); + }; + new_conf.push_str(&format!("port={}\n", set["port"])); + } + + tracing::debug!("applied spec: {:#?}", new_pspec.spec); + if self.params.lakebase_mode { + ComputeNode::set_spec(&self.params, &mut state, new_pspec); + } else { + state.pspec = Some(new_pspec); + } } + info!("applied new spec, reconfiguring as primary"); - self.reconfigure() - } + self.reconfigure()?; + let reconfigure_time_ms = now.elapsed().as_millis() as u32; - /// Merge old and new Postgres conf specs to apply on secondary. - /// Change new spec's port and safekeepers since they are supplied - /// differenly - fn merge_spec(new_conf: &mut String, existing_conf: &str) { - let mut new_conf_set: HashMap<&str, &str> = new_conf - .split_terminator('\n') - .map(|e| e.split_once("=").expect("invalid item")) - .collect(); - new_conf_set.remove("neon.safekeepers"); - - let existing_conf_set: HashMap<&str, &str> = existing_conf - .split_terminator('\n') - .map(|e| e.split_once("=").expect("invalid item")) - .collect(); - new_conf_set.insert("port", existing_conf_set["port"]); - *new_conf = new_conf_set - .iter() - .map(|(k, v)| format!("{k}={v}")) - .join("\n"); + Ok(PromoteState::Completed { + lsn_wait_time_ms, + pg_promote_time_ms, + reconfigure_time_ms, + }) } } diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 27e610a87d..82e61acfdc 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -617,9 +617,6 @@ components: type: object required: - status - - total - - prewarmed - - skipped properties: status: description: LFC prewarm status @@ -637,6 +634,15 @@ components: skipped: description: Pages processed but not prewarmed type: integer + state_download_time_ms: + description: Time it takes to download LFC state to compute + type: integer + uncompress_time_ms: + description: Time it takes to uncompress LFC state + type: integer + prewarm_time_ms: + description: Time it takes to prewarm LFC state in Postgres + type: integer LfcOffloadState: type: object @@ -650,6 +656,16 @@ components: error: description: LFC offload error, if any type: string + state_query_time_ms: + description: Time it takes to get LFC state from Postgres + type: integer + compress_time_ms: + description: Time it takes to compress LFC state + type: integer + state_upload_time_ms: + description: Time it takes to upload LFC state to endpoint storage + type: integer + PromoteState: type: object @@ -663,6 +679,15 @@ components: error: description: Promote error, if any type: string + lsn_wait_time_ms: + description: Time it takes for secondary to catch up with primary WAL flush LSN + type: integer + pg_promote_time_ms: + description: Time it takes to call pg_promote on secondary + type: integer + reconfigure_time_ms: + description: Time it takes to reconfigure promoted secondary + type: integer SetRoleGrantsRequest: type: object diff --git a/compute_tools/src/http/routes/lfc.rs b/compute_tools/src/http/routes/lfc.rs index 7483198723..6ad216778e 100644 --- a/compute_tools/src/http/routes/lfc.rs +++ b/compute_tools/src/http/routes/lfc.rs @@ -1,12 +1,11 @@ -use crate::compute_prewarm::LfcPrewarmStateWithProgress; use crate::http::JsonResponse; use axum::response::{IntoResponse, Response}; use axum::{Json, http::StatusCode}; use axum_extra::extract::OptionalQuery; -use compute_api::responses::LfcOffloadState; +use compute_api::responses::{LfcOffloadState, LfcPrewarmState}; type Compute = axum::extract::State>; -pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json { +pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json { Json(compute.lfc_prewarm_state().await) } diff --git a/compute_tools/src/http/routes/promote.rs b/compute_tools/src/http/routes/promote.rs index 7ca3464b63..865de0da91 100644 --- a/compute_tools/src/http/routes/promote.rs +++ b/compute_tools/src/http/routes/promote.rs @@ -1,11 +1,22 @@ use crate::http::JsonResponse; use axum::extract::Json; +use compute_api::responses::PromoteConfig; use http::StatusCode; pub(in crate::http) async fn promote( compute: axum::extract::State>, - Json(cfg): Json, + Json(cfg): Json, ) -> axum::response::Response { + // Return early at the cost of extra parsing spec + let pspec = match crate::compute::ParsedSpec::try_from(cfg.spec) { + Ok(p) => p, + Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e), + }; + + let cfg = PromoteConfig { + spec: pspec.spec, + wal_flush_lsn: cfg.wal_flush_lsn, + }; let state = compute.promote(cfg).await; if let compute_api::responses::PromoteState::Failed { error: _ } = state { return JsonResponse::create_response(StatusCode::INTERNAL_SERVER_ERROR, state); diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index a918644e4c..a61d418dd1 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -1,10 +1,9 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. -use std::fmt::Display; - use chrono::{DateTime, Utc}; use jsonwebtoken::jwk::JwkSet; use serde::{Deserialize, Serialize, Serializer}; +use std::fmt::Display; use crate::privilege::Privilege; use crate::spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}; @@ -49,7 +48,7 @@ pub struct ExtensionInstallResponse { /// Status of the LFC prewarm process. The same state machine is reused for /// both autoprewarm (prewarm after compute/Postgres start using the previously /// stored LFC state) and explicit prewarming via API. -#[derive(Serialize, Default, Debug, Clone, PartialEq)] +#[derive(Serialize, Default, Debug, Clone)] #[serde(tag = "status", rename_all = "snake_case")] pub enum LfcPrewarmState { /// Default value when compute boots up. @@ -59,7 +58,14 @@ pub enum LfcPrewarmState { Prewarming, /// We found requested LFC state in the endpoint storage and /// completed prewarming successfully. - Completed, + Completed { + total: i32, + prewarmed: i32, + skipped: i32, + state_download_time_ms: u32, + uncompress_time_ms: u32, + prewarm_time_ms: u32, + }, /// Unexpected error happened during prewarming. Note, `Not Found 404` /// response from the endpoint storage is explicitly excluded here /// because it can normally happen on the first compute start, @@ -84,7 +90,7 @@ impl Display for LfcPrewarmState { match self { LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"), LfcPrewarmState::Prewarming => f.write_str("Prewarming"), - LfcPrewarmState::Completed => f.write_str("Completed"), + LfcPrewarmState::Completed { .. } => f.write_str("Completed"), LfcPrewarmState::Skipped => f.write_str("Skipped"), LfcPrewarmState::Failed { error } => write!(f, "Error({error})"), LfcPrewarmState::Cancelled => f.write_str("Cancelled"), @@ -92,26 +98,36 @@ impl Display for LfcPrewarmState { } } -#[derive(Serialize, Default, Debug, Clone, PartialEq)] +#[derive(Serialize, Default, Debug, Clone)] #[serde(tag = "status", rename_all = "snake_case")] pub enum LfcOffloadState { #[default] NotOffloaded, Offloading, - Completed, + Completed { + state_query_time_ms: u32, + compress_time_ms: u32, + state_upload_time_ms: u32, + }, Failed { error: String, }, + /// LFC state was empty so it wasn't offloaded Skipped, } -#[derive(Serialize, Debug, Clone, PartialEq)] +#[derive(Serialize, Debug, Clone)] #[serde(tag = "status", rename_all = "snake_case")] -/// Response of /promote pub enum PromoteState { NotPromoted, - Completed, - Failed { error: String }, + Completed { + lsn_wait_time_ms: u32, + pg_promote_time_ms: u32, + reconfigure_time_ms: u32, + }, + Failed { + error: String, + }, } #[derive(Deserialize, Default, Debug)] diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py index 9415d6886c..26165b40cd 100644 --- a/test_runner/regress/test_replica_promotes.py +++ b/test_runner/regress/test_replica_promotes.py @@ -145,6 +145,7 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod): stop_and_check_lsn(secondary, None) if method == PromoteMethod.COMPUTE_CTL: + log.info("Restarting primary to check new config") secondary.stop() # In production, compute ultimately receives new compute spec from cplane. secondary.respec(mode="Primary") From 312a74f11fdf5a8689837834be4c5ce0cc95cebf Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 31 Jul 2025 16:40:32 +0400 Subject: [PATCH 14/15] storcon: implement safekeeper_migrate_abort handler (#12705) ## Problem Right now if we commit a joint configuration to DB, there is no way back. The only way to get the clean mconf is to continue the migration. The RFC also described an abort mechanism, which allows to abort current migration and revert mconf change. It might be needed if the migration is stuck and cannot have any progress, e.g. if the sk we are migrating to went down during the migration. This PR implements this abort algorithm. - Closes: https://databricks.atlassian.net/browse/LKB-899 - Closes: https://github.com/neondatabase/neon/issues/12549 ## Summary of changes - Implement `safekeeper_migrate_abort` handler with the algorithm described in RFC - Add `timeline-safekeeper-migrate-abort` subcommand to `storcon_cli` - Add test for the migration abort algorithm. --- control_plane/storcon_cli/src/main.rs | 18 +++ storage_controller/src/http.rs | 28 ++++ .../src/service/safekeeper_service.rs | 123 +++++++++++++++++- test_runner/fixtures/neon_fixtures.py | 13 ++ .../regress/test_safekeeper_migration.py | 88 +++++++++++++ 5 files changed, 266 insertions(+), 4 deletions(-) diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index a4d1030488..635b1858ec 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -303,6 +303,13 @@ enum Command { #[arg(long, required = true, value_delimiter = ',')] new_sk_set: Vec, }, + /// Abort ongoing safekeeper migration. + TimelineSafekeeperMigrateAbort { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + timeline_id: TimelineId, + }, } #[derive(Parser)] @@ -1396,6 +1403,17 @@ async fn main() -> anyhow::Result<()> { ) .await?; } + Command::TimelineSafekeeperMigrateAbort { + tenant_id, + timeline_id, + } => { + let path = + format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate_abort"); + + storcon_client + .dispatch::<(), ()>(Method::POST, path, None) + .await?; + } } Ok(()) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index ff73719adb..b40da4fd65 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -644,6 +644,7 @@ async fn handle_tenant_timeline_safekeeper_migrate( req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + // TODO(diko): it's not PS operation, there should be a different permission scope. check_permissions(&req, Scope::PageServerApi)?; maybe_rate_limit(&req, tenant_id).await; @@ -665,6 +666,23 @@ async fn handle_tenant_timeline_safekeeper_migrate( json_response(StatusCode::OK, ()) } +async fn handle_tenant_timeline_safekeeper_migrate_abort( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + // TODO(diko): it's not PS operation, there should be a different permission scope. + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; + + service + .tenant_timeline_safekeeper_migrate_abort(tenant_id, timeline_id) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn handle_tenant_timeline_lsn_lease( service: Arc, req: Request, @@ -2611,6 +2629,16 @@ pub fn make_router( ) }, ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_safekeeper_migrate_abort, + RequestName("v1_tenant_timeline_safekeeper_migrate_abort"), + ) + }, + ) // LSN lease passthrough to all shards .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease", diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index fab1342d5d..689d341b6a 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -1230,10 +1230,7 @@ impl Service { } // It it is the same new_sk_set, we can continue the migration (retry). } else { - let prev_finished = timeline.cplane_notified_generation == timeline.generation - && timeline.sk_set_notified_generation == timeline.generation; - - if !prev_finished { + if !is_migration_finished(&timeline) { // The previous migration is committed, but the finish step failed. // Safekeepers/cplane might not know about the last membership configuration. // Retry the finish step to ensure smooth migration. @@ -1545,6 +1542,8 @@ impl Service { timeline_id: TimelineId, timeline: &TimelinePersistence, ) -> Result<(), ApiError> { + tracing::info!(generation=?timeline.generation, sk_set=?timeline.sk_set, new_sk_set=?timeline.new_sk_set, "retrying finish safekeeper migration"); + if timeline.new_sk_set.is_some() { // Logical error, should never happen. return Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -1624,4 +1623,120 @@ impl Service { Ok(wal_positions[quorum_size - 1]) } + + /// Abort ongoing safekeeper migration. + pub(crate) async fn tenant_timeline_safekeeper_migrate_abort( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result<(), ApiError> { + // TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks. + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineSafekeeperMigrate, + ) + .await; + + // Fetch current timeline configuration from the configuration storage. + let timeline = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + + let Some(timeline) = timeline else { + return Err(ApiError::NotFound( + anyhow::anyhow!( + "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table" + ) + .into(), + )); + }; + + let mut generation = SafekeeperGeneration::new(timeline.generation as u32); + + let Some(new_sk_set) = &timeline.new_sk_set else { + // No new_sk_set -> no active migration that we can abort. + tracing::info!("timeline has no active migration"); + + if !is_migration_finished(&timeline) { + // The last migration is committed, but the finish step failed. + // Safekeepers/cplane might not know about the last membership configuration. + // Retry the finish step to make the timeline state clean. + self.finish_safekeeper_migration_retry(tenant_id, timeline_id, &timeline) + .await?; + } + return Ok(()); + }; + + tracing::info!(sk_set=?timeline.sk_set, ?new_sk_set, ?generation, "aborting timeline migration"); + + let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?; + let new_safekeepers = self.get_safekeepers(new_sk_set)?; + + let cur_sk_member_set = + Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?; + + // Increment current generation and remove new_sk_set from the timeline to abort the migration. + generation = generation.next(); + + let mconf = membership::Configuration { + generation, + members: cur_sk_member_set, + new_members: None, + }; + + // Exclude safekeepers which were added during the current migration. + let cur_ids: HashSet = cur_safekeepers.iter().map(|sk| sk.get_id()).collect(); + let exclude_safekeepers = new_safekeepers + .into_iter() + .filter(|sk| !cur_ids.contains(&sk.get_id())) + .collect::>(); + + let exclude_requests = exclude_safekeepers + .iter() + .map(|sk| TimelinePendingOpPersistence { + sk_id: sk.skp.id, + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + generation: generation.into_inner() as i32, + op_kind: SafekeeperTimelineOpKind::Exclude, + }) + .collect::>(); + + let cur_sk_set = cur_safekeepers + .iter() + .map(|sk| sk.get_id()) + .collect::>(); + + // Persist new mconf and exclude requests. + self.persistence + .update_timeline_membership( + tenant_id, + timeline_id, + generation, + &cur_sk_set, + None, + &exclude_requests, + ) + .await?; + + // At this point we have already commited the abort, but still need to notify + // cplane/safekeepers with the new mconf. That's what finish_safekeeper_migration does. + self.finish_safekeeper_migration( + tenant_id, + timeline_id, + &cur_safekeepers, + &mconf, + &exclude_safekeepers, + ) + .await?; + + Ok(()) + } +} + +fn is_migration_finished(timeline: &TimelinePersistence) -> bool { + timeline.cplane_notified_generation == timeline.generation + && timeline.sk_set_notified_generation == timeline.generation } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c3dfc78218..41213d374a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2323,6 +2323,19 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() log.info(f"migrate_safekeepers success: {response.json()}") + def abort_safekeeper_migration( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + ): + response = self.request( + "POST", + f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate_abort", + headers=self.headers(TokenScope.PAGE_SERVER_API), + ) + response.raise_for_status() + log.info(f"abort_safekeeper_migration success: {response.json()}") + def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: """ :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py index 97a6ece446..ba067b97de 100644 --- a/test_runner/regress/test_safekeeper_migration.py +++ b/test_runner/regress/test_safekeeper_migration.py @@ -460,3 +460,91 @@ def test_pull_from_most_advanced_sk(neon_env_builder: NeonEnvBuilder): ep.start(safekeeper_generation=5, safekeepers=new_sk_set2) assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)] + + +def test_abort_safekeeper_migration(neon_env_builder: NeonEnvBuilder): + """ + Test that safekeeper migration can be aborted. + 1. Insert failpoints and ensure the abort successfully reverts the timeline state. + 2. Check that endpoint is operational after the abort. + """ + neon_env_builder.num_safekeepers = 2 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 1, + } + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert len(mconf["sk_set"]) == 1 + cur_sk = mconf["sk_set"][0] + cur_gen = 1 + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"]) + ep.safe_psql("CREATE EXTENSION neon_test_utils;") + ep.safe_psql("CREATE TABLE t(a int)") + ep.safe_psql("INSERT INTO t VALUES (1)") + + another_sk = [sk.id for sk in env.safekeepers if sk.id != cur_sk][0] + + failpoints = [ + "sk-migration-after-step-3", + "sk-migration-after-step-4", + "sk-migration-after-step-5", + "sk-migration-after-step-7", + ] + + for fp in failpoints: + env.storage_controller.configure_failpoints((fp, "return(1)")) + + with pytest.raises(StorageControllerApiException, match=f"failpoint {fp}"): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [another_sk] + ) + cur_gen += 1 + + env.storage_controller.configure_failpoints((fp, "off")) + + # We should have a joint mconf after the failure. + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["generation"] == cur_gen + assert mconf["sk_set"] == [cur_sk] + assert mconf["new_sk_set"] == [another_sk] + + env.storage_controller.abort_safekeeper_migration(env.initial_tenant, env.initial_timeline) + cur_gen += 1 + + # Abort should revert the timeline to the previous sk_set and increment the generation. + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["generation"] == cur_gen + assert mconf["sk_set"] == [cur_sk] + assert mconf["new_sk_set"] is None + + assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith(f"g#{cur_gen}:") + ep.safe_psql(f"INSERT INTO t VALUES ({cur_gen})") + + # After step-8 the final mconf is committed and the migration is not abortable anymore. + # So the abort should not abort anything. + env.storage_controller.configure_failpoints(("sk-migration-after-step-8", "return(1)")) + + with pytest.raises(StorageControllerApiException, match="failpoint sk-migration-after-step-8"): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [another_sk] + ) + cur_gen += 2 + + env.storage_controller.configure_failpoints((fp, "off")) + + env.storage_controller.abort_safekeeper_migration(env.initial_tenant, env.initial_timeline) + + # The migration is fully committed, no abort should have been performed. + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["generation"] == cur_gen + assert mconf["sk_set"] == [another_sk] + assert mconf["new_sk_set"] is None + + ep.safe_psql(f"INSERT INTO t VALUES ({cur_gen})") + ep.clear_buffers() + assert ep.safe_psql("SELECT * FROM t") == [(i + 1,) for i in range(cur_gen) if i % 2 == 0] From d96cea191708fbfbbb3ee4599bc9e4eb391f67c1 Mon Sep 17 00:00:00 2001 From: Ruslan Talpa Date: Thu, 31 Jul 2025 16:05:09 +0300 Subject: [PATCH 15/15] [proxy] handle options request in rest broker (cors headers) (#12744) ## Problem rest broker needs to respond with the correct cors headers for the api to be usable from other domains ## Summary of changes added a code path in rest broker to handle the OPTIONS requests --------- Co-authored-by: Ruslan Talpa --- proxy/src/serverless/rest.rs | 224 +++++++++++++++++++++++++---------- 1 file changed, 159 insertions(+), 65 deletions(-) diff --git a/proxy/src/serverless/rest.rs b/proxy/src/serverless/rest.rs index 0c3d2c958d..9f98e87272 100644 --- a/proxy/src/serverless/rest.rs +++ b/proxy/src/serverless/rest.rs @@ -5,12 +5,17 @@ use std::sync::Arc; use bytes::Bytes; use http::Method; -use http::header::{AUTHORIZATION, CONTENT_TYPE, HOST}; +use http::header::{ + ACCESS_CONTROL_ALLOW_HEADERS, ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN, + ACCESS_CONTROL_EXPOSE_HEADERS, ACCESS_CONTROL_MAX_AGE, ACCESS_CONTROL_REQUEST_HEADERS, ALLOW, + AUTHORIZATION, CONTENT_TYPE, HOST, ORIGIN, +}; use http_body_util::combinators::BoxBody; -use http_body_util::{BodyExt, Full}; +use http_body_util::{BodyExt, Empty, Full}; use http_utils::error::ApiError; use hyper::body::Incoming; -use hyper::http::{HeaderName, HeaderValue}; +use hyper::http::response::Builder; +use hyper::http::{HeaderMap, HeaderName, HeaderValue}; use hyper::{Request, Response, StatusCode}; use indexmap::IndexMap; use moka::sync::Cache; @@ -67,6 +72,15 @@ use crate::util::deserialize_json_string; static EMPTY_JSON_SCHEMA: &str = r#"{"schemas":[]}"#; const INTROSPECTION_SQL: &str = POSTGRESQL_INTROSPECTION_SQL; +const HEADER_VALUE_ALLOW_ALL_ORIGINS: HeaderValue = HeaderValue::from_static("*"); +// CORS headers values +const ACCESS_CONTROL_ALLOW_METHODS_VALUE: HeaderValue = + HeaderValue::from_static("GET, POST, PATCH, PUT, DELETE, OPTIONS"); +const ACCESS_CONTROL_MAX_AGE_VALUE: HeaderValue = HeaderValue::from_static("86400"); +const ACCESS_CONTROL_EXPOSE_HEADERS_VALUE: HeaderValue = HeaderValue::from_static( + "Content-Encoding, Content-Location, Content-Range, Content-Type, Date, Location, Server, Transfer-Encoding, Range-Unit", +); +const ACCESS_CONTROL_ALLOW_HEADERS_VALUE: HeaderValue = HeaderValue::from_static("Authorization"); // A wrapper around the DbSchema that allows for self-referencing #[self_referencing] @@ -137,6 +151,8 @@ pub struct ApiConfig { pub role_claim_key: String, #[serde(default, deserialize_with = "deserialize_comma_separated_option")] pub db_extra_search_path: Option>, + #[serde(default, deserialize_with = "deserialize_comma_separated_option")] + pub server_cors_allowed_origins: Option>, } // The DbSchemaCache is a cache of the ApiConfig and DbSchemaOwned for each endpoint @@ -165,7 +181,13 @@ impl DbSchemaCache { } } - pub async fn get_cached_or_remote( + pub fn get_cached( + &self, + endpoint_id: &EndpointCacheKey, + ) -> Option> { + count_cache_outcome(CacheKind::Schema, self.0.get(endpoint_id)) + } + pub async fn get_remote( &self, endpoint_id: &EndpointCacheKey, auth_header: &HeaderValue, @@ -174,47 +196,42 @@ impl DbSchemaCache { ctx: &RequestContext, config: &'static ProxyConfig, ) -> Result, RestError> { - let cache_result = count_cache_outcome(CacheKind::Schema, self.0.get(endpoint_id)); - match cache_result { - Some(v) => Ok(v), - None => { - info!("db_schema cache miss for endpoint: {:?}", endpoint_id); - let remote_value = self - .get_remote(auth_header, connection_string, client, ctx, config) - .await; - let (api_config, schema_owned) = match remote_value { - Ok((api_config, schema_owned)) => (api_config, schema_owned), - Err(e @ RestError::SchemaTooLarge) => { - // for the case where the schema is too large, we cache an empty dummy value - // all the other requests will fail without triggering the introspection query - let schema_owned = serde_json::from_str::(EMPTY_JSON_SCHEMA) - .map_err(|e| JsonDeserialize { source: e })?; + info!("db_schema cache miss for endpoint: {:?}", endpoint_id); + let remote_value = self + .internal_get_remote(auth_header, connection_string, client, ctx, config) + .await; + let (api_config, schema_owned) = match remote_value { + Ok((api_config, schema_owned)) => (api_config, schema_owned), + Err(e @ RestError::SchemaTooLarge) => { + // for the case where the schema is too large, we cache an empty dummy value + // all the other requests will fail without triggering the introspection query + let schema_owned = serde_json::from_str::(EMPTY_JSON_SCHEMA) + .map_err(|e| JsonDeserialize { source: e })?; - let api_config = ApiConfig { - db_schemas: vec![], - db_anon_role: None, - db_max_rows: None, - db_allowed_select_functions: vec![], - role_claim_key: String::new(), - db_extra_search_path: None, - }; - let value = Arc::new((api_config, schema_owned)); - count_cache_insert(CacheKind::Schema); - self.0.insert(endpoint_id.clone(), value); - return Err(e); - } - Err(e) => { - return Err(e); - } + let api_config = ApiConfig { + db_schemas: vec![], + db_anon_role: None, + db_max_rows: None, + db_allowed_select_functions: vec![], + role_claim_key: String::new(), + db_extra_search_path: None, + server_cors_allowed_origins: None, }; let value = Arc::new((api_config, schema_owned)); count_cache_insert(CacheKind::Schema); - self.0.insert(endpoint_id.clone(), value.clone()); - Ok(value) + self.0.insert(endpoint_id.clone(), value); + return Err(e); } - } + Err(e) => { + return Err(e); + } + }; + let value = Arc::new((api_config, schema_owned)); + count_cache_insert(CacheKind::Schema); + self.0.insert(endpoint_id.clone(), value.clone()); + Ok(value) } - pub async fn get_remote( + async fn internal_get_remote( &self, auth_header: &HeaderValue, connection_string: &str, @@ -531,7 +548,7 @@ pub(crate) async fn handle( ) -> Result>, ApiError> { let result = handle_inner(cancel, config, &ctx, request, backend).await; - let mut response = match result { + let response = match result { Ok(r) => { ctx.set_success(); @@ -640,9 +657,6 @@ pub(crate) async fn handle( } }; - response - .headers_mut() - .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); Ok(response) } @@ -722,6 +736,37 @@ async fn handle_inner( } } +fn apply_common_cors_headers( + response: &mut Builder, + request_headers: &HeaderMap, + allowed_origins: Option<&Vec>, +) { + let request_origin = request_headers + .get(ORIGIN) + .map(|v| v.to_str().unwrap_or("")); + + let response_allow_origin = match (request_origin, allowed_origins) { + (Some(or), Some(allowed_origins)) => { + if allowed_origins.iter().any(|o| o == or) { + Some(HeaderValue::from_str(or).unwrap_or(HEADER_VALUE_ALLOW_ALL_ORIGINS)) + } else { + None + } + } + (Some(_), None) => Some(HEADER_VALUE_ALLOW_ALL_ORIGINS), + _ => None, + }; + if let Some(h) = response.headers_mut() { + h.insert( + ACCESS_CONTROL_EXPOSE_HEADERS, + ACCESS_CONTROL_EXPOSE_HEADERS_VALUE, + ); + if let Some(origin) = response_allow_origin { + h.insert(ACCESS_CONTROL_ALLOW_ORIGIN, origin); + } + } +} + #[allow(clippy::too_many_arguments)] async fn handle_rest_inner( config: &'static ProxyConfig, @@ -733,12 +778,6 @@ async fn handle_rest_inner( jwt: String, backend: Arc, ) -> Result>, RestError> { - // validate the jwt token - let jwt_parsed = backend - .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) - .await - .map_err(HttpConnError::from)?; - let db_schema_cache = config .rest_config @@ -754,28 +793,83 @@ async fn handle_rest_inner( message: "Failed to get endpoint cache key".to_string(), }))?; - let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?; - let (parts, originial_body) = request.into_parts(); + // try and get the cached entry for this endpoint + // it contains the api config and the introspected db schema + let cached_entry = db_schema_cache.get_cached(&endpoint_cache_key); + + let allowed_origins = cached_entry + .as_ref() + .and_then(|arc| arc.0.server_cors_allowed_origins.as_ref()); + + let mut response = Response::builder(); + apply_common_cors_headers(&mut response, &parts.headers, allowed_origins); + + // handle the OPTIONS request + if parts.method == Method::OPTIONS { + let allowed_headers = parts + .headers + .get(ACCESS_CONTROL_REQUEST_HEADERS) + .and_then(|a| a.to_str().ok()) + .filter(|v| !v.is_empty()) + .map_or_else( + || "Authorization".to_string(), + |v| format!("{v}, Authorization"), + ); + return response + .status(StatusCode::OK) + .header( + ACCESS_CONTROL_ALLOW_METHODS, + ACCESS_CONTROL_ALLOW_METHODS_VALUE, + ) + .header(ACCESS_CONTROL_MAX_AGE, ACCESS_CONTROL_MAX_AGE_VALUE) + .header( + ACCESS_CONTROL_ALLOW_HEADERS, + HeaderValue::from_str(&allowed_headers) + .unwrap_or(ACCESS_CONTROL_ALLOW_HEADERS_VALUE), + ) + .header(ALLOW, ACCESS_CONTROL_ALLOW_METHODS_VALUE) + .body(Empty::new().map_err(|x| match x {}).boxed()) + .map_err(|e| { + RestError::SubzeroCore(InternalError { + message: e.to_string(), + }) + }); + } + + // validate the jwt token + let jwt_parsed = backend + .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) + .await + .map_err(HttpConnError::from)?; + let auth_header = parts .headers .get(AUTHORIZATION) .ok_or(RestError::SubzeroCore(InternalError { message: "Authorization header is required".to_string(), }))?; + let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?; - let entry = db_schema_cache - .get_cached_or_remote( - &endpoint_cache_key, - auth_header, - connection_string, - &mut client, - ctx, - config, - ) - .await?; + let entry = match cached_entry { + Some(e) => e, + None => { + // if not cached, get the remote entry (will run the introspection query) + db_schema_cache + .get_remote( + &endpoint_cache_key, + auth_header, + connection_string, + &mut client, + ctx, + config, + ) + .await? + } + }; let (api_config, db_schema_owned) = entry.as_ref(); + let db_schema = db_schema_owned.borrow_schema(); let db_schemas = &api_config.db_schemas; // list of schemas available for the api @@ -999,8 +1093,8 @@ async fn handle_rest_inner( let _metrics = client.metrics(ctx); // FIXME: is everything in the context set correctly? // send the request to the local proxy - let response = make_raw_local_proxy_request(&mut client, headers, req_body).await?; - let (parts, body) = response.into_parts(); + let proxy_response = make_raw_local_proxy_request(&mut client, headers, req_body).await?; + let (response_parts, body) = proxy_response.into_parts(); let max_response = config.http_config.max_response_size_bytes; let bytes = read_body_with_limit(body, max_response) @@ -1009,7 +1103,7 @@ async fn handle_rest_inner( // if the response status is greater than 399, then it is an error // FIXME: check if there are other error codes or shapes of the response - if parts.status.as_u16() > 399 { + if response_parts.status.as_u16() > 399 { // turn this postgres error from the json into PostgresError let postgres_error = serde_json::from_slice(&bytes) .map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?; @@ -1175,7 +1269,7 @@ async fn handle_rest_inner( .boxed(); // build the response - let mut response = Response::builder() + response = response .status(StatusCode::from_u16(status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR)) .header(CONTENT_TYPE, http_content_type);