From 79485e7c3a138c724efc2b8edc82962581a48b53 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 20 Jun 2025 11:35:11 -0400 Subject: [PATCH 1/3] feat(pageserver): enable gc-compaction by default everywhere (#12105) Enable it across tests and set it as default. Marks the first milestone of https://github.com/neondatabase/neon/issues/9114. We already enabled it in all AWS regions and planning to enable it in all Azure regions next week. will merge after we roll out in all regions. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/config.rs | 2 +- test_runner/regress/test_attach_tenant_config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 2d7a06a72f..1ecc17e04b 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -816,7 +816,7 @@ pub mod tenant_conf_defaults { // By default ingest enough WAL for two new L0 layers before checking if new image // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; - pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false; + pub const DEFAULT_GC_COMPACTION_ENABLED: bool = true; pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true; pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index dc44fc77db..7788faceb4 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -184,7 +184,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "timeline_offloading": False, "rel_size_v2_enabled": True, "relsize_snapshot_cache_capacity": 10000, - "gc_compaction_enabled": True, + "gc_compaction_enabled": False, "gc_compaction_verification": False, "gc_compaction_initial_threshold_kb": 1024000, "gc_compaction_ratio_percent": 200, From b2954d16ff12899e1e85d3c772988da4454450f0 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Fri, 20 Jun 2025 20:03:17 +0400 Subject: [PATCH 2/3] storcon, neon_local: add timeline_safekeeper_count (#12303) ## Problem We need to specify the number of safekeepers for neon_local without `testing` feature. Also we need this option for testing different configurations of safekeeper migration code. We cannot set it in `neon_fixtures.py` and in the default config of `neon_local` yet, because it will fail compatibility tests. I'll make a separate PR with removing `cfg!("testing")` completely and specifying this option in the config when this option reaches the release branch. - Part of https://github.com/neondatabase/neon/issues/12298 ## Summary of changes - Add `timeline_safekeeper_count` config option to storcon and neon_local --- control_plane/src/local_env.rs | 3 +++ control_plane/src/storage_controller.rs | 4 ++++ storage_controller/src/main.rs | 12 ++++++++++ storage_controller/src/service.rs | 4 ++++ .../src/service/safekeeper_service.rs | 24 +++++++++---------- 5 files changed, 34 insertions(+), 13 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 1b231151ce..387fc297f0 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -209,6 +209,8 @@ pub struct NeonStorageControllerConf { pub use_https_safekeeper_api: bool, pub use_local_compute_notifications: bool, + + pub timeline_safekeeper_count: Option, } impl NeonStorageControllerConf { @@ -239,6 +241,7 @@ impl Default for NeonStorageControllerConf { timelines_onto_safekeepers: true, use_https_safekeeper_api: false, use_local_compute_notifications: true, + timeline_safekeeper_count: None, } } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 755d67a7ad..95f7533057 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -628,6 +628,10 @@ impl StorageController { args.push("--timelines-onto-safekeepers".to_string()); } + if let Some(sk_cnt) = self.config.timeline_safekeeper_count { + args.push(format!("--timeline-safekeeper-count={sk_cnt}")); + } + println!("Starting storage controller"); background_process::start_process( diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 2eea2f9d10..fc0ba9f28c 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -207,6 +207,12 @@ struct Cli { /// the compute notification directly (instead of via control plane). #[arg(long, default_value = "false")] use_local_compute_notifications: bool, + + /// Number of safekeepers to choose for a timeline when creating it. + /// Safekeepers will be choosen from different availability zones. + /// This option exists primarily for testing purposes. + #[arg(long, default_value = "3", value_parser = clap::value_parser!(i64).range(1..))] + timeline_safekeeper_count: i64, } enum StrictMode { @@ -371,6 +377,11 @@ async fn async_main() -> anyhow::Result<()> { StrictMode::Strict if args.use_local_compute_notifications => { anyhow::bail!("`--use-local-compute-notifications` is only permitted in `--dev` mode"); } + StrictMode::Strict if args.timeline_safekeeper_count < 3 => { + anyhow::bail!( + "Running with less than 3 safekeepers per timeline is only permitted in `--dev` mode" + ); + } StrictMode::Strict => { tracing::info!("Starting in strict mode: configuration is OK.") } @@ -433,6 +444,7 @@ async fn async_main() -> anyhow::Result<()> { ssl_ca_certs, timelines_onto_safekeepers: args.timelines_onto_safekeepers, use_local_compute_notifications: args.use_local_compute_notifications, + timeline_safekeeper_count: args.timeline_safekeeper_count, }; // Validate that we can connect to the database diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 14c81ccf59..6ec3963c48 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -466,6 +466,10 @@ pub struct Config { pub timelines_onto_safekeepers: bool, pub use_local_compute_notifications: bool, + + /// Number of safekeepers to choose for a timeline when creating it. + /// Safekeepers will be choosen from different availability zones. + pub timeline_safekeeper_count: i64, } impl From for ApiError { diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 61b9ec6b6d..193a1833a7 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -1,3 +1,4 @@ +use std::cmp::max; use std::collections::HashSet; use std::str::FromStr; use std::sync::Arc; @@ -608,7 +609,8 @@ impl Service { Ok(()) } - /// Choose safekeepers for the new timeline: 3 in different azs. + /// Choose safekeepers for the new timeline in different azs. + /// 3 are choosen by default, but may be configured via config (for testing). pub(crate) async fn safekeepers_for_new_timeline( &self, ) -> Result, ApiError> { @@ -651,18 +653,14 @@ impl Service { ) }); // Number of safekeepers in different AZs we are looking for - let wanted_count = match all_safekeepers.len() { - 0 => { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find any active safekeeper for new timeline", - ))); - } - // Have laxer requirements on testig mode as we don't want to - // spin up three safekeepers for every single test - #[cfg(feature = "testing")] - 1 | 2 => all_safekeepers.len(), - _ => 3, - }; + let mut wanted_count = self.config.timeline_safekeeper_count as usize; + // TODO(diko): remove this when `timeline_safekeeper_count` option is in the release + // branch and is specified in tests/neon_local config. + if cfg!(feature = "testing") && all_safekeepers.len() < wanted_count { + // In testing mode, we can have less safekeepers than the config says + wanted_count = max(all_safekeepers.len(), 1); + } + let mut sks = Vec::new(); let mut azs = HashSet::new(); for (_sk_util, sk_info, az_id) in all_safekeepers.iter() { From c8b2ac93cf88a3d2c68970caf9290677b7d6cb92 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Fri, 20 Jun 2025 13:46:30 -0500 Subject: [PATCH 3/3] Allow the control plane to override any Postgres connection options (#12262) The previous behavior was for the compute to override control plane options if there was a conflict. We want to change the behavior so that the control plane has the absolute power on what is right. In the event that we need a new option passed to the compute as soon as possible, we can initially roll it out in the control plane, and then migrate the option to EXTRA_OPTIONS within the compute later, for instance. Signed-off-by: Tristan Partin --- compute_tools/src/compute.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 7a7f2dfedc..684d841897 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -408,7 +408,9 @@ impl ComputeNode { // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; let options = match conn_conf.get_options() { - Some(options) => format!("{} {}", options, EXTRA_OPTIONS), + // Allow the control plane to override any options set by the + // compute + Some(options) => format!("{} {}", EXTRA_OPTIONS, options), None => EXTRA_OPTIONS.to_string(), }; conn_conf.options(&options);