diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 1b231151ce..387fc297f0 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -209,6 +209,8 @@ pub struct NeonStorageControllerConf { pub use_https_safekeeper_api: bool, pub use_local_compute_notifications: bool, + + pub timeline_safekeeper_count: Option, } impl NeonStorageControllerConf { @@ -239,6 +241,7 @@ impl Default for NeonStorageControllerConf { timelines_onto_safekeepers: true, use_https_safekeeper_api: false, use_local_compute_notifications: true, + timeline_safekeeper_count: None, } } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 755d67a7ad..95f7533057 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -628,6 +628,10 @@ impl StorageController { args.push("--timelines-onto-safekeepers".to_string()); } + if let Some(sk_cnt) = self.config.timeline_safekeeper_count { + args.push(format!("--timeline-safekeeper-count={sk_cnt}")); + } + println!("Starting storage controller"); background_process::start_process( diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 2eea2f9d10..fc0ba9f28c 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -207,6 +207,12 @@ struct Cli { /// the compute notification directly (instead of via control plane). #[arg(long, default_value = "false")] use_local_compute_notifications: bool, + + /// Number of safekeepers to choose for a timeline when creating it. + /// Safekeepers will be choosen from different availability zones. + /// This option exists primarily for testing purposes. + #[arg(long, default_value = "3", value_parser = clap::value_parser!(i64).range(1..))] + timeline_safekeeper_count: i64, } enum StrictMode { @@ -371,6 +377,11 @@ async fn async_main() -> anyhow::Result<()> { StrictMode::Strict if args.use_local_compute_notifications => { anyhow::bail!("`--use-local-compute-notifications` is only permitted in `--dev` mode"); } + StrictMode::Strict if args.timeline_safekeeper_count < 3 => { + anyhow::bail!( + "Running with less than 3 safekeepers per timeline is only permitted in `--dev` mode" + ); + } StrictMode::Strict => { tracing::info!("Starting in strict mode: configuration is OK.") } @@ -433,6 +444,7 @@ async fn async_main() -> anyhow::Result<()> { ssl_ca_certs, timelines_onto_safekeepers: args.timelines_onto_safekeepers, use_local_compute_notifications: args.use_local_compute_notifications, + timeline_safekeeper_count: args.timeline_safekeeper_count, }; // Validate that we can connect to the database diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 14c81ccf59..6ec3963c48 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -466,6 +466,10 @@ pub struct Config { pub timelines_onto_safekeepers: bool, pub use_local_compute_notifications: bool, + + /// Number of safekeepers to choose for a timeline when creating it. + /// Safekeepers will be choosen from different availability zones. + pub timeline_safekeeper_count: i64, } impl From for ApiError { diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 61b9ec6b6d..193a1833a7 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -1,3 +1,4 @@ +use std::cmp::max; use std::collections::HashSet; use std::str::FromStr; use std::sync::Arc; @@ -608,7 +609,8 @@ impl Service { Ok(()) } - /// Choose safekeepers for the new timeline: 3 in different azs. + /// Choose safekeepers for the new timeline in different azs. + /// 3 are choosen by default, but may be configured via config (for testing). pub(crate) async fn safekeepers_for_new_timeline( &self, ) -> Result, ApiError> { @@ -651,18 +653,14 @@ impl Service { ) }); // Number of safekeepers in different AZs we are looking for - let wanted_count = match all_safekeepers.len() { - 0 => { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find any active safekeeper for new timeline", - ))); - } - // Have laxer requirements on testig mode as we don't want to - // spin up three safekeepers for every single test - #[cfg(feature = "testing")] - 1 | 2 => all_safekeepers.len(), - _ => 3, - }; + let mut wanted_count = self.config.timeline_safekeeper_count as usize; + // TODO(diko): remove this when `timeline_safekeeper_count` option is in the release + // branch and is specified in tests/neon_local config. + if cfg!(feature = "testing") && all_safekeepers.len() < wanted_count { + // In testing mode, we can have less safekeepers than the config says + wanted_count = max(all_safekeepers.len(), 1); + } + let mut sks = Vec::new(); let mut azs = HashSet::new(); for (_sk_util, sk_info, az_id) in all_safekeepers.iter() {