From c501a106124db54a76b3339b23ba255c677fce65 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 26 Jul 2024 16:54:12 +0100 Subject: [PATCH] storcon: gate starting-up as candidate behind a flag --- storage_controller/src/main.rs | 4 ++++ storage_controller/src/service.rs | 17 +++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index adbf5c6496..a954d9bab2 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -81,6 +81,9 @@ struct Cli { #[arg(long, default_value = "5s")] db_connect_timeout: humantime::Duration, + #[arg(long, default_value = "false")] + start_as_candidate: bool, + /// `neon_local` sets this to the path of the neon_local repo dir. /// Only relevant for testing. // TODO: make `cfg(feature = "testing")` @@ -273,6 +276,7 @@ async fn async_main() -> anyhow::Result<()> { .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, + start_as_candidate: args.start_as_candidate, }; // After loading secrets & config, but before starting anything else, apply database migrations diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index ea515f67da..050fbd537b 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -223,6 +223,7 @@ impl ServiceState { tenants: BTreeMap, scheduler: Scheduler, delayed_reconcile_rx: tokio::sync::mpsc::Receiver, + initial_leadership_status: LeadershipStatus, ) -> Self { let status = &crate::metrics::METRICS_REGISTRY .metrics_group @@ -230,15 +231,13 @@ impl ServiceState { status.set( LeadershipStatusGroup { - status: LeadershipStatus::Leader, + status: initial_leadership_status, }, 1, ); Self { - // TODO: Starting up as Leader is a transient state. Once we enable rolling - // upgrades on the k8s side, we should start up as Candidate. - leadership_status: LeadershipStatus::Leader, + leadership_status: initial_leadership_status, tenants, nodes: Arc::new(nodes), scheduler, @@ -323,6 +322,8 @@ pub struct Config { // TODO: make this cfg(feature = "testing") pub neon_local_repo_dir: Option, + + pub start_as_candidate: bool, } impl From for ApiError { @@ -1255,12 +1256,20 @@ impl Service { config.max_warming_up_interval, cancel.clone(), ); + + let initial_leadership_status = if config.start_as_candidate { + LeadershipStatus::Candidate + } else { + LeadershipStatus::Leader + }; + let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( nodes, tenants, scheduler, delayed_reconcile_rx, + initial_leadership_status, ))), config: config.clone(), persistence,