From 5fad4a4ceeb990523336aefb3aa1fe6e0fac7eae Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 24 Feb 2025 10:30:21 -0500 Subject: [PATCH] feat(storcon): chaos injection of force exit (#10934) ## Problem close https://github.com/neondatabase/cloud/issues/24485 ## Summary of changes This patch adds a new chaos injection mode for the storcon. The chaos injector reads the crontab and exits immediately at the configured time. --------- Signed-off-by: Alex Chi Z --- Cargo.lock | 16 +++- Cargo.toml | 1 + storage_controller/Cargo.toml | 1 + storage_controller/src/main.rs | 10 ++- .../src/service/chaos_injector.rs | 82 ++++++++++++++++--- 5 files changed, 93 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 038727f1a8..47552174d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1546,6 +1546,17 @@ dependencies = [ "itertools 0.10.5", ] +[[package]] +name = "cron" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5877d3fbf742507b66bc2a1945106bd30dd8504019d596901ddd012a4dd01740" +dependencies = [ + "chrono", + "once_cell", + "winnow", +] + [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -6446,6 +6457,7 @@ dependencies = [ "chrono", "clap", "control_plane", + "cron", "diesel", "diesel-async", "diesel_migrations", @@ -8138,9 +8150,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.13" +version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1" +checksum = "1e90edd2ac1aa278a5c4599b1d89cf03074b610800f866d4026dc199d7929a28" dependencies = [ "memchr", ] diff --git a/Cargo.toml b/Cargo.toml index 21310ce6ec..e6ca3c982c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,6 +77,7 @@ byteorder = "1.4" bytes = "1.9" camino = "1.1.6" cfg-if = "1.0.0" +cron = "0.15" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive", "env"] } clashmap = { version = "1.0", features = ["raw-api"] } diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 08c80bc141..8e82996db1 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -18,6 +18,7 @@ anyhow.workspace = true bytes.workspace = true chrono.workspace = true clap.workspace = true +cron.workspace = true fail.workspace = true futures.workspace = true hex.workspace = true diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 18922b9e05..4152e40a76 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -115,10 +115,14 @@ struct Cli { #[arg(long)] neon_local_repo_dir: Option, - /// Chaos testing + /// Chaos testing: exercise tenant migrations #[arg(long)] chaos_interval: Option, + /// Chaos testing: exercise an immediate exit + #[arg(long)] + chaos_exit_crontab: Option, + // Maximum acceptable lag for the secondary location while draining // a pageserver #[arg(long)] @@ -382,10 +386,12 @@ async fn async_main() -> anyhow::Result<()> { let service = service.clone(); let cancel = CancellationToken::new(); let cancel_bg = cancel.clone(); + let chaos_exit_crontab = args.chaos_exit_crontab; ( tokio::task::spawn( async move { - let mut chaos_injector = ChaosInjector::new(service, interval.into()); + let mut chaos_injector = + ChaosInjector::new(service, interval.into(), chaos_exit_crontab); chaos_injector.run(cancel_bg).await } .instrument(tracing::info_span!("chaos_injector")), diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index aa0ee0df5a..25a0fab5ca 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -16,29 +16,80 @@ use super::{Node, Scheduler, Service, TenantShard}; pub struct ChaosInjector { service: Arc, interval: Duration, + chaos_exit_crontab: Option, +} + +fn cron_to_next_duration(cron: &cron::Schedule) -> anyhow::Result { + use chrono::Utc; + let next = cron.upcoming(Utc).next().unwrap(); + let duration = (next - Utc::now()).to_std()?; + Ok(tokio::time::sleep(duration)) +} + +async fn maybe_sleep(sleep: Option) -> Option<()> { + if let Some(sleep) = sleep { + sleep.await; + Some(()) + } else { + None + } } impl ChaosInjector { - pub fn new(service: Arc, interval: Duration) -> Self { - Self { service, interval } + pub fn new( + service: Arc, + interval: Duration, + chaos_exit_crontab: Option, + ) -> Self { + Self { + service, + interval, + chaos_exit_crontab, + } } pub async fn run(&mut self, cancel: CancellationToken) { let mut interval = tokio::time::interval(self.interval); - - loop { - tokio::select! { - _ = interval.tick() => {} - _ = cancel.cancelled() => { - tracing::info!("Shutting down"); - return; + let cron_interval = { + if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab { + match cron_to_next_duration(chaos_exit_crontab) { + Ok(interval_exit) => Some(interval_exit), + Err(e) => { + tracing::error!("Error processing the cron schedule: {e}"); + None + } } + } else { + None } - - self.inject_chaos().await; - - tracing::info!("Chaos iteration..."); + }; + enum ChaosEvent { + ShuffleTenant, + ForceKill, } + let chaos_type = tokio::select! { + _ = interval.tick() => { + ChaosEvent::ShuffleTenant + } + Some(_) = maybe_sleep(cron_interval) => { + ChaosEvent::ForceKill + } + _ = cancel.cancelled() => { + tracing::info!("Shutting down"); + return; + } + }; + + match chaos_type { + ChaosEvent::ShuffleTenant => { + self.inject_chaos().await; + } + ChaosEvent::ForceKill => { + self.force_kill().await; + } + } + + tracing::info!("Chaos iteration..."); } /// If a shard has a secondary and attached location, then re-assign the secondary to be @@ -95,6 +146,11 @@ impl ChaosInjector { ); } + async fn force_kill(&mut self) { + tracing::warn!("Injecting chaos: force kill"); + std::process::exit(1); + } + async fn inject_chaos(&mut self) { // Pick some shards to interfere with let batch_size = 128;