feat(storcon): chaos injection of force exit (#10934)

## Problem

close https://github.com/neondatabase/cloud/issues/24485

## Summary of changes

This patch adds a new chaos injection mode for the storcon. The chaos
injector reads the crontab and exits immediately at the configured time.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-02-24 10:30:21 -05:00
committed by GitHub
parent fdde58120c
commit 5fad4a4cee
5 changed files with 93 additions and 17 deletions

16
Cargo.lock generated
View File

@@ -1546,6 +1546,17 @@ dependencies = [
"itertools 0.10.5", "itertools 0.10.5",
] ]
[[package]]
name = "cron"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5877d3fbf742507b66bc2a1945106bd30dd8504019d596901ddd012a4dd01740"
dependencies = [
"chrono",
"once_cell",
"winnow",
]
[[package]] [[package]]
name = "crossbeam-channel" name = "crossbeam-channel"
version = "0.5.8" version = "0.5.8"
@@ -6446,6 +6457,7 @@ dependencies = [
"chrono", "chrono",
"clap", "clap",
"control_plane", "control_plane",
"cron",
"diesel", "diesel",
"diesel-async", "diesel-async",
"diesel_migrations", "diesel_migrations",
@@ -8138,9 +8150,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]] [[package]]
name = "winnow" name = "winnow"
version = "0.6.13" version = "0.6.26"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1" checksum = "1e90edd2ac1aa278a5c4599b1d89cf03074b610800f866d4026dc199d7929a28"
dependencies = [ dependencies = [
"memchr", "memchr",
] ]

View File

@@ -77,6 +77,7 @@ byteorder = "1.4"
bytes = "1.9" bytes = "1.9"
camino = "1.1.6" camino = "1.1.6"
cfg-if = "1.0.0" cfg-if = "1.0.0"
cron = "0.15"
chrono = { version = "0.4", default-features = false, features = ["clock"] } chrono = { version = "0.4", default-features = false, features = ["clock"] }
clap = { version = "4.0", features = ["derive", "env"] } clap = { version = "4.0", features = ["derive", "env"] }
clashmap = { version = "1.0", features = ["raw-api"] } clashmap = { version = "1.0", features = ["raw-api"] }

View File

@@ -18,6 +18,7 @@ anyhow.workspace = true
bytes.workspace = true bytes.workspace = true
chrono.workspace = true chrono.workspace = true
clap.workspace = true clap.workspace = true
cron.workspace = true
fail.workspace = true fail.workspace = true
futures.workspace = true futures.workspace = true
hex.workspace = true hex.workspace = true

View File

@@ -115,10 +115,14 @@ struct Cli {
#[arg(long)] #[arg(long)]
neon_local_repo_dir: Option<PathBuf>, neon_local_repo_dir: Option<PathBuf>,
/// Chaos testing /// Chaos testing: exercise tenant migrations
#[arg(long)] #[arg(long)]
chaos_interval: Option<humantime::Duration>, chaos_interval: Option<humantime::Duration>,
/// Chaos testing: exercise an immediate exit
#[arg(long)]
chaos_exit_crontab: Option<cron::Schedule>,
// Maximum acceptable lag for the secondary location while draining // Maximum acceptable lag for the secondary location while draining
// a pageserver // a pageserver
#[arg(long)] #[arg(long)]
@@ -382,10 +386,12 @@ async fn async_main() -> anyhow::Result<()> {
let service = service.clone(); let service = service.clone();
let cancel = CancellationToken::new(); let cancel = CancellationToken::new();
let cancel_bg = cancel.clone(); let cancel_bg = cancel.clone();
let chaos_exit_crontab = args.chaos_exit_crontab;
( (
tokio::task::spawn( tokio::task::spawn(
async move { async move {
let mut chaos_injector = ChaosInjector::new(service, interval.into()); let mut chaos_injector =
ChaosInjector::new(service, interval.into(), chaos_exit_crontab);
chaos_injector.run(cancel_bg).await chaos_injector.run(cancel_bg).await
} }
.instrument(tracing::info_span!("chaos_injector")), .instrument(tracing::info_span!("chaos_injector")),

View File

@@ -16,29 +16,80 @@ use super::{Node, Scheduler, Service, TenantShard};
pub struct ChaosInjector { pub struct ChaosInjector {
service: Arc<Service>, service: Arc<Service>,
interval: Duration, interval: Duration,
chaos_exit_crontab: Option<cron::Schedule>,
}
fn cron_to_next_duration(cron: &cron::Schedule) -> anyhow::Result<tokio::time::Sleep> {
use chrono::Utc;
let next = cron.upcoming(Utc).next().unwrap();
let duration = (next - Utc::now()).to_std()?;
Ok(tokio::time::sleep(duration))
}
async fn maybe_sleep(sleep: Option<tokio::time::Sleep>) -> Option<()> {
if let Some(sleep) = sleep {
sleep.await;
Some(())
} else {
None
}
} }
impl ChaosInjector { impl ChaosInjector {
pub fn new(service: Arc<Service>, interval: Duration) -> Self { pub fn new(
Self { service, interval } service: Arc<Service>,
interval: Duration,
chaos_exit_crontab: Option<cron::Schedule>,
) -> Self {
Self {
service,
interval,
chaos_exit_crontab,
}
} }
pub async fn run(&mut self, cancel: CancellationToken) { pub async fn run(&mut self, cancel: CancellationToken) {
let mut interval = tokio::time::interval(self.interval); let mut interval = tokio::time::interval(self.interval);
let cron_interval = {
loop { if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab {
tokio::select! { match cron_to_next_duration(chaos_exit_crontab) {
_ = interval.tick() => {} Ok(interval_exit) => Some(interval_exit),
_ = cancel.cancelled() => { Err(e) => {
tracing::info!("Shutting down"); tracing::error!("Error processing the cron schedule: {e}");
return; None
}
} }
} else {
None
} }
};
self.inject_chaos().await; enum ChaosEvent {
ShuffleTenant,
tracing::info!("Chaos iteration..."); ForceKill,
} }
let chaos_type = tokio::select! {
_ = interval.tick() => {
ChaosEvent::ShuffleTenant
}
Some(_) = maybe_sleep(cron_interval) => {
ChaosEvent::ForceKill
}
_ = cancel.cancelled() => {
tracing::info!("Shutting down");
return;
}
};
match chaos_type {
ChaosEvent::ShuffleTenant => {
self.inject_chaos().await;
}
ChaosEvent::ForceKill => {
self.force_kill().await;
}
}
tracing::info!("Chaos iteration...");
} }
/// If a shard has a secondary and attached location, then re-assign the secondary to be /// If a shard has a secondary and attached location, then re-assign the secondary to be
@@ -95,6 +146,11 @@ impl ChaosInjector {
); );
} }
async fn force_kill(&mut self) {
tracing::warn!("Injecting chaos: force kill");
std::process::exit(1);
}
async fn inject_chaos(&mut self) { async fn inject_chaos(&mut self) {
// Pick some shards to interfere with // Pick some shards to interfere with
let batch_size = 128; let batch_size = 128;