mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
feat(storcon): chaos injection of force exit (#10934)
## Problem close https://github.com/neondatabase/cloud/issues/24485 ## Summary of changes This patch adds a new chaos injection mode for the storcon. The chaos injector reads the crontab and exits immediately at the configured time. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
16
Cargo.lock
generated
16
Cargo.lock
generated
@@ -1546,6 +1546,17 @@ dependencies = [
|
|||||||
"itertools 0.10.5",
|
"itertools 0.10.5",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cron"
|
||||||
|
version = "0.15.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5877d3fbf742507b66bc2a1945106bd30dd8504019d596901ddd012a4dd01740"
|
||||||
|
dependencies = [
|
||||||
|
"chrono",
|
||||||
|
"once_cell",
|
||||||
|
"winnow",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crossbeam-channel"
|
name = "crossbeam-channel"
|
||||||
version = "0.5.8"
|
version = "0.5.8"
|
||||||
@@ -6446,6 +6457,7 @@ dependencies = [
|
|||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"control_plane",
|
"control_plane",
|
||||||
|
"cron",
|
||||||
"diesel",
|
"diesel",
|
||||||
"diesel-async",
|
"diesel-async",
|
||||||
"diesel_migrations",
|
"diesel_migrations",
|
||||||
@@ -8138,9 +8150,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "winnow"
|
name = "winnow"
|
||||||
version = "0.6.13"
|
version = "0.6.26"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1"
|
checksum = "1e90edd2ac1aa278a5c4599b1d89cf03074b610800f866d4026dc199d7929a28"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -77,6 +77,7 @@ byteorder = "1.4"
|
|||||||
bytes = "1.9"
|
bytes = "1.9"
|
||||||
camino = "1.1.6"
|
camino = "1.1.6"
|
||||||
cfg-if = "1.0.0"
|
cfg-if = "1.0.0"
|
||||||
|
cron = "0.15"
|
||||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||||
clap = { version = "4.0", features = ["derive", "env"] }
|
clap = { version = "4.0", features = ["derive", "env"] }
|
||||||
clashmap = { version = "1.0", features = ["raw-api"] }
|
clashmap = { version = "1.0", features = ["raw-api"] }
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ anyhow.workspace = true
|
|||||||
bytes.workspace = true
|
bytes.workspace = true
|
||||||
chrono.workspace = true
|
chrono.workspace = true
|
||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
|
cron.workspace = true
|
||||||
fail.workspace = true
|
fail.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
hex.workspace = true
|
hex.workspace = true
|
||||||
|
|||||||
@@ -115,10 +115,14 @@ struct Cli {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
neon_local_repo_dir: Option<PathBuf>,
|
neon_local_repo_dir: Option<PathBuf>,
|
||||||
|
|
||||||
/// Chaos testing
|
/// Chaos testing: exercise tenant migrations
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
chaos_interval: Option<humantime::Duration>,
|
chaos_interval: Option<humantime::Duration>,
|
||||||
|
|
||||||
|
/// Chaos testing: exercise an immediate exit
|
||||||
|
#[arg(long)]
|
||||||
|
chaos_exit_crontab: Option<cron::Schedule>,
|
||||||
|
|
||||||
// Maximum acceptable lag for the secondary location while draining
|
// Maximum acceptable lag for the secondary location while draining
|
||||||
// a pageserver
|
// a pageserver
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
@@ -382,10 +386,12 @@ async fn async_main() -> anyhow::Result<()> {
|
|||||||
let service = service.clone();
|
let service = service.clone();
|
||||||
let cancel = CancellationToken::new();
|
let cancel = CancellationToken::new();
|
||||||
let cancel_bg = cancel.clone();
|
let cancel_bg = cancel.clone();
|
||||||
|
let chaos_exit_crontab = args.chaos_exit_crontab;
|
||||||
(
|
(
|
||||||
tokio::task::spawn(
|
tokio::task::spawn(
|
||||||
async move {
|
async move {
|
||||||
let mut chaos_injector = ChaosInjector::new(service, interval.into());
|
let mut chaos_injector =
|
||||||
|
ChaosInjector::new(service, interval.into(), chaos_exit_crontab);
|
||||||
chaos_injector.run(cancel_bg).await
|
chaos_injector.run(cancel_bg).await
|
||||||
}
|
}
|
||||||
.instrument(tracing::info_span!("chaos_injector")),
|
.instrument(tracing::info_span!("chaos_injector")),
|
||||||
|
|||||||
@@ -16,29 +16,80 @@ use super::{Node, Scheduler, Service, TenantShard};
|
|||||||
pub struct ChaosInjector {
|
pub struct ChaosInjector {
|
||||||
service: Arc<Service>,
|
service: Arc<Service>,
|
||||||
interval: Duration,
|
interval: Duration,
|
||||||
|
chaos_exit_crontab: Option<cron::Schedule>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cron_to_next_duration(cron: &cron::Schedule) -> anyhow::Result<tokio::time::Sleep> {
|
||||||
|
use chrono::Utc;
|
||||||
|
let next = cron.upcoming(Utc).next().unwrap();
|
||||||
|
let duration = (next - Utc::now()).to_std()?;
|
||||||
|
Ok(tokio::time::sleep(duration))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn maybe_sleep(sleep: Option<tokio::time::Sleep>) -> Option<()> {
|
||||||
|
if let Some(sleep) = sleep {
|
||||||
|
sleep.await;
|
||||||
|
Some(())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ChaosInjector {
|
impl ChaosInjector {
|
||||||
pub fn new(service: Arc<Service>, interval: Duration) -> Self {
|
pub fn new(
|
||||||
Self { service, interval }
|
service: Arc<Service>,
|
||||||
|
interval: Duration,
|
||||||
|
chaos_exit_crontab: Option<cron::Schedule>,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
service,
|
||||||
|
interval,
|
||||||
|
chaos_exit_crontab,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn run(&mut self, cancel: CancellationToken) {
|
pub async fn run(&mut self, cancel: CancellationToken) {
|
||||||
let mut interval = tokio::time::interval(self.interval);
|
let mut interval = tokio::time::interval(self.interval);
|
||||||
|
let cron_interval = {
|
||||||
loop {
|
if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab {
|
||||||
tokio::select! {
|
match cron_to_next_duration(chaos_exit_crontab) {
|
||||||
_ = interval.tick() => {}
|
Ok(interval_exit) => Some(interval_exit),
|
||||||
_ = cancel.cancelled() => {
|
Err(e) => {
|
||||||
tracing::info!("Shutting down");
|
tracing::error!("Error processing the cron schedule: {e}");
|
||||||
return;
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
}
|
}
|
||||||
|
};
|
||||||
self.inject_chaos().await;
|
enum ChaosEvent {
|
||||||
|
ShuffleTenant,
|
||||||
tracing::info!("Chaos iteration...");
|
ForceKill,
|
||||||
}
|
}
|
||||||
|
let chaos_type = tokio::select! {
|
||||||
|
_ = interval.tick() => {
|
||||||
|
ChaosEvent::ShuffleTenant
|
||||||
|
}
|
||||||
|
Some(_) = maybe_sleep(cron_interval) => {
|
||||||
|
ChaosEvent::ForceKill
|
||||||
|
}
|
||||||
|
_ = cancel.cancelled() => {
|
||||||
|
tracing::info!("Shutting down");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
match chaos_type {
|
||||||
|
ChaosEvent::ShuffleTenant => {
|
||||||
|
self.inject_chaos().await;
|
||||||
|
}
|
||||||
|
ChaosEvent::ForceKill => {
|
||||||
|
self.force_kill().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::info!("Chaos iteration...");
|
||||||
}
|
}
|
||||||
|
|
||||||
/// If a shard has a secondary and attached location, then re-assign the secondary to be
|
/// If a shard has a secondary and attached location, then re-assign the secondary to be
|
||||||
@@ -95,6 +146,11 @@ impl ChaosInjector {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn force_kill(&mut self) {
|
||||||
|
tracing::warn!("Injecting chaos: force kill");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
async fn inject_chaos(&mut self) {
|
async fn inject_chaos(&mut self) {
|
||||||
// Pick some shards to interfere with
|
// Pick some shards to interfere with
|
||||||
let batch_size = 128;
|
let batch_size = 128;
|
||||||
|
|||||||
Reference in New Issue
Block a user