diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index dc7e9aed7f..22815955c1 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -226,6 +226,7 @@ pub struct ConfigToml { pub synthetic_size_calculation_interval: Duration, pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, pub test_remote_failures: u64, + pub test_remote_failures_probability: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, #[serde(with = "humantime_serde")] pub background_task_maximum_delay: Duration, @@ -758,6 +759,7 @@ impl Default for ConfigToml { disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(), test_remote_failures: (0), + test_remote_failures_probability: (100), ondemand_download_behavior_treat_error_as_warn: (false), diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 69316fd493..0ae13552b8 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -43,6 +43,7 @@ itertools.workspace = true sync_wrapper = { workspace = true, features = ["futures"] } byteorder = "1.4" +rand = "0.8.5" [dev-dependencies] camino-tempfile.workspace = true diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index ed416b2811..5885c3e791 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -732,9 +732,15 @@ impl GenericRemoteStorage { }) } - pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self { - Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first))) + /* BEGIN_HADRON */ + pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self { + Self::Unreliable(Arc::new(UnreliableWrapper::new( + s, + fail_first, + fail_probability, + ))) } + /* END_HADRON */ /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata. pub async fn upload_storage_object( diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index f9856a5856..30d116f57c 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -1,6 +1,8 @@ //! This module provides a wrapper around a real RemoteStorage implementation that //! causes the first N attempts at each upload or download operatio to fail. For //! testing purposes. +use rand::Rng; +use std::cmp; use std::collections::HashMap; use std::collections::hash_map::Entry; use std::num::NonZeroU32; @@ -25,6 +27,12 @@ pub struct UnreliableWrapper { // Tracks how many failed attempts of each operation has been made. attempts: Mutex>, + + /* BEGIN_HADRON */ + // This the probability of failure for each operation, ranged from [0, 100]. + // The probability is default to 100, which means that all operations will fail. + attempt_failure_probability: u64, + /* END_HADRON */ } /// Used to identify retries of different unique operation. @@ -40,7 +48,11 @@ enum RemoteOp { } impl UnreliableWrapper { - pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self { + pub fn new( + inner: crate::GenericRemoteStorage, + attempts_to_fail: u64, + attempt_failure_probability: u64, + ) -> Self { assert!(attempts_to_fail > 0); let inner = match inner { GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s), @@ -51,9 +63,11 @@ impl UnreliableWrapper { panic!("Can't wrap unreliable wrapper unreliably") } }; + let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100); UnreliableWrapper { inner, attempts_to_fail, + attempt_failure_probability: actual_attempt_failure_probability, attempts: Mutex::new(HashMap::new()), } } @@ -66,6 +80,7 @@ impl UnreliableWrapper { /// fn attempt(&self, op: RemoteOp) -> anyhow::Result { let mut attempts = self.attempts.lock().unwrap(); + let mut rng = rand::thread_rng(); match attempts.entry(op) { Entry::Occupied(mut e) => { @@ -75,15 +90,19 @@ impl UnreliableWrapper { *p }; - if attempts_before_this >= self.attempts_to_fail { - // let it succeed - e.remove(); - Ok(attempts_before_this) - } else { + /* BEGIN_HADRON */ + // If there are more attempts to fail, fail the request by probability. + if (attempts_before_this < self.attempts_to_fail) + && (rng.gen_range(0..=100) < self.attempt_failure_probability) + { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); Err(error) + } else { + e.remove(); + Ok(attempts_before_this) } + /* END_HADRON */ } Entry::Vacant(e) => { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs index 2a85f54a01..cc1cbf8009 100644 --- a/libs/utils/src/env.rs +++ b/libs/utils/src/env.rs @@ -44,3 +44,62 @@ where } } } + +/* BEGIN_HADRON */ +pub enum DeploymentMode { + Dev, + Staging, + Prod, +} + +pub fn get_deployment_mode() -> Option { + match std::env::var("DEPLOYMENT_MODE") { + Ok(env) => match env.as_str() { + "development" => Some(DeploymentMode::Dev), + "staging" => Some(DeploymentMode::Staging), + "production" => Some(DeploymentMode::Prod), + _ => { + tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env); + None + } + }, + Err(_) => { + tracing::error!("DEPLOYMENT_MODE not set"); + None + } + } +} + +pub fn is_dev_or_staging() -> bool { + matches!( + get_deployment_mode(), + Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging) + ) +} + +pub enum TestingMode { + Chaos, + Stress, +} + +pub fn get_test_mode() -> Option { + match std::env::var("HADRON_TEST_MODE") { + Ok(env) => match env.as_str() { + "chaos" => Some(TestingMode::Chaos), + "stress" => Some(TestingMode::Stress), + _ => { + tracing::error!("Unexpected HADRON_TEST_MODE: {}", env); + None + } + }, + Err(_) => { + tracing::error!("HADRON_TEST_MODE not set"); + None + } + } +} + +pub fn is_chaos_testing() -> bool { + matches!(get_test_mode(), Some(TestingMode::Chaos)) +} +/* END_HADRON */ diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 327384fd82..78aba25d2e 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -889,8 +889,11 @@ async fn create_remote_storage_client( "Simulating remote failures for first {} attempts of each op", conf.test_remote_failures ); - remote_storage = - GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures); + remote_storage = GenericRemoteStorage::unreliable_wrapper( + remote_storage, + conf.test_remote_failures, + conf.test_remote_failures_probability, + ); } Ok(remote_storage) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 99d7e0ca3a..15ec31b0a6 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -147,7 +147,11 @@ pub struct PageServerConf { pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, + // The number of allowed failures in remote storage operations. pub test_remote_failures: u64, + // The probability of failure in remote storage operations. Only works when test_remote_failures > 1. + // Use 100 for 100% failure, 0 for no failure. + pub test_remote_failures_probability: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, @@ -392,6 +396,7 @@ impl PageServerConf { synthetic_size_calculation_interval, disk_usage_based_eviction, test_remote_failures, + test_remote_failures_probability, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, control_plane_api, @@ -461,6 +466,7 @@ impl PageServerConf { synthetic_size_calculation_interval, disk_usage_based_eviction, test_remote_failures, + test_remote_failures_probability, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, control_plane_api: control_plane_api diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index b55cc14532..4d8df19476 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -267,7 +267,7 @@ async fn worker_inner( ) -> anyhow::Result<()> { #[cfg(any(test, feature = "testing"))] let storage = if config.test_remote_failures > 0 { - GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures) + GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures, 100) } else { storage };