libs: make remote storage failure injection probabilistic (#12526)

Change the unreliable storage wrapper to fail by probability when there
are more failure attempts left.

Co-authored-by: Yecheng Yang <carlton.yang@databricks.com>
This commit is contained in:
Vlad Lazar
2025-07-09 18:41:34 +01:00
committed by GitHub
parent 4bbabc092a
commit fe0ddb7169
8 changed files with 107 additions and 11 deletions

View File

@@ -226,6 +226,7 @@ pub struct ConfigToml {
pub synthetic_size_calculation_interval: Duration, pub synthetic_size_calculation_interval: Duration,
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
pub test_remote_failures: u64, pub test_remote_failures: u64,
pub test_remote_failures_probability: u64,
pub ondemand_download_behavior_treat_error_as_warn: bool, pub ondemand_download_behavior_treat_error_as_warn: bool,
#[serde(with = "humantime_serde")] #[serde(with = "humantime_serde")]
pub background_task_maximum_delay: Duration, pub background_task_maximum_delay: Duration,
@@ -758,6 +759,7 @@ impl Default for ConfigToml {
disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(), disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
test_remote_failures: (0), test_remote_failures: (0),
test_remote_failures_probability: (100),
ondemand_download_behavior_treat_error_as_warn: (false), ondemand_download_behavior_treat_error_as_warn: (false),

View File

@@ -43,6 +43,7 @@ itertools.workspace = true
sync_wrapper = { workspace = true, features = ["futures"] } sync_wrapper = { workspace = true, features = ["futures"] }
byteorder = "1.4" byteorder = "1.4"
rand = "0.8.5"
[dev-dependencies] [dev-dependencies]
camino-tempfile.workspace = true camino-tempfile.workspace = true

View File

@@ -732,9 +732,15 @@ impl GenericRemoteStorage {
}) })
} }
pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self { /* BEGIN_HADRON */
Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first))) pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self {
Self::Unreliable(Arc::new(UnreliableWrapper::new(
s,
fail_first,
fail_probability,
)))
} }
/* END_HADRON */
/// See [`RemoteStorage::upload`], which this method calls with `None` as metadata. /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
pub async fn upload_storage_object( pub async fn upload_storage_object(

View File

@@ -1,6 +1,8 @@
//! This module provides a wrapper around a real RemoteStorage implementation that //! This module provides a wrapper around a real RemoteStorage implementation that
//! causes the first N attempts at each upload or download operatio to fail. For //! causes the first N attempts at each upload or download operatio to fail. For
//! testing purposes. //! testing purposes.
use rand::Rng;
use std::cmp;
use std::collections::HashMap; use std::collections::HashMap;
use std::collections::hash_map::Entry; use std::collections::hash_map::Entry;
use std::num::NonZeroU32; use std::num::NonZeroU32;
@@ -25,6 +27,12 @@ pub struct UnreliableWrapper {
// Tracks how many failed attempts of each operation has been made. // Tracks how many failed attempts of each operation has been made.
attempts: Mutex<HashMap<RemoteOp, u64>>, attempts: Mutex<HashMap<RemoteOp, u64>>,
/* BEGIN_HADRON */
// This the probability of failure for each operation, ranged from [0, 100].
// The probability is default to 100, which means that all operations will fail.
attempt_failure_probability: u64,
/* END_HADRON */
} }
/// Used to identify retries of different unique operation. /// Used to identify retries of different unique operation.
@@ -40,7 +48,11 @@ enum RemoteOp {
} }
impl UnreliableWrapper { impl UnreliableWrapper {
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self { pub fn new(
inner: crate::GenericRemoteStorage,
attempts_to_fail: u64,
attempt_failure_probability: u64,
) -> Self {
assert!(attempts_to_fail > 0); assert!(attempts_to_fail > 0);
let inner = match inner { let inner = match inner {
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s), GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
@@ -51,9 +63,11 @@ impl UnreliableWrapper {
panic!("Can't wrap unreliable wrapper unreliably") panic!("Can't wrap unreliable wrapper unreliably")
} }
}; };
let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100);
UnreliableWrapper { UnreliableWrapper {
inner, inner,
attempts_to_fail, attempts_to_fail,
attempt_failure_probability: actual_attempt_failure_probability,
attempts: Mutex::new(HashMap::new()), attempts: Mutex::new(HashMap::new()),
} }
} }
@@ -66,6 +80,7 @@ impl UnreliableWrapper {
/// ///
fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> { fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
let mut attempts = self.attempts.lock().unwrap(); let mut attempts = self.attempts.lock().unwrap();
let mut rng = rand::thread_rng();
match attempts.entry(op) { match attempts.entry(op) {
Entry::Occupied(mut e) => { Entry::Occupied(mut e) => {
@@ -75,15 +90,19 @@ impl UnreliableWrapper {
*p *p
}; };
if attempts_before_this >= self.attempts_to_fail { /* BEGIN_HADRON */
// let it succeed // If there are more attempts to fail, fail the request by probability.
e.remove(); if (attempts_before_this < self.attempts_to_fail)
Ok(attempts_before_this) && (rng.gen_range(0..=100) < self.attempt_failure_probability)
} else { {
let error = let error =
anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
Err(error) Err(error)
} else {
e.remove();
Ok(attempts_before_this)
} }
/* END_HADRON */
} }
Entry::Vacant(e) => { Entry::Vacant(e) => {
let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());

View File

@@ -44,3 +44,62 @@ where
} }
} }
} }
/* BEGIN_HADRON */
pub enum DeploymentMode {
Dev,
Staging,
Prod,
}
pub fn get_deployment_mode() -> Option<DeploymentMode> {
match std::env::var("DEPLOYMENT_MODE") {
Ok(env) => match env.as_str() {
"development" => Some(DeploymentMode::Dev),
"staging" => Some(DeploymentMode::Staging),
"production" => Some(DeploymentMode::Prod),
_ => {
tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env);
None
}
},
Err(_) => {
tracing::error!("DEPLOYMENT_MODE not set");
None
}
}
}
pub fn is_dev_or_staging() -> bool {
matches!(
get_deployment_mode(),
Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging)
)
}
pub enum TestingMode {
Chaos,
Stress,
}
pub fn get_test_mode() -> Option<TestingMode> {
match std::env::var("HADRON_TEST_MODE") {
Ok(env) => match env.as_str() {
"chaos" => Some(TestingMode::Chaos),
"stress" => Some(TestingMode::Stress),
_ => {
tracing::error!("Unexpected HADRON_TEST_MODE: {}", env);
None
}
},
Err(_) => {
tracing::error!("HADRON_TEST_MODE not set");
None
}
}
}
pub fn is_chaos_testing() -> bool {
matches!(get_test_mode(), Some(TestingMode::Chaos))
}
/* END_HADRON */

View File

@@ -889,8 +889,11 @@ async fn create_remote_storage_client(
"Simulating remote failures for first {} attempts of each op", "Simulating remote failures for first {} attempts of each op",
conf.test_remote_failures conf.test_remote_failures
); );
remote_storage = remote_storage = GenericRemoteStorage::unreliable_wrapper(
GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures); remote_storage,
conf.test_remote_failures,
conf.test_remote_failures_probability,
);
} }
Ok(remote_storage) Ok(remote_storage)

View File

@@ -147,7 +147,11 @@ pub struct PageServerConf {
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
// The number of allowed failures in remote storage operations.
pub test_remote_failures: u64, pub test_remote_failures: u64,
// The probability of failure in remote storage operations. Only works when test_remote_failures > 1.
// Use 100 for 100% failure, 0 for no failure.
pub test_remote_failures_probability: u64,
pub ondemand_download_behavior_treat_error_as_warn: bool, pub ondemand_download_behavior_treat_error_as_warn: bool,
@@ -392,6 +396,7 @@ impl PageServerConf {
synthetic_size_calculation_interval, synthetic_size_calculation_interval,
disk_usage_based_eviction, disk_usage_based_eviction,
test_remote_failures, test_remote_failures,
test_remote_failures_probability,
ondemand_download_behavior_treat_error_as_warn, ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay, background_task_maximum_delay,
control_plane_api, control_plane_api,
@@ -461,6 +466,7 @@ impl PageServerConf {
synthetic_size_calculation_interval, synthetic_size_calculation_interval,
disk_usage_based_eviction, disk_usage_based_eviction,
test_remote_failures, test_remote_failures,
test_remote_failures_probability,
ondemand_download_behavior_treat_error_as_warn, ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay, background_task_maximum_delay,
control_plane_api: control_plane_api control_plane_api: control_plane_api

View File

@@ -267,7 +267,7 @@ async fn worker_inner(
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
#[cfg(any(test, feature = "testing"))] #[cfg(any(test, feature = "testing"))]
let storage = if config.test_remote_failures > 0 { let storage = if config.test_remote_failures > 0 {
GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures) GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures, 100)
} else { } else {
storage storage
}; };