libs: make remote storage failure injection probabilistic (#12526)

Change the unreliable storage wrapper to fail by probability when there
are more failure attempts left.

Co-authored-by: Yecheng Yang <carlton.yang@databricks.com>
This commit is contained in:
Vlad Lazar
2025-07-09 18:41:34 +01:00
committed by GitHub
parent 4bbabc092a
commit fe0ddb7169
8 changed files with 107 additions and 11 deletions

View File

@@ -226,6 +226,7 @@ pub struct ConfigToml {
pub synthetic_size_calculation_interval: Duration,
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
pub test_remote_failures: u64,
pub test_remote_failures_probability: u64,
pub ondemand_download_behavior_treat_error_as_warn: bool,
#[serde(with = "humantime_serde")]
pub background_task_maximum_delay: Duration,
@@ -758,6 +759,7 @@ impl Default for ConfigToml {
disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
test_remote_failures: (0),
test_remote_failures_probability: (100),
ondemand_download_behavior_treat_error_as_warn: (false),

View File

@@ -43,6 +43,7 @@ itertools.workspace = true
sync_wrapper = { workspace = true, features = ["futures"] }
byteorder = "1.4"
rand = "0.8.5"
[dev-dependencies]
camino-tempfile.workspace = true

View File

@@ -732,9 +732,15 @@ impl GenericRemoteStorage {
})
}
pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
/* BEGIN_HADRON */
pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self {
Self::Unreliable(Arc::new(UnreliableWrapper::new(
s,
fail_first,
fail_probability,
)))
}
/* END_HADRON */
/// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
pub async fn upload_storage_object(

View File

@@ -1,6 +1,8 @@
//! This module provides a wrapper around a real RemoteStorage implementation that
//! causes the first N attempts at each upload or download operatio to fail. For
//! testing purposes.
use rand::Rng;
use std::cmp;
use std::collections::HashMap;
use std::collections::hash_map::Entry;
use std::num::NonZeroU32;
@@ -25,6 +27,12 @@ pub struct UnreliableWrapper {
// Tracks how many failed attempts of each operation has been made.
attempts: Mutex<HashMap<RemoteOp, u64>>,
/* BEGIN_HADRON */
// This the probability of failure for each operation, ranged from [0, 100].
// The probability is default to 100, which means that all operations will fail.
attempt_failure_probability: u64,
/* END_HADRON */
}
/// Used to identify retries of different unique operation.
@@ -40,7 +48,11 @@ enum RemoteOp {
}
impl UnreliableWrapper {
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
pub fn new(
inner: crate::GenericRemoteStorage,
attempts_to_fail: u64,
attempt_failure_probability: u64,
) -> Self {
assert!(attempts_to_fail > 0);
let inner = match inner {
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
@@ -51,9 +63,11 @@ impl UnreliableWrapper {
panic!("Can't wrap unreliable wrapper unreliably")
}
};
let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100);
UnreliableWrapper {
inner,
attempts_to_fail,
attempt_failure_probability: actual_attempt_failure_probability,
attempts: Mutex::new(HashMap::new()),
}
}
@@ -66,6 +80,7 @@ impl UnreliableWrapper {
///
fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
let mut attempts = self.attempts.lock().unwrap();
let mut rng = rand::thread_rng();
match attempts.entry(op) {
Entry::Occupied(mut e) => {
@@ -75,15 +90,19 @@ impl UnreliableWrapper {
*p
};
if attempts_before_this >= self.attempts_to_fail {
// let it succeed
e.remove();
Ok(attempts_before_this)
} else {
/* BEGIN_HADRON */
// If there are more attempts to fail, fail the request by probability.
if (attempts_before_this < self.attempts_to_fail)
&& (rng.gen_range(0..=100) < self.attempt_failure_probability)
{
let error =
anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
Err(error)
} else {
e.remove();
Ok(attempts_before_this)
}
/* END_HADRON */
}
Entry::Vacant(e) => {
let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());

View File

@@ -44,3 +44,62 @@ where
}
}
}
/* BEGIN_HADRON */
pub enum DeploymentMode {
Dev,
Staging,
Prod,
}
pub fn get_deployment_mode() -> Option<DeploymentMode> {
match std::env::var("DEPLOYMENT_MODE") {
Ok(env) => match env.as_str() {
"development" => Some(DeploymentMode::Dev),
"staging" => Some(DeploymentMode::Staging),
"production" => Some(DeploymentMode::Prod),
_ => {
tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env);
None
}
},
Err(_) => {
tracing::error!("DEPLOYMENT_MODE not set");
None
}
}
}
pub fn is_dev_or_staging() -> bool {
matches!(
get_deployment_mode(),
Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging)
)
}
pub enum TestingMode {
Chaos,
Stress,
}
pub fn get_test_mode() -> Option<TestingMode> {
match std::env::var("HADRON_TEST_MODE") {
Ok(env) => match env.as_str() {
"chaos" => Some(TestingMode::Chaos),
"stress" => Some(TestingMode::Stress),
_ => {
tracing::error!("Unexpected HADRON_TEST_MODE: {}", env);
None
}
},
Err(_) => {
tracing::error!("HADRON_TEST_MODE not set");
None
}
}
}
pub fn is_chaos_testing() -> bool {
matches!(get_test_mode(), Some(TestingMode::Chaos))
}
/* END_HADRON */

View File

@@ -889,8 +889,11 @@ async fn create_remote_storage_client(
"Simulating remote failures for first {} attempts of each op",
conf.test_remote_failures
);
remote_storage =
GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
remote_storage = GenericRemoteStorage::unreliable_wrapper(
remote_storage,
conf.test_remote_failures,
conf.test_remote_failures_probability,
);
}
Ok(remote_storage)

View File

@@ -147,7 +147,11 @@ pub struct PageServerConf {
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
// The number of allowed failures in remote storage operations.
pub test_remote_failures: u64,
// The probability of failure in remote storage operations. Only works when test_remote_failures > 1.
// Use 100 for 100% failure, 0 for no failure.
pub test_remote_failures_probability: u64,
pub ondemand_download_behavior_treat_error_as_warn: bool,
@@ -392,6 +396,7 @@ impl PageServerConf {
synthetic_size_calculation_interval,
disk_usage_based_eviction,
test_remote_failures,
test_remote_failures_probability,
ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay,
control_plane_api,
@@ -461,6 +466,7 @@ impl PageServerConf {
synthetic_size_calculation_interval,
disk_usage_based_eviction,
test_remote_failures,
test_remote_failures_probability,
ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay,
control_plane_api: control_plane_api

View File

@@ -267,7 +267,7 @@ async fn worker_inner(
) -> anyhow::Result<()> {
#[cfg(any(test, feature = "testing"))]
let storage = if config.test_remote_failures > 0 {
GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures, 100)
} else {
storage
};