mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
libs: make remote storage failure injection probabilistic (#12526)
Change the unreliable storage wrapper to fail by probability when there are more failure attempts left. Co-authored-by: Yecheng Yang <carlton.yang@databricks.com>
This commit is contained in:
@@ -226,6 +226,7 @@ pub struct ConfigToml {
|
|||||||
pub synthetic_size_calculation_interval: Duration,
|
pub synthetic_size_calculation_interval: Duration,
|
||||||
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||||
pub test_remote_failures: u64,
|
pub test_remote_failures: u64,
|
||||||
|
pub test_remote_failures_probability: u64,
|
||||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||||
#[serde(with = "humantime_serde")]
|
#[serde(with = "humantime_serde")]
|
||||||
pub background_task_maximum_delay: Duration,
|
pub background_task_maximum_delay: Duration,
|
||||||
@@ -758,6 +759,7 @@ impl Default for ConfigToml {
|
|||||||
disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
|
disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
|
||||||
|
|
||||||
test_remote_failures: (0),
|
test_remote_failures: (0),
|
||||||
|
test_remote_failures_probability: (100),
|
||||||
|
|
||||||
ondemand_download_behavior_treat_error_as_warn: (false),
|
ondemand_download_behavior_treat_error_as_warn: (false),
|
||||||
|
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ itertools.workspace = true
|
|||||||
sync_wrapper = { workspace = true, features = ["futures"] }
|
sync_wrapper = { workspace = true, features = ["futures"] }
|
||||||
|
|
||||||
byteorder = "1.4"
|
byteorder = "1.4"
|
||||||
|
rand = "0.8.5"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
camino-tempfile.workspace = true
|
camino-tempfile.workspace = true
|
||||||
|
|||||||
@@ -732,9 +732,15 @@ impl GenericRemoteStorage {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
|
/* BEGIN_HADRON */
|
||||||
Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
|
pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self {
|
||||||
|
Self::Unreliable(Arc::new(UnreliableWrapper::new(
|
||||||
|
s,
|
||||||
|
fail_first,
|
||||||
|
fail_probability,
|
||||||
|
)))
|
||||||
}
|
}
|
||||||
|
/* END_HADRON */
|
||||||
|
|
||||||
/// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
|
/// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
|
||||||
pub async fn upload_storage_object(
|
pub async fn upload_storage_object(
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
//! This module provides a wrapper around a real RemoteStorage implementation that
|
//! This module provides a wrapper around a real RemoteStorage implementation that
|
||||||
//! causes the first N attempts at each upload or download operatio to fail. For
|
//! causes the first N attempts at each upload or download operatio to fail. For
|
||||||
//! testing purposes.
|
//! testing purposes.
|
||||||
|
use rand::Rng;
|
||||||
|
use std::cmp;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
use std::num::NonZeroU32;
|
use std::num::NonZeroU32;
|
||||||
@@ -25,6 +27,12 @@ pub struct UnreliableWrapper {
|
|||||||
|
|
||||||
// Tracks how many failed attempts of each operation has been made.
|
// Tracks how many failed attempts of each operation has been made.
|
||||||
attempts: Mutex<HashMap<RemoteOp, u64>>,
|
attempts: Mutex<HashMap<RemoteOp, u64>>,
|
||||||
|
|
||||||
|
/* BEGIN_HADRON */
|
||||||
|
// This the probability of failure for each operation, ranged from [0, 100].
|
||||||
|
// The probability is default to 100, which means that all operations will fail.
|
||||||
|
attempt_failure_probability: u64,
|
||||||
|
/* END_HADRON */
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Used to identify retries of different unique operation.
|
/// Used to identify retries of different unique operation.
|
||||||
@@ -40,7 +48,11 @@ enum RemoteOp {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl UnreliableWrapper {
|
impl UnreliableWrapper {
|
||||||
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
|
pub fn new(
|
||||||
|
inner: crate::GenericRemoteStorage,
|
||||||
|
attempts_to_fail: u64,
|
||||||
|
attempt_failure_probability: u64,
|
||||||
|
) -> Self {
|
||||||
assert!(attempts_to_fail > 0);
|
assert!(attempts_to_fail > 0);
|
||||||
let inner = match inner {
|
let inner = match inner {
|
||||||
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
|
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
|
||||||
@@ -51,9 +63,11 @@ impl UnreliableWrapper {
|
|||||||
panic!("Can't wrap unreliable wrapper unreliably")
|
panic!("Can't wrap unreliable wrapper unreliably")
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100);
|
||||||
UnreliableWrapper {
|
UnreliableWrapper {
|
||||||
inner,
|
inner,
|
||||||
attempts_to_fail,
|
attempts_to_fail,
|
||||||
|
attempt_failure_probability: actual_attempt_failure_probability,
|
||||||
attempts: Mutex::new(HashMap::new()),
|
attempts: Mutex::new(HashMap::new()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -66,6 +80,7 @@ impl UnreliableWrapper {
|
|||||||
///
|
///
|
||||||
fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
|
fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
|
||||||
let mut attempts = self.attempts.lock().unwrap();
|
let mut attempts = self.attempts.lock().unwrap();
|
||||||
|
let mut rng = rand::thread_rng();
|
||||||
|
|
||||||
match attempts.entry(op) {
|
match attempts.entry(op) {
|
||||||
Entry::Occupied(mut e) => {
|
Entry::Occupied(mut e) => {
|
||||||
@@ -75,15 +90,19 @@ impl UnreliableWrapper {
|
|||||||
*p
|
*p
|
||||||
};
|
};
|
||||||
|
|
||||||
if attempts_before_this >= self.attempts_to_fail {
|
/* BEGIN_HADRON */
|
||||||
// let it succeed
|
// If there are more attempts to fail, fail the request by probability.
|
||||||
e.remove();
|
if (attempts_before_this < self.attempts_to_fail)
|
||||||
Ok(attempts_before_this)
|
&& (rng.gen_range(0..=100) < self.attempt_failure_probability)
|
||||||
} else {
|
{
|
||||||
let error =
|
let error =
|
||||||
anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
||||||
Err(error)
|
Err(error)
|
||||||
|
} else {
|
||||||
|
e.remove();
|
||||||
|
Ok(attempts_before_this)
|
||||||
}
|
}
|
||||||
|
/* END_HADRON */
|
||||||
}
|
}
|
||||||
Entry::Vacant(e) => {
|
Entry::Vacant(e) => {
|
||||||
let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
||||||
|
|||||||
@@ -44,3 +44,62 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* BEGIN_HADRON */
|
||||||
|
pub enum DeploymentMode {
|
||||||
|
Dev,
|
||||||
|
Staging,
|
||||||
|
Prod,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_deployment_mode() -> Option<DeploymentMode> {
|
||||||
|
match std::env::var("DEPLOYMENT_MODE") {
|
||||||
|
Ok(env) => match env.as_str() {
|
||||||
|
"development" => Some(DeploymentMode::Dev),
|
||||||
|
"staging" => Some(DeploymentMode::Staging),
|
||||||
|
"production" => Some(DeploymentMode::Prod),
|
||||||
|
_ => {
|
||||||
|
tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Err(_) => {
|
||||||
|
tracing::error!("DEPLOYMENT_MODE not set");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_dev_or_staging() -> bool {
|
||||||
|
matches!(
|
||||||
|
get_deployment_mode(),
|
||||||
|
Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum TestingMode {
|
||||||
|
Chaos,
|
||||||
|
Stress,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_test_mode() -> Option<TestingMode> {
|
||||||
|
match std::env::var("HADRON_TEST_MODE") {
|
||||||
|
Ok(env) => match env.as_str() {
|
||||||
|
"chaos" => Some(TestingMode::Chaos),
|
||||||
|
"stress" => Some(TestingMode::Stress),
|
||||||
|
_ => {
|
||||||
|
tracing::error!("Unexpected HADRON_TEST_MODE: {}", env);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Err(_) => {
|
||||||
|
tracing::error!("HADRON_TEST_MODE not set");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_chaos_testing() -> bool {
|
||||||
|
matches!(get_test_mode(), Some(TestingMode::Chaos))
|
||||||
|
}
|
||||||
|
/* END_HADRON */
|
||||||
|
|||||||
@@ -889,8 +889,11 @@ async fn create_remote_storage_client(
|
|||||||
"Simulating remote failures for first {} attempts of each op",
|
"Simulating remote failures for first {} attempts of each op",
|
||||||
conf.test_remote_failures
|
conf.test_remote_failures
|
||||||
);
|
);
|
||||||
remote_storage =
|
remote_storage = GenericRemoteStorage::unreliable_wrapper(
|
||||||
GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
|
remote_storage,
|
||||||
|
conf.test_remote_failures,
|
||||||
|
conf.test_remote_failures_probability,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(remote_storage)
|
Ok(remote_storage)
|
||||||
|
|||||||
@@ -147,7 +147,11 @@ pub struct PageServerConf {
|
|||||||
|
|
||||||
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||||
|
|
||||||
|
// The number of allowed failures in remote storage operations.
|
||||||
pub test_remote_failures: u64,
|
pub test_remote_failures: u64,
|
||||||
|
// The probability of failure in remote storage operations. Only works when test_remote_failures > 1.
|
||||||
|
// Use 100 for 100% failure, 0 for no failure.
|
||||||
|
pub test_remote_failures_probability: u64,
|
||||||
|
|
||||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||||
|
|
||||||
@@ -392,6 +396,7 @@ impl PageServerConf {
|
|||||||
synthetic_size_calculation_interval,
|
synthetic_size_calculation_interval,
|
||||||
disk_usage_based_eviction,
|
disk_usage_based_eviction,
|
||||||
test_remote_failures,
|
test_remote_failures,
|
||||||
|
test_remote_failures_probability,
|
||||||
ondemand_download_behavior_treat_error_as_warn,
|
ondemand_download_behavior_treat_error_as_warn,
|
||||||
background_task_maximum_delay,
|
background_task_maximum_delay,
|
||||||
control_plane_api,
|
control_plane_api,
|
||||||
@@ -461,6 +466,7 @@ impl PageServerConf {
|
|||||||
synthetic_size_calculation_interval,
|
synthetic_size_calculation_interval,
|
||||||
disk_usage_based_eviction,
|
disk_usage_based_eviction,
|
||||||
test_remote_failures,
|
test_remote_failures,
|
||||||
|
test_remote_failures_probability,
|
||||||
ondemand_download_behavior_treat_error_as_warn,
|
ondemand_download_behavior_treat_error_as_warn,
|
||||||
background_task_maximum_delay,
|
background_task_maximum_delay,
|
||||||
control_plane_api: control_plane_api
|
control_plane_api: control_plane_api
|
||||||
|
|||||||
@@ -267,7 +267,7 @@ async fn worker_inner(
|
|||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
#[cfg(any(test, feature = "testing"))]
|
#[cfg(any(test, feature = "testing"))]
|
||||||
let storage = if config.test_remote_failures > 0 {
|
let storage = if config.test_remote_failures > 0 {
|
||||||
GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
|
GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures, 100)
|
||||||
} else {
|
} else {
|
||||||
storage
|
storage
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user