From 4626d89edaa004a3fb86f241169da07d333e6fad Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 14 Aug 2023 17:16:49 +0300 Subject: [PATCH] Harden retries on tenant/timeline deletion path. (#4973) Originated from test failure where we got SlowDown error from s3. The patch generalizes `download_retry` to not be download specific. Resulting `retry` function is moved to utils crate. `download_retries` is now a thin wrapper around this `retry` function. To ensure that all needed retries are in place test code now uses `test_remote_failures=1` setting. Ref https://neondb.slack.com/archives/C059ZC138NR/p1691743624353009 --- libs/utils/src/backoff.rs | 188 ++++++++++++++++++ libs/utils/src/lib.rs | 2 + pageserver/src/lib.rs | 53 ----- pageserver/src/tenant/delete.rs | 75 ++++--- .../src/tenant/remote_timeline_client.rs | 72 +++++-- .../tenant/remote_timeline_client/download.rs | 55 +---- .../walreceiver/connection_manager.rs | 4 +- test_runner/fixtures/pageserver/utils.py | 2 +- test_runner/regress/test_remote_storage.py | 6 +- test_runner/regress/test_tenant_delete.py | 18 +- 10 files changed, 322 insertions(+), 153 deletions(-) create mode 100644 libs/utils/src/backoff.rs diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs new file mode 100644 index 0000000000..4507efe647 --- /dev/null +++ b/libs/utils/src/backoff.rs @@ -0,0 +1,188 @@ +use std::fmt::{Debug, Display}; + +use futures::Future; + +pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; +pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; + +pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) { + let backoff_duration_seconds = + exponential_backoff_duration_seconds(n, base_increment, max_seconds); + if backoff_duration_seconds > 0.0 { + tracing::info!( + "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task", + ); + tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await; + } +} + +pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { + if n == 0 { + 0.0 + } else { + (1.0 + base_increment).powf(f64::from(n)).min(max_seconds) + } +} + +/// retries passed operation until one of the following conditions are met: +/// Encountered error is considered as permanent (non-retryable) +/// Retries have been exhausted. +/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors +/// When attempts cross `warn_threshold` function starts to emit log warnings. +/// `description` argument is added to log messages. Its value should identify the `op` is doing +pub async fn retry( + mut op: O, + is_permanent: impl Fn(&E) -> bool, + warn_threshold: u32, + max_retries: u32, + description: &str, +) -> Result +where + // Not std::error::Error because anyhow::Error doesnt implement it. + // For context see https://github.com/dtolnay/anyhow/issues/63 + E: Display + Debug, + O: FnMut() -> F, + F: Future>, +{ + let mut attempts = 0; + loop { + let result = op().await; + match result { + Ok(_) => { + if attempts > 0 { + tracing::info!("{description} succeeded after {attempts} retries"); + } + return result; + } + + // These are "permanent" errors that should not be retried. + Err(ref e) if is_permanent(e) => { + return result; + } + // Assume that any other failure might be transient, and the operation might + // succeed if we just keep trying. + Err(err) if attempts < warn_threshold => { + tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}"); + } + Err(err) if attempts < max_retries => { + tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}"); + } + Err(ref err) => { + // Operation failed `max_attempts` times. Time to give up. + tracing::warn!( + "{description} still failed after {attempts} retries, giving up: {err:?}" + ); + return result; + } + } + // sleep and retry + exponential_backoff( + attempts, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ) + .await; + attempts += 1; + } +} + +#[cfg(test)] +mod tests { + use std::io; + + use tokio::sync::Mutex; + + use super::*; + + #[test] + fn backoff_defaults_produce_growing_backoff_sequence() { + let mut current_backoff_value = None; + + for i in 0..10_000 { + let new_backoff_value = exponential_backoff_duration_seconds( + i, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ); + + if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) { + assert!( + old_backoff_value <= new_backoff_value, + "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}" + ) + } + } + + assert_eq!( + current_backoff_value.expect("Should have produced backoff values to compare"), + DEFAULT_MAX_BACKOFF_SECONDS, + "Given big enough of retries, backoff should reach its allowed max value" + ); + } + + #[tokio::test(start_paused = true)] + async fn retry_always_error() { + let count = Mutex::new(0); + let err_result = retry( + || async { + *count.lock().await += 1; + Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other)) + }, + |_e| false, + 1, + 1, + "work", + ) + .await; + + assert!(err_result.is_err()); + + assert_eq!(*count.lock().await, 2); + } + + #[tokio::test(start_paused = true)] + async fn retry_ok_after_err() { + let count = Mutex::new(0); + retry( + || async { + let mut locked = count.lock().await; + if *locked > 1 { + Ok(()) + } else { + *locked += 1; + Err(io::Error::from(io::ErrorKind::Other)) + } + }, + |_e| false, + 2, + 2, + "work", + ) + .await + .unwrap(); + } + + #[tokio::test(start_paused = true)] + async fn dont_retry_permanent_errors() { + let count = Mutex::new(0); + let _ = retry( + || async { + let mut locked = count.lock().await; + if *locked > 1 { + Ok(()) + } else { + *locked += 1; + Err(io::Error::from(io::ErrorKind::Other)) + } + }, + |_e| true, + 2, + 2, + "work", + ) + .await + .unwrap_err(); + + assert_eq!(*count.lock().await, 1); + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index b591cc611a..5c8e01f41a 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -1,6 +1,8 @@ //! `utils` is intended to be a place to put code that is shared //! between other crates in this repository. +pub mod backoff; + /// `Lsn` type implements common tasks on Log Sequence Numbers pub mod lsn; /// SeqWait allows waiting for a future sequence number to arrive diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index b918dff193..4ff8ffcc4f 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -95,28 +95,6 @@ pub async fn shutdown_pageserver(exit_code: i32) { std::process::exit(exit_code); } -const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; -const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; - -async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) { - let backoff_duration_seconds = - exponential_backoff_duration_seconds(n, base_increment, max_seconds); - if backoff_duration_seconds > 0.0 { - info!( - "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task", - ); - tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await; - } -} - -pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { - if n == 0 { - 0.0 - } else { - (1.0 + base_increment).powf(f64::from(n)).min(max_seconds) - } -} - /// The name of the metadata file pageserver creates per timeline. /// Full path: `tenants//timelines//metadata`. pub const METADATA_FILE_NAME: &str = "metadata"; @@ -238,37 +216,6 @@ async fn timed( } } -#[cfg(test)] -mod backoff_defaults_tests { - use super::*; - - #[test] - fn backoff_defaults_produce_growing_backoff_sequence() { - let mut current_backoff_value = None; - - for i in 0..10_000 { - let new_backoff_value = exponential_backoff_duration_seconds( - i, - DEFAULT_BASE_BACKOFF_SECONDS, - DEFAULT_MAX_BACKOFF_SECONDS, - ); - - if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) { - assert!( - old_backoff_value <= new_backoff_value, - "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}" - ) - } - } - - assert_eq!( - current_backoff_value.expect("Should have produced backoff values to compare"), - DEFAULT_MAX_BACKOFF_SECONDS, - "Given big enough of retries, backoff should reach its allowed max value" - ); - } -} - #[cfg(test)] mod timed_tests { use super::timed; diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index bdeb117c79..38fc31f69c 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -10,7 +10,7 @@ use tokio::sync::OwnedMutexGuard; use tracing::{error, info, instrument, warn, Instrument, Span}; use utils::{ - completion, crashsafe, fs_ext, + backoff, completion, crashsafe, fs_ext, id::{TenantId, TimelineId}, }; @@ -23,12 +23,13 @@ use crate::{ use super::{ mgr::{GetTenantError, TenantsMap}, + remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, span, timeline::delete::DeleteTimelineFlow, tree_sort_timelines, DeleteTimelineError, Tenant, }; -const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u8 = 3; +const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3; #[derive(Debug, thiserror::Error)] pub enum DeleteTenantError { @@ -71,10 +72,19 @@ async fn create_remote_delete_mark( let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?; let data: &[u8] = &[]; - remote_storage - .upload(data, 0, &remote_mark_path, None) - .await - .context("mark upload")?; + backoff::retry( + || async { + remote_storage + .upload(data, 0, &remote_mark_path, None) + .await + }, + |_e| false, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "mark_upload", + ) + .await + .context("mark_upload")?; Ok(()) } @@ -154,9 +164,16 @@ async fn remove_tenant_remote_delete_mark( tenant_id: &TenantId, ) -> Result<(), DeleteTenantError> { if let Some(remote_storage) = remote_storage { - remote_storage - .delete(&remote_tenant_delete_mark_path(conf, tenant_id)?) - .await?; + let path = remote_tenant_delete_mark_path(conf, tenant_id)?; + backoff::retry( + || async { remote_storage.delete(&path).await }, + |_e| false, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "remove_tenant_remote_delete_mark", + ) + .await + .context("remove_tenant_remote_delete_mark")?; } Ok(()) } @@ -337,32 +354,28 @@ impl DeleteTenantFlow { return Ok(acquire(tenant)); } + let remote_storage = match remote_storage { + Some(remote_storage) => remote_storage, + None => return Ok(None), + }; + // If remote storage is there we rely on it - if let Some(remote_storage) = remote_storage { - let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?; + let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?; - let attempt = 1; - loop { - match remote_storage.download(&remote_mark_path).await { - Ok(_) => return Ok(acquire(tenant)), - Err(e) => { - if matches!(e, DownloadError::NotFound) { - return Ok(None); - } - if attempt > SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS { - return Err(anyhow::anyhow!(e))?; - } + let result = backoff::retry( + || async { remote_storage.download(&remote_mark_path).await }, + |e| matches!(e, DownloadError::NotFound), + SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS, + SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS, + "fetch_tenant_deletion_mark", + ) + .await; - warn!( - "failed to fetch tenant deletion mark at {} attempt {}", - &remote_mark_path, attempt - ) - } - } - } + match result { + Ok(_) => Ok(acquire(tenant)), + Err(DownloadError::NotFound) => Ok(None), + Err(e) => Err(anyhow::anyhow!(e)).context("should_resume_deletion")?, } - - Ok(None) } pub(crate) async fn resume( diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 8d002a8570..1d04d74839 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -211,6 +211,9 @@ use chrono::{NaiveDateTime, Utc}; // re-export these pub use download::{is_temp_download_file, list_remote_timelines}; use scopeguard::ScopeGuard; +use utils::backoff::{ + self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, +}; use std::collections::{HashMap, VecDeque}; use std::path::Path; @@ -241,7 +244,6 @@ use crate::{ tenant::upload_queue::{ UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask, }, - {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}, }; use utils::id::{TenantId, TimelineId}; @@ -256,12 +258,12 @@ use super::upload_queue::SetDeletedFlagProgress; // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN // level instead, as repeated failures can mean a more serious problem. If it // fails more than FAILED_DOWNLOAD_RETRIES times, we give up -const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3; -const FAILED_DOWNLOAD_RETRIES: u32 = 10; +pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3; +pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10; // Similarly log failed uploads and deletions at WARN level, after this many // retries. Uploads and deletions are retried forever, though. -const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; +pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; pub enum MaybeDeletedIndexPart { IndexPart(IndexPart), @@ -752,12 +754,24 @@ impl RemoteTimelineClient { pausable_failpoint!("persist_deleted_index_part"); - upload::upload_index_part( - self.conf, - &self.storage_impl, - &self.tenant_id, - &self.timeline_id, - &index_part_with_deleted_at, + backoff::retry( + || async { + upload::upload_index_part( + self.conf, + &self.storage_impl, + &self.tenant_id, + &self.timeline_id, + &index_part_with_deleted_at, + ) + .await + }, + |_e| false, + 1, + // have just a couple of attempts + // when executed as part of timeline deletion this happens in context of api call + // when executed as part of tenant deletion this happens in the background + 2, + "persist_index_part_with_deleted_flag", ) .await?; @@ -834,10 +848,19 @@ impl RemoteTimelineClient { let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id); let timeline_storage_path = self.conf.remote_path(&timeline_path)?; - let remaining = self - .storage_impl - .list_prefixes(Some(&timeline_storage_path)) - .await?; + let remaining = backoff::retry( + || async { + self.storage_impl + .list_prefixes(Some(&timeline_storage_path)) + .await + }, + |_e| false, + FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "list_prefixes", + ) + .await + .context("list prefixes")?; let remaining: Vec = remaining .into_iter() @@ -852,7 +875,15 @@ impl RemoteTimelineClient { .collect(); if !remaining.is_empty() { - self.storage_impl.delete_objects(&remaining).await?; + backoff::retry( + || async { self.storage_impl.delete_objects(&remaining).await }, + |_e| false, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "delete_objects", + ) + .await + .context("delete_objects")?; } fail::fail_point!("timeline-delete-before-index-delete", |_| { @@ -864,7 +895,16 @@ impl RemoteTimelineClient { let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME)); debug!("deleting index part"); - self.storage_impl.delete(&index_file_path).await?; + + backoff::retry( + || async { self.storage_impl.delete(&index_file_path).await }, + |_e| false, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "delete_index", + ) + .await + .context("delete_index")?; fail::fail_point!("timeline-delete-after-index-delete", |_| { Err(anyhow::anyhow!( diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 64f4a0a113..0a6fd03887 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -11,19 +11,17 @@ use std::time::Duration; use anyhow::{anyhow, Context}; use tokio::fs; use tokio::io::AsyncWriteExt; - -use tracing::{info, warn}; +use utils::backoff; use crate::config::PageServerConf; use crate::tenant::storage_layer::LayerFileName; use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use remote_storage::{DownloadError, GenericRemoteStorage}; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; use super::index::{IndexPart, LayerFileMetadata}; -use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD}; +use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES}; async fn fsync_path(path: impl AsRef) -> Result<(), std::io::Error> { fs::File::open(path).await?.sync_all().await @@ -268,7 +266,6 @@ pub(super) async fn download_index_part( Ok(index_part) } -/// /// Helper function to handle retries for a download operation. /// /// Remote operations can fail due to rate limits (IAM, S3), spurious network @@ -276,47 +273,17 @@ pub(super) async fn download_index_part( /// with backoff. /// /// (See similar logic for uploads in `perform_upload_task`) -async fn download_retry(mut op: O, description: &str) -> Result +async fn download_retry(op: O, description: &str) -> Result where O: FnMut() -> F, F: Future>, { - let mut attempts = 0; - loop { - let result = op().await; - match result { - Ok(_) => { - if attempts > 0 { - info!("{description} succeeded after {attempts} retries"); - } - return result; - } - - // These are "permanent" errors that should not be retried. - Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => { - return result; - } - // Assume that any other failure might be transient, and the operation might - // succeed if we just keep trying. - Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => { - info!("{description} failed, will retry (attempt {attempts}): {err:#}"); - } - Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => { - warn!("{description} failed, will retry (attempt {attempts}): {err:#}"); - } - Err(DownloadError::Other(ref err)) => { - // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up. - warn!("{description} still failed after {attempts} retries, giving up: {err:?}"); - return result; - } - } - // sleep and retry - exponential_backoff( - attempts, - DEFAULT_BASE_BACKOFF_SECONDS, - DEFAULT_MAX_BACKOFF_SECONDS, - ) - .await; - attempts += 1; - } + backoff::retry( + op, + |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound), + FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + description, + ) + .await } diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index e96aa41da0..d52b29f175 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -31,8 +31,10 @@ use storage_broker::Streaming; use tokio::select; use tracing::*; -use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use postgres_connection::{parse_host_port, PgConnectionConfig}; +use utils::backoff::{ + exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, +}; use utils::{ id::{NodeId, TenantTimelineId}, lsn::Lsn, diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index f5859550ac..6032ff5b68 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -284,4 +284,4 @@ MANY_SMALL_LAYERS_TENANT_CONFIG = { def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int: - return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 6 + return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 8 diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index bfe9046eca..d642a6d190 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -173,9 +173,7 @@ def test_remote_storage_backup_and_restore( # # The initiated attach operation should survive the restart, and continue from where it was. env.pageserver.stop() - layer_download_failed_regex = ( - r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure" - ) + layer_download_failed_regex = r"Failed to download a remote file: simulated failure of remote operation Download.*[0-9A-F]+-[0-9A-F]+" assert not env.pageserver.log_contains( layer_download_failed_regex ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint" @@ -208,7 +206,7 @@ def test_remote_storage_backup_and_restore( == f"{data}|{checkpoint_number}" ) - log.info("ensure that we neede to retry downloads due to test_remote_failures=1") + log.info("ensure that we needed to retry downloads due to test_remote_failures=1") assert env.pageserver.log_contains(layer_download_failed_regex) diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 0618905331..f0f6cc743c 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -32,6 +32,8 @@ def test_tenant_delete_smoke( remote_storage_kind: RemoteStorageKind, pg_bin: PgBin, ): + neon_env_builder.pageserver_config_override = "test_remote_failures=1" + neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, test_name="test_tenant_delete_smoke", @@ -123,25 +125,35 @@ def combinations(): for remote_storage_kind in remotes: for delete_failpoint in FAILPOINTS: - if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in ( + if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in ( "timeline-delete-before-index-delete", ): # the above failpoint are not relevant for config without remote storage continue - result.append((remote_storage_kind, delete_failpoint)) + # Simulate failures for only one type of remote storage + # to avoid log pollution and make tests run faster + if remote_storage_kind is RemoteStorageKind.MOCK_S3: + simulate_failures = True + else: + simulate_failures = False + result.append((remote_storage_kind, delete_failpoint, simulate_failures)) return result -@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations()) +@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations()) @pytest.mark.parametrize("check", list(Check)) def test_delete_tenant_exercise_crash_safety_failpoints( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, failpoint: str, + simulate_failures: bool, check: Check, pg_bin: PgBin, ): + if simulate_failures: + neon_env_builder.pageserver_config_override = "test_remote_failures=1" + neon_env_builder.enable_remote_storage( remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints" )