mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 22:10:39 +00:00
Harden retries on tenant/timeline deletion path. (#4973)
Originated from test failure where we got SlowDown error from s3. The patch generalizes `download_retry` to not be download specific. Resulting `retry` function is moved to utils crate. `download_retries` is now a thin wrapper around this `retry` function. To ensure that all needed retries are in place test code now uses `test_remote_failures=1` setting. Ref https://neondb.slack.com/archives/C059ZC138NR/p1691743624353009
This commit is contained in:
188
libs/utils/src/backoff.rs
Normal file
188
libs/utils/src/backoff.rs
Normal file
@@ -0,0 +1,188 @@
|
||||
use std::fmt::{Debug, Display};
|
||||
|
||||
use futures::Future;
|
||||
|
||||
pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
|
||||
pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
|
||||
|
||||
pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
|
||||
let backoff_duration_seconds =
|
||||
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
|
||||
if backoff_duration_seconds > 0.0 {
|
||||
tracing::info!(
|
||||
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
|
||||
if n == 0 {
|
||||
0.0
|
||||
} else {
|
||||
(1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
|
||||
}
|
||||
}
|
||||
|
||||
/// retries passed operation until one of the following conditions are met:
|
||||
/// Encountered error is considered as permanent (non-retryable)
|
||||
/// Retries have been exhausted.
|
||||
/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
|
||||
/// When attempts cross `warn_threshold` function starts to emit log warnings.
|
||||
/// `description` argument is added to log messages. Its value should identify the `op` is doing
|
||||
pub async fn retry<T, O, F, E>(
|
||||
mut op: O,
|
||||
is_permanent: impl Fn(&E) -> bool,
|
||||
warn_threshold: u32,
|
||||
max_retries: u32,
|
||||
description: &str,
|
||||
) -> Result<T, E>
|
||||
where
|
||||
// Not std::error::Error because anyhow::Error doesnt implement it.
|
||||
// For context see https://github.com/dtolnay/anyhow/issues/63
|
||||
E: Display + Debug,
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
{
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
let result = op().await;
|
||||
match result {
|
||||
Ok(_) => {
|
||||
if attempts > 0 {
|
||||
tracing::info!("{description} succeeded after {attempts} retries");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// These are "permanent" errors that should not be retried.
|
||||
Err(ref e) if is_permanent(e) => {
|
||||
return result;
|
||||
}
|
||||
// Assume that any other failure might be transient, and the operation might
|
||||
// succeed if we just keep trying.
|
||||
Err(err) if attempts < warn_threshold => {
|
||||
tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(err) if attempts < max_retries => {
|
||||
tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(ref err) => {
|
||||
// Operation failed `max_attempts` times. Time to give up.
|
||||
tracing::warn!(
|
||||
"{description} still failed after {attempts} retries, giving up: {err:?}"
|
||||
);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
// sleep and retry
|
||||
exponential_backoff(
|
||||
attempts,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
)
|
||||
.await;
|
||||
attempts += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io;
|
||||
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn backoff_defaults_produce_growing_backoff_sequence() {
|
||||
let mut current_backoff_value = None;
|
||||
|
||||
for i in 0..10_000 {
|
||||
let new_backoff_value = exponential_backoff_duration_seconds(
|
||||
i,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
);
|
||||
|
||||
if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
|
||||
assert!(
|
||||
old_backoff_value <= new_backoff_value,
|
||||
"{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
current_backoff_value.expect("Should have produced backoff values to compare"),
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
"Given big enough of retries, backoff should reach its allowed max value"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn retry_always_error() {
|
||||
let count = Mutex::new(0);
|
||||
let err_result = retry(
|
||||
|| async {
|
||||
*count.lock().await += 1;
|
||||
Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
|
||||
},
|
||||
|_e| false,
|
||||
1,
|
||||
1,
|
||||
"work",
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(err_result.is_err());
|
||||
|
||||
assert_eq!(*count.lock().await, 2);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn retry_ok_after_err() {
|
||||
let count = Mutex::new(0);
|
||||
retry(
|
||||
|| async {
|
||||
let mut locked = count.lock().await;
|
||||
if *locked > 1 {
|
||||
Ok(())
|
||||
} else {
|
||||
*locked += 1;
|
||||
Err(io::Error::from(io::ErrorKind::Other))
|
||||
}
|
||||
},
|
||||
|_e| false,
|
||||
2,
|
||||
2,
|
||||
"work",
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn dont_retry_permanent_errors() {
|
||||
let count = Mutex::new(0);
|
||||
let _ = retry(
|
||||
|| async {
|
||||
let mut locked = count.lock().await;
|
||||
if *locked > 1 {
|
||||
Ok(())
|
||||
} else {
|
||||
*locked += 1;
|
||||
Err(io::Error::from(io::ErrorKind::Other))
|
||||
}
|
||||
},
|
||||
|_e| true,
|
||||
2,
|
||||
2,
|
||||
"work",
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert_eq!(*count.lock().await, 1);
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,8 @@
|
||||
//! `utils` is intended to be a place to put code that is shared
|
||||
//! between other crates in this repository.
|
||||
|
||||
pub mod backoff;
|
||||
|
||||
/// `Lsn` type implements common tasks on Log Sequence Numbers
|
||||
pub mod lsn;
|
||||
/// SeqWait allows waiting for a future sequence number to arrive
|
||||
|
||||
@@ -95,28 +95,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
|
||||
const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
|
||||
|
||||
async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
|
||||
let backoff_duration_seconds =
|
||||
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
|
||||
if backoff_duration_seconds > 0.0 {
|
||||
info!(
|
||||
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
|
||||
if n == 0 {
|
||||
0.0
|
||||
} else {
|
||||
(1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
|
||||
}
|
||||
}
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
@@ -238,37 +216,6 @@ async fn timed<Fut: std::future::Future>(
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod backoff_defaults_tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn backoff_defaults_produce_growing_backoff_sequence() {
|
||||
let mut current_backoff_value = None;
|
||||
|
||||
for i in 0..10_000 {
|
||||
let new_backoff_value = exponential_backoff_duration_seconds(
|
||||
i,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
);
|
||||
|
||||
if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
|
||||
assert!(
|
||||
old_backoff_value <= new_backoff_value,
|
||||
"{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
current_backoff_value.expect("Should have produced backoff values to compare"),
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
"Given big enough of retries, backoff should reach its allowed max value"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod timed_tests {
|
||||
use super::timed;
|
||||
|
||||
@@ -10,7 +10,7 @@ use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{error, info, instrument, warn, Instrument, Span};
|
||||
|
||||
use utils::{
|
||||
completion, crashsafe, fs_ext,
|
||||
backoff, completion, crashsafe, fs_ext,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
@@ -23,12 +23,13 @@ use crate::{
|
||||
|
||||
use super::{
|
||||
mgr::{GetTenantError, TenantsMap},
|
||||
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
span,
|
||||
timeline::delete::DeleteTimelineFlow,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant,
|
||||
};
|
||||
|
||||
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u8 = 3;
|
||||
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTenantError {
|
||||
@@ -71,10 +72,19 @@ async fn create_remote_delete_mark(
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
|
||||
|
||||
let data: &[u8] = &[];
|
||||
remote_storage
|
||||
.upload(data, 0, &remote_mark_path, None)
|
||||
.await
|
||||
.context("mark upload")?;
|
||||
backoff::retry(
|
||||
|| async {
|
||||
remote_storage
|
||||
.upload(data, 0, &remote_mark_path, None)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"mark_upload",
|
||||
)
|
||||
.await
|
||||
.context("mark_upload")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -154,9 +164,16 @@ async fn remove_tenant_remote_delete_mark(
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
remote_storage
|
||||
.delete(&remote_tenant_delete_mark_path(conf, tenant_id)?)
|
||||
.await?;
|
||||
let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
|
||||
backoff::retry(
|
||||
|| async { remote_storage.delete(&path).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
)
|
||||
.await
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -337,32 +354,28 @@ impl DeleteTenantFlow {
|
||||
return Ok(acquire(tenant));
|
||||
}
|
||||
|
||||
let remote_storage = match remote_storage {
|
||||
Some(remote_storage) => remote_storage,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
// If remote storage is there we rely on it
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
|
||||
|
||||
let attempt = 1;
|
||||
loop {
|
||||
match remote_storage.download(&remote_mark_path).await {
|
||||
Ok(_) => return Ok(acquire(tenant)),
|
||||
Err(e) => {
|
||||
if matches!(e, DownloadError::NotFound) {
|
||||
return Ok(None);
|
||||
}
|
||||
if attempt > SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS {
|
||||
return Err(anyhow::anyhow!(e))?;
|
||||
}
|
||||
let result = backoff::retry(
|
||||
|| async { remote_storage.download(&remote_mark_path).await },
|
||||
|e| matches!(e, DownloadError::NotFound),
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
"fetch_tenant_deletion_mark",
|
||||
)
|
||||
.await;
|
||||
|
||||
warn!(
|
||||
"failed to fetch tenant deletion mark at {} attempt {}",
|
||||
&remote_mark_path, attempt
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
match result {
|
||||
Ok(_) => Ok(acquire(tenant)),
|
||||
Err(DownloadError::NotFound) => Ok(None),
|
||||
Err(e) => Err(anyhow::anyhow!(e)).context("should_resume_deletion")?,
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
pub(crate) async fn resume(
|
||||
|
||||
@@ -211,6 +211,9 @@ use chrono::{NaiveDateTime, Utc};
|
||||
// re-export these
|
||||
pub use download::{is_temp_download_file, list_remote_timelines};
|
||||
use scopeguard::ScopeGuard;
|
||||
use utils::backoff::{
|
||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::path::Path;
|
||||
@@ -241,7 +244,6 @@ use crate::{
|
||||
tenant::upload_queue::{
|
||||
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
|
||||
},
|
||||
{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
|
||||
};
|
||||
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -256,12 +258,12 @@ use super::upload_queue::SetDeletedFlagProgress;
|
||||
// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
|
||||
// level instead, as repeated failures can mean a more serious problem. If it
|
||||
// fails more than FAILED_DOWNLOAD_RETRIES times, we give up
|
||||
const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
const FAILED_DOWNLOAD_RETRIES: u32 = 10;
|
||||
pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
|
||||
|
||||
// Similarly log failed uploads and deletions at WARN level, after this many
|
||||
// retries. Uploads and deletions are retried forever, though.
|
||||
const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
@@ -752,12 +754,24 @@ impl RemoteTimelineClient {
|
||||
|
||||
pausable_failpoint!("persist_deleted_index_part");
|
||||
|
||||
upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
&index_part_with_deleted_at,
|
||||
backoff::retry(
|
||||
|| async {
|
||||
upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
&index_part_with_deleted_at,
|
||||
)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
1,
|
||||
// have just a couple of attempts
|
||||
// when executed as part of timeline deletion this happens in context of api call
|
||||
// when executed as part of tenant deletion this happens in the background
|
||||
2,
|
||||
"persist_index_part_with_deleted_flag",
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -834,10 +848,19 @@ impl RemoteTimelineClient {
|
||||
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||
let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
|
||||
|
||||
let remaining = self
|
||||
.storage_impl
|
||||
.list_prefixes(Some(&timeline_storage_path))
|
||||
.await?;
|
||||
let remaining = backoff::retry(
|
||||
|| async {
|
||||
self.storage_impl
|
||||
.list_prefixes(Some(&timeline_storage_path))
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"list_prefixes",
|
||||
)
|
||||
.await
|
||||
.context("list prefixes")?;
|
||||
|
||||
let remaining: Vec<RemotePath> = remaining
|
||||
.into_iter()
|
||||
@@ -852,7 +875,15 @@ impl RemoteTimelineClient {
|
||||
.collect();
|
||||
|
||||
if !remaining.is_empty() {
|
||||
self.storage_impl.delete_objects(&remaining).await?;
|
||||
backoff::retry(
|
||||
|| async { self.storage_impl.delete_objects(&remaining).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"delete_objects",
|
||||
)
|
||||
.await
|
||||
.context("delete_objects")?;
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-delete", |_| {
|
||||
@@ -864,7 +895,16 @@ impl RemoteTimelineClient {
|
||||
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
|
||||
|
||||
debug!("deleting index part");
|
||||
self.storage_impl.delete(&index_file_path).await?;
|
||||
|
||||
backoff::retry(
|
||||
|| async { self.storage_impl.delete(&index_file_path).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"delete_index",
|
||||
)
|
||||
.await
|
||||
.context("delete_index")?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
|
||||
@@ -11,19 +11,17 @@ use std::time::Duration;
|
||||
use anyhow::{anyhow, Context};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
|
||||
use tracing::{info, warn};
|
||||
use utils::backoff;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
|
||||
use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
|
||||
|
||||
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
@@ -268,7 +266,6 @@ pub(super) async fn download_index_part(
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
///
|
||||
/// Helper function to handle retries for a download operation.
|
||||
///
|
||||
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
|
||||
@@ -276,47 +273,17 @@ pub(super) async fn download_index_part(
|
||||
/// with backoff.
|
||||
///
|
||||
/// (See similar logic for uploads in `perform_upload_task`)
|
||||
async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
|
||||
async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
|
||||
where
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, DownloadError>>,
|
||||
{
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
let result = op().await;
|
||||
match result {
|
||||
Ok(_) => {
|
||||
if attempts > 0 {
|
||||
info!("{description} succeeded after {attempts} retries");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// These are "permanent" errors that should not be retried.
|
||||
Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
|
||||
return result;
|
||||
}
|
||||
// Assume that any other failure might be transient, and the operation might
|
||||
// succeed if we just keep trying.
|
||||
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
|
||||
info!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
|
||||
warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(DownloadError::Other(ref err)) => {
|
||||
// Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
|
||||
warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
|
||||
return result;
|
||||
}
|
||||
}
|
||||
// sleep and retry
|
||||
exponential_backoff(
|
||||
attempts,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
)
|
||||
.await;
|
||||
attempts += 1;
|
||||
}
|
||||
backoff::retry(
|
||||
op,
|
||||
|e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
description,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -31,8 +31,10 @@ use storage_broker::Streaming;
|
||||
use tokio::select;
|
||||
use tracing::*;
|
||||
|
||||
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
use utils::backoff::{
|
||||
exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
|
||||
@@ -284,4 +284,4 @@ MANY_SMALL_LAYERS_TENANT_CONFIG = {
|
||||
|
||||
|
||||
def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
|
||||
return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 6
|
||||
return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 8
|
||||
|
||||
@@ -173,9 +173,7 @@ def test_remote_storage_backup_and_restore(
|
||||
#
|
||||
# The initiated attach operation should survive the restart, and continue from where it was.
|
||||
env.pageserver.stop()
|
||||
layer_download_failed_regex = (
|
||||
r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure"
|
||||
)
|
||||
layer_download_failed_regex = r"Failed to download a remote file: simulated failure of remote operation Download.*[0-9A-F]+-[0-9A-F]+"
|
||||
assert not env.pageserver.log_contains(
|
||||
layer_download_failed_regex
|
||||
), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint"
|
||||
@@ -208,7 +206,7 @@ def test_remote_storage_backup_and_restore(
|
||||
== f"{data}|{checkpoint_number}"
|
||||
)
|
||||
|
||||
log.info("ensure that we neede to retry downloads due to test_remote_failures=1")
|
||||
log.info("ensure that we needed to retry downloads due to test_remote_failures=1")
|
||||
assert env.pageserver.log_contains(layer_download_failed_regex)
|
||||
|
||||
|
||||
|
||||
@@ -32,6 +32,8 @@ def test_tenant_delete_smoke(
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
neon_env_builder.pageserver_config_override = "test_remote_failures=1"
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_tenant_delete_smoke",
|
||||
@@ -123,25 +125,35 @@ def combinations():
|
||||
|
||||
for remote_storage_kind in remotes:
|
||||
for delete_failpoint in FAILPOINTS:
|
||||
if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
|
||||
if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in (
|
||||
"timeline-delete-before-index-delete",
|
||||
):
|
||||
# the above failpoint are not relevant for config without remote storage
|
||||
continue
|
||||
|
||||
result.append((remote_storage_kind, delete_failpoint))
|
||||
# Simulate failures for only one type of remote storage
|
||||
# to avoid log pollution and make tests run faster
|
||||
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
|
||||
simulate_failures = True
|
||||
else:
|
||||
simulate_failures = False
|
||||
result.append((remote_storage_kind, delete_failpoint, simulate_failures))
|
||||
return result
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
|
||||
@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
|
||||
@pytest.mark.parametrize("check", list(Check))
|
||||
def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
failpoint: str,
|
||||
simulate_failures: bool,
|
||||
check: Check,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
if simulate_failures:
|
||||
neon_env_builder.pageserver_config_override = "test_remote_failures=1"
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user