Harden retries on tenant/timeline deletion path. (#4973)

Originated from test failure where we got SlowDown error from s3.
The patch generalizes `download_retry` to not be download specific.
Resulting `retry` function is moved to utils crate. `download_retries`
is now a thin wrapper around this `retry` function.

To ensure that all needed retries are in place test code now uses
`test_remote_failures=1` setting.

Ref https://neondb.slack.com/archives/C059ZC138NR/p1691743624353009
This commit is contained in:
Dmitry Rodionov
2023-08-14 17:16:49 +03:00
committed by GitHub
parent 49c57c0b13
commit 4626d89eda
10 changed files with 322 additions and 153 deletions

188
libs/utils/src/backoff.rs Normal file
View File

@@ -0,0 +1,188 @@
use std::fmt::{Debug, Display};
use futures::Future;
pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
let backoff_duration_seconds =
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
if backoff_duration_seconds > 0.0 {
tracing::info!(
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
);
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
}
}
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
if n == 0 {
0.0
} else {
(1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
}
}
/// retries passed operation until one of the following conditions are met:
/// Encountered error is considered as permanent (non-retryable)
/// Retries have been exhausted.
/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
/// When attempts cross `warn_threshold` function starts to emit log warnings.
/// `description` argument is added to log messages. Its value should identify the `op` is doing
pub async fn retry<T, O, F, E>(
mut op: O,
is_permanent: impl Fn(&E) -> bool,
warn_threshold: u32,
max_retries: u32,
description: &str,
) -> Result<T, E>
where
// Not std::error::Error because anyhow::Error doesnt implement it.
// For context see https://github.com/dtolnay/anyhow/issues/63
E: Display + Debug,
O: FnMut() -> F,
F: Future<Output = Result<T, E>>,
{
let mut attempts = 0;
loop {
let result = op().await;
match result {
Ok(_) => {
if attempts > 0 {
tracing::info!("{description} succeeded after {attempts} retries");
}
return result;
}
// These are "permanent" errors that should not be retried.
Err(ref e) if is_permanent(e) => {
return result;
}
// Assume that any other failure might be transient, and the operation might
// succeed if we just keep trying.
Err(err) if attempts < warn_threshold => {
tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
}
Err(err) if attempts < max_retries => {
tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
}
Err(ref err) => {
// Operation failed `max_attempts` times. Time to give up.
tracing::warn!(
"{description} still failed after {attempts} retries, giving up: {err:?}"
);
return result;
}
}
// sleep and retry
exponential_backoff(
attempts,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
)
.await;
attempts += 1;
}
}
#[cfg(test)]
mod tests {
use std::io;
use tokio::sync::Mutex;
use super::*;
#[test]
fn backoff_defaults_produce_growing_backoff_sequence() {
let mut current_backoff_value = None;
for i in 0..10_000 {
let new_backoff_value = exponential_backoff_duration_seconds(
i,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
);
if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
assert!(
old_backoff_value <= new_backoff_value,
"{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
)
}
}
assert_eq!(
current_backoff_value.expect("Should have produced backoff values to compare"),
DEFAULT_MAX_BACKOFF_SECONDS,
"Given big enough of retries, backoff should reach its allowed max value"
);
}
#[tokio::test(start_paused = true)]
async fn retry_always_error() {
let count = Mutex::new(0);
let err_result = retry(
|| async {
*count.lock().await += 1;
Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
},
|_e| false,
1,
1,
"work",
)
.await;
assert!(err_result.is_err());
assert_eq!(*count.lock().await, 2);
}
#[tokio::test(start_paused = true)]
async fn retry_ok_after_err() {
let count = Mutex::new(0);
retry(
|| async {
let mut locked = count.lock().await;
if *locked > 1 {
Ok(())
} else {
*locked += 1;
Err(io::Error::from(io::ErrorKind::Other))
}
},
|_e| false,
2,
2,
"work",
)
.await
.unwrap();
}
#[tokio::test(start_paused = true)]
async fn dont_retry_permanent_errors() {
let count = Mutex::new(0);
let _ = retry(
|| async {
let mut locked = count.lock().await;
if *locked > 1 {
Ok(())
} else {
*locked += 1;
Err(io::Error::from(io::ErrorKind::Other))
}
},
|_e| true,
2,
2,
"work",
)
.await
.unwrap_err();
assert_eq!(*count.lock().await, 1);
}
}

View File

@@ -1,6 +1,8 @@
//! `utils` is intended to be a place to put code that is shared
//! between other crates in this repository.
pub mod backoff;
/// `Lsn` type implements common tasks on Log Sequence Numbers
pub mod lsn;
/// SeqWait allows waiting for a future sequence number to arrive

View File

@@ -95,28 +95,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
std::process::exit(exit_code);
}
const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
let backoff_duration_seconds =
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
if backoff_duration_seconds > 0.0 {
info!(
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
);
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
}
}
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
if n == 0 {
0.0
} else {
(1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
}
}
/// The name of the metadata file pageserver creates per timeline.
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
pub const METADATA_FILE_NAME: &str = "metadata";
@@ -238,37 +216,6 @@ async fn timed<Fut: std::future::Future>(
}
}
#[cfg(test)]
mod backoff_defaults_tests {
use super::*;
#[test]
fn backoff_defaults_produce_growing_backoff_sequence() {
let mut current_backoff_value = None;
for i in 0..10_000 {
let new_backoff_value = exponential_backoff_duration_seconds(
i,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
);
if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
assert!(
old_backoff_value <= new_backoff_value,
"{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
)
}
}
assert_eq!(
current_backoff_value.expect("Should have produced backoff values to compare"),
DEFAULT_MAX_BACKOFF_SECONDS,
"Given big enough of retries, backoff should reach its allowed max value"
);
}
}
#[cfg(test)]
mod timed_tests {
use super::timed;

View File

@@ -10,7 +10,7 @@ use tokio::sync::OwnedMutexGuard;
use tracing::{error, info, instrument, warn, Instrument, Span};
use utils::{
completion, crashsafe, fs_ext,
backoff, completion, crashsafe, fs_ext,
id::{TenantId, TimelineId},
};
@@ -23,12 +23,13 @@ use crate::{
use super::{
mgr::{GetTenantError, TenantsMap},
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
span,
timeline::delete::DeleteTimelineFlow,
tree_sort_timelines, DeleteTimelineError, Tenant,
};
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u8 = 3;
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
#[derive(Debug, thiserror::Error)]
pub enum DeleteTenantError {
@@ -71,10 +72,19 @@ async fn create_remote_delete_mark(
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
let data: &[u8] = &[];
remote_storage
.upload(data, 0, &remote_mark_path, None)
.await
.context("mark upload")?;
backoff::retry(
|| async {
remote_storage
.upload(data, 0, &remote_mark_path, None)
.await
},
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"mark_upload",
)
.await
.context("mark_upload")?;
Ok(())
}
@@ -154,9 +164,16 @@ async fn remove_tenant_remote_delete_mark(
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
if let Some(remote_storage) = remote_storage {
remote_storage
.delete(&remote_tenant_delete_mark_path(conf, tenant_id)?)
.await?;
let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
backoff::retry(
|| async { remote_storage.delete(&path).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"remove_tenant_remote_delete_mark",
)
.await
.context("remove_tenant_remote_delete_mark")?;
}
Ok(())
}
@@ -337,32 +354,28 @@ impl DeleteTenantFlow {
return Ok(acquire(tenant));
}
let remote_storage = match remote_storage {
Some(remote_storage) => remote_storage,
None => return Ok(None),
};
// If remote storage is there we rely on it
if let Some(remote_storage) = remote_storage {
let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
let attempt = 1;
loop {
match remote_storage.download(&remote_mark_path).await {
Ok(_) => return Ok(acquire(tenant)),
Err(e) => {
if matches!(e, DownloadError::NotFound) {
return Ok(None);
}
if attempt > SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS {
return Err(anyhow::anyhow!(e))?;
}
let result = backoff::retry(
|| async { remote_storage.download(&remote_mark_path).await },
|e| matches!(e, DownloadError::NotFound),
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
"fetch_tenant_deletion_mark",
)
.await;
warn!(
"failed to fetch tenant deletion mark at {} attempt {}",
&remote_mark_path, attempt
)
}
}
}
match result {
Ok(_) => Ok(acquire(tenant)),
Err(DownloadError::NotFound) => Ok(None),
Err(e) => Err(anyhow::anyhow!(e)).context("should_resume_deletion")?,
}
Ok(None)
}
pub(crate) async fn resume(

View File

@@ -211,6 +211,9 @@ use chrono::{NaiveDateTime, Utc};
// re-export these
pub use download::{is_temp_download_file, list_remote_timelines};
use scopeguard::ScopeGuard;
use utils::backoff::{
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
use std::collections::{HashMap, VecDeque};
use std::path::Path;
@@ -241,7 +244,6 @@ use crate::{
tenant::upload_queue::{
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
},
{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
};
use utils::id::{TenantId, TimelineId};
@@ -256,12 +258,12 @@ use super::upload_queue::SetDeletedFlagProgress;
// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
// level instead, as repeated failures can mean a more serious problem. If it
// fails more than FAILED_DOWNLOAD_RETRIES times, we give up
const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
const FAILED_DOWNLOAD_RETRIES: u32 = 10;
pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
// Similarly log failed uploads and deletions at WARN level, after this many
// retries. Uploads and deletions are retried forever, though.
const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
pub enum MaybeDeletedIndexPart {
IndexPart(IndexPart),
@@ -752,12 +754,24 @@ impl RemoteTimelineClient {
pausable_failpoint!("persist_deleted_index_part");
upload::upload_index_part(
self.conf,
&self.storage_impl,
&self.tenant_id,
&self.timeline_id,
&index_part_with_deleted_at,
backoff::retry(
|| async {
upload::upload_index_part(
self.conf,
&self.storage_impl,
&self.tenant_id,
&self.timeline_id,
&index_part_with_deleted_at,
)
.await
},
|_e| false,
1,
// have just a couple of attempts
// when executed as part of timeline deletion this happens in context of api call
// when executed as part of tenant deletion this happens in the background
2,
"persist_index_part_with_deleted_flag",
)
.await?;
@@ -834,10 +848,19 @@ impl RemoteTimelineClient {
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
let remaining = self
.storage_impl
.list_prefixes(Some(&timeline_storage_path))
.await?;
let remaining = backoff::retry(
|| async {
self.storage_impl
.list_prefixes(Some(&timeline_storage_path))
.await
},
|_e| false,
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"list_prefixes",
)
.await
.context("list prefixes")?;
let remaining: Vec<RemotePath> = remaining
.into_iter()
@@ -852,7 +875,15 @@ impl RemoteTimelineClient {
.collect();
if !remaining.is_empty() {
self.storage_impl.delete_objects(&remaining).await?;
backoff::retry(
|| async { self.storage_impl.delete_objects(&remaining).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_objects",
)
.await
.context("delete_objects")?;
}
fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -864,7 +895,16 @@ impl RemoteTimelineClient {
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
debug!("deleting index part");
self.storage_impl.delete(&index_file_path).await?;
backoff::retry(
|| async { self.storage_impl.delete(&index_file_path).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_index",
)
.await
.context("delete_index")?;
fail::fail_point!("timeline-delete-after-index-delete", |_| {
Err(anyhow::anyhow!(

View File

@@ -11,19 +11,17 @@ use std::time::Duration;
use anyhow::{anyhow, Context};
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tracing::{info, warn};
use utils::backoff;
use crate::config::PageServerConf;
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
use remote_storage::{DownloadError, GenericRemoteStorage};
use utils::crashsafe::path_with_suffix_extension;
use utils::id::{TenantId, TimelineId};
use super::index::{IndexPart, LayerFileMetadata};
use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
fs::File::open(path).await?.sync_all().await
@@ -268,7 +266,6 @@ pub(super) async fn download_index_part(
Ok(index_part)
}
///
/// Helper function to handle retries for a download operation.
///
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
@@ -276,47 +273,17 @@ pub(super) async fn download_index_part(
/// with backoff.
///
/// (See similar logic for uploads in `perform_upload_task`)
async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
where
O: FnMut() -> F,
F: Future<Output = Result<T, DownloadError>>,
{
let mut attempts = 0;
loop {
let result = op().await;
match result {
Ok(_) => {
if attempts > 0 {
info!("{description} succeeded after {attempts} retries");
}
return result;
}
// These are "permanent" errors that should not be retried.
Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
return result;
}
// Assume that any other failure might be transient, and the operation might
// succeed if we just keep trying.
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
info!("{description} failed, will retry (attempt {attempts}): {err:#}");
}
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
}
Err(DownloadError::Other(ref err)) => {
// Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
return result;
}
}
// sleep and retry
exponential_backoff(
attempts,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
)
.await;
attempts += 1;
}
backoff::retry(
op,
|e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
description,
)
.await
}

View File

@@ -31,8 +31,10 @@ use storage_broker::Streaming;
use tokio::select;
use tracing::*;
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
use postgres_connection::{parse_host_port, PgConnectionConfig};
use utils::backoff::{
exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
use utils::{
id::{NodeId, TenantTimelineId},
lsn::Lsn,

View File

@@ -284,4 +284,4 @@ MANY_SMALL_LAYERS_TENANT_CONFIG = {
def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 6
return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 8

View File

@@ -173,9 +173,7 @@ def test_remote_storage_backup_and_restore(
#
# The initiated attach operation should survive the restart, and continue from where it was.
env.pageserver.stop()
layer_download_failed_regex = (
r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure"
)
layer_download_failed_regex = r"Failed to download a remote file: simulated failure of remote operation Download.*[0-9A-F]+-[0-9A-F]+"
assert not env.pageserver.log_contains(
layer_download_failed_regex
), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint"
@@ -208,7 +206,7 @@ def test_remote_storage_backup_and_restore(
== f"{data}|{checkpoint_number}"
)
log.info("ensure that we neede to retry downloads due to test_remote_failures=1")
log.info("ensure that we needed to retry downloads due to test_remote_failures=1")
assert env.pageserver.log_contains(layer_download_failed_regex)

View File

@@ -32,6 +32,8 @@ def test_tenant_delete_smoke(
remote_storage_kind: RemoteStorageKind,
pg_bin: PgBin,
):
neon_env_builder.pageserver_config_override = "test_remote_failures=1"
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_tenant_delete_smoke",
@@ -123,25 +125,35 @@ def combinations():
for remote_storage_kind in remotes:
for delete_failpoint in FAILPOINTS:
if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in (
"timeline-delete-before-index-delete",
):
# the above failpoint are not relevant for config without remote storage
continue
result.append((remote_storage_kind, delete_failpoint))
# Simulate failures for only one type of remote storage
# to avoid log pollution and make tests run faster
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
simulate_failures = True
else:
simulate_failures = False
result.append((remote_storage_kind, delete_failpoint, simulate_failures))
return result
@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
@pytest.mark.parametrize("check", list(Check))
def test_delete_tenant_exercise_crash_safety_failpoints(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
failpoint: str,
simulate_failures: bool,
check: Check,
pg_bin: PgBin,
):
if simulate_failures:
neon_env_builder.pageserver_config_override = "test_remote_failures=1"
neon_env_builder.enable_remote_storage(
remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints"
)