diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs index 096c7e5854..0f7108613f 100644 --- a/libs/utils/src/backoff.rs +++ b/libs/utils/src/backoff.rs @@ -6,6 +6,10 @@ use tokio_util::sync::CancellationToken; pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; +pub const DEFAULT_NETWORK_BASE_BACKOFF_SECONDS: f64 = 1.5; +pub const DEFAULT_NETWORK_MAX_BACKOFF_SECONDS: f64 = 60.0; + + pub async fn exponential_backoff( n: u32, base_increment: f64, @@ -37,6 +41,31 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec } } +pub async fn retry( + op: O, + is_permanent: impl Fn(&E) -> bool, + warn_threshold: u32, + max_retries: u32, + description: &str, + cancel: &CancellationToken, +) -> Option> +where + E: Display + Debug + 'static, + O: FnMut() -> F, + F: Future>, +{ + retry_with_options( + op, + is_permanent, + warn_threshold, + max_retries, + description, + cancel, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ).await +} + /// Retries passed operation until one of the following conditions are met: /// - encountered error is considered as permanent (non-retryable) /// - retries have been exhausted @@ -51,13 +80,15 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec /// for any other error type. Final failed attempt is logged with `{:?}`. /// /// Returns `None` if cancellation was noticed during backoff or the terminal result. -pub async fn retry( +pub async fn retry_with_options( mut op: O, is_permanent: impl Fn(&E) -> bool, warn_threshold: u32, max_retries: u32, description: &str, cancel: &CancellationToken, + base_increment: f64, + max_seconds: f64, ) -> Option> where // Not std::error::Error because anyhow::Error doesnt implement it. @@ -104,8 +135,8 @@ where // sleep and retry exponential_backoff( attempts, - DEFAULT_BASE_BACKOFF_SECONDS, - DEFAULT_MAX_BACKOFF_SECONDS, + base_increment, + max_seconds, cancel, ) .await; diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index efcd20d1bf..a17472753c 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -693,13 +693,15 @@ where O: FnMut() -> F, F: Future>, { - backoff::retry( + backoff::retry_with_options( op, DownloadError::is_permanent, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, description, cancel, + backoff::DEFAULT_NETWORK_BASE_BACKOFF_SECONDS, + backoff::DEFAULT_NETWORK_MAX_BACKOFF_SECONDS, ) .await .ok_or_else(|| DownloadError::Cancelled) @@ -715,13 +717,15 @@ where O: FnMut() -> F, F: Future>, { - backoff::retry( + backoff::retry_with_options( op, DownloadError::is_permanent, FAILED_DOWNLOAD_WARN_THRESHOLD, u32::MAX, description, cancel, + backoff::DEFAULT_NETWORK_BASE_BACKOFF_SECONDS, + backoff::DEFAULT_NETWORK_MAX_BACKOFF_SECONDS, ) .await .ok_or_else(|| DownloadError::Cancelled) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index a9f1189b41..cfe5f398df 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -3,6 +3,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; +use utils::backoff; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; @@ -1204,8 +1205,8 @@ impl LayerInner { let backoff = utils::backoff::exponential_backoff_duration_seconds( consecutive_failures.min(u32::MAX as usize) as u32, - 1.5, - 60.0, + backoff::DEFAULT_NETWORK_BASE_BACKOFF_SECONDS, + backoff::DEFAULT_NETWORK_MAX_BACKOFF_SECONDS, ); let backoff = std::time::Duration::from_secs_f64(backoff);