From c66e859bccd3598b380c30255ac09a5cadb4594a Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 24 Aug 2023 22:23:29 +0300 Subject: [PATCH] try to apply backoff *after* download might not work as we could get cancelled, but doing it right before seems wrong as well. We already retry the download. --- pageserver/src/tenant/storage_layer/layer.rs | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index a54608aa2b..6f2bca890f 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -318,6 +318,7 @@ struct LayerInner { /// Allow subscribing to when the layer actually gets evicted. status: tokio::sync::broadcast::Sender, + consecutive_failures: AtomicUsize, } impl std::fmt::Display for LayerInner { @@ -423,6 +424,7 @@ impl LayerInner { }, version: AtomicUsize::new(0), status: tokio::sync::broadcast::channel(1).0, + consecutive_failures: AtomicUsize::new(0), } } @@ -647,12 +649,24 @@ impl LayerInner { // this is really a bug in needs_download or remote timeline client panic!("post-condition failed: needs_download returned {reason:?}"); } + + self.consecutive_failures.store(0, Ordering::Relaxed); } Ok(Err(e)) => { - tracing::error!("layer file download failed: {e:#}"); + let consecutive_failures = + self.consecutive_failures.fetch_add(1, Ordering::Relaxed); + tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); + let backoff = utils::backoff::exponential_backoff_duration_seconds( + consecutive_failures.min(u32::MAX as usize) as u32, + 1.5, + 60.0, + ); + let backoff = std::time::Duration::from_secs_f64(backoff); + + // unless we get cancelled, we will hold off the semaphore init + tokio::time::sleep(backoff).await; + return Err(DownloadError::DownloadFailed); - // FIXME: we need backoff here so never spiral to download loop, maybe, - // because remote timeline client already retries } Err(_gone) => { return Err(DownloadError::DownloadCancelled);