From d6f6e9a87bcb210757caa68365af072332a8d74a Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 13 Dec 2023 22:29:15 +0000 Subject: [PATCH] fix: layer backoff --- pageserver/src/tenant/storage_layer/layer.rs | 34 +++++++++----------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 4e0240a66b..8acfb600a6 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -861,6 +861,21 @@ impl LayerInner { Ok(()) } Err(e) => { + let consecutive_failures = + this.consecutive_failures.fetch_add(1, Ordering::Relaxed); + + let backoff = utils::backoff::exponential_backoff_duration_seconds( + consecutive_failures.min(u32::MAX as usize) as u32, + 1.5, + 60.0, + ); + let backoff = std::time::Duration::from_secs_f64(backoff); + + tokio::select! { + _ = tokio::time::sleep(backoff) => {}, + _ = crate::task_mgr::shutdown_token().cancelled_owned() => {}, + }; + Err(e) } }; @@ -908,24 +923,7 @@ impl LayerInner { Ok(permit) } - Ok((Err(e), _permit)) => { - // FIXME: this should be with the spawned task and be cancellation sensitive - // - // while we should not need this, this backoff has turned out to be useful with - // a bug of unexpectedly deleted remote layer file (#5787). - let consecutive_failures = - self.consecutive_failures.fetch_add(1, Ordering::Relaxed); - tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); - let backoff = utils::backoff::exponential_backoff_duration_seconds( - consecutive_failures.min(u32::MAX as usize) as u32, - 1.5, - 60.0, - ); - let backoff = std::time::Duration::from_secs_f64(backoff); - - tokio::time::sleep(backoff).await; - Err(DownloadError::DownloadFailed) - } + Ok((Err(_), _permit)) => Err(DownloadError::DownloadFailed), Err(_gone) => Err(DownloadError::DownloadCancelled), } }