From 0fc3708de248afd3a1e43798df7c4c551a6a0b45 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 11 Oct 2023 15:25:08 +0100 Subject: [PATCH] pageserver: use a backoff::retry in Deleter (#5534) ## Problem The `Deleter` currently doesn't use a backoff::retry because it doesn't need to: it is already inside a loop when doing the deletion, so can just let the loop go around. However, this is a problem for logging, because we log on errors, which includes things like 503/429 cases that would usually be swallowed by a backoff::retry in most places we use the RemoteStorage interface. The underlying problem is that RemoteStorage doesn't have a proper error type, and an anyhow::Error can't easily be interrogated for its original S3 SdkError because downcast_ref requires a concrete type, but SdkError is parametrized on response type. ## Summary of changes Wrap remote deletions in Deleter in a backoff::retry to avoid logging warnings on transient 429/503 conditions, and for symmetry with how RemoteStorage is used in other places. --- pageserver/src/deletion_queue/deleter.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index 5c6e7dc9d7..81cb016d49 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -13,6 +13,7 @@ use std::time::Duration; use tokio_util::sync::CancellationToken; use tracing::info; use tracing::warn; +use utils::backoff; use crate::metrics; @@ -63,7 +64,19 @@ impl Deleter { Err(anyhow::anyhow!("failpoint hit")) }); - self.remote_storage.delete_objects(&self.accumulator).await + // A backoff::retry is used here for two reasons: + // - To provide a backoff rather than busy-polling the API on errors + // - To absorb transient 429/503 conditions without hitting our error + // logging path for issues deleting objects. + backoff::retry( + || async { self.remote_storage.delete_objects(&self.accumulator).await }, + |_| false, + 3, + 10, + "executing deletion batch", + backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")), + ) + .await } /// Block until everything in accumulator has been executed @@ -88,7 +101,10 @@ impl Deleter { self.accumulator.clear(); } Err(e) => { - warn!("DeleteObjects request failed: {e:#}, will retry"); + if self.cancel.is_cancelled() { + return Err(DeletionQueueError::ShuttingDown); + } + warn!("DeleteObjects request failed: {e:#}, will continue trying"); metrics::DELETION_QUEUE .remote_errors .with_label_values(&["execute"])