storage_scrubber: retry on index deletion failures (#9204)

## Problem

In automated tests running on AWS S3, we frequently see scrubber
failures when it can't delete an index.

`location_conf_churn`:

https://neon-github-public-dev.s3.amazonaws.com/reports/main/11076221056/index.html#/testresult/f89b1916b6a693e2

`scrubber_physical_gc`:

https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9178/11074269153/index.html#/testresult/9885ed5aa0fe38b6

## Summary of changes

Wrap index deletion in a backoff::retry

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
This commit is contained in:
John Spray
2024-10-01 10:34:39 +01:00
committed by GitHub
parent d6c6b0a509
commit 40b10b878a

View File

@@ -4,7 +4,7 @@ use std::time::Duration;
use crate::checks::{list_timeline_blobs, BlobDataParseResult};
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
use futures_util::{StreamExt, TryStreamExt};
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
@@ -18,6 +18,7 @@ use serde::Serialize;
use storage_controller_client::control_api;
use tokio_util::sync::CancellationToken;
use tracing::{info_span, Instrument};
use utils::backoff;
use utils::generation::Generation;
use utils::id::{TenantId, TenantTimelineId};
@@ -326,15 +327,25 @@ async fn maybe_delete_index(
}
// All validations passed: erase the object
match remote_client
.delete(&obj.key, &CancellationToken::new())
.await
let cancel = CancellationToken::new();
match backoff::retry(
|| remote_client.delete(&obj.key, &cancel),
|_| false,
3,
MAX_RETRIES as u32,
"maybe_delete_index",
&cancel,
)
.await
{
Ok(_) => {
None => {
unreachable!("Using a dummy cancellation token");
}
Some(Ok(_)) => {
tracing::info!("Successfully deleted index");
summary.indices_deleted += 1;
}
Err(e) => {
Some(Err(e)) => {
tracing::warn!("Failed to delete index: {e}");
summary.remote_storage_errors += 1;
}