pageserver: retry wrapper on manifest upload (#10524)

## Problem

On remote storage errors (e.g. I/O timeout) uploading tenant manifest,
all of compaction could fail. This is a problem IRL because we shouldn't
abort compaction on a single IO error, and in tests because it generates
spurious failures.

Related:
https://github.com/orgs/neondatabase/projects/51/views/2?sliceBy%5Bvalue%5D=jcsp&pane=issue&itemId=93692919&issue=neondatabase%7Cneon%7C10389

## Summary of changes

- Use `backoff::retry` when uploading tenant manifest
This commit is contained in:
John Spray
2025-01-27 21:02:25 +00:00
committed by GitHub
parent 5477d7db93
commit d73f4a6470

View File

@@ -37,6 +37,8 @@ use remote_timeline_client::manifest::{
OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION,
};
use remote_timeline_client::UploadQueueNotReadyError;
use remote_timeline_client::FAILED_REMOTE_OP_RETRIES;
use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD;
use std::collections::BTreeMap;
use std::fmt;
use std::future::Future;
@@ -5308,27 +5310,37 @@ impl Tenant {
return Ok(());
}
upload_tenant_manifest(
&self.remote_storage,
&self.tenant_shard_id,
self.generation,
&manifest,
// Remote storage does no retries internally, so wrap it
match backoff::retry(
|| async {
upload_tenant_manifest(
&self.remote_storage,
&self.tenant_shard_id,
self.generation,
&manifest,
&self.cancel,
)
.await
},
|_e| self.cancel.is_cancelled(),
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"uploading tenant manifest",
&self.cancel,
)
.await
.map_err(|e| {
if self.cancel.is_cancelled() {
TenantManifestError::Cancelled
} else {
TenantManifestError::RemoteStorage(e)
{
None => Err(TenantManifestError::Cancelled),
Some(Err(_)) if self.cancel.is_cancelled() => Err(TenantManifestError::Cancelled),
Some(Err(e)) => Err(TenantManifestError::RemoteStorage(e)),
Some(Ok(_)) => {
// Store the successfully uploaded manifest, so that future callers can avoid
// re-uploading the same thing.
*guard = Some(manifest);
Ok(())
}
})?;
// Store the successfully uploaded manifest, so that future callers can avoid
// re-uploading the same thing.
*guard = Some(manifest);
Ok(())
}
}
}