mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 14:00:38 +00:00
pageserver: retry wrapper on manifest upload (#10524)
## Problem On remote storage errors (e.g. I/O timeout) uploading tenant manifest, all of compaction could fail. This is a problem IRL because we shouldn't abort compaction on a single IO error, and in tests because it generates spurious failures. Related: https://github.com/orgs/neondatabase/projects/51/views/2?sliceBy%5Bvalue%5D=jcsp&pane=issue&itemId=93692919&issue=neondatabase%7Cneon%7C10389 ## Summary of changes - Use `backoff::retry` when uploading tenant manifest
This commit is contained in:
@@ -37,6 +37,8 @@ use remote_timeline_client::manifest::{
|
||||
OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION,
|
||||
};
|
||||
use remote_timeline_client::UploadQueueNotReadyError;
|
||||
use remote_timeline_client::FAILED_REMOTE_OP_RETRIES;
|
||||
use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
@@ -5308,27 +5310,37 @@ impl Tenant {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
self.generation,
|
||||
&manifest,
|
||||
// Remote storage does no retries internally, so wrap it
|
||||
match backoff::retry(
|
||||
|| async {
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
self.generation,
|
||||
&manifest,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
},
|
||||
|_e| self.cancel.is_cancelled(),
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"uploading tenant manifest",
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
if self.cancel.is_cancelled() {
|
||||
TenantManifestError::Cancelled
|
||||
} else {
|
||||
TenantManifestError::RemoteStorage(e)
|
||||
{
|
||||
None => Err(TenantManifestError::Cancelled),
|
||||
Some(Err(_)) if self.cancel.is_cancelled() => Err(TenantManifestError::Cancelled),
|
||||
Some(Err(e)) => Err(TenantManifestError::RemoteStorage(e)),
|
||||
Some(Ok(_)) => {
|
||||
// Store the successfully uploaded manifest, so that future callers can avoid
|
||||
// re-uploading the same thing.
|
||||
*guard = Some(manifest);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
})?;
|
||||
|
||||
// Store the successfully uploaded manifest, so that future callers can avoid
|
||||
// re-uploading the same thing.
|
||||
*guard = Some(manifest);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user