mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 12:02:55 +00:00
fix(remote_storage): continue on Azure+AWS retryable error (#11903)
## Problem We implemented the retry logic in AWS S3 but not in Azure. Therefore, if there is an error during Azure listing, we will return an Err to the caller, and the stream will end without fetching more tenants. Part of https://github.com/neondatabase/neon/issues/11159 Without this fix, listing tenant will stop once we hit an error (could be network errors -- that happens more frequent on Azure). If we happen to stop at a point that we only listed part of the shards, we will hit the "missed shards" error or even remove layers being used. This bug (for Azure listing) was introduced as part of https://github.com/neondatabase/neon/pull/9840 There is also a bug that stops the stream for AWS when there's a timeout -- this is fixed along with this patch. ## Summary of changes Retry the request on error. In the future, we should make such streams return something like `Result<Result<T>>` where the outer result is the error that ends the stream and the inner one is the error that should be retried by the caller. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
@@ -330,11 +330,18 @@ impl AzureBlobStorage {
|
||||
if let Err(DownloadError::Timeout) = &next_item {
|
||||
timeout_try_cnt += 1;
|
||||
if timeout_try_cnt <= 5 {
|
||||
continue;
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
|
||||
let next_item = next_item?;
|
||||
let next_item = match next_item {
|
||||
Ok(next_item) => next_item,
|
||||
Err(e) => {
|
||||
// The error is potentially retryable, so we must rewind the loop after yielding.
|
||||
yield Err(e);
|
||||
continue 'outer;
|
||||
},
|
||||
};
|
||||
|
||||
// Log a warning if we saw two timeouts in a row before a successful request
|
||||
if timeout_try_cnt > 2 {
|
||||
|
||||
@@ -657,7 +657,14 @@ impl RemoteStorage for S3Bucket {
|
||||
res = request => Ok(res),
|
||||
_ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout),
|
||||
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
|
||||
}?;
|
||||
};
|
||||
|
||||
if let Err(DownloadError::Timeout) = &response {
|
||||
yield Err(DownloadError::Timeout);
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
let response = response?; // always yield cancellation errors and stop the stream
|
||||
|
||||
let response = response
|
||||
.context("Failed to list S3 prefixes")
|
||||
|
||||
Reference in New Issue
Block a user