fix(safekeeper): retry if open segment fail (#12757)

## Problem

Fix LKB-2632.

The safekeeper wal read path does not seem to retry at all. This would
cause client read errors on the customer side.

## Summary of changes

- Retry on `safekeeper::wal_backup::read_object`.
- Note that this only retries on S3 HTTP connection errors. Subsequent
reads could fail, and that needs more refactors to make the retry
mechanism work across the path.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-07-29 10:20:43 -04:00
committed by GitHub
parent 5e3cb2ab07
commit dbde37c53a

View File

@@ -12,7 +12,7 @@ use futures::stream::{self, FuturesOrdered};
use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
use remote_storage::{
DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata,
DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata,
};
use safekeeper_api::models::PeerInfo;
use tokio::fs::File;
@@ -607,6 +607,9 @@ pub(crate) async fn copy_partial_segment(
storage.copy_object(source, destination, &cancel).await
}
const WAL_READ_WARN_THRESHOLD: u32 = 2;
const WAL_READ_MAX_RETRIES: u32 = 3;
pub async fn read_object(
storage: &GenericRemoteStorage,
file_path: &RemotePath,
@@ -620,12 +623,23 @@ pub async fn read_object(
byte_start: std::ops::Bound::Included(offset),
..Default::default()
};
let download = storage
.download(file_path, &opts, &cancel)
.await
.with_context(|| {
format!("Failed to open WAL segment download stream for remote path {file_path:?}")
})?;
// This retry only solves the connect errors: subsequent reads can still fail as this function returns
// a stream.
let download = backoff::retry(
|| async { storage.download(file_path, &opts, &cancel).await },
DownloadError::is_permanent,
WAL_READ_WARN_THRESHOLD,
WAL_READ_MAX_RETRIES,
"download WAL segment",
&cancel,
)
.await
.ok_or_else(|| DownloadError::Cancelled)
.and_then(|x| x)
.with_context(|| {
format!("Failed to open WAL segment download stream for remote path {file_path:?}")
})?;
let reader = tokio_util::io::StreamReader::new(download.download_stream);