Add s3 storage to test_s3_wal_replay (#10809)

## Problem

The test is flaky: WAL in remote storage appears to be corrupted. One of
hypotheses so far is that corruption is the result of local fs
implementation being non atomic, and safekeepers may concurrently PUT
the same segment. That's dubious though because by looking at local_fs
impl I'd expect then early EOF on segment read rather then observed
zeros in test failures, but other directions seem even less probable.

## Summary of changes

Let's add s3 backend as well and see if it is also flaky. Also add some
more logging around segments uploads.

ref https://github.com/neondatabase/neon/issues/10761
This commit is contained in:
Arseny Sher
2025-02-13 21:05:15 +03:00
committed by GitHub
parent 0cf9157adc
commit 98e18e9a54
2 changed files with 20 additions and 6 deletions

View File

@@ -310,9 +310,12 @@ impl WalBackupTask {
retry_attempt = 0;
}
Err(e) => {
// We might have managed to upload some segment even though
// some later in the range failed, so log backup_lsn
// separately.
error!(
"failed while offloading range {}-{}: {:?}",
backup_lsn, commit_lsn, e
"failed while offloading range {}-{}, backup_lsn {}: {:?}",
backup_lsn, commit_lsn, backup_lsn, e
);
retry_attempt = retry_attempt.saturating_add(1);
@@ -338,6 +341,13 @@ async fn backup_lsn_range(
let start_lsn = *backup_lsn;
let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
info!(
"offloading segnos {:?} of range [{}-{})",
segments.iter().map(|&s| s.seg_no).collect::<Vec<_>>(),
start_lsn,
end_lsn,
);
// Pool of concurrent upload tasks. We use `FuturesOrdered` to
// preserve order of uploads, and update `backup_lsn` only after
// all previous uploads are finished.
@@ -374,10 +384,10 @@ async fn backup_lsn_range(
}
info!(
"offloaded segnos {:?} up to {}, previous backup_lsn {}",
"offloaded segnos {:?} of range [{}-{})",
segments.iter().map(|&s| s.seg_no).collect::<Vec<_>>(),
end_lsn,
start_lsn,
end_lsn,
);
Ok(())
}

View File

@@ -566,10 +566,14 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder):
assert_prefix_empty(neon_env_builder.safekeepers_remote_storage, prefix)
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
# This test is flaky, probably because PUTs of local fs storage are not atomic.
# Let's keep both remote storage kinds for a while to see if this is the case.
# https://github.com/neondatabase/neon/issues/10761
@pytest.mark.parametrize("remote_storage_kind", [s3_storage(), RemoteStorageKind.LOCAL_FS])
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
neon_env_builder.num_safekeepers = 3
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant