From 98e18e9a543f6ab073d3e3a100fe795f6fc08576 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Thu, 13 Feb 2025 21:05:15 +0300 Subject: [PATCH] Add s3 storage to test_s3_wal_replay (#10809) ## Problem The test is flaky: WAL in remote storage appears to be corrupted. One of hypotheses so far is that corruption is the result of local fs implementation being non atomic, and safekeepers may concurrently PUT the same segment. That's dubious though because by looking at local_fs impl I'd expect then early EOF on segment read rather then observed zeros in test failures, but other directions seem even less probable. ## Summary of changes Let's add s3 backend as well and see if it is also flaky. Also add some more logging around segments uploads. ref https://github.com/neondatabase/neon/issues/10761 --- safekeeper/src/wal_backup.rs | 18 ++++++++++++++---- test_runner/regress/test_wal_acceptor.py | 8 ++++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 8517fa0344..2f6b91cf47 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -310,9 +310,12 @@ impl WalBackupTask { retry_attempt = 0; } Err(e) => { + // We might have managed to upload some segment even though + // some later in the range failed, so log backup_lsn + // separately. error!( - "failed while offloading range {}-{}: {:?}", - backup_lsn, commit_lsn, e + "failed while offloading range {}-{}, backup_lsn {}: {:?}", + backup_lsn, commit_lsn, backup_lsn, e ); retry_attempt = retry_attempt.saturating_add(1); @@ -338,6 +341,13 @@ async fn backup_lsn_range( let start_lsn = *backup_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); + info!( + "offloading segnos {:?} of range [{}-{})", + segments.iter().map(|&s| s.seg_no).collect::>(), + start_lsn, + end_lsn, + ); + // Pool of concurrent upload tasks. We use `FuturesOrdered` to // preserve order of uploads, and update `backup_lsn` only after // all previous uploads are finished. @@ -374,10 +384,10 @@ async fn backup_lsn_range( } info!( - "offloaded segnos {:?} up to {}, previous backup_lsn {}", + "offloaded segnos {:?} of range [{}-{})", segments.iter().map(|&s| s.seg_no).collect::>(), - end_lsn, start_lsn, + end_lsn, ); Ok(()) } diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 2b6a267bdf..21b2ad479c 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -566,10 +566,14 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder): assert_prefix_empty(neon_env_builder.safekeepers_remote_storage, prefix) -def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): +# This test is flaky, probably because PUTs of local fs storage are not atomic. +# Let's keep both remote storage kinds for a while to see if this is the case. +# https://github.com/neondatabase/neon/issues/10761 +@pytest.mark.parametrize("remote_storage_kind", [s3_storage(), RemoteStorageKind.LOCAL_FS]) +def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 - neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() tenant_id = env.initial_tenant