From d263b1804e70d3adf482e740fb9ed20e3fbcbe09 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 18 Jul 2024 13:46:00 +0100 Subject: [PATCH] Fix partial upload bug with invalid remote state (#8383) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have an issue that some partial uploaded segments can be actually missing in remote storage. I found this issue when was looking at the logs in staging, and it can be triggered by failed uploads: 1. Code tries to upload `SEG_TERM_LSN_LSN_sk5.partial`, but receives error from S3 2. The failed attempt is saved to `segments` vec 3. After some time, the code tries to upload `SEG_TERM_LSN_LSN_sk5.partial` again 4. This time the upload is successful and code calls `gc()` to delete previous uploads 5. Since new object and old object share the same name, uploaded data gets deleted from remote storage This commit fixes the issue by patching `gc()` not to delete objects with the same name as currently uploaded. --------- Co-authored-by: Arpad Müller --- safekeeper/src/timeline_eviction.rs | 5 +---- safekeeper/src/wal_backup_partial.rs | 12 ++++++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 0b8d58ee8a..7947d83eb4 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -199,10 +199,7 @@ async fn redownload_partial_segment( file.flush().await?; let final_path = local_segment_path(mgr, partial); - info!( - "downloaded {} bytes, renaming to {}", - final_path, final_path, - ); + info!("downloaded {actual_len} bytes, renaming to {final_path}"); if let Err(e) = durable_rename(&tmp_file, &final_path, !mgr.conf.no_sync).await { // Probably rename succeeded, but fsync of it failed. Remove // the file then to avoid using it. diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 825851c97c..b1efa9749f 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -289,6 +289,18 @@ impl PartialBackup { }) .collect(); + if new_segments.len() == 1 { + // we have an uploaded segment, it must not be deleted from remote storage + segments_to_delete.retain(|name| name != &new_segments[0].name); + } else { + // there should always be zero or one uploaded segment + assert!( + new_segments.is_empty(), + "too many uploaded segments: {:?}", + new_segments + ); + } + info!("deleting objects: {:?}", segments_to_delete); let mut objects_to_delete = vec![]; for seg in segments_to_delete.iter() {