Fix partial upload bug with invalid remote state (#8383)

We have an issue that some partial uploaded segments can be actually
missing in remote storage. I found this issue when was looking at the
logs in staging, and it can be triggered by failed uploads:
1. Code tries to upload `SEG_TERM_LSN_LSN_sk5.partial`, but receives
error from S3
2. The failed attempt is saved to `segments` vec
3. After some time, the code tries to upload
`SEG_TERM_LSN_LSN_sk5.partial` again
4. This time the upload is successful and code calls `gc()` to delete
previous uploads
5. Since new object and old object share the same name, uploaded data
gets deleted from remote storage

This commit fixes the issue by patching `gc()` not to delete objects
with the same name as currently uploaded.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
This commit is contained in:
Arthur Petukhovsky
2024-07-18 13:46:00 +01:00
committed by GitHub
parent b461755326
commit d263b1804e
2 changed files with 13 additions and 4 deletions

View File

@@ -199,10 +199,7 @@ async fn redownload_partial_segment(
file.flush().await?;
let final_path = local_segment_path(mgr, partial);
info!(
"downloaded {} bytes, renaming to {}",
final_path, final_path,
);
info!("downloaded {actual_len} bytes, renaming to {final_path}");
if let Err(e) = durable_rename(&tmp_file, &final_path, !mgr.conf.no_sync).await {
// Probably rename succeeded, but fsync of it failed. Remove
// the file then to avoid using it.

View File

@@ -289,6 +289,18 @@ impl PartialBackup {
})
.collect();
if new_segments.len() == 1 {
// we have an uploaded segment, it must not be deleted from remote storage
segments_to_delete.retain(|name| name != &new_segments[0].name);
} else {
// there should always be zero or one uploaded segment
assert!(
new_segments.is_empty(),
"too many uploaded segments: {:?}",
new_segments
);
}
info!("deleting objects: {:?}", segments_to_delete);
let mut objects_to_delete = vec![];
for seg in segments_to_delete.iter() {