From 9e5a41a3423782b1ab5f097e04583f38b78d9ba9 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 15 May 2025 15:02:16 +0800 Subject: [PATCH] fix(scrubber): `remote_storage` error causes layers to be deleted as orphans (#11924) ## Problem close https://github.com/neondatabase/neon/issues/11159 ; we get occasional wrong deletions of layer files being used and errors in staging. This patch fixed it. Example errors: ``` Timeline metadata errors: ["index_part.json contains a layer .... (shard 0000) that is not present in remote storage (layer_is_l0: false) with error: Failed to download a remote file: s3 head object\n\nCaused by:\n 0: dispatch failure\n 1: timeout\n 2: error trying to connect: HTTP connect timeout occurred after 3.1s\n ``` This error should not be fired because the file could exist, but we cannot know if it exists due to head request failure. ## Summary of changes Only generate cannot find layer errors when the head_object return type is `NotFound`. Signed-off-by: Alex Chi Z --- storage_scrubber/src/checks.rs | 43 +++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 40f3523a7e..865f0908f9 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -13,7 +13,7 @@ use pageserver::tenant::remote_timeline_client::{ }; use pageserver::tenant::storage_layer::LayerName; use pageserver_api::shard::ShardIndex; -use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; +use remote_storage::{DownloadError, GenericRemoteStorage, ListingObject, RemotePath}; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::generation::Generation; @@ -165,23 +165,34 @@ pub(crate) async fn branch_cleanup_and_check_errors( .head_object(&path, &CancellationToken::new()) .await; - if let Err(e) = response { - // Object is not present. - let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta()); + match response { + Ok(_) => {} + Err(DownloadError::NotFound) => { + // Object is not present. + let is_l0 = + LayerMap::is_l0(layer.key_range(), layer.is_delta()); - let msg = format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}", - layer, - metadata.generation.get_suffix(), - metadata.shard, - is_l0, - e, - ); + let msg = format!( + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + layer, + metadata.generation.get_suffix(), + metadata.shard, + is_l0, + ); - if is_l0 || ignore_error { - result.warnings.push(msg); - } else { - result.errors.push(msg); + if is_l0 || ignore_error { + result.warnings.push(msg); + } else { + result.errors.push(msg); + } + } + Err(e) => { + tracing::warn!( + "cannot check if the layer {}{} is present in remote storage (error: {})", + layer, + metadata.generation.get_suffix(), + e, + ); } } }