fix(scrubber): remote_storage error causes layers to be deleted as orphans (#11924)

## Problem

close https://github.com/neondatabase/neon/issues/11159 ; we get
occasional wrong deletions of layer files being used and errors in
staging. This patch fixed it.

Example errors:

```
Timeline metadata errors: ["index_part.json contains a layer .... (shard 0000) that is not present in remote storage (layer_is_l0: false) with error: Failed to download a remote file: s3 head object\n\nCaused by:\n    0: dispatch failure\n    1: timeout\n    2: error trying to connect: HTTP connect timeout occurred after 3.1s\n
```

This error should not be fired because the file could exist, but we
cannot know if it exists due to head request failure.

## Summary of changes

Only generate cannot find layer errors when the head_object return type
is `NotFound`.

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-05-15 15:02:16 +08:00
committed by GitHub
parent 48b870bc07
commit 9e5a41a342

View File

@@ -13,7 +13,7 @@ use pageserver::tenant::remote_timeline_client::{
}; };
use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::storage_layer::LayerName;
use pageserver_api::shard::ShardIndex; use pageserver_api::shard::ShardIndex;
use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; use remote_storage::{DownloadError, GenericRemoteStorage, ListingObject, RemotePath};
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use tracing::{info, warn}; use tracing::{info, warn};
use utils::generation::Generation; use utils::generation::Generation;
@@ -165,23 +165,34 @@ pub(crate) async fn branch_cleanup_and_check_errors(
.head_object(&path, &CancellationToken::new()) .head_object(&path, &CancellationToken::new())
.await; .await;
if let Err(e) = response { match response {
// Object is not present. Ok(_) => {}
let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta()); Err(DownloadError::NotFound) => {
// Object is not present.
let is_l0 =
LayerMap::is_l0(layer.key_range(), layer.is_delta());
let msg = format!( let msg = format!(
"index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}", "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
layer, layer,
metadata.generation.get_suffix(), metadata.generation.get_suffix(),
metadata.shard, metadata.shard,
is_l0, is_l0,
e, );
);
if is_l0 || ignore_error { if is_l0 || ignore_error {
result.warnings.push(msg); result.warnings.push(msg);
} else { } else {
result.errors.push(msg); result.errors.push(msg);
}
}
Err(e) => {
tracing::warn!(
"cannot check if the layer {}{} is present in remote storage (error: {})",
layer,
metadata.generation.get_suffix(),
e,
);
} }
} }
} }