fix(scrubber): remote_storage error causes layers to be deleted as orphans (#11924)

## Problem

close https://github.com/neondatabase/neon/issues/11159 ; we get
occasional wrong deletions of layer files being used and errors in
staging. This patch fixed it.

Example errors:

```
Timeline metadata errors: ["index_part.json contains a layer .... (shard 0000) that is not present in remote storage (layer_is_l0: false) with error: Failed to download a remote file: s3 head object\n\nCaused by:\n    0: dispatch failure\n    1: timeout\n    2: error trying to connect: HTTP connect timeout occurred after 3.1s\n
```

This error should not be fired because the file could exist, but we
cannot know if it exists due to head request failure.

## Summary of changes

Only generate cannot find layer errors when the head_object return type
is `NotFound`.

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-05-15 15:02:16 +08:00
committed by GitHub
parent 48b870bc07
commit 9e5a41a342

View File

@@ -13,7 +13,7 @@ use pageserver::tenant::remote_timeline_client::{
};
use pageserver::tenant::storage_layer::LayerName;
use pageserver_api::shard::ShardIndex;
use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
use remote_storage::{DownloadError, GenericRemoteStorage, ListingObject, RemotePath};
use tokio_util::sync::CancellationToken;
use tracing::{info, warn};
use utils::generation::Generation;
@@ -165,23 +165,34 @@ pub(crate) async fn branch_cleanup_and_check_errors(
.head_object(&path, &CancellationToken::new())
.await;
if let Err(e) = response {
// Object is not present.
let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta());
match response {
Ok(_) => {}
Err(DownloadError::NotFound) => {
// Object is not present.
let is_l0 =
LayerMap::is_l0(layer.key_range(), layer.is_delta());
let msg = format!(
"index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}",
layer,
metadata.generation.get_suffix(),
metadata.shard,
is_l0,
e,
);
let msg = format!(
"index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
layer,
metadata.generation.get_suffix(),
metadata.shard,
is_l0,
);
if is_l0 || ignore_error {
result.warnings.push(msg);
} else {
result.errors.push(msg);
if is_l0 || ignore_error {
result.warnings.push(msg);
} else {
result.errors.push(msg);
}
}
Err(e) => {
tracing::warn!(
"cannot check if the layer {}{} is present in remote storage (error: {})",
layer,
metadata.generation.get_suffix(),
e,
);
}
}
}