mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
fix(scrubber): remote_storage error causes layers to be deleted as orphans (#11924)
## Problem close https://github.com/neondatabase/neon/issues/11159 ; we get occasional wrong deletions of layer files being used and errors in staging. This patch fixed it. Example errors: ``` Timeline metadata errors: ["index_part.json contains a layer .... (shard 0000) that is not present in remote storage (layer_is_l0: false) with error: Failed to download a remote file: s3 head object\n\nCaused by:\n 0: dispatch failure\n 1: timeout\n 2: error trying to connect: HTTP connect timeout occurred after 3.1s\n ``` This error should not be fired because the file could exist, but we cannot know if it exists due to head request failure. ## Summary of changes Only generate cannot find layer errors when the head_object return type is `NotFound`. Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
@@ -13,7 +13,7 @@ use pageserver::tenant::remote_timeline_client::{
|
||||
};
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingObject, RemotePath};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info, warn};
|
||||
use utils::generation::Generation;
|
||||
@@ -165,23 +165,34 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
.head_object(&path, &CancellationToken::new())
|
||||
.await;
|
||||
|
||||
if let Err(e) = response {
|
||||
// Object is not present.
|
||||
let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta());
|
||||
match response {
|
||||
Ok(_) => {}
|
||||
Err(DownloadError::NotFound) => {
|
||||
// Object is not present.
|
||||
let is_l0 =
|
||||
LayerMap::is_l0(layer.key_range(), layer.is_delta());
|
||||
|
||||
let msg = format!(
|
||||
"index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}",
|
||||
layer,
|
||||
metadata.generation.get_suffix(),
|
||||
metadata.shard,
|
||||
is_l0,
|
||||
e,
|
||||
);
|
||||
let msg = format!(
|
||||
"index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
|
||||
layer,
|
||||
metadata.generation.get_suffix(),
|
||||
metadata.shard,
|
||||
is_l0,
|
||||
);
|
||||
|
||||
if is_l0 || ignore_error {
|
||||
result.warnings.push(msg);
|
||||
} else {
|
||||
result.errors.push(msg);
|
||||
if is_l0 || ignore_error {
|
||||
result.warnings.push(msg);
|
||||
} else {
|
||||
result.errors.push(msg);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"cannot check if the layer {}{} is present in remote storage (error: {})",
|
||||
layer,
|
||||
metadata.generation.get_suffix(),
|
||||
e,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user