fix(storage-scrubber): ignore errors if index_part is not consistent (#10304)

## Problem

Consider the pageserver is doing the following sequence of operations:

* upload X files
* update index_part to add X and remove Y
* delete Y files

When storage scrubber obtains the initial timeline snapshot before
"update index_part" (that is the old version that contains Y but not X),
and then obtains the index_part file after it gets updated, it will
report all Y files are missing.

## Summary of changes

Do not report layer file missing if index_part listed and downloaded are
not the same (i.e. different last_modified times)

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-01-07 18:24:17 -05:00
committed by GitHub
parent 237dae71a1
commit 5c76e2a983
5 changed files with 36 additions and 13 deletions

View File

@@ -1,4 +1,5 @@
use std::collections::{HashMap, HashSet};
use std::time::SystemTime;
use itertools::Itertools;
use pageserver::tenant::checks::check_valid_layermap;
@@ -88,9 +89,14 @@ pub(crate) async fn branch_cleanup_and_check_errors(
match s3_data.blob_data {
BlobDataParseResult::Parsed {
index_part,
index_part_generation: _index_part_generation,
s3_layers: _s3_layers,
index_part_generation: _,
s3_layers: _,
index_part_last_modified_time,
index_part_snapshot_time,
} => {
// Ignore missing file error if index_part downloaded is different from the one when listing the layer files.
let ignore_error = index_part_snapshot_time < index_part_last_modified_time
&& !cfg!(debug_assertions);
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) {
result
.errors
@@ -171,7 +177,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
is_l0,
);
if is_l0 {
if is_l0 || ignore_error {
result.warnings.push(msg);
} else {
result.errors.push(msg);
@@ -308,6 +314,8 @@ pub(crate) enum BlobDataParseResult {
Parsed {
index_part: Box<IndexPart>,
index_part_generation: Generation,
index_part_last_modified_time: SystemTime,
index_part_snapshot_time: SystemTime,
s3_layers: HashSet<(LayerName, Generation)>,
},
/// The remains of an uncleanly deleted Timeline or aborted timeline creation(e.g. an initdb archive only, or some layer without an index)
@@ -484,9 +492,9 @@ async fn list_timeline_blobs_impl(
}
if let Some(index_part_object_key) = index_part_object.as_ref() {
let index_part_bytes =
let (index_part_bytes, index_part_last_modified_time) =
match download_object_with_retries(remote_client, &index_part_object_key.key).await {
Ok(index_part_bytes) => index_part_bytes,
Ok(data) => data,
Err(e) => {
// It is possible that the branch gets deleted in-between we list the objects
// and we download the index part file.
@@ -500,7 +508,7 @@ async fn list_timeline_blobs_impl(
));
}
};
let index_part_snapshot_time = index_part_object_key.last_modified;
match serde_json::from_slice(&index_part_bytes) {
Ok(index_part) => {
return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
@@ -508,6 +516,8 @@ async fn list_timeline_blobs_impl(
index_part: Box::new(index_part),
index_part_generation,
s3_layers,
index_part_last_modified_time,
index_part_snapshot_time,
},
unused_index_keys: index_part_keys,
unknown_keys,
@@ -625,7 +635,7 @@ pub(crate) async fn list_tenant_manifests(
let manifest_bytes =
match download_object_with_retries(remote_client, &latest_listing_object.key).await {
Ok(bytes) => bytes,
Ok((bytes, _)) => bytes,
Err(e) => {
// It is possible that the tenant gets deleted in-between we list the objects
// and we download the manifest file.

View File

@@ -13,7 +13,7 @@ pub mod tenant_snapshot;
use std::env;
use std::fmt::Display;
use std::sync::Arc;
use std::time::Duration;
use std::time::{Duration, SystemTime};
use anyhow::Context;
use aws_config::retry::{RetryConfigBuilder, RetryMode};
@@ -509,10 +509,11 @@ async fn list_objects_with_retries(
panic!("MAX_RETRIES is not allowed to be 0");
}
/// Returns content, last modified time
async fn download_object_with_retries(
remote_client: &GenericRemoteStorage,
key: &RemotePath,
) -> anyhow::Result<Vec<u8>> {
) -> anyhow::Result<(Vec<u8>, SystemTime)> {
let cancel = CancellationToken::new();
for trial in 0..MAX_RETRIES {
let mut buf = Vec::new();
@@ -535,7 +536,7 @@ async fn download_object_with_retries(
{
Ok(bytes_read) => {
tracing::debug!("Downloaded {bytes_read} bytes for object {key}");
return Ok(buf);
return Ok((buf, download.last_modified));
}
Err(e) => {
error!("Failed to stream object body for key {key}: {e}");

View File

@@ -450,6 +450,8 @@ async fn gc_ancestor(
index_part: _,
index_part_generation: _,
s3_layers,
index_part_last_modified_time: _,
index_part_snapshot_time: _,
} => s3_layers,
BlobDataParseResult::Relic => {
// Post-deletion tenant location: don't try and GC it.
@@ -586,7 +588,9 @@ async fn gc_timeline(
BlobDataParseResult::Parsed {
index_part,
index_part_generation,
s3_layers: _s3_layers,
s3_layers: _,
index_part_last_modified_time: _,
index_part_snapshot_time: _,
} => (index_part, *index_part_generation, data.unused_index_keys),
BlobDataParseResult::Relic => {
// Post-deletion tenant location: don't try and GC it.

View File

@@ -47,6 +47,8 @@ impl MetadataSummary {
index_part,
index_part_generation: _,
s3_layers: _,
index_part_last_modified_time: _,
index_part_snapshot_time: _,
} = &data.blob_data
{
*self
@@ -195,7 +197,9 @@ pub async fn scan_pageserver_metadata(
if let BlobDataParseResult::Parsed {
index_part,
index_part_generation,
s3_layers: _s3_layers,
s3_layers: _,
index_part_last_modified_time: _,
index_part_snapshot_time: _,
} = &data.blob_data
{
if index_part.deleted_at.is_some() {
@@ -318,9 +322,11 @@ pub async fn scan_pageserver_metadata(
match &data.blob_data {
BlobDataParseResult::Parsed {
index_part: _index_part,
index_part: _,
index_part_generation: _index_part_generation,
s3_layers,
index_part_last_modified_time: _,
index_part_snapshot_time: _,
} => {
tenant_objects.push(ttid, s3_layers.clone());
}

View File

@@ -268,6 +268,8 @@ impl SnapshotDownloader {
index_part,
index_part_generation,
s3_layers: _,
index_part_last_modified_time: _,
index_part_snapshot_time: _,
} => {
self.download_timeline(
ttid,