From dfb0a6fdaf8eecbfacf5b1d9a5dc62ec26bd64c9 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 12 Dec 2023 16:53:08 +0000 Subject: [PATCH] scrubber: handle initdb files, fix an issue with prefixes (#6079) - The code for calculating the prefix in the bucket was expecting a trailing slash (as it is in the tests), but that's an awkward expectation to impose for use in the field: make the code more flexible by only trimming a trailing character if it is indeed a slash. - initdb archives were detected by the scrubber as malformed layer files. Teach it to recognize and ignore them. --- s3_scrubber/src/checks.rs | 22 +++++++++++++++++++++- s3_scrubber/src/lib.rs | 4 +++- s3_scrubber/src/main.rs | 13 ++++++++++++- s3_scrubber/src/scan_metadata.rs | 4 ++++ 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 510a128663..a15a908212 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -142,7 +142,9 @@ pub(crate) async fn branch_cleanup_and_check_errors( .collect(); if !orphan_layers.is_empty() { - result.errors.push(format!( + // An orphan layer is not an error: it's arguably not even a warning, but it is helpful to report + // these as a hint that there is something worth cleaning up here. + result.warnings.push(format!( "index_part.json does not contain layers from S3: {:?}", orphan_layers .iter() @@ -170,6 +172,7 @@ pub(crate) async fn branch_cleanup_and_check_errors( )); } } + BlobDataParseResult::Relic => {} BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend( parse_errors .into_iter() @@ -215,6 +218,8 @@ pub(crate) enum BlobDataParseResult { index_part_generation: Generation, s3_layers: HashSet<(LayerFileName, Generation)>, }, + /// The remains of a deleted Timeline (i.e. an initdb archive only) + Relic, Incorrect(Vec), } @@ -245,6 +250,7 @@ pub(crate) async fn list_timeline_blobs( timeline_dir_target.delimiter = String::new(); let mut index_parts: Vec = Vec::new(); + let mut initdb_archive: bool = false; let stream = stream_listing(s3_client, &timeline_dir_target); pin_mut!(stream); @@ -258,6 +264,10 @@ pub(crate) async fn list_timeline_blobs( tracing::info!("Index key {key}"); index_parts.push(obj) } + Some("initdb.tar.zst") => { + tracing::info!("initdb archive {key}"); + initdb_archive = true; + } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { Ok((new_layer, gen)) => { tracing::info!("Parsed layer key: {} {:?}", new_layer, gen); @@ -279,6 +289,16 @@ pub(crate) async fn list_timeline_blobs( } } + if index_parts.is_empty() && s3_layers.is_empty() && initdb_archive { + tracing::info!( + "Timeline is empty apart from initdb archive: expected post-deletion state." + ); + return Ok(S3TimelineBlobData { + blob_data: BlobDataParseResult::Relic, + keys_to_remove: Vec::new(), + }); + } + // Choose the index_part with the highest generation let (index_part_object, index_part_generation) = match index_parts .iter() diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index e5465952fb..6607db21e6 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -86,7 +86,9 @@ impl S3Target { if new_self.prefix_in_bucket.is_empty() { new_self.prefix_in_bucket = format!("/{}/", new_segment); } else { - let _ = new_self.prefix_in_bucket.pop(); + if new_self.prefix_in_bucket.ends_with('/') { + new_self.prefix_in_bucket.pop(); + } new_self.prefix_in_bucket = [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter); } diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs index 1f0ceebdaf..ef020edc2a 100644 --- a/s3_scrubber/src/main.rs +++ b/s3_scrubber/src/main.rs @@ -57,7 +57,7 @@ async fn main() -> anyhow::Result<()> { )); match cli.command { - Command::ScanMetadata { json } => match scan_metadata(bucket_config).await { + Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await { Err(e) => { tracing::error!("Failed: {e}"); Err(e) @@ -70,6 +70,17 @@ async fn main() -> anyhow::Result<()> { } if summary.is_fatal() { Err(anyhow::anyhow!("Fatal scrub errors detected")) + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + Err(anyhow::anyhow!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + )) } else { Ok(()) } diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs index ad82db1e76..228f8d6763 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/s3_scrubber/src/scan_metadata.rs @@ -174,6 +174,10 @@ Timeline layer count: {6} pub fn is_fatal(&self) -> bool { !self.with_errors.is_empty() } + + pub fn is_empty(&self) -> bool { + self.count == 0 + } } /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.