diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 14788515dd..35ec69fd50 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -172,8 +172,11 @@ pub(crate) async fn branch_cleanup_and_check_errors( } } BlobDataParseResult::Relic => {} - BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend( - parse_errors + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, + } => result.errors.extend( + errors .into_iter() .map(|error| format!("parse error: {error}")), ), @@ -300,7 +303,10 @@ pub(crate) enum BlobDataParseResult { }, /// The remains of a deleted Timeline (i.e. an initdb archive only) Relic, - Incorrect(Vec), + Incorrect { + errors: Vec, + s3_layers: HashSet<(LayerName, Generation)>, + }, } pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { @@ -443,7 +449,7 @@ pub(crate) async fn list_timeline_blobs( } Ok(S3TimelineBlobData { - blob_data: BlobDataParseResult::Incorrect(errors), + blob_data: BlobDataParseResult::Incorrect { errors, s3_layers }, unused_index_keys: index_part_keys, unknown_keys, }) diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index a111c31844..cbc836755a 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -208,21 +208,21 @@ async fn main() -> anyhow::Result<()> { } if summary.is_fatal() { - Err(anyhow::anyhow!("Fatal scrub errors detected")) + tracing::error!("Fatal scrub errors detected"); } else if summary.is_empty() { // Strictly speaking an empty bucket is a valid bucket, but if someone ran the // scrubber they were likely expecting to scan something, and if we see no timelines // at all then it's likely due to some configuration issues like a bad prefix - Err(anyhow::anyhow!( + tracing::error!( "No timelines found in bucket {} prefix {}", bucket_config.bucket, bucket_config .prefix_in_bucket .unwrap_or("".to_string()) - )) - } else { - Ok(()) + ); } + + Ok(()) } } } diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index 69896caa82..ff230feae3 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -389,10 +389,13 @@ async fn gc_ancestor( // Post-deletion tenant location: don't try and GC it. continue; } - BlobDataParseResult::Incorrect(reasons) => { + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, // TODO(yuchen): could still check references to these s3 layers? + } => { // Our primary purpose isn't to report on bad data, but log this rather than skipping silently tracing::warn!( - "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}" + "Skipping ancestor GC for timeline {ttid}, bad metadata: {errors:?}" ); continue; } @@ -518,9 +521,12 @@ pub async fn pageserver_physical_gc( // Post-deletion tenant location: don't try and GC it. return Ok(summary); } - BlobDataParseResult::Incorrect(reasons) => { + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, + } => { // Our primary purpose isn't to report on bad data, but log this rather than skipping silently - tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}"); + tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}"); return Ok(summary); } }; diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index dc410bde41..b9630056e1 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -290,13 +290,21 @@ pub async fn scan_metadata( } } - if let BlobDataParseResult::Parsed { - index_part: _index_part, - index_part_generation: _index_part_generation, - s3_layers, - } = &data.blob_data - { - tenant_objects.push(ttid, s3_layers.clone()); + match &data.blob_data { + BlobDataParseResult::Parsed { + index_part: _index_part, + index_part_generation: _index_part_generation, + s3_layers, + } => { + tenant_objects.push(ttid, s3_layers.clone()); + } + BlobDataParseResult::Relic => (), + BlobDataParseResult::Incorrect { + errors: _, + s3_layers, + } => { + tenant_objects.push(ttid, s3_layers.clone()); + } } tenant_timeline_results.push((ttid, data)); } diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 5a75f8d40e..1866e6ec80 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -269,7 +269,7 @@ impl SnapshotDownloader { .context("Downloading timeline")?; } BlobDataParseResult::Relic => {} - BlobDataParseResult::Incorrect(_) => { + BlobDataParseResult::Incorrect { .. } => { tracing::error!("Bad metadata in timeline {ttid}"); } }; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7289472de2..c6f4404784 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -978,7 +978,10 @@ class NeonEnvBuilder: and self.enable_scrub_on_exit ): try: - self.env.storage_scrubber.scan_metadata() + healthy, _ = self.env.storage_scrubber.scan_metadata() + if not healthy: + e = Exception("Remote storage metadata corrupted") + cleanup_error = e except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -4411,14 +4414,19 @@ class StorageScrubber: assert stdout is not None return stdout - def scan_metadata(self, post_to_storage_controller: bool = False) -> Any: + def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]: + """ + Returns the health status and the metadata summary. + """ args = ["scan-metadata", "--node-kind", "pageserver", "--json"] if post_to_storage_controller: args.append("--post") stdout = self.scrubber_cli(args, timeout=30) try: - return json.loads(stdout) + summary = json.loads(stdout) + healthy = not summary["with_errors"] and not summary["with_warnings"] + return healthy, summary except: log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:") log.error(stdout) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 137b0e931d..afa5f6873c 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -496,11 +496,10 @@ def test_historic_storage_formats( # Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt) # # Do this _before_ importing to the pageserver, as that import may start writing immediately - metadata_summary = env.storage_scrubber.scan_metadata() + healthy, metadata_summary = env.storage_scrubber.scan_metadata() + assert healthy assert metadata_summary["tenant_count"] >= 1 assert metadata_summary["timeline_count"] >= 1 - assert not metadata_summary["with_errors"] - assert not metadata_summary["with_warnings"] env.neon_cli.import_tenant(dataset.tenant_id) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 8941ddd281..73af7950f1 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -214,12 +214,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Having written a mixture of generation-aware and legacy index_part.json, # ensure the scrubber handles the situation as expected. - metadata_summary = env.storage_scrubber.scan_metadata() + healthy, metadata_summary = env.storage_scrubber.scan_metadata() assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline assert metadata_summary["timeline_count"] == 1 assert metadata_summary["timeline_shard_count"] == 1 - assert not metadata_summary["with_errors"] - assert not metadata_summary["with_warnings"] + assert healthy def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 4b0af24480..8746b88a75 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -593,7 +593,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Scrub the remote storage # ======================== # This confirms that the scrubber isn't upset by the presence of the heatmap - env.storage_scrubber.scan_metadata() + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy # Detach secondary and delete tenant # =================================== diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 7f30b2d7a7..1011a6fd22 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -124,7 +124,8 @@ def test_sharding_smoke( # Check the scrubber isn't confused by sharded content, then disable # it during teardown because we'll have deleted by then - env.storage_scrubber.scan_metadata() + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy env.storage_controller.pageserver_api().tenant_delete(tenant_id) assert_prefix_empty( diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index e3f627b6a6..388f6a9e92 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -516,9 +516,8 @@ def test_scrubber_scan_pageserver_metadata( assert len(index.layer_metadata) > 0 it = iter(index.layer_metadata.items()) - scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) - assert not scan_summary["with_warnings"] - assert not scan_summary["with_errors"] + healthy, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) + assert healthy assert env.storage_controller.metadata_health_is_healthy() @@ -532,16 +531,18 @@ def test_scrubber_scan_pageserver_metadata( log.info(f"delete response: {delete_response}") # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings. - scan_summary = env.storage_scrubber.scan_metadata() + _, scan_summary = env.storage_scrubber.scan_metadata() log.info(f"{pprint.pformat(scan_summary)}") assert len(scan_summary["with_warnings"]) > 0 assert env.storage_controller.metadata_health_is_healthy() # Now post to storage controller, expect seeing one unhealthy health record - scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) + _, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) log.info(f"{pprint.pformat(scan_summary)}") assert len(scan_summary["with_warnings"]) > 0 unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"] assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id) + + neon_env_builder.disable_scrub_on_exit() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index c343b349cf..c01b3a2e89 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -341,13 +341,13 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) env.stop() - result = env.storage_scrubber.scan_metadata() - assert result["with_warnings"] == [] + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy env.start() ps_http = env.pageserver.http_client() ps_http.tenant_delete(tenant_id) env.stop() - env.storage_scrubber.scan_metadata() - assert result["with_warnings"] == [] + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy