From f8a727ac4713dac129ca710294c6ba98cb8cced0 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 30 Jul 2024 16:46:14 -0400 Subject: [PATCH] use explicit checks in test Signed-off-by: Yuchen Liang --- test_runner/fixtures/neon_fixtures.py | 14 +++++++++++--- test_runner/regress/test_pageserver_generations.py | 5 ++--- test_runner/regress/test_pageserver_secondary.py | 3 ++- test_runner/regress/test_sharding.py | 3 ++- test_runner/regress/test_storage_scrubber.py | 11 ++++++----- test_runner/regress/test_tenant_delete.py | 8 ++++---- 6 files changed, 27 insertions(+), 17 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 0c33dec784..14d1773f20 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -978,7 +978,10 @@ class NeonEnvBuilder: and self.enable_scrub_on_exit ): try: - self.env.storage_scrubber.scan_metadata() + healthy, _ = self.env.storage_scrubber.scan_metadata() + if not healthy: + e = Exception("Remote storage metadata corrupted") + cleanup_error = e except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -4401,14 +4404,19 @@ class StorageScrubber: assert stdout is not None return stdout - def scan_metadata(self, post_to_storage_controller: bool = False) -> Any: + def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]: + """ + Returns the health status and the metadata summary. + """ args = ["scan-metadata", "--node-kind", "pageserver", "--json"] if post_to_storage_controller: args.append("--post") stdout = self.scrubber_cli(args, timeout=30) try: - return json.loads(stdout) + summary = json.loads(stdout) + healthy = not summary["with_errors"] and not summary["with_warnings"] + return healthy, summary except: log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:") log.error(stdout) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 8941ddd281..73af7950f1 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -214,12 +214,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Having written a mixture of generation-aware and legacy index_part.json, # ensure the scrubber handles the situation as expected. - metadata_summary = env.storage_scrubber.scan_metadata() + healthy, metadata_summary = env.storage_scrubber.scan_metadata() assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline assert metadata_summary["timeline_count"] == 1 assert metadata_summary["timeline_shard_count"] == 1 - assert not metadata_summary["with_errors"] - assert not metadata_summary["with_warnings"] + assert healthy def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 53f69b5b26..573fe46c59 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -563,7 +563,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Scrub the remote storage # ======================== # This confirms that the scrubber isn't upset by the presence of the heatmap - env.storage_scrubber.scan_metadata() + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy # Detach secondary and delete tenant # =================================== diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 7f30b2d7a7..1011a6fd22 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -124,7 +124,8 @@ def test_sharding_smoke( # Check the scrubber isn't confused by sharded content, then disable # it during teardown because we'll have deleted by then - env.storage_scrubber.scan_metadata() + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy env.storage_controller.pageserver_api().tenant_delete(tenant_id) assert_prefix_empty( diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index fadf438788..6d36654ca3 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -440,9 +440,8 @@ def test_scrubber_scan_pageserver_metadata( assert len(index.layer_metadata) > 0 it = iter(index.layer_metadata.items()) - scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) - assert not scan_summary["with_warnings"] - assert not scan_summary["with_errors"] + healthy, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) + assert healthy assert env.storage_controller.metadata_health_is_healthy() @@ -456,16 +455,18 @@ def test_scrubber_scan_pageserver_metadata( log.info(f"delete response: {delete_response}") # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings. - scan_summary = env.storage_scrubber.scan_metadata() + _, scan_summary = env.storage_scrubber.scan_metadata() log.info(f"{pprint.pformat(scan_summary)}") assert len(scan_summary["with_warnings"]) > 0 assert env.storage_controller.metadata_health_is_healthy() # Now post to storage controller, expect seeing one unhealthy health record - scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) + _, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) log.info(f"{pprint.pformat(scan_summary)}") assert len(scan_summary["with_warnings"]) > 0 unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"] assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id) + + neon_env_builder.disable_scrub_on_exit() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index c343b349cf..c01b3a2e89 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -341,13 +341,13 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) env.stop() - result = env.storage_scrubber.scan_metadata() - assert result["with_warnings"] == [] + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy env.start() ps_http = env.pageserver.http_client() ps_http.tenant_delete(tenant_id) env.stop() - env.storage_scrubber.scan_metadata() - assert result["with_warnings"] == [] + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy