From 24ea9f9f600d588bbae8812ce2c8e6570fad67f0 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 25 Jul 2024 14:19:38 +0100 Subject: [PATCH] tests: always scrub on test exit when using S3Storage (#8437) ## Problem Currently, tests may have a scrub during teardown if they ask for it, but most tests don't request it. To detect "unknown unknowns", let's run it at the end of every test where possible. This is similar to asserting that there are no errors in the log at the end of tests. ## Summary of changes - Remove explicit `enable_scrub_on_exit` - Always scrub if remote storage is an S3Storage. --- test_runner/fixtures/neon_fixtures.py | 33 ++++++++----------- .../regress/test_pageserver_restart.py | 2 -- .../regress/test_pageserver_secondary.py | 6 ++++ test_runner/regress/test_pg_regress.py | 3 -- test_runner/regress/test_sharding.py | 3 -- test_runner/regress/test_tenant_delete.py | 3 ++ test_runner/regress/test_timeline_delete.py | 6 ++++ 7 files changed, 29 insertions(+), 27 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 76ab46b01a..d6718fca39 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -523,7 +523,7 @@ class NeonEnvBuilder: self.preserve_database_files = preserve_database_files self.initial_tenant = initial_tenant or TenantId.generate() self.initial_timeline = initial_timeline or TimelineId.generate() - self.scrub_on_exit = False + self.enable_scrub_on_exit = True self.test_output_dir = test_output_dir self.test_overlay_dir = test_overlay_dir self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = [] @@ -852,6 +852,13 @@ class NeonEnvBuilder: ) ident_state_dir.rmdir() # should be empty since we moved `upper` out + def disable_scrub_on_exit(self): + """ + Some tests intentionally leave the remote storage contents empty or corrupt, + so it doesn't make sense to do the usual scrub at the end of the test. + """ + self.enable_scrub_on_exit = False + def overlay_cleanup_teardown(self): """ Unmount the overlayfs mounts created by `self.overlay_mount()`. @@ -877,23 +884,6 @@ class NeonEnvBuilder: # assert all overlayfs mounts in our test directory are gone assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir)) - def enable_scrub_on_exit(self): - """ - Call this if you would like the fixture to automatically run - storage_scrubber at the end of the test, as a bidirectional test - that the scrubber is working properly, and that the code within - the test didn't produce any invalid remote state. - """ - - if not isinstance(self.pageserver_remote_storage, S3Storage): - # The scrubber can't talk to e.g. LocalFS -- it needs - # an HTTP endpoint (mock is fine) to connect to. - raise RuntimeError( - "Cannot scrub with remote_storage={self.pageserver_remote_storage}, require an S3 endpoint" - ) - - self.scrub_on_exit = True - def enable_pageserver_remote_storage( self, remote_storage_kind: RemoteStorageKind, @@ -995,7 +985,12 @@ class NeonEnvBuilder: ) cleanup_error = None - if self.scrub_on_exit: + # If we are running with S3Storage (required by the scrubber), check that whatever the test + # did does not generate any corruption + if ( + isinstance(self.env.pageserver_remote_storage, S3Storage) + and self.enable_scrub_on_exit + ): try: self.env.storage_scrubber.scan_metadata() except Exception as e: diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index dccc1264e3..68a45f957c 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -13,7 +13,6 @@ from fixtures.utils import wait_until # running. def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() # We inject a delay of 15 seconds for tenant activation below. # Hence, bump the max delay here to not skip over the activation. @@ -161,7 +160,6 @@ def test_pageserver_chaos( pytest.skip("times out in debug builds") neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() if shard_count is not None: neon_env_builder.num_pageservers = shard_count diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index f43141c2d8..53f69b5b26 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -390,6 +390,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): # (reproduce https://github.com/neondatabase/neon/issues/6802) pageserver_b.http_client().tenant_delete(tenant_id) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): """ @@ -589,6 +592,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) workload.stop() + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index d5b5ac3f75..6f7ea0092a 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -138,7 +138,6 @@ def test_pg_regress( neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start( initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count, @@ -202,7 +201,6 @@ def test_isolation( if shard_count is not None: neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start( initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count ) @@ -265,7 +263,6 @@ def test_sql_regress( if shard_count is not None: neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start( initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count ) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 9c45af7c1b..bc43bc77fa 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -47,7 +47,6 @@ def test_sharding_smoke( # Use S3-compatible remote storage so that we can scrub: this test validates # that the scrubber doesn't barf when it sees a sharded tenant. neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() neon_env_builder.preserve_database_files = True @@ -128,7 +127,6 @@ def test_sharding_smoke( # Check the scrubber isn't confused by sharded content, then disable # it during teardown because we'll have deleted by then env.storage_scrubber.scan_metadata() - neon_env_builder.scrub_on_exit = False env.storage_controller.pageserver_api().tenant_delete(tenant_id) assert_prefix_empty( @@ -373,7 +371,6 @@ def test_sharding_split_smoke( # Use S3-compatible remote storage so that we can scrub: this test validates # that the scrubber doesn't barf when it sees a sharded tenant. neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() neon_env_builder.preserve_database_files = True diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 6d20b3d0de..c343b349cf 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -315,6 +315,9 @@ def test_tenant_delete_races_timeline_creation( # Zero tenants remain (we deleted the default tenant) assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index da37f469b3..6d96dda391 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -485,6 +485,9 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage), ) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + @pytest.mark.parametrize( "stuck_failpoint", @@ -703,6 +706,9 @@ def test_timeline_delete_works_for_remote_smoke( # Assume it is mock server inconsistency and check twice. wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage)) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_delete_orphaned_objects( neon_env_builder: NeonEnvBuilder,