tests: always scrub on test exit when using S3Storage (#8437)

## Problem

Currently, tests may have a scrub during teardown if they ask for it,
but most tests don't request it. To detect "unknown unknowns", let's run
it at the end of every test where possible. This is similar to asserting
that there are no errors in the log at the end of tests.

## Summary of changes

- Remove explicit `enable_scrub_on_exit`
- Always scrub if remote storage is an S3Storage.
This commit is contained in:
John Spray
2024-07-25 14:19:38 +01:00
committed by GitHub
parent 9c5ad21341
commit 24ea9f9f60
7 changed files with 29 additions and 27 deletions

View File

@@ -523,7 +523,7 @@ class NeonEnvBuilder:
self.preserve_database_files = preserve_database_files
self.initial_tenant = initial_tenant or TenantId.generate()
self.initial_timeline = initial_timeline or TimelineId.generate()
self.scrub_on_exit = False
self.enable_scrub_on_exit = True
self.test_output_dir = test_output_dir
self.test_overlay_dir = test_overlay_dir
self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = []
@@ -852,6 +852,13 @@ class NeonEnvBuilder:
)
ident_state_dir.rmdir() # should be empty since we moved `upper` out
def disable_scrub_on_exit(self):
"""
Some tests intentionally leave the remote storage contents empty or corrupt,
so it doesn't make sense to do the usual scrub at the end of the test.
"""
self.enable_scrub_on_exit = False
def overlay_cleanup_teardown(self):
"""
Unmount the overlayfs mounts created by `self.overlay_mount()`.
@@ -877,23 +884,6 @@ class NeonEnvBuilder:
# assert all overlayfs mounts in our test directory are gone
assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir))
def enable_scrub_on_exit(self):
"""
Call this if you would like the fixture to automatically run
storage_scrubber at the end of the test, as a bidirectional test
that the scrubber is working properly, and that the code within
the test didn't produce any invalid remote state.
"""
if not isinstance(self.pageserver_remote_storage, S3Storage):
# The scrubber can't talk to e.g. LocalFS -- it needs
# an HTTP endpoint (mock is fine) to connect to.
raise RuntimeError(
"Cannot scrub with remote_storage={self.pageserver_remote_storage}, require an S3 endpoint"
)
self.scrub_on_exit = True
def enable_pageserver_remote_storage(
self,
remote_storage_kind: RemoteStorageKind,
@@ -995,7 +985,12 @@ class NeonEnvBuilder:
)
cleanup_error = None
if self.scrub_on_exit:
# If we are running with S3Storage (required by the scrubber), check that whatever the test
# did does not generate any corruption
if (
isinstance(self.env.pageserver_remote_storage, S3Storage)
and self.enable_scrub_on_exit
):
try:
self.env.storage_scrubber.scan_metadata()
except Exception as e:

View File

@@ -13,7 +13,6 @@ from fixtures.utils import wait_until
# running.
def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
# We inject a delay of 15 seconds for tenant activation below.
# Hence, bump the max delay here to not skip over the activation.
@@ -161,7 +160,6 @@ def test_pageserver_chaos(
pytest.skip("times out in debug builds")
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
if shard_count is not None:
neon_env_builder.num_pageservers = shard_count

View File

@@ -390,6 +390,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
# (reproduce https://github.com/neondatabase/neon/issues/6802)
pageserver_b.http_client().tenant_delete(tenant_id)
# We deleted our only tenant, and the scrubber fails if it detects nothing
neon_env_builder.disable_scrub_on_exit()
def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
"""
@@ -589,6 +592,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
)
workload.stop()
# We deleted our only tenant, and the scrubber fails if it detects nothing
neon_env_builder.disable_scrub_on_exit()
def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
"""

View File

@@ -138,7 +138,6 @@ def test_pg_regress(
neon_env_builder.num_pageservers = shard_count
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
env = neon_env_builder.init_start(
initial_tenant_conf=TENANT_CONF,
initial_tenant_shard_count=shard_count,
@@ -202,7 +201,6 @@ def test_isolation(
if shard_count is not None:
neon_env_builder.num_pageservers = shard_count
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
env = neon_env_builder.init_start(
initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
)
@@ -265,7 +263,6 @@ def test_sql_regress(
if shard_count is not None:
neon_env_builder.num_pageservers = shard_count
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
env = neon_env_builder.init_start(
initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
)

View File

@@ -47,7 +47,6 @@ def test_sharding_smoke(
# Use S3-compatible remote storage so that we can scrub: this test validates
# that the scrubber doesn't barf when it sees a sharded tenant.
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
neon_env_builder.preserve_database_files = True
@@ -128,7 +127,6 @@ def test_sharding_smoke(
# Check the scrubber isn't confused by sharded content, then disable
# it during teardown because we'll have deleted by then
env.storage_scrubber.scan_metadata()
neon_env_builder.scrub_on_exit = False
env.storage_controller.pageserver_api().tenant_delete(tenant_id)
assert_prefix_empty(
@@ -373,7 +371,6 @@ def test_sharding_split_smoke(
# Use S3-compatible remote storage so that we can scrub: this test validates
# that the scrubber doesn't barf when it sees a sharded tenant.
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
neon_env_builder.preserve_database_files = True

View File

@@ -315,6 +315,9 @@ def test_tenant_delete_races_timeline_creation(
# Zero tenants remain (we deleted the default tenant)
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
# We deleted our only tenant, and the scrubber fails if it detects nothing
neon_env_builder.disable_scrub_on_exit()
def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
"""

View File

@@ -485,6 +485,9 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage),
)
# We deleted our only tenant, and the scrubber fails if it detects nothing
neon_env_builder.disable_scrub_on_exit()
@pytest.mark.parametrize(
"stuck_failpoint",
@@ -703,6 +706,9 @@ def test_timeline_delete_works_for_remote_smoke(
# Assume it is mock server inconsistency and check twice.
wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage))
# We deleted our only tenant, and the scrubber fails if it detects nothing
neon_env_builder.disable_scrub_on_exit()
def test_delete_orphaned_objects(
neon_env_builder: NeonEnvBuilder,