From a1f37cba1c790e5b89958fb7df13cde39429add8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 12 Feb 2024 19:15:21 +0100 Subject: [PATCH] Add test that runs the S3 scrubber (#6641) In #6079 it was found that there is no test that executes the scrubber. We now add such a test, which does the following things: * create a tenant, write some data * run the scrubber * remove the tenant * run the scrubber again Each time, the scrubber runs the scan-metadata command. Before #6079 we would have errored, now we don't. Fixes #6080 --- test_runner/fixtures/neon_fixtures.py | 8 ++-- .../regress/test_pageserver_generations.py | 4 +- .../regress/test_pageserver_secondary.py | 2 +- test_runner/regress/test_tenant_delete.py | 40 ++++++++++++++++++- 4 files changed, 45 insertions(+), 9 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index faa8effe10..26f2b999a6 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -899,7 +899,7 @@ class NeonEnvBuilder: if self.scrub_on_exit: try: - S3Scrubber(self.test_output_dir, self).scan_metadata() + S3Scrubber(self).scan_metadata() except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -3659,9 +3659,9 @@ class SafekeeperHttpClient(requests.Session): class S3Scrubber: - def __init__(self, log_dir: Path, env: NeonEnvBuilder): + def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None): self.env = env - self.log_dir = log_dir + self.log_dir = log_dir or env.test_output_dir def scrubber_cli(self, args: list[str], timeout) -> str: assert isinstance(self.env.pageserver_remote_storage, S3Storage) @@ -3682,7 +3682,7 @@ class S3Scrubber: args = base_args + args (output_path, stdout, status_code) = subprocess_capture( - self.log_dir, + self.env.test_output_dir, args, echo_stderr=True, echo_stdout=True, diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 725ed63d1c..de9f3b6945 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -265,9 +265,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Having written a mixture of generation-aware and legacy index_part.json, # ensure the scrubber handles the situation as expected. - metadata_summary = S3Scrubber( - neon_env_builder.test_output_dir, neon_env_builder - ).scan_metadata() + metadata_summary = S3Scrubber(neon_env_builder).scan_metadata() assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline assert metadata_summary["timeline_count"] == 1 assert metadata_summary["timeline_shard_count"] == 1 diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 293152dd62..aec989252c 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -498,7 +498,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Scrub the remote storage # ======================== # This confirms that the scrubber isn't upset by the presence of the heatmap - S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata() + S3Scrubber(neon_env_builder).scan_metadata() # Detach secondary and delete tenant # =================================== diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index b4e5a550f3..e928ea8bb1 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -9,6 +9,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, + S3Scrubber, last_flush_lsn_upload, wait_for_last_flush_lsn, ) @@ -19,12 +20,13 @@ from fixtures.pageserver.utils import ( assert_prefix_not_empty, poll_for_remote_storage_iterations, tenant_delete_wait_completed, + wait_for_upload, wait_tenant_status_404, wait_until_tenant_active, wait_until_tenant_state, ) from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage -from fixtures.types import TenantId, TimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import run_pg_bench_small, wait_until from requests.exceptions import ReadTimeout @@ -669,3 +671,39 @@ def test_tenant_delete_races_timeline_creation( # Zero tenants remain (we deleted the default tenant) assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 + + +def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): + """ + Validate that creating and then deleting the tenant both survives the scrubber, + and that one can run the scrubber without problems. + """ + + remote_storage_kind = RemoteStorageKind.MOCK_S3 + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + scrubber = S3Scrubber(neon_env_builder) + env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + + ps_http = env.pageserver.http_client() + # create a tenant separate from the main tenant so that we have one remaining + # after we deleted it, as the scrubber treats empty buckets as an error. + (tenant_id, timeline_id) = env.neon_cli.create_tenant() + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + env.stop() + + result = scrubber.scan_metadata() + assert result["with_warnings"] == [] + + env.start() + ps_http = env.pageserver.http_client() + iterations = poll_for_remote_storage_iterations(remote_storage_kind) + tenant_delete_wait_completed(ps_http, tenant_id, iterations) + env.stop() + + scrubber.scan_metadata() + assert result["with_warnings"] == []