Add test that runs the S3 scrubber (#6641)

In #6079 it was found that there is no test that executes the scrubber.
We now add such a test, which does the following things:

* create a tenant, write some data
* run the scrubber
* remove the tenant
* run the scrubber again

Each time, the scrubber runs the scan-metadata command. Before #6079 we
would have errored, now we don't.

Fixes #6080
This commit is contained in:
Arpad Müller
2024-02-12 19:15:21 +01:00
committed by GitHub
parent 8b8ff88e4b
commit a1f37cba1c
4 changed files with 45 additions and 9 deletions

View File

@@ -899,7 +899,7 @@ class NeonEnvBuilder:
if self.scrub_on_exit:
try:
S3Scrubber(self.test_output_dir, self).scan_metadata()
S3Scrubber(self).scan_metadata()
except Exception as e:
log.error(f"Error during remote storage scrub: {e}")
cleanup_error = e
@@ -3659,9 +3659,9 @@ class SafekeeperHttpClient(requests.Session):
class S3Scrubber:
def __init__(self, log_dir: Path, env: NeonEnvBuilder):
def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
self.env = env
self.log_dir = log_dir
self.log_dir = log_dir or env.test_output_dir
def scrubber_cli(self, args: list[str], timeout) -> str:
assert isinstance(self.env.pageserver_remote_storage, S3Storage)
@@ -3682,7 +3682,7 @@ class S3Scrubber:
args = base_args + args
(output_path, stdout, status_code) = subprocess_capture(
self.log_dir,
self.env.test_output_dir,
args,
echo_stderr=True,
echo_stdout=True,

View File

@@ -265,9 +265,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
# Having written a mixture of generation-aware and legacy index_part.json,
# ensure the scrubber handles the situation as expected.
metadata_summary = S3Scrubber(
neon_env_builder.test_output_dir, neon_env_builder
).scan_metadata()
metadata_summary = S3Scrubber(neon_env_builder).scan_metadata()
assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline
assert metadata_summary["timeline_count"] == 1
assert metadata_summary["timeline_shard_count"] == 1

View File

@@ -498,7 +498,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
# Scrub the remote storage
# ========================
# This confirms that the scrubber isn't upset by the presence of the heatmap
S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata()
S3Scrubber(neon_env_builder).scan_metadata()
# Detach secondary and delete tenant
# ===================================

View File

@@ -9,6 +9,7 @@ from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
PgBin,
S3Scrubber,
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
@@ -19,12 +20,13 @@ from fixtures.pageserver.utils import (
assert_prefix_not_empty,
poll_for_remote_storage_iterations,
tenant_delete_wait_completed,
wait_for_upload,
wait_tenant_status_404,
wait_until_tenant_active,
wait_until_tenant_state,
)
from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
from fixtures.types import TenantId, TimelineId
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import run_pg_bench_small, wait_until
from requests.exceptions import ReadTimeout
@@ -669,3 +671,39 @@ def test_tenant_delete_races_timeline_creation(
# Zero tenants remain (we deleted the default tenant)
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
"""
Validate that creating and then deleting the tenant both survives the scrubber,
and that one can run the scrubber without problems.
"""
remote_storage_kind = RemoteStorageKind.MOCK_S3
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
scrubber = S3Scrubber(neon_env_builder)
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
ps_http = env.pageserver.http_client()
# create a tenant separate from the main tenant so that we have one remaining
# after we deleted it, as the scrubber treats empty buckets as an error.
(tenant_id, timeline_id) = env.neon_cli.create_tenant()
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
run_pg_bench_small(pg_bin, endpoint.connstr())
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
ps_http.timeline_checkpoint(tenant_id, timeline_id)
wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
env.stop()
result = scrubber.scan_metadata()
assert result["with_warnings"] == []
env.start()
ps_http = env.pageserver.http_client()
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
env.stop()
scrubber.scan_metadata()
assert result["with_warnings"] == []