mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 20:12:54 +00:00
Add test that runs the S3 scrubber (#6641)
In #6079 it was found that there is no test that executes the scrubber. We now add such a test, which does the following things: * create a tenant, write some data * run the scrubber * remove the tenant * run the scrubber again Each time, the scrubber runs the scan-metadata command. Before #6079 we would have errored, now we don't. Fixes #6080
This commit is contained in:
@@ -899,7 +899,7 @@ class NeonEnvBuilder:
|
||||
|
||||
if self.scrub_on_exit:
|
||||
try:
|
||||
S3Scrubber(self.test_output_dir, self).scan_metadata()
|
||||
S3Scrubber(self).scan_metadata()
|
||||
except Exception as e:
|
||||
log.error(f"Error during remote storage scrub: {e}")
|
||||
cleanup_error = e
|
||||
@@ -3659,9 +3659,9 @@ class SafekeeperHttpClient(requests.Session):
|
||||
|
||||
|
||||
class S3Scrubber:
|
||||
def __init__(self, log_dir: Path, env: NeonEnvBuilder):
|
||||
def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
|
||||
self.env = env
|
||||
self.log_dir = log_dir
|
||||
self.log_dir = log_dir or env.test_output_dir
|
||||
|
||||
def scrubber_cli(self, args: list[str], timeout) -> str:
|
||||
assert isinstance(self.env.pageserver_remote_storage, S3Storage)
|
||||
@@ -3682,7 +3682,7 @@ class S3Scrubber:
|
||||
args = base_args + args
|
||||
|
||||
(output_path, stdout, status_code) = subprocess_capture(
|
||||
self.log_dir,
|
||||
self.env.test_output_dir,
|
||||
args,
|
||||
echo_stderr=True,
|
||||
echo_stdout=True,
|
||||
|
||||
@@ -265,9 +265,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Having written a mixture of generation-aware and legacy index_part.json,
|
||||
# ensure the scrubber handles the situation as expected.
|
||||
metadata_summary = S3Scrubber(
|
||||
neon_env_builder.test_output_dir, neon_env_builder
|
||||
).scan_metadata()
|
||||
metadata_summary = S3Scrubber(neon_env_builder).scan_metadata()
|
||||
assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline
|
||||
assert metadata_summary["timeline_count"] == 1
|
||||
assert metadata_summary["timeline_shard_count"] == 1
|
||||
|
||||
@@ -498,7 +498,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
# Scrub the remote storage
|
||||
# ========================
|
||||
# This confirms that the scrubber isn't upset by the presence of the heatmap
|
||||
S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata()
|
||||
S3Scrubber(neon_env_builder).scan_metadata()
|
||||
|
||||
# Detach secondary and delete tenant
|
||||
# ===================================
|
||||
|
||||
@@ -9,6 +9,7 @@ from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
S3Scrubber,
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
@@ -19,12 +20,13 @@ from fixtures.pageserver.utils import (
|
||||
assert_prefix_not_empty,
|
||||
poll_for_remote_storage_iterations,
|
||||
tenant_delete_wait_completed,
|
||||
wait_for_upload,
|
||||
wait_tenant_status_404,
|
||||
wait_until_tenant_active,
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import run_pg_bench_small, wait_until
|
||||
from requests.exceptions import ReadTimeout
|
||||
|
||||
@@ -669,3 +671,39 @@ def test_tenant_delete_races_timeline_creation(
|
||||
|
||||
# Zero tenants remain (we deleted the default tenant)
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
|
||||
|
||||
|
||||
def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Validate that creating and then deleting the tenant both survives the scrubber,
|
||||
and that one can run the scrubber without problems.
|
||||
"""
|
||||
|
||||
remote_storage_kind = RemoteStorageKind.MOCK_S3
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
scrubber = S3Scrubber(neon_env_builder)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
# create a tenant separate from the main tenant so that we have one remaining
|
||||
# after we deleted it, as the scrubber treats empty buckets as an error.
|
||||
(tenant_id, timeline_id) = env.neon_cli.create_tenant()
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
run_pg_bench_small(pg_bin, endpoint.connstr())
|
||||
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
|
||||
env.stop()
|
||||
|
||||
result = scrubber.scan_metadata()
|
||||
assert result["with_warnings"] == []
|
||||
|
||||
env.start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
|
||||
env.stop()
|
||||
|
||||
scrubber.scan_metadata()
|
||||
assert result["with_warnings"] == []
|
||||
|
||||
Reference in New Issue
Block a user