mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
s3_scrubber: prepare for scrubbing buckets with generation-aware content (#5700)
## Problem The scrubber didn't know how to find the latest index_part when generations were in use. ## Summary of changes - Teach the scrubber to do the same dance that pageserver does when finding the latest index_part.json - Teach the scrubber how to understand layer files with generation suffixes. - General improvement to testability: scan_metadata has a machine readable output that the testing `S3Scrubber` wrapper can read. - Existing test coverage of scrubber was false-passing because it just didn't see any data due to prefixing of data in the bucket. Fix that. This is incremental improvement: the more confidence we can have in the scrubber, the more we can use it in integration tests to validate the state of remote storage. --------- Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
This commit is contained in:
@@ -2968,24 +2968,33 @@ class S3Scrubber:
|
||||
self.env = env
|
||||
self.log_dir = log_dir
|
||||
|
||||
def scrubber_cli(self, args, timeout):
|
||||
def scrubber_cli(self, args: list[str], timeout) -> str:
|
||||
assert isinstance(self.env.pageserver_remote_storage, S3Storage)
|
||||
s3_storage = self.env.pageserver_remote_storage
|
||||
|
||||
env = {
|
||||
"REGION": s3_storage.bucket_region,
|
||||
"BUCKET": s3_storage.bucket_name,
|
||||
"BUCKET_PREFIX": s3_storage.prefix_in_bucket,
|
||||
"RUST_LOG": "DEBUG",
|
||||
}
|
||||
env.update(s3_storage.access_env_vars())
|
||||
|
||||
if s3_storage.endpoint is not None:
|
||||
env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
|
||||
|
||||
base_args = [self.env.neon_binpath / "s3_scrubber"]
|
||||
base_args = [str(self.env.neon_binpath / "s3_scrubber")]
|
||||
args = base_args + args
|
||||
|
||||
(output_path, _, status_code) = subprocess_capture(
|
||||
self.log_dir, args, echo_stderr=True, echo_stdout=True, env=env, check=False
|
||||
(output_path, stdout, status_code) = subprocess_capture(
|
||||
self.log_dir,
|
||||
args,
|
||||
echo_stderr=True,
|
||||
echo_stdout=True,
|
||||
env=env,
|
||||
check=False,
|
||||
capture_stdout=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
if status_code:
|
||||
log.warning(f"Scrub command {args} failed")
|
||||
@@ -2994,8 +3003,18 @@ class S3Scrubber:
|
||||
|
||||
raise RuntimeError("Remote storage scrub failed")
|
||||
|
||||
def scan_metadata(self):
|
||||
self.scrubber_cli(["scan-metadata"], timeout=30)
|
||||
assert stdout is not None
|
||||
return stdout
|
||||
|
||||
def scan_metadata(self) -> Any:
|
||||
stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
|
||||
|
||||
try:
|
||||
return json.loads(stdout)
|
||||
except:
|
||||
log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:")
|
||||
log.error(stdout)
|
||||
raise
|
||||
|
||||
|
||||
def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
|
||||
|
||||
@@ -35,6 +35,7 @@ def subprocess_capture(
|
||||
echo_stderr=False,
|
||||
echo_stdout=False,
|
||||
capture_stdout=False,
|
||||
timeout=None,
|
||||
**kwargs: Any,
|
||||
) -> Tuple[str, Optional[str], int]:
|
||||
"""Run a process and bifurcate its output to files and the `log` logger
|
||||
@@ -104,7 +105,7 @@ def subprocess_capture(
|
||||
stderr_handler = OutputHandler(p.stderr, stderr_f, echo=echo_stderr, capture=False)
|
||||
stderr_handler.start()
|
||||
|
||||
r = p.wait()
|
||||
r = p.wait(timeout=timeout)
|
||||
|
||||
stdout_handler.join()
|
||||
stderr_handler.join()
|
||||
|
||||
@@ -21,6 +21,7 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
S3Scrubber,
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
@@ -234,8 +235,22 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
assert len(suffixed_objects) > 0
|
||||
assert len(legacy_objects) > 0
|
||||
|
||||
# Flush through deletions to get a clean state for scrub: we are implicitly validating
|
||||
# that our generations-enabled pageserver was able to do deletions of layers
|
||||
# from earlier which don't have a generation.
|
||||
env.pageserver.http_client().deletion_queue_flush(execute=True)
|
||||
|
||||
assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
|
||||
|
||||
# Having written a mixture of generation-aware and legacy index_part.json,
|
||||
# ensure the scrubber handles the situation as expected.
|
||||
metadata_summary = S3Scrubber(
|
||||
neon_env_builder.test_output_dir, neon_env_builder
|
||||
).scan_metadata()
|
||||
assert metadata_summary["count"] == 1 # Scrubber should have seen our timeline
|
||||
assert not metadata_summary["with_errors"]
|
||||
assert not metadata_summary["with_warnings"]
|
||||
|
||||
|
||||
def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_generations = True
|
||||
|
||||
Reference in New Issue
Block a user