s3_scrubber: prepare for scrubbing buckets with generation-aware content (#5700)

## Problem

The scrubber didn't know how to find the latest index_part when
generations were in use.

## Summary of changes

- Teach the scrubber to do the same dance that pageserver does when
finding the latest index_part.json
- Teach the scrubber how to understand layer files with generation
suffixes.
- General improvement to testability: scan_metadata has a machine
readable output that the testing `S3Scrubber` wrapper can read.
- Existing test coverage of scrubber was false-passing because it just
didn't see any data due to prefixing of data in the bucket. Fix that.

This is incremental improvement: the more confidence we can have in the
scrubber, the more we can use it in integration tests to validate the
state of remote storage.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
This commit is contained in:
John Spray
2023-11-03 17:36:02 +00:00
committed by GitHub
parent 5ceccdc7de
commit 306c4f9967
13 changed files with 209 additions and 82 deletions

View File

@@ -2968,24 +2968,33 @@ class S3Scrubber:
self.env = env
self.log_dir = log_dir
def scrubber_cli(self, args, timeout):
def scrubber_cli(self, args: list[str], timeout) -> str:
assert isinstance(self.env.pageserver_remote_storage, S3Storage)
s3_storage = self.env.pageserver_remote_storage
env = {
"REGION": s3_storage.bucket_region,
"BUCKET": s3_storage.bucket_name,
"BUCKET_PREFIX": s3_storage.prefix_in_bucket,
"RUST_LOG": "DEBUG",
}
env.update(s3_storage.access_env_vars())
if s3_storage.endpoint is not None:
env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
base_args = [self.env.neon_binpath / "s3_scrubber"]
base_args = [str(self.env.neon_binpath / "s3_scrubber")]
args = base_args + args
(output_path, _, status_code) = subprocess_capture(
self.log_dir, args, echo_stderr=True, echo_stdout=True, env=env, check=False
(output_path, stdout, status_code) = subprocess_capture(
self.log_dir,
args,
echo_stderr=True,
echo_stdout=True,
env=env,
check=False,
capture_stdout=True,
timeout=timeout,
)
if status_code:
log.warning(f"Scrub command {args} failed")
@@ -2994,8 +3003,18 @@ class S3Scrubber:
raise RuntimeError("Remote storage scrub failed")
def scan_metadata(self):
self.scrubber_cli(["scan-metadata"], timeout=30)
assert stdout is not None
return stdout
def scan_metadata(self) -> Any:
stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
try:
return json.loads(stdout)
except:
log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:")
log.error(stdout)
raise
def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:

View File

@@ -35,6 +35,7 @@ def subprocess_capture(
echo_stderr=False,
echo_stdout=False,
capture_stdout=False,
timeout=None,
**kwargs: Any,
) -> Tuple[str, Optional[str], int]:
"""Run a process and bifurcate its output to files and the `log` logger
@@ -104,7 +105,7 @@ def subprocess_capture(
stderr_handler = OutputHandler(p.stderr, stderr_f, echo=echo_stderr, capture=False)
stderr_handler.start()
r = p.wait()
r = p.wait(timeout=timeout)
stdout_handler.join()
stderr_handler.join()

View File

@@ -21,6 +21,7 @@ from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
S3Scrubber,
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
@@ -234,8 +235,22 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
assert len(suffixed_objects) > 0
assert len(legacy_objects) > 0
# Flush through deletions to get a clean state for scrub: we are implicitly validating
# that our generations-enabled pageserver was able to do deletions of layers
# from earlier which don't have a generation.
env.pageserver.http_client().deletion_queue_flush(execute=True)
assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
# Having written a mixture of generation-aware and legacy index_part.json,
# ensure the scrubber handles the situation as expected.
metadata_summary = S3Scrubber(
neon_env_builder.test_output_dir, neon_env_builder
).scan_metadata()
assert metadata_summary["count"] == 1 # Scrubber should have seen our timeline
assert not metadata_summary["with_errors"]
assert not metadata_summary["with_warnings"]
def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_generations = True