From fcff7528517b47f79a55334c22dc6dc89c113be1 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 13 Dec 2024 17:28:21 +0100 Subject: [PATCH] fix(test_timeline_archival_chaos): flakiness caused by orphan layers (#10083) The test was failing with the scary but generic message `Remote storage metadata corrupted`. The underlying scrubber error is `Orphan layer detected: ...`. The test kills pageserver at random points, hence it's expected that we leak layers if we're killed in the window after layer upload but before it's referenced from index part. Refer to generation numbers RFC for details. Refs: - fixes https://github.com/neondatabase/neon/issues/9988 - root-cause analysis https://github.com/neondatabase/neon/issues/9988#issuecomment-2520673167 --- test_runner/regress/test_timeline_archive.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index e808dd1396..addf702893 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -435,6 +435,14 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): ] ) + env.storage_scrubber.allowed_errors.extend( + [ + # Unclcean shutdowns of pageserver can legitimately result in orphan layers + # (https://github.com/neondatabase/neon/issues/9988#issuecomment-2520558211) + f".*Orphan layer detected: tenants/{tenant_id}/.*" + ] + ) + class TimelineState: def __init__(self): self.timeline_id = TimelineId.generate()