scrubber: fix spurious "Missed some shards" errors (#8661)

## Problem The storage scrubber was reporting warnings for lots of timelines like: ``` WARN Missed some shards at count ShardCount(0) tenant_id=25eb7a83d9a2f90ac0b765b6ca84cf4c ``` These were spurious: these tenants are fine. There was a bug in accumulating the ShardIndex for each tenant, whereby multiple timelines would lead us to add the same ShardIndex more than one. Closes: #8646 ## Summary of changes - Accumulate ShardIndex in a BTreeSet instead of a Vec - Extend the test to reproduce the issue
2026-05-21 23:20:40 +00:00 · 2024-08-14 09:29:06 +01:00
parent 7a1736ddcf
commit 4049d2b7e1
3 changed files with 29 additions and 16 deletions
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -204,6 +204,11 @@ def test_scrubber_physical_gc_ancestors(
        },
    )

+    # Create an extra timeline, to ensure the scrubber isn't confused by multiple timelines
+    env.storage_controller.pageserver_api().timeline_create(
+        env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
+    )
+
    # Make sure the original shard has some layers
    workload = Workload(env, tenant_id, timeline_id)
    workload.init()
@@ -214,6 +219,11 @@ def test_scrubber_physical_gc_ancestors(
    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
    env.storage_controller.reconcile_until_idle()  # Move shards to their final locations immediately

+    # Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors
+    env.storage_controller.pageserver_api().timeline_create(
+        env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
+    )
+
    # Make sure child shards have some layers.  Do not force upload, because the test helper calls checkpoint, which
    # compacts, and we only want to do tha explicitly later in the test.
    workload.write_rows(100, upload=False)
@@ -305,10 +315,19 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
    # Make sure the original shard has some layers
    workload = Workload(env, tenant_id, timeline_id)
    workload.init()
-    workload.write_rows(100)
+    workload.write_rows(100, upload=False)
+    workload.stop()

    new_shard_count = 4
    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
+        ps.http_client().timeline_checkpoint(
+            shard, timeline_id, compact=False, wait_until_uploaded=True
+        )
+
+        ps.http_client().deletion_queue_flush(execute=True)

    # Create a second timeline so that when we delete the first one, child shards still have some content in S3.
    #
@@ -319,15 +338,6 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
        PgVersion.NOT_SET, tenant_id, other_timeline_id
    )

-    # Write after split so that child shards have some indices in S3
-    workload.write_rows(100, upload=False)
-    for shard in shards:
-        ps = env.get_tenant_pageserver(shard)
-        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
-        ps.http_client().timeline_checkpoint(
-            shard, timeline_id, compact=False, wait_until_uploaded=True
-        )
-
    # The timeline still exists in child shards and they reference its layers, so scrubbing
    # now shouldn't delete anything.
    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")