scrubber: fix spurious "Missed some shards" errors (#8661)

## Problem

The storage scrubber was reporting warnings for lots of timelines like:
```
WARN Missed some shards at count ShardCount(0) tenant_id=25eb7a83d9a2f90ac0b765b6ca84cf4c
```

These were spurious: these tenants are fine. There was a bug in
accumulating the ShardIndex for each tenant, whereby multiple timelines
would lead us to add the same ShardIndex more than one.

Closes: #8646 

## Summary of changes

- Accumulate ShardIndex in a BTreeSet instead of a Vec
- Extend the test to reproduce the issue
This commit is contained in:
John Spray
2024-08-14 09:29:06 +01:00
committed by GitHub
parent 7a1736ddcf
commit 4049d2b7e1
3 changed files with 29 additions and 16 deletions

View File

@@ -204,6 +204,11 @@ def test_scrubber_physical_gc_ancestors(
},
)
# Create an extra timeline, to ensure the scrubber isn't confused by multiple timelines
env.storage_controller.pageserver_api().timeline_create(
env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
)
# Make sure the original shard has some layers
workload = Workload(env, tenant_id, timeline_id)
workload.init()
@@ -214,6 +219,11 @@ def test_scrubber_physical_gc_ancestors(
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately
# Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors
env.storage_controller.pageserver_api().timeline_create(
env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
)
# Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which
# compacts, and we only want to do tha explicitly later in the test.
workload.write_rows(100, upload=False)
@@ -305,10 +315,19 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
# Make sure the original shard has some layers
workload = Workload(env, tenant_id, timeline_id)
workload.init()
workload.write_rows(100)
workload.write_rows(100, upload=False)
workload.stop()
new_shard_count = 4
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
for shard in shards:
ps = env.get_tenant_pageserver(shard)
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
ps.http_client().timeline_checkpoint(
shard, timeline_id, compact=False, wait_until_uploaded=True
)
ps.http_client().deletion_queue_flush(execute=True)
# Create a second timeline so that when we delete the first one, child shards still have some content in S3.
#
@@ -319,15 +338,6 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
PgVersion.NOT_SET, tenant_id, other_timeline_id
)
# Write after split so that child shards have some indices in S3
workload.write_rows(100, upload=False)
for shard in shards:
ps = env.get_tenant_pageserver(shard)
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
ps.http_client().timeline_checkpoint(
shard, timeline_id, compact=False, wait_until_uploaded=True
)
# The timeline still exists in child shards and they reference its layers, so scrubbing
# now shouldn't delete anything.
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")