mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 23:20:40 +00:00
scrubber: fix spurious "Missed some shards" errors (#8661)
## Problem The storage scrubber was reporting warnings for lots of timelines like: ``` WARN Missed some shards at count ShardCount(0) tenant_id=25eb7a83d9a2f90ac0b765b6ca84cf4c ``` These were spurious: these tenants are fine. There was a bug in accumulating the ShardIndex for each tenant, whereby multiple timelines would lead us to add the same ShardIndex more than one. Closes: #8646 ## Summary of changes - Accumulate ShardIndex in a BTreeSet instead of a Vec - Extend the test to reproduce the issue
This commit is contained in:
@@ -204,6 +204,11 @@ def test_scrubber_physical_gc_ancestors(
|
||||
},
|
||||
)
|
||||
|
||||
# Create an extra timeline, to ensure the scrubber isn't confused by multiple timelines
|
||||
env.storage_controller.pageserver_api().timeline_create(
|
||||
env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
|
||||
)
|
||||
|
||||
# Make sure the original shard has some layers
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
@@ -214,6 +219,11 @@ def test_scrubber_physical_gc_ancestors(
|
||||
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
|
||||
env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately
|
||||
|
||||
# Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors
|
||||
env.storage_controller.pageserver_api().timeline_create(
|
||||
env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
|
||||
)
|
||||
|
||||
# Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which
|
||||
# compacts, and we only want to do tha explicitly later in the test.
|
||||
workload.write_rows(100, upload=False)
|
||||
@@ -305,10 +315,19 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
|
||||
# Make sure the original shard has some layers
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
workload.write_rows(100)
|
||||
workload.write_rows(100, upload=False)
|
||||
workload.stop()
|
||||
|
||||
new_shard_count = 4
|
||||
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
|
||||
for shard in shards:
|
||||
ps = env.get_tenant_pageserver(shard)
|
||||
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
|
||||
ps.http_client().timeline_checkpoint(
|
||||
shard, timeline_id, compact=False, wait_until_uploaded=True
|
||||
)
|
||||
|
||||
ps.http_client().deletion_queue_flush(execute=True)
|
||||
|
||||
# Create a second timeline so that when we delete the first one, child shards still have some content in S3.
|
||||
#
|
||||
@@ -319,15 +338,6 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
|
||||
PgVersion.NOT_SET, tenant_id, other_timeline_id
|
||||
)
|
||||
|
||||
# Write after split so that child shards have some indices in S3
|
||||
workload.write_rows(100, upload=False)
|
||||
for shard in shards:
|
||||
ps = env.get_tenant_pageserver(shard)
|
||||
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
|
||||
ps.http_client().timeline_checkpoint(
|
||||
shard, timeline_id, compact=False, wait_until_uploaded=True
|
||||
)
|
||||
|
||||
# The timeline still exists in child shards and they reference its layers, so scrubbing
|
||||
# now shouldn't delete anything.
|
||||
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
|
||||
|
||||
Reference in New Issue
Block a user