scrubber: fix spurious "Missed some shards" errors (#8661)

## Problem

The storage scrubber was reporting warnings for lots of timelines like:
```
WARN Missed some shards at count ShardCount(0) tenant_id=25eb7a83d9a2f90ac0b765b6ca84cf4c
```

These were spurious: these tenants are fine. There was a bug in
accumulating the ShardIndex for each tenant, whereby multiple timelines
would lead us to add the same ShardIndex more than one.

Closes: #8646 

## Summary of changes

- Accumulate ShardIndex in a BTreeSet instead of a Vec
- Extend the test to reproduce the issue
This commit is contained in:
John Spray
2024-08-14 09:29:06 +01:00
committed by GitHub
parent 7a1736ddcf
commit 4049d2b7e1
3 changed files with 29 additions and 16 deletions

View File

@@ -1787,9 +1787,11 @@ async fn timeline_checkpoint_handler(
} }
if wait_until_uploaded { if wait_until_uploaded {
tracing::info!("Waiting for uploads to complete...");
timeline.remote_client.wait_completion().await timeline.remote_client.wait_completion().await
// XXX map to correct ApiError for the cases where it's due to shutdown // XXX map to correct ApiError for the cases where it's due to shutdown
.context("wait completion").map_err(ApiError::InternalServerError)?; .context("wait completion").map_err(ApiError::InternalServerError)?;
tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0)));
} }
json_response(StatusCode::OK, ()) json_response(StatusCode::OK, ())

View File

@@ -1,4 +1,4 @@
use std::collections::{BTreeMap, HashMap}; use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::sync::Arc; use std::sync::Arc;
use std::time::{Duration, SystemTime}; use std::time::{Duration, SystemTime};
@@ -117,7 +117,7 @@ use refs::AncestorRefs;
// - Are there any refs to ancestor shards' layers? // - Are there any refs to ancestor shards' layers?
#[derive(Default)] #[derive(Default)]
struct TenantRefAccumulator { struct TenantRefAccumulator {
shards_seen: HashMap<TenantId, Vec<ShardIndex>>, shards_seen: HashMap<TenantId, BTreeSet<ShardIndex>>,
// For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to
ancestor_ref_shards: AncestorRefs, ancestor_ref_shards: AncestorRefs,
@@ -130,7 +130,7 @@ impl TenantRefAccumulator {
.shards_seen .shards_seen
.entry(ttid.tenant_shard_id.tenant_id) .entry(ttid.tenant_shard_id.tenant_id)
.or_default()) .or_default())
.push(this_shard_idx); .insert(this_shard_idx);
let mut ancestor_refs = Vec::new(); let mut ancestor_refs = Vec::new();
for (layer_name, layer_metadata) in &index_part.layer_metadata { for (layer_name, layer_metadata) in &index_part.layer_metadata {
@@ -154,7 +154,7 @@ impl TenantRefAccumulator {
summary: &mut GcSummary, summary: &mut GcSummary,
) -> (Vec<TenantShardId>, AncestorRefs) { ) -> (Vec<TenantShardId>, AncestorRefs) {
let mut ancestors_to_gc = Vec::new(); let mut ancestors_to_gc = Vec::new();
for (tenant_id, mut shard_indices) in self.shards_seen { for (tenant_id, shard_indices) in self.shards_seen {
// Find the highest shard count // Find the highest shard count
let latest_count = shard_indices let latest_count = shard_indices
.iter() .iter()
@@ -162,6 +162,7 @@ impl TenantRefAccumulator {
.max() .max()
.expect("Always at least one shard"); .expect("Always at least one shard");
let mut shard_indices = shard_indices.iter().collect::<Vec<_>>();
let (mut latest_shards, ancestor_shards) = { let (mut latest_shards, ancestor_shards) = {
let at = let at =
itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count); itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count);
@@ -174,7 +175,7 @@ impl TenantRefAccumulator {
// to scan the S3 bucket halfway through a shard split. // to scan the S3 bucket halfway through a shard split.
if latest_shards.len() != latest_count.count() as usize { if latest_shards.len() != latest_count.count() as usize {
// This should be extremely rare, so we warn on it. // This should be extremely rare, so we warn on it.
tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count); tracing::warn!(%tenant_id, "Missed some shards at count {:?}: {latest_shards:?}", latest_count);
continue; continue;
} }
@@ -212,7 +213,7 @@ impl TenantRefAccumulator {
.iter() .iter()
.map(|s| s.tenant_shard_id.to_index()) .map(|s| s.tenant_shard_id.to_index())
.collect(); .collect();
if controller_indices != latest_shards { if !controller_indices.iter().eq(latest_shards.iter().copied()) {
tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})"); tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})");
continue; continue;
} }

View File

@@ -204,6 +204,11 @@ def test_scrubber_physical_gc_ancestors(
}, },
) )
# Create an extra timeline, to ensure the scrubber isn't confused by multiple timelines
env.storage_controller.pageserver_api().timeline_create(
env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
)
# Make sure the original shard has some layers # Make sure the original shard has some layers
workload = Workload(env, tenant_id, timeline_id) workload = Workload(env, tenant_id, timeline_id)
workload.init() workload.init()
@@ -214,6 +219,11 @@ def test_scrubber_physical_gc_ancestors(
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately
# Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors
env.storage_controller.pageserver_api().timeline_create(
env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
)
# Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which # Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which
# compacts, and we only want to do tha explicitly later in the test. # compacts, and we only want to do tha explicitly later in the test.
workload.write_rows(100, upload=False) workload.write_rows(100, upload=False)
@@ -305,10 +315,19 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
# Make sure the original shard has some layers # Make sure the original shard has some layers
workload = Workload(env, tenant_id, timeline_id) workload = Workload(env, tenant_id, timeline_id)
workload.init() workload.init()
workload.write_rows(100) workload.write_rows(100, upload=False)
workload.stop()
new_shard_count = 4 new_shard_count = 4
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
for shard in shards:
ps = env.get_tenant_pageserver(shard)
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
ps.http_client().timeline_checkpoint(
shard, timeline_id, compact=False, wait_until_uploaded=True
)
ps.http_client().deletion_queue_flush(execute=True)
# Create a second timeline so that when we delete the first one, child shards still have some content in S3. # Create a second timeline so that when we delete the first one, child shards still have some content in S3.
# #
@@ -319,15 +338,6 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
PgVersion.NOT_SET, tenant_id, other_timeline_id PgVersion.NOT_SET, tenant_id, other_timeline_id
) )
# Write after split so that child shards have some indices in S3
workload.write_rows(100, upload=False)
for shard in shards:
ps = env.get_tenant_pageserver(shard)
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
ps.http_client().timeline_checkpoint(
shard, timeline_id, compact=False, wait_until_uploaded=True
)
# The timeline still exists in child shards and they reference its layers, so scrubbing # The timeline still exists in child shards and they reference its layers, so scrubbing
# now shouldn't delete anything. # now shouldn't delete anything.
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")