mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-26 23:59:58 +00:00
scrubber: fix spurious "Missed some shards" errors (#8661)
## Problem The storage scrubber was reporting warnings for lots of timelines like: ``` WARN Missed some shards at count ShardCount(0) tenant_id=25eb7a83d9a2f90ac0b765b6ca84cf4c ``` These were spurious: these tenants are fine. There was a bug in accumulating the ShardIndex for each tenant, whereby multiple timelines would lead us to add the same ShardIndex more than one. Closes: #8646 ## Summary of changes - Accumulate ShardIndex in a BTreeSet instead of a Vec - Extend the test to reproduce the issue
This commit is contained in:
@@ -1787,9 +1787,11 @@ async fn timeline_checkpoint_handler(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if wait_until_uploaded {
|
if wait_until_uploaded {
|
||||||
|
tracing::info!("Waiting for uploads to complete...");
|
||||||
timeline.remote_client.wait_completion().await
|
timeline.remote_client.wait_completion().await
|
||||||
// XXX map to correct ApiError for the cases where it's due to shutdown
|
// XXX map to correct ApiError for the cases where it's due to shutdown
|
||||||
.context("wait completion").map_err(ApiError::InternalServerError)?;
|
.context("wait completion").map_err(ApiError::InternalServerError)?;
|
||||||
|
tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0)));
|
||||||
}
|
}
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use std::collections::{BTreeMap, HashMap};
|
use std::collections::{BTreeMap, BTreeSet, HashMap};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, SystemTime};
|
use std::time::{Duration, SystemTime};
|
||||||
|
|
||||||
@@ -117,7 +117,7 @@ use refs::AncestorRefs;
|
|||||||
// - Are there any refs to ancestor shards' layers?
|
// - Are there any refs to ancestor shards' layers?
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct TenantRefAccumulator {
|
struct TenantRefAccumulator {
|
||||||
shards_seen: HashMap<TenantId, Vec<ShardIndex>>,
|
shards_seen: HashMap<TenantId, BTreeSet<ShardIndex>>,
|
||||||
|
|
||||||
// For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to
|
// For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to
|
||||||
ancestor_ref_shards: AncestorRefs,
|
ancestor_ref_shards: AncestorRefs,
|
||||||
@@ -130,7 +130,7 @@ impl TenantRefAccumulator {
|
|||||||
.shards_seen
|
.shards_seen
|
||||||
.entry(ttid.tenant_shard_id.tenant_id)
|
.entry(ttid.tenant_shard_id.tenant_id)
|
||||||
.or_default())
|
.or_default())
|
||||||
.push(this_shard_idx);
|
.insert(this_shard_idx);
|
||||||
|
|
||||||
let mut ancestor_refs = Vec::new();
|
let mut ancestor_refs = Vec::new();
|
||||||
for (layer_name, layer_metadata) in &index_part.layer_metadata {
|
for (layer_name, layer_metadata) in &index_part.layer_metadata {
|
||||||
@@ -154,7 +154,7 @@ impl TenantRefAccumulator {
|
|||||||
summary: &mut GcSummary,
|
summary: &mut GcSummary,
|
||||||
) -> (Vec<TenantShardId>, AncestorRefs) {
|
) -> (Vec<TenantShardId>, AncestorRefs) {
|
||||||
let mut ancestors_to_gc = Vec::new();
|
let mut ancestors_to_gc = Vec::new();
|
||||||
for (tenant_id, mut shard_indices) in self.shards_seen {
|
for (tenant_id, shard_indices) in self.shards_seen {
|
||||||
// Find the highest shard count
|
// Find the highest shard count
|
||||||
let latest_count = shard_indices
|
let latest_count = shard_indices
|
||||||
.iter()
|
.iter()
|
||||||
@@ -162,6 +162,7 @@ impl TenantRefAccumulator {
|
|||||||
.max()
|
.max()
|
||||||
.expect("Always at least one shard");
|
.expect("Always at least one shard");
|
||||||
|
|
||||||
|
let mut shard_indices = shard_indices.iter().collect::<Vec<_>>();
|
||||||
let (mut latest_shards, ancestor_shards) = {
|
let (mut latest_shards, ancestor_shards) = {
|
||||||
let at =
|
let at =
|
||||||
itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count);
|
itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count);
|
||||||
@@ -174,7 +175,7 @@ impl TenantRefAccumulator {
|
|||||||
// to scan the S3 bucket halfway through a shard split.
|
// to scan the S3 bucket halfway through a shard split.
|
||||||
if latest_shards.len() != latest_count.count() as usize {
|
if latest_shards.len() != latest_count.count() as usize {
|
||||||
// This should be extremely rare, so we warn on it.
|
// This should be extremely rare, so we warn on it.
|
||||||
tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count);
|
tracing::warn!(%tenant_id, "Missed some shards at count {:?}: {latest_shards:?}", latest_count);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -212,7 +213,7 @@ impl TenantRefAccumulator {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|s| s.tenant_shard_id.to_index())
|
.map(|s| s.tenant_shard_id.to_index())
|
||||||
.collect();
|
.collect();
|
||||||
if controller_indices != latest_shards {
|
if !controller_indices.iter().eq(latest_shards.iter().copied()) {
|
||||||
tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})");
|
tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -204,6 +204,11 @@ def test_scrubber_physical_gc_ancestors(
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Create an extra timeline, to ensure the scrubber isn't confused by multiple timelines
|
||||||
|
env.storage_controller.pageserver_api().timeline_create(
|
||||||
|
env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
|
||||||
|
)
|
||||||
|
|
||||||
# Make sure the original shard has some layers
|
# Make sure the original shard has some layers
|
||||||
workload = Workload(env, tenant_id, timeline_id)
|
workload = Workload(env, tenant_id, timeline_id)
|
||||||
workload.init()
|
workload.init()
|
||||||
@@ -214,6 +219,11 @@ def test_scrubber_physical_gc_ancestors(
|
|||||||
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
|
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
|
||||||
env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately
|
env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately
|
||||||
|
|
||||||
|
# Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors
|
||||||
|
env.storage_controller.pageserver_api().timeline_create(
|
||||||
|
env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
|
||||||
|
)
|
||||||
|
|
||||||
# Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which
|
# Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which
|
||||||
# compacts, and we only want to do tha explicitly later in the test.
|
# compacts, and we only want to do tha explicitly later in the test.
|
||||||
workload.write_rows(100, upload=False)
|
workload.write_rows(100, upload=False)
|
||||||
@@ -305,10 +315,19 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
|
|||||||
# Make sure the original shard has some layers
|
# Make sure the original shard has some layers
|
||||||
workload = Workload(env, tenant_id, timeline_id)
|
workload = Workload(env, tenant_id, timeline_id)
|
||||||
workload.init()
|
workload.init()
|
||||||
workload.write_rows(100)
|
workload.write_rows(100, upload=False)
|
||||||
|
workload.stop()
|
||||||
|
|
||||||
new_shard_count = 4
|
new_shard_count = 4
|
||||||
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
|
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
|
||||||
|
for shard in shards:
|
||||||
|
ps = env.get_tenant_pageserver(shard)
|
||||||
|
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
|
||||||
|
ps.http_client().timeline_checkpoint(
|
||||||
|
shard, timeline_id, compact=False, wait_until_uploaded=True
|
||||||
|
)
|
||||||
|
|
||||||
|
ps.http_client().deletion_queue_flush(execute=True)
|
||||||
|
|
||||||
# Create a second timeline so that when we delete the first one, child shards still have some content in S3.
|
# Create a second timeline so that when we delete the first one, child shards still have some content in S3.
|
||||||
#
|
#
|
||||||
@@ -319,15 +338,6 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
|
|||||||
PgVersion.NOT_SET, tenant_id, other_timeline_id
|
PgVersion.NOT_SET, tenant_id, other_timeline_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Write after split so that child shards have some indices in S3
|
|
||||||
workload.write_rows(100, upload=False)
|
|
||||||
for shard in shards:
|
|
||||||
ps = env.get_tenant_pageserver(shard)
|
|
||||||
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
|
|
||||||
ps.http_client().timeline_checkpoint(
|
|
||||||
shard, timeline_id, compact=False, wait_until_uploaded=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# The timeline still exists in child shards and they reference its layers, so scrubbing
|
# The timeline still exists in child shards and they reference its layers, so scrubbing
|
||||||
# now shouldn't delete anything.
|
# now shouldn't delete anything.
|
||||||
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
|
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
|
||||||
|
|||||||
Reference in New Issue
Block a user