scrubber: fix spurious "Missed some shards" errors (#8661)

## Problem The storage scrubber was reporting warnings for lots of timelines like: ``` WARN Missed some shards at count ShardCount(0) tenant_id=25eb7a83d9a2f90ac0b765b6ca84cf4c ``` These were spurious: these tenants are fine. There was a bug in accumulating the ShardIndex for each tenant, whereby multiple timelines would lead us to add the same ShardIndex more than one. Closes: #8646 ## Summary of changes - Accumulate ShardIndex in a BTreeSet instead of a Vec - Extend the test to reproduce the issue
2025-12-26 23:59:58 +00:00 · 2024-08-14 09:29:06 +01:00
parent 7a1736ddcf
commit 4049d2b7e1
3 changed files with 29 additions and 16 deletions
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1787,9 +1787,11 @@ async fn timeline_checkpoint_handler(
        }
        if wait_until_uploaded {
            tracing::info!("Waiting for uploads to complete...");
            timeline.remote_client.wait_completion().await
            // XXX map to correct ApiError for the cases where it's due to shutdown
            .context("wait completion").map_err(ApiError::InternalServerError)?;
            tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0)));
        }
        json_response(StatusCode::OK, ())
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -1,4 +1,4 @@
-use std::collections::{BTreeMap, HashMap};
+use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
@@ -117,7 +117,7 @@ use refs::AncestorRefs;
 // - Are there any refs to ancestor shards' layers?
 #[derive(Default)]
 struct TenantRefAccumulator {
-    shards_seen: HashMap<TenantId, Vec<ShardIndex>>,
+    shards_seen: HashMap<TenantId, BTreeSet<ShardIndex>>,
    // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to
    ancestor_ref_shards: AncestorRefs,
@@ -130,7 +130,7 @@ impl TenantRefAccumulator {
            .shards_seen
            .entry(ttid.tenant_shard_id.tenant_id)
            .or_default())
-        .push(this_shard_idx);
+        .insert(this_shard_idx);
        let mut ancestor_refs = Vec::new();
        for (layer_name, layer_metadata) in &index_part.layer_metadata {
@@ -154,7 +154,7 @@ impl TenantRefAccumulator {
        summary: &mut GcSummary,
    ) -> (Vec<TenantShardId>, AncestorRefs) {
        let mut ancestors_to_gc = Vec::new();
-        for (tenant_id, mut shard_indices) in self.shards_seen {
+        for (tenant_id, shard_indices) in self.shards_seen {
            // Find the highest shard count
            let latest_count = shard_indices
                .iter()
@@ -162,6 +162,7 @@ impl TenantRefAccumulator {
                .max()
                .expect("Always at least one shard");
            let mut shard_indices = shard_indices.iter().collect::<Vec<_>>();
            let (mut latest_shards, ancestor_shards) = {
                let at =
                    itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count);
@@ -174,7 +175,7 @@ impl TenantRefAccumulator {
            // to scan the S3 bucket halfway through a shard split.
            if latest_shards.len() != latest_count.count() as usize {
                // This should be extremely rare, so we warn on it.
-                tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count);
+                tracing::warn!(%tenant_id, "Missed some shards at count {:?}: {latest_shards:?}", latest_count);
                continue;
            }
@@ -212,7 +213,7 @@ impl TenantRefAccumulator {
                        .iter()
                        .map(|s| s.tenant_shard_id.to_index())
                        .collect();
-                    if controller_indices != latest_shards {
+                    if !controller_indices.iter().eq(latest_shards.iter().copied()) {
                        tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})");
                        continue;
                    }
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -204,6 +204,11 @@ def test_scrubber_physical_gc_ancestors(
        },
    )
    # Create an extra timeline, to ensure the scrubber isn't confused by multiple timelines
    env.storage_controller.pageserver_api().timeline_create(
        env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
    )
    # Make sure the original shard has some layers
    workload = Workload(env, tenant_id, timeline_id)
    workload.init()
@@ -214,6 +219,11 @@ def test_scrubber_physical_gc_ancestors(
    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
    env.storage_controller.reconcile_until_idle()  # Move shards to their final locations immediately
    # Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors
    env.storage_controller.pageserver_api().timeline_create(
        env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
    )
    # Make sure child shards have some layers.  Do not force upload, because the test helper calls checkpoint, which
    # compacts, and we only want to do tha explicitly later in the test.
    workload.write_rows(100, upload=False)
@@ -305,10 +315,19 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
    # Make sure the original shard has some layers
    workload = Workload(env, tenant_id, timeline_id)
    workload.init()
-    workload.write_rows(100)
+    workload.write_rows(100, upload=False)
    workload.stop()
    new_shard_count = 4
    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
    for shard in shards:
        ps = env.get_tenant_pageserver(shard)
        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
        ps.http_client().timeline_checkpoint(
            shard, timeline_id, compact=False, wait_until_uploaded=True
        )
        ps.http_client().deletion_queue_flush(execute=True)
    # Create a second timeline so that when we delete the first one, child shards still have some content in S3.
    #
@@ -319,15 +338,6 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
        PgVersion.NOT_SET, tenant_id, other_timeline_id
    )
    # Write after split so that child shards have some indices in S3
    workload.write_rows(100, upload=False)
    for shard in shards:
        ps = env.get_tenant_pageserver(shard)
        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
        ps.http_client().timeline_checkpoint(
            shard, timeline_id, compact=False, wait_until_uploaded=True
        )
    # The timeline still exists in child shards and they reference its layers, so scrubbing
    # now shouldn't delete anything.
    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")