From c08759f367063fde2558f979b83f6f9209741c7a Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 3 Jan 2025 10:55:15 +0000 Subject: [PATCH] storcon: verbose logs in rare case of shards not attached yet (#10262) ## Problem When we do a timeline CRUD operation, we check that the shards we need to mutate are currently attached to a pageserver, by reading `generation` and `generation_pageserver` from the database. If any don't appear to be attached, we respond with a a 503 and "One or more shards in tenant is not yet attached". This is happening more often than expected, and it's not obvious with current logging what's going on: specifically which shard has a problem, and exactly what we're seeing in these persistent generation columns. (Aside: it's possible that we broke something with the change in #10011 which clears generation_pageserver when we detach a shard, although if so the mechanism isn't trivial: what should happen is that if we stamp on generation_pageserver if a reconciler is running, then it shouldn't matter because we're about to ## Summary of changes - When we are in Attached mode but find that generation_pageserver/generation are unset, output details while looping over shards. --- storage_controller/src/service.rs | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index c0c5bc371a..222cb9fdd4 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -3572,6 +3572,11 @@ impl Service { .iter() .any(|i| i.generation.is_none() || i.generation_pageserver.is_none()) { + let shard_generations = generations + .into_iter() + .map(|i| (i.tenant_shard_id, (i.generation, i.generation_pageserver))) + .collect::>(); + // One or more shards has not been attached to a pageserver. Check if this is because it's configured // to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry) let locked = self.inner.read().unwrap(); @@ -3582,6 +3587,28 @@ impl Service { PlacementPolicy::Attached(_) => { // This shard is meant to be attached: the caller is not wrong to try and // use this function, but we can't service the request right now. + let Some(generation) = shard_generations.get(shard_id) else { + // This can only happen if there is a split brain controller modifying the database. This should + // never happen when testing, and if it happens in production we can only log the issue. + debug_assert!(false); + tracing::error!("Shard {shard_id} not found in generation state! Is another rogue controller running?"); + continue; + }; + let (generation, generation_pageserver) = generation; + if let Some(generation) = generation { + if generation_pageserver.is_none() { + // This is legitimate only in a very narrow window where the shard was only just configured into + // Attached mode after being created in Secondary or Detached mode, and it has had its generation + // set but not yet had a Reconciler run (reconciler is the only thing that sets generation_pageserver). + tracing::warn!("Shard {shard_id} generation is set ({generation:?}) but generation_pageserver is None, reconciler not run yet?"); + } + } else { + // This should never happen: a shard with no generation is only permitted when it was created in some state + // other than PlacementPolicy::Attached (and generation is always written to DB before setting Attached in memory) + debug_assert!(false); + tracing::error!("Shard {shard_id} generation is None, but it is in PlacementPolicy::Attached mode!"); + continue; + } } PlacementPolicy::Secondary | PlacementPolicy::Detached => { return Err(ApiError::Conflict(format!(