mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 14:00:38 +00:00
storcon: verbose logs in rare case of shards not attached yet (#10262)
## Problem When we do a timeline CRUD operation, we check that the shards we need to mutate are currently attached to a pageserver, by reading `generation` and `generation_pageserver` from the database. If any don't appear to be attached, we respond with a a 503 and "One or more shards in tenant is not yet attached". This is happening more often than expected, and it's not obvious with current logging what's going on: specifically which shard has a problem, and exactly what we're seeing in these persistent generation columns. (Aside: it's possible that we broke something with the change in #10011 which clears generation_pageserver when we detach a shard, although if so the mechanism isn't trivial: what should happen is that if we stamp on generation_pageserver if a reconciler is running, then it shouldn't matter because we're about to ## Summary of changes - When we are in Attached mode but find that generation_pageserver/generation are unset, output details while looping over shards.
This commit is contained in:
@@ -3572,6 +3572,11 @@ impl Service {
|
||||
.iter()
|
||||
.any(|i| i.generation.is_none() || i.generation_pageserver.is_none())
|
||||
{
|
||||
let shard_generations = generations
|
||||
.into_iter()
|
||||
.map(|i| (i.tenant_shard_id, (i.generation, i.generation_pageserver)))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
// One or more shards has not been attached to a pageserver. Check if this is because it's configured
|
||||
// to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry)
|
||||
let locked = self.inner.read().unwrap();
|
||||
@@ -3582,6 +3587,28 @@ impl Service {
|
||||
PlacementPolicy::Attached(_) => {
|
||||
// This shard is meant to be attached: the caller is not wrong to try and
|
||||
// use this function, but we can't service the request right now.
|
||||
let Some(generation) = shard_generations.get(shard_id) else {
|
||||
// This can only happen if there is a split brain controller modifying the database. This should
|
||||
// never happen when testing, and if it happens in production we can only log the issue.
|
||||
debug_assert!(false);
|
||||
tracing::error!("Shard {shard_id} not found in generation state! Is another rogue controller running?");
|
||||
continue;
|
||||
};
|
||||
let (generation, generation_pageserver) = generation;
|
||||
if let Some(generation) = generation {
|
||||
if generation_pageserver.is_none() {
|
||||
// This is legitimate only in a very narrow window where the shard was only just configured into
|
||||
// Attached mode after being created in Secondary or Detached mode, and it has had its generation
|
||||
// set but not yet had a Reconciler run (reconciler is the only thing that sets generation_pageserver).
|
||||
tracing::warn!("Shard {shard_id} generation is set ({generation:?}) but generation_pageserver is None, reconciler not run yet?");
|
||||
}
|
||||
} else {
|
||||
// This should never happen: a shard with no generation is only permitted when it was created in some state
|
||||
// other than PlacementPolicy::Attached (and generation is always written to DB before setting Attached in memory)
|
||||
debug_assert!(false);
|
||||
tracing::error!("Shard {shard_id} generation is None, but it is in PlacementPolicy::Attached mode!");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
PlacementPolicy::Secondary | PlacementPolicy::Detached => {
|
||||
return Err(ApiError::Conflict(format!(
|
||||
|
||||
Reference in New Issue
Block a user