From 14e05276a3e0882777c4ee38252bed25324c93e8 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 5 Feb 2025 16:05:12 +0000 Subject: [PATCH] storcon: fix a case where optimise could get stuck on unschedulable node (#10648) ## Problem When a shard has two secondary locations, but one of them is on a node with MaySchedule::No, the optimiser would get stuck, because it couldn't decide which secondary to remove. This is generally okay if a node is offline, but if a node is in Pause mode for a long period of time, it's a problem. Closes: https://github.com/neondatabase/neon/issues/10646 ## Summary of changes - Instead of insisting on finding a node in the wrong AZ to remove, find an available node in the _right_ AZ, and remove all the others. This ensures that if there is one live suitable node, then other offline/paused nodes cannot hold things up. --- storage_controller/src/tenant_shard.rs | 150 +++++++++++++++++++++++-- 1 file changed, 141 insertions(+), 9 deletions(-) diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 302104dc97..219c0dffe7 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -707,6 +707,7 @@ impl TenantShard { if let Some(node_id) = self.intent.get_attached() { // Populate secondary by demoting the attached node self.intent.demote_attached(scheduler, *node_id); + modified = true; } else if self.intent.secondary.is_empty() { // Populate secondary by scheduling a fresh node @@ -979,24 +980,51 @@ impl TenantShard { ), ) }) - .collect::>(); + .collect::>(); if secondary_scores.iter().any(|score| score.1.is_none()) { - // Don't have full list of scores, so can't make a good decision about which to drop unless - // there is an obvious one in the wrong AZ - for secondary in self.intent.get_secondary() { - if scheduler.get_node_az(secondary) == self.intent.preferred_az_id { + // Trivial case: if we only have one secondary, drop that one + if self.intent.get_secondary().len() == 1 { + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::RemoveSecondary( + *self.intent.get_secondary().first().unwrap(), + ), + }); + } + + // Try to find a "good" secondary to keep, without relying on scores (one or more nodes is in a state + // where its score can't be calculated), and drop the others. This enables us to make progress in + // most cases, even if some nodes are offline or have scheduling=pause set. + + debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this + // logic presumes we are in a mode where we want secondaries to be in non-home AZ + if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| { + let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id; + let is_available = secondary_scores + .get(n) + .expect("Built from same list of nodes") + .is_some(); + is_available && !in_home_az + }) { + // Great, we found one to retain. Pick some other to drop. + if let Some(victim) = self + .intent + .get_secondary() + .iter() + .find(|n| n != &retain_secondary) + { return Some(ScheduleOptimization { sequence: self.sequence, - action: ScheduleOptimizationAction::RemoveSecondary(*secondary), + action: ScheduleOptimizationAction::RemoveSecondary(*victim), }); } } // Fall through: we didn't identify one to remove. This ought to be rare. tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)", - self.intent.get_secondary() - ); + self.intent.get_secondary() + ); } else { let victim = secondary_scores .iter() @@ -1005,7 +1033,7 @@ impl TenantShard { .0; return Some(ScheduleOptimization { sequence: self.sequence, - action: ScheduleOptimizationAction::RemoveSecondary(victim), + action: ScheduleOptimizationAction::RemoveSecondary(*victim), }); } } @@ -2379,6 +2407,110 @@ pub(crate) mod tests { Ok(()) } + /// Test how the optimisation code behaves with an extra secondary + #[test] + fn optimize_removes_secondary() -> anyhow::Result<()> { + let az_a_tag = AvailabilityZone("az-a".to_string()); + let az_b_tag = AvailabilityZone("az-b".to_string()); + let mut nodes = make_test_nodes( + 4, + &[ + az_a_tag.clone(), + az_b_tag.clone(), + az_a_tag.clone(), + az_b_tag.clone(), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut schedule_context = ScheduleContext::default(); + + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_a.intent.preferred_az_id = Some(az_a_tag.clone()); + shard_a + .schedule(&mut scheduler, &mut schedule_context) + .unwrap(); + + // Attached on node 1, secondary on node 2 + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1))); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(2)]); + + // Initially optimiser is idle + assert_eq!( + shard_a.optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shard_a.optimize_secondary(&mut scheduler, &schedule_context), + None + ); + + // A spare secondary in the home AZ: it should be removed -- this is the situation when we're midway through a graceful migration, after cutting over + // to our new location + shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); + let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization.unwrap()); + + // A spare secondary in the non-home AZ, and one of them is offline + shard_a.intent.push_secondary(&mut scheduler, NodeId(4)); + nodes + .get_mut(&NodeId(4)) + .unwrap() + .set_availability(NodeAvailability::Offline); + scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap()); + let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(4)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization.unwrap()); + + // A spare secondary when should have none + shard_a.policy = PlacementPolicy::Attached(0); + let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization.unwrap()); + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1))); + assert_eq!(shard_a.intent.get_secondary(), &vec![]); + + // Check that in secondary mode, we preserve the secondary in the preferred AZ + let mut schedule_context = ScheduleContext::default(); // Fresh context, we're about to call schedule() + shard_a.policy = PlacementPolicy::Secondary; + shard_a + .schedule(&mut scheduler, &mut schedule_context) + .unwrap(); + assert_eq!(shard_a.intent.get_attached(), &None); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]); + assert_eq!( + shard_a.optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shard_a.optimize_secondary(&mut scheduler, &schedule_context), + None + ); + + shard_a.intent.clear(&mut scheduler); + + Ok(()) + } + // Optimize til quiescent: this emulates what Service::optimize_all does, when // called repeatedly in the background. // Returns the applied optimizations