diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 752fb2c161..388e0eadc8 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5364,7 +5364,6 @@ impl Service { let mut last_inspected_shard: Option = None; let mut inspected_all_shards = false; let mut waiters = Vec::new(); - let mut schedule_context = ScheduleContext::default(); while !inspected_all_shards { if cancel.is_cancelled() { @@ -5419,28 +5418,32 @@ impl Service { } }; - if tenant_shard.intent.demote_attached(scheduler, node_id) { - match tenant_shard.schedule(scheduler, &mut schedule_context) { - Err(e) => { - tracing::warn!( - tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), - "Scheduling error when draining pageserver {} : {e}", node_id - ); - } - Ok(()) => { - let scheduled_to = tenant_shard.intent.get_attached(); - tracing::info!( - tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), - "Rescheduled shard while draining node {}: {} -> {:?}", - node_id, - node_id, - scheduled_to - ); + // If the shard is not attached to the node being drained, skip it. + if *tenant_shard.intent.get_attached() != Some(node_id) { + last_inspected_shard = Some(*tid); + continue; + } - let waiter = self.maybe_reconcile_shard(tenant_shard, nodes); - if let Some(some) = waiter { - waiters.push(some); - } + match tenant_shard.reschedule_to_secondary(None, scheduler) { + Err(e) => { + tracing::warn!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Scheduling error when draining pageserver {} : {e}", node_id + ); + } + Ok(()) => { + let scheduled_to = tenant_shard.intent.get_attached(); + tracing::info!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Rescheduled shard while draining node {}: {} -> {:?}", + node_id, + node_id, + scheduled_to + ); + + let waiter = self.maybe_reconcile_shard(tenant_shard, nodes); + if let Some(some) = waiter { + waiters.push(some); } } } @@ -5603,9 +5606,7 @@ impl Service { // secondaries are warm. This is not always true (e.g. we just migrated the // tenant). Take that into consideration by checking the secondary status. let mut tids_to_promote = self.fill_node_plan(node_id); - let mut waiters = Vec::new(); - let mut schedule_context = ScheduleContext::default(); // Execute the plan we've composed above. Before aplying each move from the plan, // we validate to ensure that it has not gone stale in the meantime. @@ -5655,9 +5656,7 @@ impl Service { } let previously_attached_to = *tenant_shard.intent.get_attached(); - - tenant_shard.intent.promote_attached(scheduler, node_id); - match tenant_shard.schedule(scheduler, &mut schedule_context) { + match tenant_shard.reschedule_to_secondary(Some(node_id), scheduler) { Err(e) => { tracing::warn!( tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 840bcbb81d..45295bc59b 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -646,6 +646,48 @@ impl TenantShard { Ok(()) } + /// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error + /// if the swap is not possible and leaves the intent state in its original state. + /// + /// Arguments: + /// `attached_to`: the currently attached location matching the intent state (may be None if the + /// shard is not attached) + /// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask + /// the scheduler to recommend a node + pub(crate) fn reschedule_to_secondary( + &mut self, + promote_to: Option, + scheduler: &mut Scheduler, + ) -> Result<(), ScheduleError> { + let promote_to = match promote_to { + Some(node) => node, + None => match scheduler.node_preferred(self.intent.get_secondary()) { + Some(node) => node, + None => { + return Err(ScheduleError::ImpossibleConstraint); + } + }, + }; + + assert!(self.intent.get_secondary().contains(&promote_to)); + + if let Some(node) = self.intent.get_attached() { + let demoted = self.intent.demote_attached(scheduler, *node); + if !demoted { + return Err(ScheduleError::ImpossibleConstraint); + } + } + + self.intent.promote_attached(scheduler, promote_to); + + // Increment the sequence number for the edge case where a + // reconciler is already running to avoid waiting on the + // current reconcile instead of spawning a new one. + self.sequence = self.sequence.next(); + + Ok(()) + } + /// Optimize attachments: if a shard has a secondary location that is preferable to /// its primary location based on soft constraints, switch that secondary location /// to be attached.