mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 20:42:54 +00:00
storcon: improve drain and fill shard placement (#8119)
## Problem While adapting the storage controller scale test to do graceful rolling restarts via drain and fill, I noticed that secondaries are also being rescheduled, which, in turn, caused the storage controller to optimise attachments. ## Summary of changes * Introduce a transactional looking rescheduling primitive (i.e. "try to schedule to this secondary, but leave everything as is if you can't") * Use it for the drain and fill stages to avoid calling into `Scheduler::schedule` and having secondaries move around.
This commit is contained in:
@@ -5364,7 +5364,6 @@ impl Service {
|
||||
let mut last_inspected_shard: Option<TenantShardId> = None;
|
||||
let mut inspected_all_shards = false;
|
||||
let mut waiters = Vec::new();
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
|
||||
while !inspected_all_shards {
|
||||
if cancel.is_cancelled() {
|
||||
@@ -5419,28 +5418,32 @@ impl Service {
|
||||
}
|
||||
};
|
||||
|
||||
if tenant_shard.intent.demote_attached(scheduler, node_id) {
|
||||
match tenant_shard.schedule(scheduler, &mut schedule_context) {
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
|
||||
"Scheduling error when draining pageserver {} : {e}", node_id
|
||||
);
|
||||
}
|
||||
Ok(()) => {
|
||||
let scheduled_to = tenant_shard.intent.get_attached();
|
||||
tracing::info!(
|
||||
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
|
||||
"Rescheduled shard while draining node {}: {} -> {:?}",
|
||||
node_id,
|
||||
node_id,
|
||||
scheduled_to
|
||||
);
|
||||
// If the shard is not attached to the node being drained, skip it.
|
||||
if *tenant_shard.intent.get_attached() != Some(node_id) {
|
||||
last_inspected_shard = Some(*tid);
|
||||
continue;
|
||||
}
|
||||
|
||||
let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
|
||||
if let Some(some) = waiter {
|
||||
waiters.push(some);
|
||||
}
|
||||
match tenant_shard.reschedule_to_secondary(None, scheduler) {
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
|
||||
"Scheduling error when draining pageserver {} : {e}", node_id
|
||||
);
|
||||
}
|
||||
Ok(()) => {
|
||||
let scheduled_to = tenant_shard.intent.get_attached();
|
||||
tracing::info!(
|
||||
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
|
||||
"Rescheduled shard while draining node {}: {} -> {:?}",
|
||||
node_id,
|
||||
node_id,
|
||||
scheduled_to
|
||||
);
|
||||
|
||||
let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
|
||||
if let Some(some) = waiter {
|
||||
waiters.push(some);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5603,9 +5606,7 @@ impl Service {
|
||||
// secondaries are warm. This is not always true (e.g. we just migrated the
|
||||
// tenant). Take that into consideration by checking the secondary status.
|
||||
let mut tids_to_promote = self.fill_node_plan(node_id);
|
||||
|
||||
let mut waiters = Vec::new();
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
|
||||
// Execute the plan we've composed above. Before aplying each move from the plan,
|
||||
// we validate to ensure that it has not gone stale in the meantime.
|
||||
@@ -5655,9 +5656,7 @@ impl Service {
|
||||
}
|
||||
|
||||
let previously_attached_to = *tenant_shard.intent.get_attached();
|
||||
|
||||
tenant_shard.intent.promote_attached(scheduler, node_id);
|
||||
match tenant_shard.schedule(scheduler, &mut schedule_context) {
|
||||
match tenant_shard.reschedule_to_secondary(Some(node_id), scheduler) {
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
|
||||
|
||||
@@ -646,6 +646,48 @@ impl TenantShard {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error
|
||||
/// if the swap is not possible and leaves the intent state in its original state.
|
||||
///
|
||||
/// Arguments:
|
||||
/// `attached_to`: the currently attached location matching the intent state (may be None if the
|
||||
/// shard is not attached)
|
||||
/// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask
|
||||
/// the scheduler to recommend a node
|
||||
pub(crate) fn reschedule_to_secondary(
|
||||
&mut self,
|
||||
promote_to: Option<NodeId>,
|
||||
scheduler: &mut Scheduler,
|
||||
) -> Result<(), ScheduleError> {
|
||||
let promote_to = match promote_to {
|
||||
Some(node) => node,
|
||||
None => match scheduler.node_preferred(self.intent.get_secondary()) {
|
||||
Some(node) => node,
|
||||
None => {
|
||||
return Err(ScheduleError::ImpossibleConstraint);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
assert!(self.intent.get_secondary().contains(&promote_to));
|
||||
|
||||
if let Some(node) = self.intent.get_attached() {
|
||||
let demoted = self.intent.demote_attached(scheduler, *node);
|
||||
if !demoted {
|
||||
return Err(ScheduleError::ImpossibleConstraint);
|
||||
}
|
||||
}
|
||||
|
||||
self.intent.promote_attached(scheduler, promote_to);
|
||||
|
||||
// Increment the sequence number for the edge case where a
|
||||
// reconciler is already running to avoid waiting on the
|
||||
// current reconcile instead of spawning a new one.
|
||||
self.sequence = self.sequence.next();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Optimize attachments: if a shard has a secondary location that is preferable to
|
||||
/// its primary location based on soft constraints, switch that secondary location
|
||||
/// to be attached.
|
||||
|
||||
Reference in New Issue
Block a user