storcon: improve drain and fill shard placement (#8119)

## Problem
While adapting the storage controller scale test to do graceful rolling
restarts via
drain and fill, I noticed that secondaries are also being rescheduled,
which, in turn,
caused the storage controller to optimise attachments.

## Summary of changes
* Introduce a transactional looking rescheduling primitive (i.e. "try to
schedule to this
secondary, but leave everything as is if you can't")
* Use it for the drain and fill stages to avoid calling into
`Scheduler::schedule` and having
secondaries move around.
This commit is contained in:
Vlad Lazar
2024-06-22 15:20:58 +01:00
committed by GitHub
parent 8776089c70
commit 8fe3f17c47
2 changed files with 68 additions and 27 deletions

View File

@@ -5364,7 +5364,6 @@ impl Service {
let mut last_inspected_shard: Option<TenantShardId> = None;
let mut inspected_all_shards = false;
let mut waiters = Vec::new();
let mut schedule_context = ScheduleContext::default();
while !inspected_all_shards {
if cancel.is_cancelled() {
@@ -5419,28 +5418,32 @@ impl Service {
}
};
if tenant_shard.intent.demote_attached(scheduler, node_id) {
match tenant_shard.schedule(scheduler, &mut schedule_context) {
Err(e) => {
tracing::warn!(
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
"Scheduling error when draining pageserver {} : {e}", node_id
);
}
Ok(()) => {
let scheduled_to = tenant_shard.intent.get_attached();
tracing::info!(
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
"Rescheduled shard while draining node {}: {} -> {:?}",
node_id,
node_id,
scheduled_to
);
// If the shard is not attached to the node being drained, skip it.
if *tenant_shard.intent.get_attached() != Some(node_id) {
last_inspected_shard = Some(*tid);
continue;
}
let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
if let Some(some) = waiter {
waiters.push(some);
}
match tenant_shard.reschedule_to_secondary(None, scheduler) {
Err(e) => {
tracing::warn!(
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
"Scheduling error when draining pageserver {} : {e}", node_id
);
}
Ok(()) => {
let scheduled_to = tenant_shard.intent.get_attached();
tracing::info!(
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
"Rescheduled shard while draining node {}: {} -> {:?}",
node_id,
node_id,
scheduled_to
);
let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
if let Some(some) = waiter {
waiters.push(some);
}
}
}
@@ -5603,9 +5606,7 @@ impl Service {
// secondaries are warm. This is not always true (e.g. we just migrated the
// tenant). Take that into consideration by checking the secondary status.
let mut tids_to_promote = self.fill_node_plan(node_id);
let mut waiters = Vec::new();
let mut schedule_context = ScheduleContext::default();
// Execute the plan we've composed above. Before aplying each move from the plan,
// we validate to ensure that it has not gone stale in the meantime.
@@ -5655,9 +5656,7 @@ impl Service {
}
let previously_attached_to = *tenant_shard.intent.get_attached();
tenant_shard.intent.promote_attached(scheduler, node_id);
match tenant_shard.schedule(scheduler, &mut schedule_context) {
match tenant_shard.reschedule_to_secondary(Some(node_id), scheduler) {
Err(e) => {
tracing::warn!(
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),

View File

@@ -646,6 +646,48 @@ impl TenantShard {
Ok(())
}
/// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error
/// if the swap is not possible and leaves the intent state in its original state.
///
/// Arguments:
/// `attached_to`: the currently attached location matching the intent state (may be None if the
/// shard is not attached)
/// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask
/// the scheduler to recommend a node
pub(crate) fn reschedule_to_secondary(
&mut self,
promote_to: Option<NodeId>,
scheduler: &mut Scheduler,
) -> Result<(), ScheduleError> {
let promote_to = match promote_to {
Some(node) => node,
None => match scheduler.node_preferred(self.intent.get_secondary()) {
Some(node) => node,
None => {
return Err(ScheduleError::ImpossibleConstraint);
}
},
};
assert!(self.intent.get_secondary().contains(&promote_to));
if let Some(node) = self.intent.get_attached() {
let demoted = self.intent.demote_attached(scheduler, *node);
if !demoted {
return Err(ScheduleError::ImpossibleConstraint);
}
}
self.intent.promote_attached(scheduler, promote_to);
// Increment the sequence number for the edge case where a
// reconciler is already running to avoid waiting on the
// current reconcile instead of spawning a new one.
self.sequence = self.sequence.next();
Ok(())
}
/// Optimize attachments: if a shard has a secondary location that is preferable to
/// its primary location based on soft constraints, switch that secondary location
/// to be attached.