storage controller: API-driven graceful migrations (#10913)

## Problem

The current migration API does a live migration, but if the destination
doesn't already have a secondary, that live migration is unlikely to be
able to warm up a tenant properly within its timeout (full warmup of a
big tenant can take tens of minutes).

Background optimisation code knows how to do this gracefully by creating
a secondary first, but we don't currently give a human a way to trigger
that.

Closes: https://github.com/neondatabase/neon/issues/10540

## Summary of changes

- Add `prefererred_node` parameter to TenantShard, which is respected by
optimize_attachment
- Modify migration API to have optional prewarm=true mode, in which we
set preferred_node and call optimize_attachment, rather than directly
modifying intentstate
- Require override_scheduler=true flag if migrating somewhere that is a
less-than-optimal scheduling location (e.g. wrong AZ)
- Add `origin_node_id` to migration API so that callers can ensure
they're moving from where they think they're moving from
- Add tests for the above

The storcon_cli wrapper for this has a 'watch' mode that waits for
eventual cutover. This doesn't show the warmth of the secondary evolve
because we don't currently have an API for that in the controller, as
the passthrough API only targets attached locations, not secondaries. It
would be straightforward to add later as a dedicated endpoint for
getting secondary status, then extend the storcon_cli to consume that
and print a nice progress indicator.
This commit is contained in:
John Spray
2025-03-07 17:02:38 +00:00
committed by GitHub
parent 084fc4a757
commit 87e6117dfd
9 changed files with 707 additions and 120 deletions

View File

@@ -85,7 +85,9 @@ use crate::reconciler::{
attached_location_conf,
};
use crate::safekeeper::Safekeeper;
use crate::scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode, Scheduler};
use crate::scheduler::{
AttachedShardTag, MaySchedule, ScheduleContext, ScheduleError, ScheduleMode, Scheduler,
};
use crate::tenant_shard::{
IntentState, MigrateAttachment, ObservedState, ObservedStateDelta, ObservedStateLocation,
ReconcileNeeded, ReconcileResult, ReconcileWaitError, ReconcilerStatus, ReconcilerWaiter,
@@ -5299,12 +5301,93 @@ impl Service {
Ok((response, waiters))
}
/// A graceful migration: update the preferred node and let optimisation handle the migration
/// in the background (may take a long time as it will fully warm up a location before cutting over)
///
/// Our external API calls this a 'prewarm=true' migration, but internally it isn't a special prewarm step: it's
/// just a migration that uses the same graceful procedure as our background scheduling optimisations would use.
fn tenant_shard_migrate_with_prewarm(
&self,
migrate_req: &TenantShardMigrateRequest,
shard: &mut TenantShard,
scheduler: &mut Scheduler,
schedule_context: ScheduleContext,
) -> Result<Option<ScheduleOptimization>, ApiError> {
shard.set_preferred_node(Some(migrate_req.node_id));
// Generate whatever the initial change to the intent is: this could be creation of a secondary, or
// cutting over to an existing secondary. Caller is responsible for validating this before applying it,
// e.g. by checking secondary is warm enough.
Ok(shard.optimize_attachment(scheduler, &schedule_context))
}
/// Immediate migration: directly update the intent state and kick off a reconciler
fn tenant_shard_migrate_immediate(
&self,
migrate_req: &TenantShardMigrateRequest,
nodes: &Arc<HashMap<NodeId, Node>>,
shard: &mut TenantShard,
scheduler: &mut Scheduler,
) -> Result<Option<ReconcilerWaiter>, ApiError> {
// Non-graceful migration: update the intent state immediately
let old_attached = *shard.intent.get_attached();
match shard.policy {
PlacementPolicy::Attached(n) => {
// If our new attached node was a secondary, it no longer should be.
shard
.intent
.remove_secondary(scheduler, migrate_req.node_id);
shard
.intent
.set_attached(scheduler, Some(migrate_req.node_id));
// If we were already attached to something, demote that to a secondary
if let Some(old_attached) = old_attached {
if n > 0 {
// Remove other secondaries to make room for the location we'll demote
while shard.intent.get_secondary().len() >= n {
shard.intent.pop_secondary(scheduler);
}
shard.intent.push_secondary(scheduler, old_attached);
}
}
}
PlacementPolicy::Secondary => {
shard.intent.clear(scheduler);
shard.intent.push_secondary(scheduler, migrate_req.node_id);
}
PlacementPolicy::Detached => {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
)));
}
}
tracing::info!("Migrating: new intent {:?}", shard.intent);
shard.sequence = shard.sequence.next();
shard.set_preferred_node(None); // Abort any in-flight graceful migration
Ok(self.maybe_configured_reconcile_shard(
shard,
nodes,
(&migrate_req.migration_config).into(),
))
}
pub(crate) async fn tenant_shard_migrate(
&self,
tenant_shard_id: TenantShardId,
migrate_req: TenantShardMigrateRequest,
) -> Result<TenantShardMigrateResponse, ApiError> {
let waiter = {
// Depending on whether the migration is a change and whether it's graceful or immediate, we might
// get a different outcome to handle
enum MigrationOutcome {
Optimization(Option<ScheduleOptimization>),
Reconcile(Option<ReconcilerWaiter>),
}
let outcome = {
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
@@ -5315,71 +5398,139 @@ impl Service {
)));
};
// Migration to unavavailable node requires force flag
if !node.is_available() {
// Warn but proceed: the caller may intend to manually adjust the placement of
// a shard even if the node is down, e.g. if intervening during an incident.
tracing::warn!("Migrating to unavailable node {node}");
if migrate_req.migration_config.override_scheduler {
// Warn but proceed: the caller may intend to manually adjust the placement of
// a shard even if the node is down, e.g. if intervening during an incident.
tracing::warn!("Forcibly migrating to unavailable node {node}");
} else {
tracing::warn!("Node {node} is unavailable, refusing migration");
return Err(ApiError::PreconditionFailed(
format!("Node {node} is unavailable").into_boxed_str(),
));
}
}
// Calculate the ScheduleContext for this tenant
let mut schedule_context = ScheduleContext::default();
for (_shard_id, shard) in
tenants.range(TenantShardId::tenant_range(tenant_shard_id.tenant_id))
{
schedule_context.avoid(&shard.intent.all_pageservers());
}
// Look up the specific shard we will migrate
let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant shard not found").into(),
));
};
// Migration to a node with unfavorable scheduling score requires a force flag, because it might just
// be migrated back by the optimiser.
if let Some(better_node) = shard.find_better_location::<AttachedShardTag>(
scheduler,
&schedule_context,
migrate_req.node_id,
&[],
) {
if !migrate_req.migration_config.override_scheduler {
return Err(ApiError::PreconditionFailed(
"Migration to a worse-scoring node".into(),
));
} else {
tracing::info!(
"Migrating to a worse-scoring node {} (optimiser would prefer {better_node})",
migrate_req.node_id
);
}
}
if let Some(origin_node_id) = migrate_req.origin_node_id {
if shard.intent.get_attached() != &Some(origin_node_id) {
return Err(ApiError::PreconditionFailed(
format!(
"Migration expected to originate from {} but shard is on {:?}",
origin_node_id,
shard.intent.get_attached()
)
.into(),
));
}
}
if shard.intent.get_attached() == &Some(migrate_req.node_id) {
// No-op case: we will still proceed to wait for reconciliation in case it is
// incomplete from an earlier update to the intent.
tracing::info!("Migrating: intent is unchanged {:?}", shard.intent);
// An instruction to migrate to the currently attached node should
// cancel any pending graceful migration
shard.set_preferred_node(None);
MigrationOutcome::Reconcile(self.maybe_configured_reconcile_shard(
shard,
nodes,
(&migrate_req.migration_config).into(),
))
} else if migrate_req.migration_config.prewarm {
MigrationOutcome::Optimization(self.tenant_shard_migrate_with_prewarm(
&migrate_req,
shard,
scheduler,
schedule_context,
)?)
} else {
let old_attached = *shard.intent.get_attached();
match shard.policy {
PlacementPolicy::Attached(n) => {
// If our new attached node was a secondary, it no longer should be.
shard
.intent
.remove_secondary(scheduler, migrate_req.node_id);
shard
.intent
.set_attached(scheduler, Some(migrate_req.node_id));
// If we were already attached to something, demote that to a secondary
if let Some(old_attached) = old_attached {
if n > 0 {
// Remove other secondaries to make room for the location we'll demote
while shard.intent.get_secondary().len() >= n {
shard.intent.pop_secondary(scheduler);
}
shard.intent.push_secondary(scheduler, old_attached);
}
}
}
PlacementPolicy::Secondary => {
shard.intent.clear(scheduler);
shard.intent.push_secondary(scheduler, migrate_req.node_id);
}
PlacementPolicy::Detached => {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
)));
}
}
tracing::info!("Migrating: new intent {:?}", shard.intent);
shard.sequence = shard.sequence.next();
MigrationOutcome::Reconcile(self.tenant_shard_migrate_immediate(
&migrate_req,
nodes,
shard,
scheduler,
)?)
}
let reconciler_config = match migrate_req.migration_config {
Some(cfg) => (&cfg).into(),
None => ReconcilerConfig::new(ReconcilerPriority::High),
};
self.maybe_configured_reconcile_shard(shard, nodes, reconciler_config)
};
// We may need to validate + apply an optimisation, or we may need to just retrive a reconcile waiter
let waiter = match outcome {
MigrationOutcome::Optimization(Some(optimization)) => {
// Validate and apply the optimization -- this would happen anyway in background reconcile loop, but
// we might as well do it more promptly as this is a direct external request.
let mut validated = self
.optimize_all_validate(vec![(tenant_shard_id, optimization)])
.await;
if let Some((_shard_id, optimization)) = validated.pop() {
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
// Rare but possible: tenant is removed between generating optimisation and validating it.
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant shard not found").into(),
));
};
if !shard.apply_optimization(scheduler, optimization) {
// This can happen but is unusual enough to warn on: something else changed in the shard that made the optimisation stale
// and therefore not applied.
tracing::warn!(
"Schedule optimisation generated during graceful migration was not applied, shard changed?"
);
}
self.maybe_configured_reconcile_shard(
shard,
nodes,
(&migrate_req.migration_config).into(),
)
} else {
None
}
}
MigrationOutcome::Optimization(None) => None,
MigrationOutcome::Reconcile(waiter) => waiter,
};
// Finally, wait for any reconcile we started to complete. In the case of immediate-mode migrations to cold
// locations, this has a good chance of timing out.
if let Some(waiter) = waiter {
waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
} else {
@@ -6959,6 +7110,10 @@ impl Service {
ShardSchedulingPolicy::Active => {
// Ok to do optimization
}
ShardSchedulingPolicy::Essential if shard.get_preferred_node().is_some() => {
// Ok to do optimization: we are executing a graceful migration that
// has set preferred_node
}
ShardSchedulingPolicy::Essential
| ShardSchedulingPolicy::Pause
| ShardSchedulingPolicy::Stop => {

View File

@@ -132,6 +132,10 @@ pub(crate) struct TenantShard {
/// of state that we publish externally in an eventually consistent way.
pub(crate) pending_compute_notification: bool,
/// To do a graceful migration, set this field to the destination pageserver, and optimization
/// functions will consider this node the best location and react appropriately.
preferred_node: Option<NodeId>,
// Support/debug tool: if something is going wrong or flapping with scheduling, this may
// be set to a non-active state to avoid making changes while the issue is fixed.
scheduling_policy: ShardSchedulingPolicy,
@@ -555,6 +559,7 @@ impl TenantShard {
last_error: Arc::default(),
pending_compute_notification: false,
scheduling_policy: ShardSchedulingPolicy::default(),
preferred_node: None,
}
}
@@ -809,6 +814,15 @@ impl TenantShard {
return None;
};
// If the candidate is our preferred node, then it is better than the current location, as long
// as it is online -- the online check is part of the score calculation we did above, so it's
// important that this check comes after that one.
if let Some(preferred) = self.preferred_node.as_ref() {
if preferred == &candidate {
return Some(true);
}
}
match scheduler.compute_node_score::<T::Score>(
current,
&self.intent.preferred_az_id,
@@ -847,13 +861,22 @@ impl TenantShard {
}
}
fn find_better_location<T: ShardTag>(
pub(crate) fn find_better_location<T: ShardTag>(
&self,
scheduler: &mut Scheduler,
schedule_context: &ScheduleContext,
current: NodeId,
hard_exclude: &[NodeId],
) -> Option<NodeId> {
// If we have a migration hint, then that is our better location
if let Some(hint) = self.preferred_node.as_ref() {
if hint == &current {
return None;
}
return Some(*hint);
}
// Look for a lower-scoring location to attach to
let Ok(candidate_node) = scheduler.schedule_shard::<T>(
hard_exclude,
@@ -887,6 +910,13 @@ impl TenantShard {
scheduler: &mut Scheduler,
schedule_context: &ScheduleContext,
) -> bool {
// Tenant with preferred node: check if it is not already at the preferred node
if let Some(preferred) = self.preferred_node.as_ref() {
if Some(preferred) != self.intent.get_attached().as_ref() {
return true;
}
}
// Sharded tenant: check if any locations have a nonzero affinity score
if self.shard.count >= ShardCount(1) {
let schedule_context = schedule_context.project_detach(self);
@@ -927,6 +957,9 @@ impl TenantShard {
/// Optimize attachments: if a shard has a secondary location that is preferable to
/// its primary location based on soft constraints, switch that secondary location
/// to be attached.
///
/// `schedule_context` should have been populated with all shards in the tenant, including
/// the one we're trying to optimize (this function will subtract its own contribution before making scoring decisions)
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
pub(crate) fn optimize_attachment(
&self,
@@ -1055,7 +1088,8 @@ impl TenantShard {
//
// This should be a transient state, there should always be capacity eventually in our preferred AZ (even if nodes
// there are too overloaded for scheduler to suggest them, more should be provisioned eventually).
if self.intent.preferred_az_id.is_some()
if self.preferred_node.is_none()
&& self.intent.preferred_az_id.is_some()
&& scheduler.get_node_az(&replacement) != self.intent.preferred_az_id
{
tracing::debug!(
@@ -1161,6 +1195,27 @@ impl TenantShard {
None
}
/// Start or abort a graceful migration of this shard to another pageserver. This works on top of the
/// other optimisation functions, to bias them to move to the destination node.
pub(crate) fn set_preferred_node(&mut self, node: Option<NodeId>) {
if let Some(hint) = self.preferred_node.as_ref() {
if Some(hint) != node.as_ref() {
// This is legal but a bit surprising: we expect that administrators wouldn't usually
// change their mind about where to migrate something.
tracing::warn!(
"Changing migration destination from {hint} to {node:?} (current intent {:?})",
self.intent
);
}
}
self.preferred_node = node;
}
pub(crate) fn get_preferred_node(&self) -> Option<NodeId> {
self.preferred_node
}
/// Return true if the optimization was really applied: it will not be applied if the optimization's
/// sequence is behind this tenant shard's
pub(crate) fn apply_optimization(
@@ -1185,6 +1240,14 @@ impl TenantShard {
self.intent.demote_attached(scheduler, old_attached_node_id);
self.intent
.promote_attached(scheduler, new_attached_node_id);
if let Some(hint) = self.preferred_node.as_ref() {
if hint == &new_attached_node_id {
// The migration target is not a long term pin: once we are done with the migration, clear it.
tracing::info!("Graceful migration to {hint} complete");
self.preferred_node = None;
}
}
}
ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
old_node_id,
@@ -1703,6 +1766,10 @@ impl TenantShard {
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
if self.preferred_node == Some(node_id) {
self.preferred_node = None;
}
intent_modified
}
@@ -1750,6 +1817,7 @@ impl TenantShard {
pending_compute_notification: false,
delayed_reconcile: false,
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
preferred_node: None,
})
}
@@ -2270,6 +2338,85 @@ pub(crate) mod tests {
Ok(())
}
#[test]
/// How the optimisation code handles a shard with a preferred node set; this is an example
/// of the multi-step migration, but driven by a different input.
fn optimize_attachment_multi_preferred_node() -> anyhow::Result<()> {
let nodes = make_test_nodes(
4,
&[
AvailabilityZone("az-a".to_string()),
AvailabilityZone("az-a".to_string()),
AvailabilityZone("az-b".to_string()),
AvailabilityZone("az-b".to_string()),
],
);
let mut scheduler = Scheduler::new(nodes.values());
// Two shards of a tenant that wants to be in AZ A
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
// Initially attached in a stable location
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
// Set the preferred node to node 2, an equally high scoring node to its current location
shard_a.preferred_node = Some(NodeId(2));
fn make_schedule_context(shard_a: &TenantShard) -> ScheduleContext {
let mut schedule_context = ScheduleContext::default();
schedule_context.avoid(&shard_a.intent.all_pageservers());
schedule_context
}
let schedule_context = make_schedule_context(&shard_a);
let optimization_a_prepare = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
assert_eq!(
optimization_a_prepare,
Some(ScheduleOptimization {
sequence: shard_a.sequence,
action: ScheduleOptimizationAction::CreateSecondary(NodeId(2))
})
);
shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap());
// The first step of the optimisation should not have cleared the preferred node
assert_eq!(shard_a.preferred_node, Some(NodeId(2)));
let schedule_context = make_schedule_context(&shard_a);
let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
assert_eq!(
optimization_a_migrate,
Some(ScheduleOptimization {
sequence: shard_a.sequence,
action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
old_attached_node_id: NodeId(1),
new_attached_node_id: NodeId(2)
})
})
);
shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap());
// The cutover step of the optimisation should have cleared the preferred node
assert_eq!(shard_a.preferred_node, None);
let schedule_context = make_schedule_context(&shard_a);
let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
assert_eq!(
optimization_a_cleanup,
Some(ScheduleOptimization {
sequence: shard_a.sequence,
action: ScheduleOptimizationAction::RemoveSecondary(NodeId(1))
})
);
shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap());
shard_a.intent.clear(&mut scheduler);
Ok(())
}
#[test]
/// Check that multi-step migration works when moving to somewhere that is only better by
/// 1 AffinityScore -- this ensures that we don't have a bug like the intermediate secondary