storage controller: API-driven graceful migrations (#10913)

## Problem The current migration API does a live migration, but if the destination doesn't already have a secondary, that live migration is unlikely to be able to warm up a tenant properly within its timeout (full warmup of a big tenant can take tens of minutes). Background optimisation code knows how to do this gracefully by creating a secondary first, but we don't currently give a human a way to trigger that. Closes: https://github.com/neondatabase/neon/issues/10540 ## Summary of changes - Add `prefererred_node` parameter to TenantShard, which is respected by optimize_attachment - Modify migration API to have optional prewarm=true mode, in which we set preferred_node and call optimize_attachment, rather than directly modifying intentstate - Require override_scheduler=true flag if migrating somewhere that is a less-than-optimal scheduling location (e.g. wrong AZ) - Add `origin_node_id` to migration API so that callers can ensure they're moving from where they think they're moving from - Add tests for the above The storcon_cli wrapper for this has a 'watch' mode that waits for eventual cutover. This doesn't show the warmth of the secondary evolve because we don't currently have an API for that in the controller, as the passthrough API only targets attached locations, not secondaries. It would be straightforward to add later as a dedicated endpoint for getting secondary status, then extend the storcon_cli to consume that and print a nice progress indicator.
2026-05-27 10:00:38 +00:00 · 2025-03-07 17:02:38 +00:00
parent 084fc4a757
commit 87e6117dfd
9 changed files with 707 additions and 120 deletions
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -85,7 +85,9 @@ use crate::reconciler::{
    attached_location_conf,
 };
 use crate::safekeeper::Safekeeper;
-use crate::scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode, Scheduler};
+use crate::scheduler::{
+    AttachedShardTag, MaySchedule, ScheduleContext, ScheduleError, ScheduleMode, Scheduler,
+};
 use crate::tenant_shard::{
    IntentState, MigrateAttachment, ObservedState, ObservedStateDelta, ObservedStateLocation,
    ReconcileNeeded, ReconcileResult, ReconcileWaitError, ReconcilerStatus, ReconcilerWaiter,
@@ -5299,12 +5301,93 @@ impl Service {
        Ok((response, waiters))
    }

+    /// A graceful migration: update the preferred node and let optimisation handle the migration
+    /// in the background (may take a long time as it will fully warm up a location before cutting over)
+    ///
+    /// Our external API calls this a 'prewarm=true' migration, but internally it isn't a special prewarm step: it's
+    /// just a migration that uses the same graceful procedure as our background scheduling optimisations would use.
+    fn tenant_shard_migrate_with_prewarm(
+        &self,
+        migrate_req: &TenantShardMigrateRequest,
+        shard: &mut TenantShard,
+        scheduler: &mut Scheduler,
+        schedule_context: ScheduleContext,
+    ) -> Result<Option<ScheduleOptimization>, ApiError> {
+        shard.set_preferred_node(Some(migrate_req.node_id));
+
+        // Generate whatever the initial change to the intent is: this could be creation of a secondary, or
+        // cutting over to an existing secondary.  Caller is responsible for validating this before applying it,
+        // e.g. by checking secondary is warm enough.
+        Ok(shard.optimize_attachment(scheduler, &schedule_context))
+    }
+
+    /// Immediate migration: directly update the intent state and kick off a reconciler
+    fn tenant_shard_migrate_immediate(
+        &self,
+        migrate_req: &TenantShardMigrateRequest,
+        nodes: &Arc<HashMap<NodeId, Node>>,
+        shard: &mut TenantShard,
+        scheduler: &mut Scheduler,
+    ) -> Result<Option<ReconcilerWaiter>, ApiError> {
+        // Non-graceful migration: update the intent state immediately
+        let old_attached = *shard.intent.get_attached();
+        match shard.policy {
+            PlacementPolicy::Attached(n) => {
+                // If our new attached node was a secondary, it no longer should be.
+                shard
+                    .intent
+                    .remove_secondary(scheduler, migrate_req.node_id);
+
+                shard
+                    .intent
+                    .set_attached(scheduler, Some(migrate_req.node_id));
+
+                // If we were already attached to something, demote that to a secondary
+                if let Some(old_attached) = old_attached {
+                    if n > 0 {
+                        // Remove other secondaries to make room for the location we'll demote
+                        while shard.intent.get_secondary().len() >= n {
+                            shard.intent.pop_secondary(scheduler);
+                        }
+
+                        shard.intent.push_secondary(scheduler, old_attached);
+                    }
+                }
+            }
+            PlacementPolicy::Secondary => {
+                shard.intent.clear(scheduler);
+                shard.intent.push_secondary(scheduler, migrate_req.node_id);
+            }
+            PlacementPolicy::Detached => {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
+                )));
+            }
+        }
+
+        tracing::info!("Migrating: new intent {:?}", shard.intent);
+        shard.sequence = shard.sequence.next();
+        shard.set_preferred_node(None); // Abort any in-flight graceful migration
+        Ok(self.maybe_configured_reconcile_shard(
+            shard,
+            nodes,
+            (&migrate_req.migration_config).into(),
+        ))
+    }
+
    pub(crate) async fn tenant_shard_migrate(
        &self,
        tenant_shard_id: TenantShardId,
        migrate_req: TenantShardMigrateRequest,
    ) -> Result<TenantShardMigrateResponse, ApiError> {
-        let waiter = {
+        // Depending on whether the migration is a change and whether it's graceful or immediate, we might
+        // get a different outcome to handle
+        enum MigrationOutcome {
+            Optimization(Option<ScheduleOptimization>),
+            Reconcile(Option<ReconcilerWaiter>),
+        }
+
+        let outcome = {
            let mut locked = self.inner.write().unwrap();
            let (nodes, tenants, scheduler) = locked.parts_mut();

@@ -5315,71 +5398,139 @@ impl Service {
                )));
            };

+            // Migration to unavavailable node requires force flag
            if !node.is_available() {
-                // Warn but proceed: the caller may intend to manually adjust the placement of
-                // a shard even if the node is down, e.g. if intervening during an incident.
-                tracing::warn!("Migrating to unavailable node {node}");
+                if migrate_req.migration_config.override_scheduler {
+                    // Warn but proceed: the caller may intend to manually adjust the placement of
+                    // a shard even if the node is down, e.g. if intervening during an incident.
+                    tracing::warn!("Forcibly migrating to unavailable node {node}");
+                } else {
+                    tracing::warn!("Node {node} is unavailable, refusing migration");
+                    return Err(ApiError::PreconditionFailed(
+                        format!("Node {node} is unavailable").into_boxed_str(),
+                    ));
+                }
            }

+            // Calculate the ScheduleContext for this tenant
+            let mut schedule_context = ScheduleContext::default();
+            for (_shard_id, shard) in
+                tenants.range(TenantShardId::tenant_range(tenant_shard_id.tenant_id))
+            {
+                schedule_context.avoid(&shard.intent.all_pageservers());
+            }
+
+            // Look up the specific shard we will migrate
            let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
                return Err(ApiError::NotFound(
                    anyhow::anyhow!("Tenant shard not found").into(),
                ));
            };

+            // Migration to a node with unfavorable scheduling score requires a force flag, because it might just
+            // be migrated back by the optimiser.
+            if let Some(better_node) = shard.find_better_location::<AttachedShardTag>(
+                scheduler,
+                &schedule_context,
+                migrate_req.node_id,
+                &[],
+            ) {
+                if !migrate_req.migration_config.override_scheduler {
+                    return Err(ApiError::PreconditionFailed(
+                        "Migration to a worse-scoring node".into(),
+                    ));
+                } else {
+                    tracing::info!(
+                        "Migrating to a worse-scoring node {} (optimiser would prefer {better_node})",
+                        migrate_req.node_id
+                    );
+                }
+            }
+
+            if let Some(origin_node_id) = migrate_req.origin_node_id {
+                if shard.intent.get_attached() != &Some(origin_node_id) {
+                    return Err(ApiError::PreconditionFailed(
+                        format!(
+                            "Migration expected to originate from {} but shard is on {:?}",
+                            origin_node_id,
+                            shard.intent.get_attached()
+                        )
+                        .into(),
+                    ));
+                }
+            }
+
            if shard.intent.get_attached() == &Some(migrate_req.node_id) {
                // No-op case: we will still proceed to wait for reconciliation in case it is
                // incomplete from an earlier update to the intent.
                tracing::info!("Migrating: intent is unchanged {:?}", shard.intent);
+
+                // An instruction to migrate to the currently attached node should
+                // cancel any pending graceful migration
+                shard.set_preferred_node(None);
+
+                MigrationOutcome::Reconcile(self.maybe_configured_reconcile_shard(
+                    shard,
+                    nodes,
+                    (&migrate_req.migration_config).into(),
+                ))
+            } else if migrate_req.migration_config.prewarm {
+                MigrationOutcome::Optimization(self.tenant_shard_migrate_with_prewarm(
+                    &migrate_req,
+                    shard,
+                    scheduler,
+                    schedule_context,
+                )?)
            } else {
-                let old_attached = *shard.intent.get_attached();
-
-                match shard.policy {
-                    PlacementPolicy::Attached(n) => {
-                        // If our new attached node was a secondary, it no longer should be.
-                        shard
-                            .intent
-                            .remove_secondary(scheduler, migrate_req.node_id);
-
-                        shard
-                            .intent
-                            .set_attached(scheduler, Some(migrate_req.node_id));
-
-                        // If we were already attached to something, demote that to a secondary
-                        if let Some(old_attached) = old_attached {
-                            if n > 0 {
-                                // Remove other secondaries to make room for the location we'll demote
-                                while shard.intent.get_secondary().len() >= n {
-                                    shard.intent.pop_secondary(scheduler);
-                                }
-
-                                shard.intent.push_secondary(scheduler, old_attached);
-                            }
-                        }
-                    }
-                    PlacementPolicy::Secondary => {
-                        shard.intent.clear(scheduler);
-                        shard.intent.push_secondary(scheduler, migrate_req.node_id);
-                    }
-                    PlacementPolicy::Detached => {
-                        return Err(ApiError::BadRequest(anyhow::anyhow!(
-                            "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
-                        )));
-                    }
-                }
-
-                tracing::info!("Migrating: new intent {:?}", shard.intent);
-                shard.sequence = shard.sequence.next();
+                MigrationOutcome::Reconcile(self.tenant_shard_migrate_immediate(
+                    &migrate_req,
+                    nodes,
+                    shard,
+                    scheduler,
+                )?)
            }
-
-            let reconciler_config = match migrate_req.migration_config {
-                Some(cfg) => (&cfg).into(),
-                None => ReconcilerConfig::new(ReconcilerPriority::High),
-            };
-
-            self.maybe_configured_reconcile_shard(shard, nodes, reconciler_config)
        };

+        // We may need to validate + apply an optimisation, or we may need to just retrive a reconcile waiter
+        let waiter = match outcome {
+            MigrationOutcome::Optimization(Some(optimization)) => {
+                // Validate and apply the optimization -- this would happen anyway in background reconcile loop, but
+                // we might as well do it more promptly as this is a direct external request.
+                let mut validated = self
+                    .optimize_all_validate(vec![(tenant_shard_id, optimization)])
+                    .await;
+                if let Some((_shard_id, optimization)) = validated.pop() {
+                    let mut locked = self.inner.write().unwrap();
+                    let (nodes, tenants, scheduler) = locked.parts_mut();
+                    let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
+                        // Rare but possible: tenant is removed between generating optimisation and validating it.
+                        return Err(ApiError::NotFound(
+                            anyhow::anyhow!("Tenant shard not found").into(),
+                        ));
+                    };
+
+                    if !shard.apply_optimization(scheduler, optimization) {
+                        // This can happen but is unusual enough to warn on: something else changed in the shard that made the optimisation stale
+                        // and therefore not applied.
+                        tracing::warn!(
+                            "Schedule optimisation generated during graceful migration was not applied, shard changed?"
+                        );
+                    }
+                    self.maybe_configured_reconcile_shard(
+                        shard,
+                        nodes,
+                        (&migrate_req.migration_config).into(),
+                    )
+                } else {
+                    None
+                }
+            }
+            MigrationOutcome::Optimization(None) => None,
+            MigrationOutcome::Reconcile(waiter) => waiter,
+        };
+
+        // Finally, wait for any reconcile we started to complete.  In the case of immediate-mode migrations to cold
+        // locations, this has a good chance of timing out.
        if let Some(waiter) = waiter {
            waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
        } else {
@@ -6959,6 +7110,10 @@ impl Service {
                    ShardSchedulingPolicy::Active => {
                        // Ok to do optimization
                    }
+                    ShardSchedulingPolicy::Essential if shard.get_preferred_node().is_some() => {
+                        // Ok to do optimization: we are executing a graceful migration that
+                        // has set preferred_node
+                    }
                    ShardSchedulingPolicy::Essential
                    | ShardSchedulingPolicy::Pause
                    | ShardSchedulingPolicy::Stop => {
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -132,6 +132,10 @@ pub(crate) struct TenantShard {
    /// of state that we publish externally in an eventually consistent way.
    pub(crate) pending_compute_notification: bool,

+    /// To do a graceful migration, set this field to the destination pageserver, and optimization
+    /// functions will consider this node the best location and react appropriately.
+    preferred_node: Option<NodeId>,
+
    // Support/debug tool: if something is going wrong or flapping with scheduling, this may
    // be set to a non-active state to avoid making changes while the issue is fixed.
    scheduling_policy: ShardSchedulingPolicy,
@@ -555,6 +559,7 @@ impl TenantShard {
            last_error: Arc::default(),
            pending_compute_notification: false,
            scheduling_policy: ShardSchedulingPolicy::default(),
+            preferred_node: None,
        }
    }

@@ -809,6 +814,15 @@ impl TenantShard {
            return None;
        };

+        // If the candidate is our preferred node, then it is better than the current location, as long
+        // as it is online -- the online check is part of the score calculation we did above, so it's
+        // important that this check comes after that one.
+        if let Some(preferred) = self.preferred_node.as_ref() {
+            if preferred == &candidate {
+                return Some(true);
+            }
+        }
+
        match scheduler.compute_node_score::<T::Score>(
            current,
            &self.intent.preferred_az_id,
@@ -847,13 +861,22 @@ impl TenantShard {
        }
    }

-    fn find_better_location<T: ShardTag>(
+    pub(crate) fn find_better_location<T: ShardTag>(
        &self,
        scheduler: &mut Scheduler,
        schedule_context: &ScheduleContext,
        current: NodeId,
        hard_exclude: &[NodeId],
    ) -> Option<NodeId> {
+        // If we have a migration hint, then that is our better location
+        if let Some(hint) = self.preferred_node.as_ref() {
+            if hint == &current {
+                return None;
+            }
+
+            return Some(*hint);
+        }
+
        // Look for a lower-scoring location to attach to
        let Ok(candidate_node) = scheduler.schedule_shard::<T>(
            hard_exclude,
@@ -887,6 +910,13 @@ impl TenantShard {
        scheduler: &mut Scheduler,
        schedule_context: &ScheduleContext,
    ) -> bool {
+        // Tenant with preferred node: check if it is not already at the preferred node
+        if let Some(preferred) = self.preferred_node.as_ref() {
+            if Some(preferred) != self.intent.get_attached().as_ref() {
+                return true;
+            }
+        }
+
        // Sharded tenant: check if any locations have a nonzero affinity score
        if self.shard.count >= ShardCount(1) {
            let schedule_context = schedule_context.project_detach(self);
@@ -927,6 +957,9 @@ impl TenantShard {
    /// Optimize attachments: if a shard has a secondary location that is preferable to
    /// its primary location based on soft constraints, switch that secondary location
    /// to be attached.
+    ///
+    /// `schedule_context` should have been populated with all shards in the tenant, including
+    /// the one we're trying to optimize (this function will subtract its own contribution before making scoring decisions)
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
    pub(crate) fn optimize_attachment(
        &self,
@@ -1055,7 +1088,8 @@ impl TenantShard {
            //
            // This should be a transient state, there should always be capacity eventually in our preferred AZ (even if nodes
            // there are too overloaded for scheduler to suggest them, more should be provisioned eventually).
-            if self.intent.preferred_az_id.is_some()
+            if self.preferred_node.is_none()
+                && self.intent.preferred_az_id.is_some()
                && scheduler.get_node_az(&replacement) != self.intent.preferred_az_id
            {
                tracing::debug!(
@@ -1161,6 +1195,27 @@ impl TenantShard {
        None
    }

+    /// Start or abort a graceful migration of this shard to another pageserver. This works on top of the
+    /// other optimisation functions, to bias them to move to the destination node.
+    pub(crate) fn set_preferred_node(&mut self, node: Option<NodeId>) {
+        if let Some(hint) = self.preferred_node.as_ref() {
+            if Some(hint) != node.as_ref() {
+                // This is legal but a bit surprising: we expect that administrators wouldn't usually
+                // change their mind about where to migrate something.
+                tracing::warn!(
+                    "Changing migration destination from {hint} to {node:?} (current intent {:?})",
+                    self.intent
+                );
+            }
+        }
+
+        self.preferred_node = node;
+    }
+
+    pub(crate) fn get_preferred_node(&self) -> Option<NodeId> {
+        self.preferred_node
+    }
+
    /// Return true if the optimization was really applied: it will not be applied if the optimization's
    /// sequence is behind this tenant shard's
    pub(crate) fn apply_optimization(
@@ -1185,6 +1240,14 @@ impl TenantShard {
                self.intent.demote_attached(scheduler, old_attached_node_id);
                self.intent
                    .promote_attached(scheduler, new_attached_node_id);
+
+                if let Some(hint) = self.preferred_node.as_ref() {
+                    if hint == &new_attached_node_id {
+                        // The migration target is not a long term pin: once we are done with the migration, clear it.
+                        tracing::info!("Graceful migration to {hint} complete");
+                        self.preferred_node = None;
+                    }
+                }
            }
            ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
                old_node_id,
@@ -1703,6 +1766,10 @@ impl TenantShard {

        debug_assert!(!self.intent.all_pageservers().contains(&node_id));

+        if self.preferred_node == Some(node_id) {
+            self.preferred_node = None;
+        }
+
        intent_modified
    }

@@ -1750,6 +1817,7 @@ impl TenantShard {
            pending_compute_notification: false,
            delayed_reconcile: false,
            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
+            preferred_node: None,
        })
    }

@@ -2270,6 +2338,85 @@ pub(crate) mod tests {
        Ok(())
    }

+    #[test]
+    /// How the optimisation code handles a shard with a preferred node set; this is an example
+    /// of the multi-step migration, but driven by a different input.
+    fn optimize_attachment_multi_preferred_node() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(
+            4,
+            &[
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-b".to_string()),
+            ],
+        );
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Two shards of a tenant that wants to be in AZ A
+        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
+
+        // Initially attached in a stable location
+        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
+
+        // Set the preferred node to node 2, an equally high scoring node to its current location
+        shard_a.preferred_node = Some(NodeId(2));
+
+        fn make_schedule_context(shard_a: &TenantShard) -> ScheduleContext {
+            let mut schedule_context = ScheduleContext::default();
+            schedule_context.avoid(&shard_a.intent.all_pageservers());
+            schedule_context
+        }
+
+        let schedule_context = make_schedule_context(&shard_a);
+        let optimization_a_prepare = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_prepare,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::CreateSecondary(NodeId(2))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap());
+
+        // The first step of the optimisation should not have cleared the preferred node
+        assert_eq!(shard_a.preferred_node, Some(NodeId(2)));
+
+        let schedule_context = make_schedule_context(&shard_a);
+        let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_migrate,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(1),
+                    new_attached_node_id: NodeId(2)
+                })
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap());
+
+        // The cutover step of the optimisation should have cleared the preferred node
+        assert_eq!(shard_a.preferred_node, None);
+
+        let schedule_context = make_schedule_context(&shard_a);
+        let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_cleanup,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(1))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap());
+
+        shard_a.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+
    #[test]
    /// Check that multi-step migration works when moving to somewhere that is only better by
    /// 1 AffinityScore -- this ensures that we don't have a bug like the intermediate secondary