storcon: use proper schedule context during node delete (#9958)

## Problem I was touching `test_storage_controller_node_deletion` because for AZ scheduling work I was adding a change to the storage controller (kick secondaries during optimisation) that made a FIXME in this test defunct. While looking at it I also realized that we can easily fix the way node deletion currently doesn't use a proper ScheduleContext, using the iterator type recently added for that purpose. ## Summary of changes - A testing-only behavior in storage controller where if a secondary location isn't yet ready during optimisation, it will be actively polled. - Remove workaround in `test_storage_controller_node_deletion` that previously was needed because optimisation would get stuck on cold secondaries. - Update node deletion code to use a `TenantShardContextIterator` and thereby a proper ScheduleContext
2026-05-31 12:00:42 +00:00 · 2024-12-03 08:59:38 +00:00
parent c4987b0b13
commit 4a488b3e24
3 changed files with 96 additions and 31 deletions
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5158,34 +5158,38 @@ impl Service {
                *nodes = Arc::new(nodes_mut);
            }

-            for (tenant_shard_id, shard) in tenants {
-                if shard.deref_node(node_id) {
-                    // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise
-                    // it won't properly do anti-affinity.
-                    let mut schedule_context = ScheduleContext::default();
+            for (_tenant_id, mut schedule_context, shards) in
+                TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
+            {
+                for shard in shards {
+                    if shard.deref_node(node_id) {
+                        if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
+                            // TODO: implement force flag to remove a node even if we can't reschedule
+                            // a tenant
+                            tracing::error!(
+                                "Refusing to delete node, shard {} can't be rescheduled: {e}",
+                                shard.tenant_shard_id
+                            );
+                            return Err(e.into());
+                        } else {
+                            tracing::info!(
+                                "Rescheduled shard {} away from node during deletion",
+                                shard.tenant_shard_id
+                            )
+                        }

-                    if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
-                        // TODO: implement force flag to remove a node even if we can't reschedule
-                        // a tenant
-                        tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}");
-                        return Err(e.into());
-                    } else {
-                        tracing::info!(
-                            "Rescheduled shard {tenant_shard_id} away from node during deletion"
-                        )
+                        self.maybe_reconcile_shard(shard, nodes);
                    }

-                    self.maybe_reconcile_shard(shard, nodes);
+                    // Here we remove an existing observed location for the node we're removing, and it will
+                    // not be re-added by a reconciler's completion because we filter out removed nodes in
+                    // process_result.
+                    //
+                    // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
+                    // means any reconciles we spawned will know about the node we're deleting, enabling them
+                    // to do live migrations if it's still online.
+                    shard.observed.locations.remove(&node_id);
                }
-
-                // Here we remove an existing observed location for the node we're removing, and it will
-                // not be re-added by a reconciler's completion because we filter out removed nodes in
-                // process_result.
-                //
-                // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
-                // means any reconciles we spawned will know about the node we're deleting, enabling them
-                // to do live migrations if it's still online.
-                shard.observed.locations.remove(&node_id);
            }

            scheduler.node_remove(node_id);
@@ -6279,6 +6283,14 @@ impl Service {
                            > DOWNLOAD_FRESHNESS_THRESHOLD
                    {
                        tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}");
+
+                        #[cfg(feature = "testing")]
+                        if progress.heatmap_mtime.is_none() {
+                            // No heatmap might mean the attached location has never uploaded one, or that
+                            // the secondary download hasn't happened yet.  This is relatively unusual in the field,
+                            // but fairly common in tests.
+                            self.kick_secondary_download(tenant_shard_id).await;
+                        }
                    } else {
                        // Location looks ready: proceed
                        tracing::info!(
@@ -6293,6 +6305,58 @@ impl Service {
        validated_work
    }

+    /// Some aspects of scheduling optimisation wait for secondary locations to be warm.  This
+    /// happens on multi-minute timescales in the field, which is fine because optimisation is meant
+    /// to be a lazy background thing. However, when testing, it is not practical to wait around, so
+    /// we have this helper to move things along faster.
+    #[cfg(feature = "testing")]
+    async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
+        let (attached_node, secondary_node) = {
+            let locked = self.inner.read().unwrap();
+            let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
+                return;
+            };
+            let (Some(attached), Some(secondary)) = (
+                shard.intent.get_attached(),
+                shard.intent.get_secondary().first(),
+            ) else {
+                return;
+            };
+            (
+                locked.nodes.get(attached).unwrap().clone(),
+                locked.nodes.get(secondary).unwrap().clone(),
+            )
+        };
+
+        // Make remote API calls to upload + download heatmaps: we ignore errors because this is just
+        // a 'kick' to let scheduling optimisation run more promptly.
+        attached_node
+            .with_client_retries(
+                |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
+                &self.config.jwt_token,
+                3,
+                10,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+
+        secondary_node
+            .with_client_retries(
+                |client| async move {
+                    client
+                        .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1)))
+                        .await
+                },
+                &self.config.jwt_token,
+                3,
+                10,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+    }
+
    /// Look for shards which are oversized and in need of splitting
    async fn autosplit_tenants(self: &Arc<Self>) {
        let Some(split_threshold) = self.config.split_threshold else {
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -519,6 +519,13 @@ def test_sharding_split_smoke(
    # We will have 2 shards per pageserver once done (including secondaries)
    neon_env_builder.num_pageservers = split_shard_count

+    # Two AZs
+    def assign_az(ps_cfg):
+        az = f"az-{(ps_cfg['id'] - 1) % 2}"
+        ps_cfg["availability_zone"] = az
+
+    neon_env_builder.pageserver_config_override = assign_az
+
    # 1MiB stripes: enable getting some meaningful data distribution without
    # writing large quantities of data in this test.  The stripe size is given
    # in number of 8KiB pages.
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2253,12 +2253,7 @@ def test_storage_controller_node_deletion(
            assert victim.id not in shard["node_secondary"]

    # Reconciles running during deletion should all complete
-    # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting
-    # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3
-    # test that hasn't uploaded any heatmaps for secondaries.
-    # In the interim, just do a reconcile_all to enable the consistency check.
-    # env.storage_controller.reconcile_until_idle()
-    env.storage_controller.reconcile_all()
+    env.storage_controller.reconcile_until_idle()

    # Controller should pass its own consistency checks
    env.storage_controller.consistency_check()
@@ -2267,7 +2262,6 @@ def test_storage_controller_node_deletion(
    env.storage_controller.stop()
    env.storage_controller.start()
    assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
-    env.storage_controller.reconcile_all()  # FIXME: workaround for optimizations happening on startup, see FIXME above.
    env.storage_controller.consistency_check()