storcon: adjust AZ selection for heterogenous AZs (#12296)

## Problem The scheduler uses total shards per AZ to select the AZ for newly created or attached tenants. This makes bad decisions when we have different node counts per AZ -- we might have 2 very busy pageservers in one AZ, and 4 more lightly loaded pageservers in other AZs, and the scheduler picks the busy pageservers because the total shard count in their AZ is lower. ## Summary of changes - Divide the shard count by the number of nodes in the AZ when scoring in `get_az_for_new_tenant` --------- Co-authored-by: John Spray <john.spray@databricks.com>
2026-01-04 03:52:56 +00:00 · 2025-06-23 08:50:31 -07:00
parent 68a175d545
commit 6c3aba7c44
1 changed files with 13 additions and 1 deletions
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -825,6 +825,7 @@ impl Scheduler {
        struct AzScore {
            home_shard_count: usize,
            scheduleable: bool,
+            node_count: usize,
        }

        let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new();
@@ -832,6 +833,7 @@ impl Scheduler {
            let az = azs.entry(&node.az).or_default();
            az.home_shard_count += node.home_shard_count;
            az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_));
+            az.node_count += 1;
        }

        // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where
@@ -840,10 +842,20 @@ impl Scheduler {
            azs.retain(|_, i| i.scheduleable);
        }

+        // We will multiply up shard counts by the max node count for scoring, before dividing
+        // by per-node max node count, to get a normalized score that doesn't collapse to zero
+        // when the absolute shard count is less than the node count.
+        let max_node_count = azs.values().map(|i| i.node_count).max().unwrap_or(0);
+
        // Find the AZ with the lowest number of shards currently allocated
        Some(
            azs.into_iter()
-                .min_by_key(|i| (i.1.home_shard_count, i.0))
+                .min_by_key(|i| {
+                    (
+                        (i.1.home_shard_count * max_node_count) / i.1.node_count,
+                        i.0,
+                    )
+                })
                .unwrap()
                .0
                .clone(),