From 6c3aba7c44e070a25064b113651b934cb7460e67 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 23 Jun 2025 08:50:31 -0700 Subject: [PATCH] storcon: adjust AZ selection for heterogenous AZs (#12296) ## Problem The scheduler uses total shards per AZ to select the AZ for newly created or attached tenants. This makes bad decisions when we have different node counts per AZ -- we might have 2 very busy pageservers in one AZ, and 4 more lightly loaded pageservers in other AZs, and the scheduler picks the busy pageservers because the total shard count in their AZ is lower. ## Summary of changes - Divide the shard count by the number of nodes in the AZ when scoring in `get_az_for_new_tenant` --------- Co-authored-by: John Spray --- storage_controller/src/scheduler.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 3fa25443da..b3656c33d4 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -825,6 +825,7 @@ impl Scheduler { struct AzScore { home_shard_count: usize, scheduleable: bool, + node_count: usize, } let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new(); @@ -832,6 +833,7 @@ impl Scheduler { let az = azs.entry(&node.az).or_default(); az.home_shard_count += node.home_shard_count; az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_)); + az.node_count += 1; } // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where @@ -840,10 +842,20 @@ impl Scheduler { azs.retain(|_, i| i.scheduleable); } + // We will multiply up shard counts by the max node count for scoring, before dividing + // by per-node max node count, to get a normalized score that doesn't collapse to zero + // when the absolute shard count is less than the node count. + let max_node_count = azs.values().map(|i| i.node_count).max().unwrap_or(0); + // Find the AZ with the lowest number of shards currently allocated Some( azs.into_iter() - .min_by_key(|i| (i.1.home_shard_count, i.0)) + .min_by_key(|i| { + ( + (i.1.home_shard_count * max_node_count) / i.1.node_count, + i.0, + ) + }) .unwrap() .0 .clone(),