diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 3fa25443da..b3656c33d4 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -825,6 +825,7 @@ impl Scheduler { struct AzScore { home_shard_count: usize, scheduleable: bool, + node_count: usize, } let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new(); @@ -832,6 +833,7 @@ impl Scheduler { let az = azs.entry(&node.az).or_default(); az.home_shard_count += node.home_shard_count; az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_)); + az.node_count += 1; } // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where @@ -840,10 +842,20 @@ impl Scheduler { azs.retain(|_, i| i.scheduleable); } + // We will multiply up shard counts by the max node count for scoring, before dividing + // by per-node max node count, to get a normalized score that doesn't collapse to zero + // when the absolute shard count is less than the node count. + let max_node_count = azs.values().map(|i| i.node_count).max().unwrap_or(0); + // Find the AZ with the lowest number of shards currently allocated Some( azs.into_iter() - .min_by_key(|i| (i.1.home_shard_count, i.0)) + .min_by_key(|i| { + ( + (i.1.home_shard_count * max_node_count) / i.1.node_count, + i.0, + ) + }) .unwrap() .0 .clone(),