diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 0bd2eeac35..843159010d 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -391,7 +391,7 @@ impl Scheduler { return Err(ScheduleError::NoPageservers); } - let mut scores: Vec<(NodeId, AffinityScore, usize)> = self + let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self .nodes .iter() .filter_map(|(k, v)| { @@ -402,6 +402,7 @@ impl Scheduler { *k, context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE), v.shard_count, + v.attached_shard_count, )) } }) @@ -409,9 +410,12 @@ impl Scheduler { // Sort by, in order of precedence: // 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available - // 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes. - // 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems. - scores.sort_by_key(|i| (i.1, i.2, i.0)); + // 2nd: Attached shard count. Within nodes with the same affinity, we always pick the node with + // the least number of attached shards. + // 3rd: Total shard count. Within nodes with the same affinity and attached shard count, use nodes + // with the lower total shard count. + // 4th: Node ID. This is a convenience to make selection deterministic in tests and empty systems. + scores.sort_by_key(|i| (i.1, i.3, i.2, i.0)); if scores.is_empty() { // After applying constraints, no pageservers were left. diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index d1b632755f..840bcbb81d 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1632,14 +1632,10 @@ pub(crate) mod tests { // We should see equal number of locations on the two nodes. assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4); - // Scheduling does not consider the number of attachments picking the initial - // pageserver to attach to (hence the assertion that all primaries are on the - // same node) - // TODO: Tweak the scheduling to evenly distribute attachments for new shards. - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2); assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4); - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2); // Add another two nodes: we should see the shards spread out when their optimize // methods are called diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 30f96ceee8..c6450df186 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -1527,13 +1527,7 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): ) # Give things a chance to settle. - # A call to `reconcile_until_idle` could be used here instead, - # however since all attachments are placed on the same node, - # we'd have to wait for a long time (2 minutes-ish) for optimizations - # to quiesce. - # TODO: once the initial attachment selection is fixed, update this - # to use `reconcile_until_idle`. - time.sleep(2) + env.storage_controller.reconcile_until_idle(timeout_secs=30) nodes = env.storage_controller.node_list() assert len(nodes) == 2