storcon: use attached shard counts for initial shard placement (#8061)

## Problem When creating a new shard the storage controller schedules via Scheduler::schedule_shard. This does not take into account the number of attached shards. What it does take into account is the node affinity: when a shard is scheduled, all its nodes (primaries and secondaries) get their affinity incremented. For two node clusters and shards with one secondary we have a pathological case where all primaries are scheduled on the same node. Now that we track the count of attached shards per node, this is trivial to fix. Still, the "proper" fix is to use the pageserver's utilization score. Closes https://github.com/neondatabase/neon/issues/8041 ## Summary of changes Use attached shard count when deciding which node to schedule a fresh shard on.
2026-01-07 21:42:56 +00:00 · 2024-06-20 17:32:01 +01:00
parent 02ecdd137b
commit f8ac3b0e0e
3 changed files with 11 additions and 17 deletions
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -391,7 +391,7 @@ impl Scheduler {
            return Err(ScheduleError::NoPageservers);
        }

-        let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
+        let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
            .nodes
            .iter()
            .filter_map(|(k, v)| {
@@ -402,6 +402,7 @@ impl Scheduler {
                        *k,
                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
                        v.shard_count,
+                        v.attached_shard_count,
                    ))
                }
            })
@@ -409,9 +410,12 @@ impl Scheduler {

        // Sort by, in order of precedence:
        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Utilization.  Within nodes with the same affinity, use the least loaded nodes.
-        //  3rd: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
+        //  2nd: Attached shard count.  Within nodes with the same affinity, we always pick the node with
+        //  the least number of attached shards.
+        //  3rd: Total shard count.  Within nodes with the same affinity and attached shard count, use nodes
+        //  with the lower total shard count.
+        //  4th: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
+        scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));

        if scores.is_empty() {
            // After applying constraints, no pageservers were left.
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1632,14 +1632,10 @@ pub(crate) mod tests {

        // We should see equal number of locations on the two nodes.
        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
-        // Scheduling does not consider the number of attachments picking the initial
-        // pageserver to attach to (hence the assertion that all primaries are on the
-        // same node)
-        // TODO: Tweak the scheduling to evenly distribute attachments for new shards.
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2);

        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);

        // Add another two nodes: we should see the shards spread out when their optimize
        // methods are called