storcon: use attached shard counts for initial shard placement (#8061)

## Problem
When creating a new shard the storage controller schedules via
Scheduler::schedule_shard. This does not take into account the number of
attached shards. What it does take into account is the node affinity:
when a shard is scheduled, all its nodes (primaries and secondaries) get
their affinity incremented.

For two node clusters and shards with one secondary we have a
pathological case where all primaries are scheduled on the same node.
Now that we track the count of attached shards per node, this is trivial
to fix. Still, the "proper" fix is to use the pageserver's utilization
score.

Closes https://github.com/neondatabase/neon/issues/8041

## Summary of changes
Use attached shard count when deciding which node to schedule a fresh
shard on.
This commit is contained in:
Vlad Lazar
2024-06-20 17:32:01 +01:00
committed by GitHub
parent 02ecdd137b
commit f8ac3b0e0e
3 changed files with 11 additions and 17 deletions

View File

@@ -391,7 +391,7 @@ impl Scheduler {
return Err(ScheduleError::NoPageservers);
}
let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
.nodes
.iter()
.filter_map(|(k, v)| {
@@ -402,6 +402,7 @@ impl Scheduler {
*k,
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
v.shard_count,
v.attached_shard_count,
))
}
})
@@ -409,9 +410,12 @@ impl Scheduler {
// Sort by, in order of precedence:
// 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available
// 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes.
// 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
scores.sort_by_key(|i| (i.1, i.2, i.0));
// 2nd: Attached shard count. Within nodes with the same affinity, we always pick the node with
// the least number of attached shards.
// 3rd: Total shard count. Within nodes with the same affinity and attached shard count, use nodes
// with the lower total shard count.
// 4th: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));
if scores.is_empty() {
// After applying constraints, no pageservers were left.

View File

@@ -1632,14 +1632,10 @@ pub(crate) mod tests {
// We should see equal number of locations on the two nodes.
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
// Scheduling does not consider the number of attachments picking the initial
// pageserver to attach to (hence the assertion that all primaries are on the
// same node)
// TODO: Tweak the scheduling to evenly distribute attachments for new shards.
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2);
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);
// Add another two nodes: we should see the shards spread out when their optimize
// methods are called