storcon: avoid promoting too many shards of the same tenant

2026-05-14 03:30:36 +00:00 · 2024-06-19 14:12:50 +01:00
parent c789ec21f6
commit 3a74253bcb
2 changed files with 24 additions and 4 deletions
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -43,7 +43,7 @@ use utils::id::TenantId;
 pub struct ShardNumber(pub u8);

 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(u8);
+pub struct ShardCount(pub u8);

 /// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
 /// when we need to know which shard we're dealing with, but do not need to know the full
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5394,6 +5394,9 @@ impl Service {
    /// throughout the cluster. We achieve this by picking tenant shards from each node,
    /// starting from the ones with the largest number of attached shards, until the node
    /// reaches the expected cluster average.
+    /// 3. Avoid promoting more shards of the same tenant than required. The upper bound
+    /// for the number of tenants from the same shard promoted to the node being filled is:
+    /// shard count for the tenant divided by the number of nodes in the cluster.
    fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
        let mut locked = self.inner.write().unwrap();
        let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
@@ -5415,8 +5418,18 @@ impl Service {
        let expected_attached = locked.scheduler.expected_attached_shard_count();
        let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();

+        let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
        let mut plan = Vec::new();
+
        for (node_id, attached) in nodes_by_load {
+            let available = locked
+                .nodes
+                .get(&node_id)
+                .map_or(false, |n| n.is_available());
+            if !available {
+                continue;
+            }
+
            if plan.len() >= fill_requirement
                || tids_by_node.is_empty()
                || attached <= expected_attached
@@ -5424,13 +5437,20 @@ impl Service {
                break;
            }

-            let can_take = attached - expected_attached;
+            let mut can_take = attached - expected_attached;
            let mut remove_node = false;
-            for _ in 0..can_take {
+            while can_take > 0 {
                match tids_by_node.get_mut(&node_id) {
                    Some(tids) => match tids.pop() {
                        Some(tid) => {
-                            plan.push(tid);
+                            let max_promote_for_tenant =
+                                std::cmp::max(tid.shard_count.0 as usize / locked.nodes.len(), 1);
+                            let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default();
+                            if *promoted < max_promote_for_tenant {
+                                plan.push(tid);
+                                *promoted += 1;
+                                can_take -= 1;
+                            }
                        }
                        None => {
                            remove_node = true;