From 3a74253bcb4f897d4fc902f776ede2dd942e86ef Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlalazar.vlad@gmail.com>
Date: Wed, 19 Jun 2024 14:12:50 +0100
Subject: [PATCH] storcon: avoid promoting too many shards of the same tenant

---
 libs/pageserver_api/src/shard.rs  |  2 +-
 storage_controller/src/service.rs | 26 +++++++++++++++++++++++---
 2 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 8c5a4e6168..32c1db2046 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -43,7 +43,7 @@ use utils::id::TenantId;
 pub struct ShardNumber(pub u8);
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(u8);
+pub struct ShardCount(pub u8);
 
 /// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
 /// when we need to know which shard we're dealing with, but do not need to know the full
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 8475bf46d2..df5e03da49 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5394,6 +5394,9 @@ impl Service {
     /// throughout the cluster. We achieve this by picking tenant shards from each node,
     /// starting from the ones with the largest number of attached shards, until the node
     /// reaches the expected cluster average.
+    /// 3. Avoid promoting more shards of the same tenant than required. The upper bound
+    /// for the number of tenants from the same shard promoted to the node being filled is:
+    /// shard count for the tenant divided by the number of nodes in the cluster.
     fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
         let mut locked = self.inner.write().unwrap();
         let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
@@ -5415,8 +5418,18 @@ impl Service {
         let expected_attached = locked.scheduler.expected_attached_shard_count();
         let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
 
+        let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
         let mut plan = Vec::new();
+
         for (node_id, attached) in nodes_by_load {
+            let available = locked
+                .nodes
+                .get(&node_id)
+                .map_or(false, |n| n.is_available());
+            if !available {
+                continue;
+            }
+
             if plan.len() >= fill_requirement
                 || tids_by_node.is_empty()
                 || attached <= expected_attached
@@ -5424,13 +5437,20 @@ impl Service {
                 break;
             }
 
-            let can_take = attached - expected_attached;
+            let mut can_take = attached - expected_attached;
             let mut remove_node = false;
-            for _ in 0..can_take {
+            while can_take > 0 {
                 match tids_by_node.get_mut(&node_id) {
                     Some(tids) => match tids.pop() {
                         Some(tid) => {
-                            plan.push(tid);
+                            let max_promote_for_tenant =
+                                std::cmp::max(tid.shard_count.0 as usize / locked.nodes.len(), 1);
+                            let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default();
+                            if *promoted < max_promote_for_tenant {
+                                plan.push(tid);
+                                *promoted += 1;
+                                can_take -= 1;
+                            }
                         }
                         None => {
                             remove_node = true;