From 01399621d53aa70edcc8f89976d2ae2fba5723e1 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 21 Jun 2024 10:19:01 +0100
Subject: [PATCH] storcon: avoid promoting too many shards of the same tenant
 (#8099)

## Problem

The fill planner introduced in
https://github.com/neondatabase/neon/pull/8014 selects
tenant shards to promote strictly based on attached shard count load
(tenant shards on
nodes with the most attached shard counts are considered first). This
approach runs the
risk of migrating too many shards belonging to the same tenant on the
same primary node.

This is bad for availability and causes extra reconciles via the storage
controller's background
optimisations.

Also see
https://github.com/neondatabase/neon/pull/8014#discussion_r1642456241.

## Summary of changes
Refine the fill plan to avoid promoting too many shards belonging to the
same tenant on the same node.
We allow for `max(1, shard_count / node_count)` shards belonging to the
same tenant to be promoted.
---
 storage_controller/src/service.rs | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 6ed6c16347..792f68cc5a 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5395,6 +5395,9 @@ impl Service {
     /// throughout the cluster. We achieve this by picking tenant shards from each node,
     /// starting from the ones with the largest number of attached shards, until the node
     /// reaches the expected cluster average.
+    /// 3. Avoid promoting more shards of the same tenant than required. The upper bound
+    /// for the number of tenants from the same shard promoted to the node being filled is:
+    /// shard count for the tenant divided by the number of nodes in the cluster.
     fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
         let mut locked = self.inner.write().unwrap();
         let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
@@ -5416,8 +5419,18 @@ impl Service {
         let expected_attached = locked.scheduler.expected_attached_shard_count();
         let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
 
+        let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
         let mut plan = Vec::new();
+
         for (node_id, attached) in nodes_by_load {
+            let available = locked
+                .nodes
+                .get(&node_id)
+                .map_or(false, |n| n.is_available());
+            if !available {
+                continue;
+            }
+
             if plan.len() >= fill_requirement
                 || tids_by_node.is_empty()
                 || attached <= expected_attached
@@ -5425,13 +5438,22 @@ impl Service {
                 break;
             }
 
-            let can_take = attached - expected_attached;
+            let mut can_take = attached - expected_attached;
             let mut remove_node = false;
-            for _ in 0..can_take {
+            while can_take > 0 {
                 match tids_by_node.get_mut(&node_id) {
                     Some(tids) => match tids.pop() {
                         Some(tid) => {
-                            plan.push(tid);
+                            let max_promote_for_tenant = std::cmp::max(
+                                tid.shard_count.count() as usize / locked.nodes.len(),
+                                1,
+                            );
+                            let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default();
+                            if *promoted < max_promote_for_tenant {
+                                plan.push(tid);
+                                *promoted += 1;
+                                can_take -= 1;
+                            }
                         }
                         None => {
                             remove_node = true;