From 01399621d53aa70edcc8f89976d2ae2fba5723e1 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 21 Jun 2024 10:19:01 +0100 Subject: [PATCH] storcon: avoid promoting too many shards of the same tenant (#8099) ## Problem The fill planner introduced in https://github.com/neondatabase/neon/pull/8014 selects tenant shards to promote strictly based on attached shard count load (tenant shards on nodes with the most attached shard counts are considered first). This approach runs the risk of migrating too many shards belonging to the same tenant on the same primary node. This is bad for availability and causes extra reconciles via the storage controller's background optimisations. Also see https://github.com/neondatabase/neon/pull/8014#discussion_r1642456241. ## Summary of changes Refine the fill plan to avoid promoting too many shards belonging to the same tenant on the same node. We allow for `max(1, shard_count / node_count)` shards belonging to the same tenant to be promoted. --- storage_controller/src/service.rs | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 6ed6c16347..792f68cc5a 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5395,6 +5395,9 @@ impl Service { /// throughout the cluster. We achieve this by picking tenant shards from each node, /// starting from the ones with the largest number of attached shards, until the node /// reaches the expected cluster average. + /// 3. Avoid promoting more shards of the same tenant than required. The upper bound + /// for the number of tenants from the same shard promoted to the node being filled is: + /// shard count for the tenant divided by the number of nodes in the cluster. fn fill_node_plan(&self, node_id: NodeId) -> Vec { let mut locked = self.inner.write().unwrap(); let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); @@ -5416,8 +5419,18 @@ impl Service { let expected_attached = locked.scheduler.expected_attached_shard_count(); let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count(); + let mut promoted_per_tenant: HashMap = HashMap::new(); let mut plan = Vec::new(); + for (node_id, attached) in nodes_by_load { + let available = locked + .nodes + .get(&node_id) + .map_or(false, |n| n.is_available()); + if !available { + continue; + } + if plan.len() >= fill_requirement || tids_by_node.is_empty() || attached <= expected_attached @@ -5425,13 +5438,22 @@ impl Service { break; } - let can_take = attached - expected_attached; + let mut can_take = attached - expected_attached; let mut remove_node = false; - for _ in 0..can_take { + while can_take > 0 { match tids_by_node.get_mut(&node_id) { Some(tids) => match tids.pop() { Some(tid) => { - plan.push(tid); + let max_promote_for_tenant = std::cmp::max( + tid.shard_count.count() as usize / locked.nodes.len(), + 1, + ); + let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default(); + if *promoted < max_promote_for_tenant { + plan.push(tid); + *promoted += 1; + can_take -= 1; + } } None => { remove_node = true;