From 3a74253bcb4f897d4fc902f776ede2dd942e86ef Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 19 Jun 2024 14:12:50 +0100 Subject: [PATCH] storcon: avoid promoting too many shards of the same tenant --- libs/pageserver_api/src/shard.rs | 2 +- storage_controller/src/service.rs | 26 +++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 8c5a4e6168..32c1db2046 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -43,7 +43,7 @@ use utils::id::TenantId; pub struct ShardNumber(pub u8); #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] -pub struct ShardCount(u8); +pub struct ShardCount(pub u8); /// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant, /// when we need to know which shard we're dealing with, but do not need to know the full diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 8475bf46d2..df5e03da49 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5394,6 +5394,9 @@ impl Service { /// throughout the cluster. We achieve this by picking tenant shards from each node, /// starting from the ones with the largest number of attached shards, until the node /// reaches the expected cluster average. + /// 3. Avoid promoting more shards of the same tenant than required. The upper bound + /// for the number of tenants from the same shard promoted to the node being filled is: + /// shard count for the tenant divided by the number of nodes in the cluster. fn fill_node_plan(&self, node_id: NodeId) -> Vec { let mut locked = self.inner.write().unwrap(); let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); @@ -5415,8 +5418,18 @@ impl Service { let expected_attached = locked.scheduler.expected_attached_shard_count(); let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count(); + let mut promoted_per_tenant: HashMap = HashMap::new(); let mut plan = Vec::new(); + for (node_id, attached) in nodes_by_load { + let available = locked + .nodes + .get(&node_id) + .map_or(false, |n| n.is_available()); + if !available { + continue; + } + if plan.len() >= fill_requirement || tids_by_node.is_empty() || attached <= expected_attached @@ -5424,13 +5437,20 @@ impl Service { break; } - let can_take = attached - expected_attached; + let mut can_take = attached - expected_attached; let mut remove_node = false; - for _ in 0..can_take { + while can_take > 0 { match tids_by_node.get_mut(&node_id) { Some(tids) => match tids.pop() { Some(tid) => { - plan.push(tid); + let max_promote_for_tenant = + std::cmp::max(tid.shard_count.0 as usize / locked.nodes.len(), 1); + let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default(); + if *promoted < max_promote_for_tenant { + plan.push(tid); + *promoted += 1; + can_take -= 1; + } } None => { remove_node = true;