From 58340f9dbf0adb57c9d4dd3e719702d31f586be1 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlalazar.vlad@gmail.com>
Date: Tue, 11 Jun 2024 15:00:36 +0100
Subject: [PATCH] storcon: add node fill algorithm

---
 storage_controller/src/scheduler.rs |  39 +++++++
 storage_controller/src/service.rs   | 153 ++++++++++++++++++++++++++++
 2 files changed, 192 insertions(+)

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 4ab85509dc..0bd2eeac35 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,4 +1,5 @@
 use crate::{node::Node, tenant_shard::TenantShard};
+use itertools::Itertools;
 use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -283,6 +284,44 @@ impl Scheduler {
         }
     }
 
+    // Check if the number of shards attached to a given node is lagging below
+    // the cluster average. If that's the case, the node should be filled.
+    pub(crate) fn compute_fill_requirement(&self, node_id: NodeId) -> usize {
+        let Some(node) = self.nodes.get(&node_id) else {
+            debug_assert!(false);
+            tracing::error!("Scheduler missing node {node_id}");
+            return 0;
+        };
+        assert!(!self.nodes.is_empty());
+        let expected_attached_shards_per_node = self.expected_attached_shard_count();
+
+        for (node_id, node) in self.nodes.iter() {
+            tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node);
+        }
+
+        if node.attached_shard_count < expected_attached_shards_per_node {
+            expected_attached_shards_per_node - node.attached_shard_count
+        } else {
+            0
+        }
+    }
+
+    pub(crate) fn expected_attached_shard_count(&self) -> usize {
+        let total_attached_shards: usize =
+            self.nodes.values().map(|n| n.attached_shard_count).sum();
+
+        assert!(!self.nodes.is_empty());
+        total_attached_shards / self.nodes.len()
+    }
+
+    pub(crate) fn nodes_by_attached_shard_count(&self) -> Vec<(NodeId, usize)> {
+        self.nodes
+            .iter()
+            .map(|(node_id, stats)| (*node_id, stats.attached_shard_count))
+            .sorted_by(|lhs, rhs| Ord::cmp(&lhs.1, &rhs.1).reverse())
+            .collect()
+    }
+
     pub(crate) fn node_upsert(&mut self, node: &Node) {
         use std::collections::hash_map::Entry::*;
         match self.nodes.entry(node.get_id()) {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index f74017c172..f65b17ace0 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5092,4 +5092,157 @@ impl Service {
 
         Ok(())
     }
+
+    pub(crate) async fn fill_node(
+        &self,
+        node_id: NodeId,
+        cancel: CancellationToken,
+    ) -> Result<(), OperationError> {
+        // TODO(vlad): Currently this operates on the assumption that all
+        // secondaries are warm. This is not always true (e.g. we just migrated the
+        // tenant). Take that into consideration by checking the secondary status.
+
+        tracing::info!(%node_id, "Starting fill background operation");
+
+        // Create a fill plan (pick secondaries to promote) that meets the following requirements:
+        // 1. The node should be filled until it reaches the expected cluster average of
+        // attached shards. If there are not enough secondaries on the node, the plan stops early.
+        // 2. Select tenant shards to promote such that the number of attached shards is balanced
+        // throughout the cluster. We achieve this by picking tenant shards from each node,
+        // starting from the ones with the largest number of attached shards, until the node
+        // reaches the expected cluster average.
+        let mut tids_to_promote = {
+            let mut locked = self.inner.write().unwrap();
+            let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
+
+            let mut tids_by_node = locked
+                .tenants
+                .iter_mut()
+                .filter_map(|(tid, tenant_shard)| {
+                    if tenant_shard.intent.get_secondary().contains(&node_id) {
+                        if let Some(primary) = tenant_shard.intent.get_attached() {
+                            return Some((*primary, *tid));
+                        }
+                    }
+
+                    None
+                })
+                .into_group_map();
+
+            let expected_attached = locked.scheduler.expected_attached_shard_count();
+            let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
+
+            let mut plan = Vec::new();
+            for (node_id, attached) in nodes_by_load {
+                if plan.len() >= fill_requirement
+                    || tids_by_node.is_empty()
+                    || attached <= expected_attached
+                {
+                    break;
+                }
+
+                let can_take = attached - expected_attached;
+                let mut remove_node = false;
+                for _ in 0..can_take {
+                    match tids_by_node.get_mut(&node_id) {
+                        Some(tids) => match tids.pop() {
+                            Some(tid) => {
+                                plan.push(tid);
+                            }
+                            None => {
+                                remove_node = true;
+                                break;
+                            }
+                        },
+                        None => {
+                            break;
+                        }
+                    }
+                }
+
+                if remove_node {
+                    tids_by_node.remove(&node_id);
+                }
+            }
+
+            plan
+        };
+
+        let mut waiters = Vec::new();
+        let mut schedule_context = ScheduleContext::default();
+
+        // Execute the plan we've composed above. Before aplying each move from the plan,
+        // we validate to ensure that it has not gone stale in the meantime.
+        while !tids_to_promote.is_empty() {
+            if cancel.is_cancelled() {
+                return Err(OperationError::Cancelled);
+            }
+
+            {
+                let mut locked = self.inner.write().unwrap();
+                let (nodes, tenants, scheduler) = locked.parts_mut();
+
+                let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
+                    format!("node {node_id} was removed").into(),
+                ))?;
+
+                let current_policy = node.get_scheduling();
+                if !matches!(current_policy, NodeSchedulingPolicy::Filling) {
+                    // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
+                    // about it
+                    return Err(OperationError::NodeStateChanged(
+                        format!("node {node_id} changed state to {current_policy:?}").into(),
+                    ));
+                }
+
+                while waiters.len() < MAX_RECONCILES_PER_OPERATION {
+                    if let Some(tid) = tids_to_promote.pop() {
+                        if let Some(tenant_shard) = tenants.get_mut(&tid) {
+                            // If the node being filled is not a secondary anymore,
+                            // skip the promotion.
+                            if !tenant_shard.intent.get_secondary().contains(&node_id) {
+                                continue;
+                            }
+
+                            tenant_shard.intent.promote_attached(scheduler, node_id);
+                            match tenant_shard.schedule(scheduler, &mut schedule_context) {
+                                Err(e) => {
+                                    tracing::warn!(%tid, "Scheduling error when filling pageserver {} : {e}", node_id);
+                                }
+                                Ok(()) => {
+                                    if let Some(waiter) =
+                                        self.maybe_reconcile_shard(tenant_shard, nodes)
+                                    {
+                                        waiters.push(waiter);
+                                    }
+                                }
+                            }
+                        }
+                    } else {
+                        break;
+                    }
+                }
+            }
+
+            waiters = self.kick_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await;
+        }
+
+        while !waiters.is_empty() {
+            waiters = self.kick_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await;
+        }
+
+        if let Err(err) = self
+            .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+            .await
+        {
+            // This isn't a huge issue since the filling process starts upon request. However, it
+            // will prevent the next drain from starting. The only case in which this can fail
+            // is database unavailability. Such a case will require manual intervention.
+            tracing::error!(%node_id, "Failed to finalise fill by setting scheduling policy: {err}");
+        }
+
+        tracing::info!(%node_id, "Completed fill background operation");
+
+        Ok(())
+    }
 }