controller: use PageserverUtilization for scheduling (#8711)

## Problem Previously, the controller only used the shard counts for scheduling. This works well when hosting only many-sharded tenants, but works much less well when hosting single-sharded tenants that have a greater deviation in size-per-shard. Closes: https://github.com/neondatabase/neon/issues/7798 ## Summary of changes - Instead of UtilizationScore, carry the full PageserverUtilization through into the Scheduler. - Use the PageserverUtilization::score() instead of shard count when ordering nodes in scheduling. Q: Why did test_sharding_split_smoke need updating in this PR? A: There's an interesting side effect during shard splits: because we do not decrement the shard count in the utilization when we de-schedule the shards from before the split, the controller will now prefer to pick _different_ nodes for shards compared with which ones held secondaries before the split. We could use our knowledge of splitting to fix up the utilizations more actively in this situation, but I'm leaning toward leaving the code simpler, as in practical systems the impact of one shard on the utilization of a node should be fairly low (single digit %).
2026-01-04 12:02:55 +00:00 · 2024-08-23 18:32:56 +01:00
parent c1cb7a0fa0
commit b65a95f12e
11 changed files with 340 additions and 101 deletions
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -92,15 +92,15 @@ impl Node {
        }
    }

-    pub(crate) fn get_availability(&self) -> NodeAvailability {
-        self.availability
+    pub(crate) fn get_availability(&self) -> &NodeAvailability {
+        &self.availability
    }

    pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
        use AvailabilityTransition::*;
        use NodeAvailability::WarmingUp;

-        match self.get_availability_transition(availability) {
+        match self.get_availability_transition(&availability) {
            ToActive => {
                // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                // users of previously-cloned copies of the node will still see the old cancellation
@@ -115,8 +115,8 @@ impl Node {
            Unchanged | ToWarmingUpFromOffline => {}
        }

-        if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) {
-            self.availability = WarmingUp(std::cmp::max(crnt, proposed));
+        if let (WarmingUp(crnt), WarmingUp(proposed)) = (&self.availability, &availability) {
+            self.availability = WarmingUp(std::cmp::max(*crnt, *proposed));
        } else {
            self.availability = availability;
        }
@@ -126,12 +126,12 @@ impl Node {
    /// into a description of the transition.
    pub(crate) fn get_availability_transition(
        &self,
-        availability: NodeAvailability,
+        availability: &NodeAvailability,
    ) -> AvailabilityTransition {
        use AvailabilityTransition::*;
        use NodeAvailability::*;

-        match (self.availability, availability) {
+        match (&self.availability, availability) {
            (Offline, Active(_)) => ToActive,
            (Active(_), Offline) => ToOffline,
            (Active(_), WarmingUp(_)) => ToWarmingUpFromActive,
@@ -153,15 +153,15 @@ impl Node {

    /// Is this node elegible to have work scheduled onto it?
    pub(crate) fn may_schedule(&self) -> MaySchedule {
-        let score = match self.availability {
-            NodeAvailability::Active(score) => score,
+        let utilization = match &self.availability {
+            NodeAvailability::Active(u) => u.clone(),
            NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No,
        };

        match self.scheduling {
-            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization),
            NodeSchedulingPolicy::Draining => MaySchedule::No,
-            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization),
            NodeSchedulingPolicy::Pause => MaySchedule::No,
            NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
        }
@@ -285,7 +285,7 @@ impl Node {
    pub(crate) fn describe(&self) -> NodeDescribeResponse {
        NodeDescribeResponse {
            id: self.id,
-            availability: self.availability.into(),
+            availability: self.availability.clone().into(),
            scheduling: self.scheduling,
            listen_http_addr: self.listen_http_addr.clone(),
            listen_http_port: self.listen_http_port,