From 8270b58f3925340a2a32b008d559902aa0457de5 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 14 Jun 2024 11:28:11 +0100 Subject: [PATCH] storcon: handle reattach and heartbeat race Consider the case when the storage controller handles the re-attach of a node before the heartbeats detect that the node is back online. We still need to reconfigure the node (by calling `Service::node_configure`) to migrate attachments back onto the node. In order to determine if node reconfiguration is required, we call into `Node::get_availability_transition`. This commit updates the function to consider the transition from "node just re-attached" (with no utilisation score) to "node responded to the first heartbeat after a period of unavailablity" (with some utilisation score). --- storage_controller/src/node.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 7b5513c908..f7a034bc8b 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -3,7 +3,7 @@ use std::{str::FromStr, time::Duration}; use pageserver_api::{ controller_api::{ NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, - TenantLocateResponseShard, + TenantLocateResponseShard, UtilizationScore, }, shard::TenantShardId, }; @@ -116,6 +116,15 @@ impl Node { match (self.availability, availability) { (Offline, Active(_)) => ToActive, (Active(_), Offline) => ToOffline, + // Consider the case when the storage controller handles the re-attach of a node + // before the heartbeats detect that the node is back online. We still need + // [`Service::node_configure`] to migrate attachments back onto the node. + // The unsavoury match arm below handles this situation. + (Active(lhs), Active(rhs)) + if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() => + { + ToActive + } _ => Unchanged, } }