From 677c1662a42ce2a73bcb7f76879d04d9a8548341 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 14 Jun 2024 11:32:04 +0100 Subject: [PATCH] storcon: do not detach tenants when all nodes are unvailable Previously, when all nodes in the cluster became unavailable at the same time, we would detach all tenant shards. This is due to a bug in `Service::node_configure`. If all nodes are unavailable, there's no chance of reschedulling anything, so we should leave the intent states untouced. This commit adds a special case which detects this situation and skips any reschedullings. --- storage_controller/src/service.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 1e81b5c5a2..ff9c1434f3 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -4312,6 +4312,13 @@ impl Service { continue; } + if !new_nodes.values().any(Node::is_available) { + // Special case for when all nodes are unavailable: there is no point + // trying to reschedule since there's nowhere else to go. Without this + // branch we incorrectly detach tenants in response to node unavailability. + continue; + } + if tenant_shard.intent.demote_attached(scheduler, node_id) { tenant_shard.sequence = tenant_shard.sequence.next();