storcon: handle entire cluster going unavailable correctly (#8060)

## Problem A period of unavailability for all pageservers in a cluster produced the following fallout in staging: all tenants became detached and required manual operation to re-attach. Manually restarting the storage controller re-attached all tenants due to a consistency bug. Turns out there are two related bugs which caused the issue: 1. Pageserver re-attach can be processed before the first heartbeat. Hence, when handling the availability delta produced by the heartbeater, `Node::get_availability_transition` claims that there's no need to reconfigure the node. 2. We would still attempt to reschedule tenant shards when handling offline transitions even if the entire cluster is down. This puts tenant shards into a state where the reconciler believes they have to be detached (no pageserver shows up in their intent state). This is doubly wrong because we don't mark the tenant shards as detached in the database, thus causing memory vs database consistency issues. Luckily, this bug allowed all tenant shards to re-attach after restart. ## Summary of changes * For (1), abuse the fact that re-attach requests do not contain an utilisation score and use that to differentiate from a node that replied to heartbeats. * For (2), introduce a special case that skips any rescheduling if the entire cluster is unavailable. * Update the storage controller heartbeat test with an extra scenario where the entire cluster goes for lunch. Fixes https://github.com/neondatabase/neon/issues/8044
2026-01-06 04:52:55 +00:00 · 2024-06-17 11:40:35 +01:00
parent 2ba414525e
commit 16d80128ee
4 changed files with 141 additions and 57 deletions
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -31,6 +31,7 @@ pub(crate) enum PageserverState {
    Available {
        last_seen_at: Instant,
        utilization: PageserverUtilization,
+        new: bool,
    },
    Offline,
 }
@@ -127,6 +128,7 @@ impl HeartbeaterTask {
            heartbeat_futs.push({
                let jwt_token = self.jwt_token.clone();
                let cancel = self.cancel.clone();
+                let new_node = !self.state.contains_key(node_id);

                // Clone the node and mark it as available such that the request
                // goes through to the pageserver even when the node is marked offline.
@@ -159,6 +161,7 @@ impl HeartbeaterTask {
                        PageserverState::Available {
                            last_seen_at: Instant::now(),
                            utilization,
+                            new: new_node,
                        }
                    } else {
                        PageserverState::Offline
@@ -220,6 +223,7 @@ impl HeartbeaterTask {
                    }
                },
                Vacant(_) => {
+                    // This is a new node. Don't generate a delta for it.
                    deltas.push((node_id, ps_state.clone()));
                }
            }
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -3,7 +3,7 @@ use std::{str::FromStr, time::Duration};
 use pageserver_api::{
    controller_api::{
        NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantLocateResponseShard,
+        TenantLocateResponseShard, UtilizationScore,
    },
    shard::TenantShardId,
 };
@@ -116,6 +116,16 @@ impl Node {
        match (self.availability, availability) {
            (Offline, Active(_)) => ToActive,
            (Active(_), Offline) => ToOffline,
+            // Consider the case when the storage controller handles the re-attach of a node
+            // before the heartbeats detect that the node is back online. We still need
+            // [`Service::node_configure`] to attempt reconciliations for shards with an
+            // unknown observed location.
+            // The unsavoury match arm below handles this situation.
+            (Active(lhs), Active(rhs))
+                if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() =>
+            {
+                ToActive
+            }
            _ => Unchanged,
        }
    }
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -12,7 +12,7 @@ use crate::{
    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
    persistence::{AbortShardSplitStatus, TenantFilter},
    reconciler::{ReconcileError, ReconcileUnits},
-    scheduler::{ScheduleContext, ScheduleMode},
+    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
    tenant_shard::{
        MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction,
    },
@@ -747,29 +747,61 @@ impl Service {
            let res = self.heartbeater.heartbeat(nodes).await;
            if let Ok(deltas) = res {
                for (node_id, state) in deltas.0 {
-                    let new_availability = match state {
-                        PageserverState::Available { utilization, .. } => NodeAvailability::Active(
-                            UtilizationScore(utilization.utilization_score),
+                    let (new_node, new_availability) = match state {
+                        PageserverState::Available {
+                            utilization, new, ..
+                        } => (
+                            new,
+                            NodeAvailability::Active(UtilizationScore(
+                                utilization.utilization_score,
+                            )),
                        ),
-                        PageserverState::Offline => NodeAvailability::Offline,
+                        PageserverState::Offline => (false, NodeAvailability::Offline),
                    };
-                    let res = self
-                        .node_configure(node_id, Some(new_availability), None)
-                        .await;

-                    match res {
-                        Ok(()) => {}
-                        Err(ApiError::NotFound(_)) => {
-                            // This should be rare, but legitimate since the heartbeats are done
-                            // on a snapshot of the nodes.
-                            tracing::info!("Node {} was not found after heartbeat round", node_id);
+                    if new_node {
+                        // When the heartbeats detect a newly added node, we don't wish
+                        // to attempt to reconcile the shards assigned to it. The node
+                        // is likely handling it's re-attach response, so reconciling now
+                        // would be counterproductive.
+                        //
+                        // Instead, update the in-memory state with the details learned about the
+                        // node.
+                        let mut locked = self.inner.write().unwrap();
+                        let (nodes, _tenants, scheduler) = locked.parts_mut();
+
+                        let mut new_nodes = (**nodes).clone();
+
+                        if let Some(node) = new_nodes.get_mut(&node_id) {
+                            node.set_availability(new_availability);
+                            scheduler.node_upsert(node);
                        }
-                        Err(err) => {
-                            tracing::error!(
-                                "Failed to update node {} after heartbeat round: {}",
-                                node_id,
-                                err
-                            );
+
+                        locked.nodes = Arc::new(new_nodes);
+                    } else {
+                        // This is the code path for geniune availability transitions (i.e node
+                        // goes unavailable and/or comes back online).
+                        let res = self
+                            .node_configure(node_id, Some(new_availability), None)
+                            .await;
+
+                        match res {
+                            Ok(()) => {}
+                            Err(ApiError::NotFound(_)) => {
+                                // This should be rare, but legitimate since the heartbeats are done
+                                // on a snapshot of the nodes.
+                                tracing::info!(
+                                    "Node {} was not found after heartbeat round",
+                                    node_id
+                                );
+                            }
+                            Err(err) => {
+                                tracing::error!(
+                                    "Failed to update node {} after heartbeat round: {}",
+                                    node_id,
+                                    err
+                                );
+                            }
                        }
                    }
                }
@@ -4316,6 +4348,16 @@ impl Service {
                        continue;
                    }

+                    if !new_nodes
+                        .values()
+                        .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_)))
+                    {
+                        // Special case for when all nodes are unavailable and/or unschedulable: there is no point
+                        // trying to reschedule since there's nowhere else to go. Without this
+                        // branch we incorrectly detach tenants in response to node unavailability.
+                        continue;
+                    }
+
                    if tenant_shard.intent.demote_attached(scheduler, node_id) {
                        tenant_shard.sequence = tenant_shard.sequence.next();

@@ -4353,6 +4395,12 @@ impl Service {
                // When a node comes back online, we must reconcile any tenant that has a None observed
                // location on the node.
                for tenant_shard in locked.tenants.values_mut() {
+                    // If a reconciliation is already in progress, rely on the previous scheduling
+                    // decision and skip triggering a new reconciliation.
+                    if tenant_shard.reconciler.is_some() {
+                        continue;
+                    }
+
                    if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
                        if observed_loc.conf.is_none() {
                            self.maybe_reconcile_shard(tenant_shard, &new_nodes);