storage_controller: fix node flap detach race (#10298)

## Problem The observed state removal may race with the inline updates of the observed state done from `Service::node_activate_reconcile`. This was intended to work as follows: 1. Detaches while the node is unavailable remove the entry from the observed state. 2. `Service::node_activate_reconcile` diffs the locations returned by the pageserver with the observed state and detaches in-line when required. ## Summary of changes This PR removes step (1) and lets background reconciliations deal with the mismatch between the intent and observed state. A follow up will attempt to remove `Service::node_activate_reconcile` altogether. Closes https://github.com/neondatabase/neon/issues/10253
2026-01-07 21:42:56 +00:00 · 2025-01-08 10:26:53 +00:00
parent 5c76e2a983
commit dc284247a5
4 changed files with 139 additions and 10 deletions
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -14,7 +14,6 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
 use utils::backoff::exponential_backoff;
-use utils::failpoint_support;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
@@ -212,11 +211,12 @@ impl Reconciler {
        lazy: bool,
    ) -> Result<(), ReconcileError> {
        if !node.is_available() && config.mode == LocationConfigMode::Detached {
-            // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
-            // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
-            // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
-            tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
-            self.observed.locations.remove(&node.get_id());
+            // [`crate::service::Service::node_activate_reconcile`] will update the observed state
+            // when the node comes back online. At that point, the intent and observed states will
+            // be mismatched and a background reconciliation will detach.
+            tracing::info!(
+                "Node {node} is unavailable during detach: proceeding anyway, it will be detached via background reconciliation"
+            );
            return Ok(());
        }

@@ -749,6 +749,8 @@ impl Reconciler {
                    };

                    if increment_generation {
+                        pausable_failpoint!("reconciler-pre-increment-generation");
+
                        let generation = self
                            .persistence
                            .increment_generation(self.tenant_shard_id, node.get_id())
@@ -824,7 +826,7 @@ impl Reconciler {
                .handle_detach(self.tenant_shard_id, self.shard.stripe_size);
        }

-        failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");
+        pausable_failpoint!("reconciler-epilogue");

        Ok(())
    }
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -83,6 +83,7 @@ use utils::{
    generation::Generation,
    http::error::ApiError,
    id::{NodeId, TenantId, TimelineId},
+    pausable_failpoint,
    sync::gate::Gate,
 };

@@ -1024,6 +1025,8 @@ impl Service {
                    )
                    .await;

+                    pausable_failpoint!("heartbeat-pre-node-state-configure");
+
                    // This is the code path for geniune availability transitions (i.e node
                    // goes unavailable and/or comes back online).
                    let res = self
@@ -2492,6 +2495,7 @@ impl Service {
                // Persist updates
                // Ordering: write to the database before applying changes in-memory, so that
                // we will not appear time-travel backwards on a restart.
+
                let mut schedule_context = ScheduleContext::default();
                for ShardUpdate {
                    tenant_shard_id,