storage_controller: make leadership protocol more robust (#11703)

## Problem We saw the following scenario in staging: 1. Pod A starts up. Becomes leader and steps down the previous pod cleanly. 2. Pod B starts up (deployment). 3. Step down request from pod B to pod A times out. Pod A did not manage to stop its reconciliations within 10 seconds and exited with return code 1 ([code](7ba8519b43/storage_controller/src/service.rs (L8686-L8702))). 4. Pod B marks itself as the leader and finishes start-up 5. k8s restarts pod A 6. k8s marks pod B as ready 7. pod A sends step down request to pod A - this succeeds => pod A is now the leader 8. k8s kills pod A because it thinks pod B is healthy and pod A is part of the old replica set We end up in a situation where the only pod we have (B) is stepped down and attempts to forward requests to a leader that doesn't exist. k8s can't detect that pod B is in a bad state since the /status endpoint simply returns 200 hundred if the pod is running. ## Summary of changes This PR includes a number of robustness improvements to the leadership protocol: * use a single step down task per controller * add a new endpoint to be used as k8s liveness probe and check leadership status there * handle restarts explicitly (i.e. don't step yourself down) * increase the step down retry count * don't kill the process on long step down since k8s will just restart it
2026-01-10 15:02:56 +00:00 · 2025-04-24 17:59:56 +01:00
parent 8afb783708
commit 6f7e3c18e4
5 changed files with 142 additions and 89 deletions
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -11,7 +11,7 @@ use std::num::NonZeroU32;
 use std::ops::{Deref, DerefMut};
 use std::path::PathBuf;
 use std::str::FromStr;
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
 use std::time::{Duration, Instant, SystemTime};

 use anyhow::Context;
@@ -524,6 +524,9 @@ pub struct Service {

    /// HTTP client with proper CA certs.
    http_client: reqwest::Client,
+
+    /// Handle for the step down background task if one was ever requested
+    step_down_barrier: OnceLock<tokio::sync::watch::Receiver<Option<GlobalObservedState>>>,
 }

 impl From<ReconcileWaitError> for ApiError {
@@ -1745,6 +1748,7 @@ impl Service {
            tenant_op_locks: Default::default(),
            node_op_locks: Default::default(),
            http_client,
+            step_down_barrier: Default::default(),
        });

        let result_task_this = this.clone();
@@ -8886,27 +8890,59 @@ impl Service {
        self.inner.read().unwrap().get_leadership_status()
    }

-    pub(crate) async fn step_down(&self) -> GlobalObservedState {
+    /// Handler for step down requests
+    ///
+    /// Step down runs in separate task since once it's called it should
+    /// be driven to completion. Subsequent requests will wait on the same
+    /// step down task.
+    pub(crate) async fn step_down(self: &Arc<Self>) -> GlobalObservedState {
+        let handle = self.step_down_barrier.get_or_init(|| {
+            let step_down_self = self.clone();
+            let (tx, rx) = tokio::sync::watch::channel::<Option<GlobalObservedState>>(None);
+            tokio::spawn(async move {
+                let state = step_down_self.step_down_task().await;
+                tx.send(Some(state))
+                    .expect("Task Arc<Service> keeps receiver alive");
+            });
+
+            rx
+        });
+
+        handle
+            .clone()
+            .wait_for(|observed_state| observed_state.is_some())
+            .await
+            .expect("Task Arc<Service> keeps sender alive")
+            .deref()
+            .clone()
+            .expect("Checked above")
+    }
+
+    async fn step_down_task(&self) -> GlobalObservedState {
        tracing::info!("Received step down request from peer");
        failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");

        self.inner.write().unwrap().step_down();

-        // Wait for reconciliations to stop, or terminate this process if they
-        // fail to stop in time (this indicates a bug in shutdown)
-        tokio::select! {
-            _ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => {
-                tracing::info!("Reconciliations stopped, proceeding with step down");
-            }
-            _ = async {
-                failpoint_support::sleep_millis_async!("step-down-delay-timeout");
-                tokio::time::sleep(Duration::from_secs(10)).await
-            } => {
-                tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process");
+        let stop_reconciliations =
+            self.stop_reconciliations(StopReconciliationsReason::SteppingDown);
+        let mut stop_reconciliations = std::pin::pin!(stop_reconciliations);

-                // The caller may proceed to act as leader when it sees this request fail: reduce the chance
-                // of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state.
-                std::process::exit(1);
+        let started_at = Instant::now();
+
+        // Wait for reconciliations to stop and warn if that's taking a long time
+        loop {
+            tokio::select! {
+                _ = &mut stop_reconciliations => {
+                    tracing::info!("Reconciliations stopped, proceeding with step down");
+                    break;
+                }
+                _ = tokio::time::sleep(Duration::from_secs(10)) => {
+                    tracing::warn!(
+                        elapsed_sec=%started_at.elapsed().as_secs(),
+                        "Stopping reconciliations during step down is taking too long"
+                    );
+                }
            }
        }