storage controller: sleep between compute notify retries (#8869)

## Problem Live migration retries when it fails to notify the compute of the new location. It should sleep between attempts. Closes: https://github.com/neondatabase/neon/issues/8820 ## Summary of changes - Do an `exponential_backoff` in the retry loop for compute notifications
2025-12-25 23:29:59 +00:00 · 2024-08-30 11:44:13 +01:00
parent 72aa6b02da
commit 20f82f9169
1 changed files with 13 additions and 0 deletions
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -12,6 +12,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
+use utils::backoff::exponential_backoff;
 use utils::failpoint_support;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
@@ -568,6 +569,7 @@ impl Reconciler {

        // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
        // the origin without notifying compute, we will render the tenant unavailable.
+        let mut notify_attempts = 0;
        while let Err(e) = self.compute_notify().await {
            match e {
                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
@@ -578,6 +580,17 @@ impl Reconciler {
                    );
                }
            }
+
+            exponential_backoff(
+                notify_attempts,
+                // Generous waits: control plane operations which might be blocking us usually complete on the order
+                // of hundreds to thousands of milliseconds, so no point busy polling.
+                1.0,
+                10.0,
+                &self.cancel,
+            )
+            .await;
+            notify_attempts += 1;
        }

        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then