From 20f82f91698fc64265b18e12cd7482b141e0832c Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 30 Aug 2024 11:44:13 +0100 Subject: [PATCH] storage controller: sleep between compute notify retries (#8869) ## Problem Live migration retries when it fails to notify the compute of the new location. It should sleep between attempts. Closes: https://github.com/neondatabase/neon/issues/8820 ## Summary of changes - Do an `exponential_backoff` in the retry loop for compute notifications --- storage_controller/src/reconciler.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 94db879ade..102a3124d2 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -12,6 +12,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; +use utils::backoff::exponential_backoff; use utils::failpoint_support; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; @@ -568,6 +569,7 @@ impl Reconciler { // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach // the origin without notifying compute, we will render the tenant unavailable. + let mut notify_attempts = 0; while let Err(e) = self.compute_notify().await { match e { NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), @@ -578,6 +580,17 @@ impl Reconciler { ); } } + + exponential_backoff( + notify_attempts, + // Generous waits: control plane operations which might be blocking us usually complete on the order + // of hundreds to thousands of milliseconds, so no point busy polling. + 1.0, + 10.0, + &self.cancel, + ) + .await; + notify_attempts += 1; } // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then