From 3bd2a4fd56803b0aabb87e9076872ceff0147a77 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 7 Feb 2024 19:14:18 +0000 Subject: [PATCH] control_plane: avoid feedback loop with /location_config if compute hook fails. (#6668) ## Problem The existing behavior isn't exactly incorrect, but is operationally risky: if the control plane compute hook breaks, then all the control plane operations trying to call /location_config will end up retrying forever, which could put more load on the system. ## Summary of changes - Treat 404s as fatal errors to do fewer retries: a 404 either indicates we have the wrong URL, or some control plane bug is failing to recognize our tenant ID as existing. - Do not return an error on reconcilation errors in a non-creating /location_config response: this allows the control plane to finish its Operation (and we will eventually retry the compute notification later) --- control_plane/attachment_service/src/compute_hook.rs | 2 +- control_plane/attachment_service/src/service.rs | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs index 0d3610aafa..5bd1b6bf09 100644 --- a/control_plane/attachment_service/src/compute_hook.rs +++ b/control_plane/attachment_service/src/compute_hook.rs @@ -240,7 +240,7 @@ impl ComputeHook { let client = reqwest::Client::new(); backoff::retry( || self.do_notify_iteration(&client, url, &reconfigure_request, cancel), - |e| matches!(e, NotifyError::Fatal(_)), + |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)), 3, 10, "Send compute notification", diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs index febee1aa0d..1db1906df8 100644 --- a/control_plane/attachment_service/src/service.rs +++ b/control_plane/attachment_service/src/service.rs @@ -989,7 +989,15 @@ impl Service { .collect(); } else { // This was an update, wait for reconciliation - self.await_waiters(waiters).await?; + if let Err(e) = self.await_waiters(waiters).await { + // Do not treat a reconcile error as fatal: we have already applied any requested + // Intent changes, and the reconcile can fail for external reasons like unavailable + // compute notification API. In these cases, it is important that we do not + // cause the cloud control plane to retry forever on this API. + tracing::warn!( + "Failed to reconcile after /location_config: {e}, returning success anyway" + ); + } } Ok(result)