From 80e974d05b7689f98ca0e83496895e821840e263 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 26 Sep 2024 15:42:17 +0100 Subject: [PATCH] fix(compute_ctl): race condition in configurator (#9162) There was a tricky race condition in compute_ctl, that sometimes makes configurator skip updates. It makes a deadlock because: - control-plane cannot configure compute, because it's in ConfigurationPending state - compute_ctl doesn't do any reconfiguration because `configurator_main_loop` missed notification for it Full sequence that reproduces the issue: 1. `start_compute` finishes works and changes status `self.set_status(ComputeStatus::Running);` 2. configurator received update about `Running` state and dropped the mutex lock in the iteration 3. `/configure` request was triggered at the same time as step 1, and got the mutex lock 4. same `/configure` request set the spec and updated the state to `ConfigurationPending`, also sent a notification 5. next iteration in configurator got the mutex lock, but missed the notification There are more details in this slack thread: https://neondb.slack.com/archives/C03438W3FLZ/p1727281028478689?thread_ts=1727261220.483799&cid=C03438W3FLZ --------- Co-authored-by: Alexey Kondratov --- compute_tools/src/configurator.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs index 274a221ac7..7bd0e4938d 100644 --- a/compute_tools/src/configurator.rs +++ b/compute_tools/src/configurator.rs @@ -11,9 +11,17 @@ use crate::compute::ComputeNode; fn configurator_main_loop(compute: &Arc) { info!("waiting for reconfiguration requests"); loop { - let state = compute.state.lock().unwrap(); - let mut state = compute.state_changed.wait(state).unwrap(); + let mut state = compute.state.lock().unwrap(); + // We have to re-check the status after re-acquiring the lock because it could be that + // the status has changed while we were waiting for the lock, and we might not need to + // wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e. + // we are waiting for a condition variable that will never be signaled. + if state.status != ComputeStatus::ConfigurationPending { + state = compute.state_changed.wait(state).unwrap(); + } + + // Re-check the status after waking up if state.status == ComputeStatus::ConfigurationPending { info!("got configuration request"); state.status = ComputeStatus::Configuration;