mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-01 04:20:39 +00:00
fix(compute_ctl): race condition in configurator (#9162)
There was a tricky race condition in compute_ctl, that sometimes makes configurator skip updates. It makes a deadlock because: - control-plane cannot configure compute, because it's in ConfigurationPending state - compute_ctl doesn't do any reconfiguration because `configurator_main_loop` missed notification for it Full sequence that reproduces the issue: 1. `start_compute` finishes works and changes status `self.set_status(ComputeStatus::Running);` 2. configurator received update about `Running` state and dropped the mutex lock in the iteration 3. `/configure` request was triggered at the same time as step 1, and got the mutex lock 4. same `/configure` request set the spec and updated the state to `ConfigurationPending`, also sent a notification 5. next iteration in configurator got the mutex lock, but missed the notification There are more details in this slack thread: https://neondb.slack.com/archives/C03438W3FLZ/p1727281028478689?thread_ts=1727261220.483799&cid=C03438W3FLZ --------- Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
This commit is contained in:
committed by
GitHub
parent
7fdf1ab5b6
commit
80e974d05b
@@ -11,9 +11,17 @@ use crate::compute::ComputeNode;
|
||||
fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
info!("waiting for reconfiguration requests");
|
||||
loop {
|
||||
let state = compute.state.lock().unwrap();
|
||||
let mut state = compute.state_changed.wait(state).unwrap();
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
|
||||
// We have to re-check the status after re-acquiring the lock because it could be that
|
||||
// the status has changed while we were waiting for the lock, and we might not need to
|
||||
// wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e.
|
||||
// we are waiting for a condition variable that will never be signaled.
|
||||
if state.status != ComputeStatus::ConfigurationPending {
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
}
|
||||
|
||||
// Re-check the status after waking up
|
||||
if state.status == ComputeStatus::ConfigurationPending {
|
||||
info!("got configuration request");
|
||||
state.status = ComputeStatus::Configuration;
|
||||
|
||||
Reference in New Issue
Block a user