mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-31 03:50:37 +00:00
There was a tricky race condition in compute_ctl, that sometimes makes configurator skip updates. It makes a deadlock because: - control-plane cannot configure compute, because it's in ConfigurationPending state - compute_ctl doesn't do any reconfiguration because `configurator_main_loop` missed notification for it Full sequence that reproduces the issue: 1. `start_compute` finishes works and changes status `self.set_status(ComputeStatus::Running);` 2. configurator received update about `Running` state and dropped the mutex lock in the iteration 3. `/configure` request was triggered at the same time as step 1, and got the mutex lock 4. same `/configure` request set the spec and updated the state to `ConfigurationPending`, also sent a notification 5. next iteration in configurator got the mutex lock, but missed the notification There are more details in this slack thread: https://neondb.slack.com/archives/C03438W3FLZ/p1727281028478689?thread_ts=1727261220.483799&cid=C03438W3FLZ --------- Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
63 lines
2.2 KiB
Rust
63 lines
2.2 KiB
Rust
use std::sync::Arc;
|
|
use std::thread;
|
|
|
|
use tracing::{error, info, instrument};
|
|
|
|
use compute_api::responses::ComputeStatus;
|
|
|
|
use crate::compute::ComputeNode;
|
|
|
|
#[instrument(skip_all)]
|
|
fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
|
info!("waiting for reconfiguration requests");
|
|
loop {
|
|
let mut state = compute.state.lock().unwrap();
|
|
|
|
// We have to re-check the status after re-acquiring the lock because it could be that
|
|
// the status has changed while we were waiting for the lock, and we might not need to
|
|
// wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e.
|
|
// we are waiting for a condition variable that will never be signaled.
|
|
if state.status != ComputeStatus::ConfigurationPending {
|
|
state = compute.state_changed.wait(state).unwrap();
|
|
}
|
|
|
|
// Re-check the status after waking up
|
|
if state.status == ComputeStatus::ConfigurationPending {
|
|
info!("got configuration request");
|
|
state.status = ComputeStatus::Configuration;
|
|
compute.state_changed.notify_all();
|
|
drop(state);
|
|
|
|
let mut new_status = ComputeStatus::Failed;
|
|
if let Err(e) = compute.reconfigure() {
|
|
error!("could not configure compute node: {}", e);
|
|
} else {
|
|
new_status = ComputeStatus::Running;
|
|
info!("compute node configured");
|
|
}
|
|
|
|
// XXX: used to test that API is blocking
|
|
// std::thread::sleep(std::time::Duration::from_millis(10000));
|
|
|
|
compute.set_status(new_status);
|
|
} else if state.status == ComputeStatus::Failed {
|
|
info!("compute node is now in Failed state, exiting");
|
|
break;
|
|
} else {
|
|
info!("woken up for compute status: {:?}, sleeping", state.status);
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
|
let compute = Arc::clone(compute);
|
|
|
|
thread::Builder::new()
|
|
.name("compute-configurator".into())
|
|
.spawn(move || {
|
|
configurator_main_loop(&compute);
|
|
info!("configurator thread is exited");
|
|
})
|
|
.expect("cannot launch configurator thread")
|
|
}
|