added changes from hadron configurator

This commit is contained in:
Suhas Thalanki
2025-07-22 16:02:28 -05:00
parent f859354466
commit 3cffaedaba
7 changed files with 198 additions and 15 deletions

View File

@@ -1,10 +1,16 @@
use std::sync::Arc;
use std::fs::File;
use std::thread;
use std::{path::Path, sync::Arc};
use anyhow::Result;
use compute_api::responses::ComputeStatus;
use compute_api::spec::ComputeSpec;
use tracing::{error, info, instrument};
use crate::compute::ComputeNode;
use crate::{
compute::{ComputeNode, ParsedSpec},
spec::get_config_from_control_plane,
};
#[instrument(skip_all)]
fn configurator_main_loop(compute: &Arc<ComputeNode>) {
@@ -12,12 +18,27 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
loop {
let mut state = compute.state.lock().unwrap();
// We have to re-check the status after re-acquiring the lock because it could be that
// the status has changed while we were waiting for the lock, and we might not need to
// wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e.
// we are waiting for a condition variable that will never be signaled.
if state.status != ComputeStatus::ConfigurationPending {
state = compute.state_changed.wait(state).unwrap();
if compute.params.lakebase_mode {
/* BEGIN_HADRON */
// RefreshConfiguration should only be used inside the loop
assert_ne!(state.status, ComputeStatus::RefreshConfiguration);
/* END_HADRON */
while state.status != ComputeStatus::ConfigurationPending
&& state.status != ComputeStatus::RefreshConfigurationPending
&& state.status != ComputeStatus::Failed
{
info!("configurator: compute status: {:?}, sleeping", state.status);
state = compute.state_changed.wait(state).unwrap();
}
} else {
// We have to re-check the status after re-acquiring the lock because it could be that
// the status has changed while we were waiting for the lock, and we might not need to
// wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e.
// we are waiting for a condition variable that will never be signaled.
if state.status != ComputeStatus::ConfigurationPending {
state = compute.state_changed.wait(state).unwrap();
}
}
// Re-check the status after waking up
@@ -26,17 +47,146 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
state.set_status(ComputeStatus::Configuration, &compute.state_changed);
drop(state);
let mut new_status = ComputeStatus::Failed;
let mut _new_status = ComputeStatus::Failed;
if let Err(e) = compute.reconfigure() {
error!("could not configure compute node: {}", e);
// TODO(BRC-1726): Remove this panic once we fix the state machine to allow futher
// configuration attempts after a failed configuration attempt.
error!("Compute node exiting due to configuration failure.");
std::process::exit(1);
} else {
new_status = ComputeStatus::Running;
_new_status = ComputeStatus::Running;
info!("compute node configured");
}
// XXX: used to test that API is blocking
// std::thread::sleep(std::time::Duration::from_millis(10000));
compute.set_status(_new_status);
} else if state.status == ComputeStatus::RefreshConfigurationPending {
info!(
"compute node suspects its configuration is out of date, now refreshing configuration"
);
state.set_status(ComputeStatus::RefreshConfiguration, &compute.state_changed);
// Drop the lock guard here to avoid holding the lock while downloading spec from the control plane / HCC.
// This is the only thread that can move compute_ctl out of the `RefreshConfiguration` state, so it
// is safe to drop the lock like this.
drop(state);
let get_spec_result: anyhow::Result<Option<ComputeSpec>> =
if let Some(sp) = &compute.params.spec_path_test_only {
// This path is only to make testing easier. In production we always get the spec from the HCM.
info!("reloading spec.json from path: {:?}", sp);
let path = Path::new(sp);
if let Ok(file) = File::open(path) {
match serde_json::from_reader(file) {
Ok(spec) => Ok(Some(spec)),
Err(e) => {
error!("could not parse spec file: {}", e);
Err(anyhow::anyhow!("could not parse spec file: {}", e))
}
}
} else {
error!("could not open spec file at path: {:?}", sp);
Err(anyhow::anyhow!(
"could not open spec file at path: {:?}",
sp
))
}
} else if let Some(control_plane_uri) = &compute.params.control_plane_uri {
get_config_from_control_plane(control_plane_uri, &compute.params.compute_id).map(
|(spec_opt, _)| {
info!("got spec from control plane: {:?}", spec_opt);
spec_opt
},
)
} else {
Err(anyhow::anyhow!("spec_path_test_only is not set"))
};
// Parse any received ComputeSpec and transpose the result into a Result<Option<ParsedSpec>>.
let parsed_spec_result: Result<Option<ParsedSpec>> = get_spec_result.and_then(|spec| {
if let Some(spec) = spec {
if let Ok(pspec) = ParsedSpec::try_from(spec) {
Ok(Some(pspec))
} else {
Err(anyhow::anyhow!("could not parse spec"))
}
} else {
Ok(None)
}
});
let new_status: ComputeStatus;
match parsed_spec_result {
// Control plane (HCM) returned a spec and we were able to parse it.
Ok(Some(pspec)) => {
{
let mut state = compute.state.lock().unwrap();
// Defensive programming to make sure this thread is indeed the only one that can move the compute
// node out of the `RefreshConfiguration` state. Would be nice if we can encode this invariant
// into the type system.
assert_eq!(state.status, ComputeStatus::RefreshConfiguration);
if state.pspec.as_ref().map(|ps| ps.pageserver_connstr.clone())
== Some(pspec.pageserver_connstr.clone())
{
info!(
"Refresh configuration: Retrieved spec is the same as the current spec. Waiting for control plane to update the spec before attempting reconfiguration."
);
state.status = ComputeStatus::Running;
compute.state_changed.notify_all();
drop(state);
std::thread::sleep(std::time::Duration::from_secs(5));
continue;
}
// state.pspec is consumed by compute.reconfigure() below. Note that compute.reconfigure() will acquire
// the compute.state lock again so we need to have the lock guard go out of scope here. We could add a
// "locked" variant of compute.reconfigure() that takes the lock guard as an argument to make this cleaner,
// but it's not worth forking the codebase too much for this minor point alone right now.
ComputeNode::set_spec(&compute.params, &mut state, pspec);
}
match compute.reconfigure() {
Ok(_) => {
info!("Refresh configuration: compute node configured");
new_status = ComputeStatus::Running;
}
Err(e) => {
error!(
"Refresh configuration: could not configure compute node: {}",
e
);
// Set the compute node back to the `RefreshConfigurationPending` state if the configuration
// was not successful. It should be okay to treat this situation the same as if the loop
// hasn't executed yet as long as the detection side keeps notifying.
new_status = ComputeStatus::RefreshConfigurationPending;
}
}
}
// Control plane (HCM)'s response does not contain a spec. This is the "Empty" attachment case.
Ok(None) => {
info!(
"Compute Manager signaled that this compute is no longer attached to any storage. Exiting."
);
// We just immediately terminate the whole compute_ctl in this case. It's not necessary to attempt a
// clean shutdown as Postgres is probably not responding anyway (which is why we are in this refresh
// configuration state).
std::process::exit(1);
}
// Various error cases:
// - The request to the control plane (HCM) either failed or returned a malformed spec.
// - compute_ctl itself is configured incorrectly (e.g., compute_id is not set).
Err(e) => {
error!(
"Refresh configuration: error getting a parsed spec: {:?}",
e
);
new_status = ComputeStatus::RefreshConfigurationPending;
// We may be dealing with an overloaded HCM if we end up in this path. Backoff 5 seconds before
// retrying to avoid hammering the HCM.
std::thread::sleep(std::time::Duration::from_secs(5));
}
}
compute.set_status(new_status);
} else if state.status == ComputeStatus::Failed {
info!("compute node is now in Failed state, exiting");