mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
storcon: print viability of --timelines-onto-safekeepers (#12485)
The `--timelines-onto-safekeepers` flag is very consequential in the sense that it controls every single timeline creation. However, we don't have any automatic insight whether enabling the option will break things or not. The main way things can break is by misconfigured safekeepers, say they are marked as paused in the storcon db. The best input so far we can obtain via manually connecting via storcon_cli and listing safekeepers, but this is cumbersome and manual so prone to human error. So at storcon startup, do a simulated "test creation" in which we call `timelines_onto_safekeepers` with the configuration provided to us, and print whether it was successful or not. No actual timeline is created, and nothing is written into the storcon db. The heartbeat info will not have reached us at that point yet, but that's okay, because we still fall back to safekeepers that don't have any heartbeat. Also print some general scheduling policy stats on initial safekeeper load. Part of #11670.
This commit is contained in:
@@ -384,7 +384,7 @@ pub struct SafekeepersInfo {
|
||||
pub safekeepers: Vec<SafekeeperInfo>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SafekeeperInfo {
|
||||
pub id: NodeId,
|
||||
pub hostname: String,
|
||||
|
||||
@@ -1677,7 +1677,21 @@ impl Service {
|
||||
.collect::<anyhow::Result<Vec<_>>>()?;
|
||||
let safekeepers: HashMap<NodeId, Safekeeper> =
|
||||
safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
|
||||
tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
|
||||
let count_policy = |policy| {
|
||||
safekeepers
|
||||
.iter()
|
||||
.filter(|sk| sk.1.scheduling_policy() == policy)
|
||||
.count()
|
||||
};
|
||||
let active_sk_count = count_policy(SkSchedulingPolicy::Active);
|
||||
let activating_sk_count = count_policy(SkSchedulingPolicy::Activating);
|
||||
let pause_sk_count = count_policy(SkSchedulingPolicy::Pause);
|
||||
let decom_sk_count = count_policy(SkSchedulingPolicy::Decomissioned);
|
||||
tracing::info!(
|
||||
"Loaded {} safekeepers from database. Active {active_sk_count}, activating {activating_sk_count}, \
|
||||
paused {pause_sk_count}, decomissioned {decom_sk_count}.",
|
||||
safekeepers.len()
|
||||
);
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_safekeeper_nodes
|
||||
@@ -1969,6 +1983,14 @@ impl Service {
|
||||
}
|
||||
});
|
||||
|
||||
// Check that there is enough safekeepers configured that we can create new timelines
|
||||
let test_sk_res = this.safekeepers_for_new_timeline().await;
|
||||
tracing::info!(
|
||||
timeline_safekeeper_count = config.timeline_safekeeper_count,
|
||||
timelines_onto_safekeepers = config.timelines_onto_safekeepers,
|
||||
"viability test result (test timeline creation on safekeepers): {test_sk_res:?}",
|
||||
);
|
||||
|
||||
Ok(this)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user