mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-26 09:30:37 +00:00
storcon: sk heartbeat fixes (#10891)
This PR does the following things: * The initial heartbeat round blocks the storage controller from becoming online again. If all safekeepers are unresponsive, this can cause storage controller startup to be very slow. The original intent of #10583 was that heartbeats don't affect normal functionality of the storage controller. So add a short timeout to prevent it from impeding storcon functionality. * Fix the URL of the utilization endpoint. * Don't send heartbeats to safekeepers which are decomissioned. Part of https://github.com/neondatabase/neon/issues/9011 context: https://neondb.slack.com/archives/C033RQ5SPDH/p1739966807592589
This commit is contained in:
@@ -819,7 +819,9 @@ impl Service {
|
||||
.heartbeater_ps
|
||||
.heartbeat(Arc::new(nodes_to_heartbeat))
|
||||
.await;
|
||||
let res_sk = self.heartbeater_sk.heartbeat(all_sks).await;
|
||||
// Put a small, but reasonable timeout to get the initial heartbeats of the safekeepers to avoid a storage controller downtime
|
||||
const SK_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
let res_sk = tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks)).await;
|
||||
|
||||
let mut online_nodes = HashMap::new();
|
||||
if let Ok(deltas) = res_ps {
|
||||
@@ -837,7 +839,7 @@ impl Service {
|
||||
}
|
||||
|
||||
let mut online_sks = HashMap::new();
|
||||
if let Ok(deltas) = res_sk {
|
||||
if let Ok(Ok(deltas)) = res_sk {
|
||||
for (node_id, status) in deltas.0 {
|
||||
match status {
|
||||
SafekeeperState::Available {
|
||||
@@ -7960,7 +7962,7 @@ impl Service {
|
||||
let sk = safekeepers
|
||||
.get_mut(&node_id)
|
||||
.ok_or(DatabaseError::Logical("Not found".to_string()))?;
|
||||
sk.skp.scheduling_policy = String::from(scheduling_policy);
|
||||
sk.set_scheduling_policy(scheduling_policy);
|
||||
|
||||
locked.safekeepers = Arc::new(safekeepers);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user