storcon: sk heartbeat fixes (#10891)

This PR does the following things:

* The initial heartbeat round blocks the storage controller from
becoming online again. If all safekeepers are unresponsive, this can
cause storage controller startup to be very slow. The original intent of
#10583 was that heartbeats don't affect normal functionality of the
storage controller. So add a short timeout to prevent it from impeding
storcon functionality.

* Fix the URL of the utilization endpoint.

* Don't send heartbeats to safekeepers which are decomissioned.

Part of https://github.com/neondatabase/neon/issues/9011

context: https://neondb.slack.com/archives/C033RQ5SPDH/p1739966807592589
This commit is contained in:
Arpad Müller
2025-02-19 17:57:11 +01:00
committed by GitHub
parent 1f9511dbd9
commit 9ba2a87e69
5 changed files with 36 additions and 12 deletions

View File

@@ -137,7 +137,7 @@ impl Client {
}
pub async fn utilization(&self) -> Result<SafekeeperUtilization> {
let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint);
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
let resp = self.get(&uri).await?;
resp.json().await.map_err(Error::ReceiveBody)
}

View File

@@ -10,7 +10,10 @@ use std::{
};
use tokio_util::sync::CancellationToken;
use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization};
use pageserver_api::{
controller_api::{NodeAvailability, SkSchedulingPolicy},
models::PageserverUtilization,
};
use thiserror::Error;
use utils::{id::NodeId, logging::SecretString};
@@ -311,6 +314,9 @@ impl HeartBeat<Safekeeper, SafekeeperState> for HeartbeaterTask<Safekeeper, Safe
let mut heartbeat_futs = FuturesUnordered::new();
for (node_id, sk) in &*safekeepers {
if sk.scheduling_policy() == SkSchedulingPolicy::Decomissioned {
continue;
}
heartbeat_futs.push({
let jwt_token = self
.jwt_token

View File

@@ -18,12 +18,14 @@ pub struct Safekeeper {
cancel: CancellationToken,
listen_http_addr: String,
listen_http_port: u16,
scheduling_policy: SkSchedulingPolicy,
id: NodeId,
availability: SafekeeperState,
}
impl Safekeeper {
pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self {
let scheduling_policy = SkSchedulingPolicy::from_str(&skp.scheduling_policy).unwrap();
Self {
cancel,
listen_http_addr: skp.host.clone(),
@@ -31,6 +33,7 @@ impl Safekeeper {
id: NodeId(skp.id as u64),
skp,
availability: SafekeeperState::Offline,
scheduling_policy,
}
}
pub(crate) fn base_url(&self) -> String {
@@ -46,6 +49,13 @@ impl Safekeeper {
pub(crate) fn set_availability(&mut self, availability: SafekeeperState) {
self.availability = availability;
}
pub(crate) fn scheduling_policy(&self) -> SkSchedulingPolicy {
self.scheduling_policy
}
pub(crate) fn set_scheduling_policy(&mut self, scheduling_policy: SkSchedulingPolicy) {
self.scheduling_policy = scheduling_policy;
self.skp.scheduling_policy = String::from(scheduling_policy);
}
/// Perform an operation (which is given a [`SafekeeperClient`]) with retries
pub(crate) async fn with_client_retries<T, O, F>(
&self,
@@ -129,10 +139,8 @@ impl Safekeeper {
self.id.0
);
}
self.skp = crate::persistence::SafekeeperPersistence::from_upsert(
record,
SkSchedulingPolicy::from_str(&self.skp.scheduling_policy).unwrap(),
);
self.skp =
crate::persistence::SafekeeperPersistence::from_upsert(record, self.scheduling_policy);
self.listen_http_port = http_port as u16;
self.listen_http_addr = host;
}

View File

@@ -819,7 +819,9 @@ impl Service {
.heartbeater_ps
.heartbeat(Arc::new(nodes_to_heartbeat))
.await;
let res_sk = self.heartbeater_sk.heartbeat(all_sks).await;
// Put a small, but reasonable timeout to get the initial heartbeats of the safekeepers to avoid a storage controller downtime
const SK_TIMEOUT: Duration = Duration::from_secs(5);
let res_sk = tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks)).await;
let mut online_nodes = HashMap::new();
if let Ok(deltas) = res_ps {
@@ -837,7 +839,7 @@ impl Service {
}
let mut online_sks = HashMap::new();
if let Ok(deltas) = res_sk {
if let Ok(Ok(deltas)) = res_sk {
for (node_id, status) in deltas.0 {
match status {
SafekeeperState::Available {
@@ -7960,7 +7962,7 @@ impl Service {
let sk = safekeepers
.get_mut(&node_id)
.ok_or(DatabaseError::Logical("Not found".to_string()))?;
sk.skp.scheduling_policy = String::from(scheduling_policy);
sk.set_scheduling_policy(scheduling_policy);
locked.safekeepers = Arc::new(safekeepers);
}

View File

@@ -3238,12 +3238,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Pause"
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
target.safekeeper_scheduling_policy(inserted["id"], "Active")
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Decomissioned"
assert newest_info["scheduling_policy"] == "Active"
# Ensure idempotency
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
target.safekeeper_scheduling_policy(inserted["id"], "Active")
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Active"
# change back to paused again
target.safekeeper_scheduling_policy(inserted["id"], "Pause")
def storcon_heartbeat():
assert env.storage_controller.log_contains(
@@ -3252,6 +3257,9 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
wait_until(storcon_heartbeat)
# Now decomission it
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
compared = [dict(a), dict(b)]