storcon: add https metrics for pageservers/safekeepers (#11460)

## Problem
Storcon will not start up if `use_https` is on and there are some
pageservers or safekeepers without https port in the database. Metrics
"how many nodes with https we have in DB" will help us to make sure that
`use_https` may be turned on safely.
- Part of https://github.com/neondatabase/cloud/issues/25526

## Summary of changes
- Add `storage_controller_https_pageserver_nodes`,
`storage_controller_safekeeper_nodes` and
`storage_controller_https_safekeeper_nodes` Prometheus metrics.
This commit is contained in:
Dmitrii Kovalkov
2025-04-09 12:33:49 +04:00
committed by GitHub
parent c610f3584d
commit cf62017a5b
5 changed files with 55 additions and 0 deletions

View File

@@ -44,6 +44,15 @@ pub(crate) struct StorageControllerMetricGroup {
/// Size of the in-memory map of pageserver_nodes
pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
/// Count of how many pageserver nodes from in-memory map have https configured
pub(crate) storage_controller_https_pageserver_nodes: measured::Gauge,
/// Size of the in-memory map of safekeeper_nodes
pub(crate) storage_controller_safekeeper_nodes: measured::Gauge,
/// Count of how many safekeeper nodes from in-memory map have https configured
pub(crate) storage_controller_https_safekeeper_nodes: measured::Gauge,
/// Reconciler tasks completed, broken down by success/failure/cancelled
pub(crate) storage_controller_reconcile_complete:
measured::CounterVec<ReconcileCompleteLabelGroupSet>,

View File

@@ -89,6 +89,10 @@ impl Node {
self.scheduling = scheduling
}
pub(crate) fn has_https_port(&self) -> bool {
self.listen_https_port.is_some()
}
/// Does this registration request match `self`? This is used when deciding whether a registration
/// request should be allowed to update an existing record with the same node ID.
pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {

View File

@@ -89,6 +89,9 @@ impl Safekeeper {
pub(crate) fn availability(&self) -> SafekeeperState {
self.availability.clone()
}
pub(crate) fn has_https_port(&self) -> bool {
self.listen_https_port.is_some()
}
/// Perform an operation (which is given a [`SafekeeperClient`]) with retries
#[allow(clippy::too_many_arguments)]
pub(crate) async fn with_client_retries<T, O, F>(

View File

@@ -1509,6 +1509,10 @@ impl Service {
.metrics_group
.storage_controller_pageserver_nodes
.set(nodes.len() as i64);
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_https_pageserver_nodes
.set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
tracing::info!("Loading safekeepers from database...");
let safekeepers = persistence
@@ -1526,6 +1530,14 @@ impl Service {
let safekeepers: HashMap<NodeId, Safekeeper> =
safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_safekeeper_nodes
.set(safekeepers.len() as i64);
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_https_safekeeper_nodes
.set(safekeepers.values().filter(|s| s.has_https_port()).count() as i64);
tracing::info!("Loading shards from database...");
let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
@@ -6254,6 +6266,10 @@ impl Service {
.metrics_group
.storage_controller_pageserver_nodes
.set(locked.nodes.len() as i64);
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_https_pageserver_nodes
.set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64);
locked.scheduler.node_remove(node_id);
@@ -6345,6 +6361,10 @@ impl Service {
.metrics_group
.storage_controller_pageserver_nodes
.set(nodes.len() as i64);
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_https_pageserver_nodes
.set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
}
}
@@ -6569,6 +6589,10 @@ impl Service {
.metrics_group
.storage_controller_pageserver_nodes
.set(locked.nodes.len() as i64);
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_https_pageserver_nodes
.set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64);
match registration_status {
RegistrationStatus::New => {

View File

@@ -5,6 +5,7 @@ use std::time::Duration;
use super::safekeeper_reconciler::ScheduleRequest;
use crate::heartbeater::SafekeeperState;
use crate::metrics;
use crate::persistence::{
DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence,
};
@@ -590,6 +591,20 @@ impl Service {
}
}
locked.safekeepers = Arc::new(safekeepers);
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_safekeeper_nodes
.set(locked.safekeepers.len() as i64);
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_https_safekeeper_nodes
.set(
locked
.safekeepers
.values()
.filter(|s| s.has_https_port())
.count() as i64,
);
}
Ok(())
}