mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
storcon: add https metrics for pageservers/safekeepers (#11460)
## Problem Storcon will not start up if `use_https` is on and there are some pageservers or safekeepers without https port in the database. Metrics "how many nodes with https we have in DB" will help us to make sure that `use_https` may be turned on safely. - Part of https://github.com/neondatabase/cloud/issues/25526 ## Summary of changes - Add `storage_controller_https_pageserver_nodes`, `storage_controller_safekeeper_nodes` and `storage_controller_https_safekeeper_nodes` Prometheus metrics.
This commit is contained in:
@@ -44,6 +44,15 @@ pub(crate) struct StorageControllerMetricGroup {
|
|||||||
/// Size of the in-memory map of pageserver_nodes
|
/// Size of the in-memory map of pageserver_nodes
|
||||||
pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
|
pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
|
||||||
|
|
||||||
|
/// Count of how many pageserver nodes from in-memory map have https configured
|
||||||
|
pub(crate) storage_controller_https_pageserver_nodes: measured::Gauge,
|
||||||
|
|
||||||
|
/// Size of the in-memory map of safekeeper_nodes
|
||||||
|
pub(crate) storage_controller_safekeeper_nodes: measured::Gauge,
|
||||||
|
|
||||||
|
/// Count of how many safekeeper nodes from in-memory map have https configured
|
||||||
|
pub(crate) storage_controller_https_safekeeper_nodes: measured::Gauge,
|
||||||
|
|
||||||
/// Reconciler tasks completed, broken down by success/failure/cancelled
|
/// Reconciler tasks completed, broken down by success/failure/cancelled
|
||||||
pub(crate) storage_controller_reconcile_complete:
|
pub(crate) storage_controller_reconcile_complete:
|
||||||
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
|
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
|
||||||
|
|||||||
@@ -89,6 +89,10 @@ impl Node {
|
|||||||
self.scheduling = scheduling
|
self.scheduling = scheduling
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn has_https_port(&self) -> bool {
|
||||||
|
self.listen_https_port.is_some()
|
||||||
|
}
|
||||||
|
|
||||||
/// Does this registration request match `self`? This is used when deciding whether a registration
|
/// Does this registration request match `self`? This is used when deciding whether a registration
|
||||||
/// request should be allowed to update an existing record with the same node ID.
|
/// request should be allowed to update an existing record with the same node ID.
|
||||||
pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
|
pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
|
||||||
|
|||||||
@@ -89,6 +89,9 @@ impl Safekeeper {
|
|||||||
pub(crate) fn availability(&self) -> SafekeeperState {
|
pub(crate) fn availability(&self) -> SafekeeperState {
|
||||||
self.availability.clone()
|
self.availability.clone()
|
||||||
}
|
}
|
||||||
|
pub(crate) fn has_https_port(&self) -> bool {
|
||||||
|
self.listen_https_port.is_some()
|
||||||
|
}
|
||||||
/// Perform an operation (which is given a [`SafekeeperClient`]) with retries
|
/// Perform an operation (which is given a [`SafekeeperClient`]) with retries
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub(crate) async fn with_client_retries<T, O, F>(
|
pub(crate) async fn with_client_retries<T, O, F>(
|
||||||
|
|||||||
@@ -1509,6 +1509,10 @@ impl Service {
|
|||||||
.metrics_group
|
.metrics_group
|
||||||
.storage_controller_pageserver_nodes
|
.storage_controller_pageserver_nodes
|
||||||
.set(nodes.len() as i64);
|
.set(nodes.len() as i64);
|
||||||
|
metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_https_pageserver_nodes
|
||||||
|
.set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
|
||||||
|
|
||||||
tracing::info!("Loading safekeepers from database...");
|
tracing::info!("Loading safekeepers from database...");
|
||||||
let safekeepers = persistence
|
let safekeepers = persistence
|
||||||
@@ -1526,6 +1530,14 @@ impl Service {
|
|||||||
let safekeepers: HashMap<NodeId, Safekeeper> =
|
let safekeepers: HashMap<NodeId, Safekeeper> =
|
||||||
safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
|
safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
|
||||||
tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
|
tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
|
||||||
|
metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_safekeeper_nodes
|
||||||
|
.set(safekeepers.len() as i64);
|
||||||
|
metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_https_safekeeper_nodes
|
||||||
|
.set(safekeepers.values().filter(|s| s.has_https_port()).count() as i64);
|
||||||
|
|
||||||
tracing::info!("Loading shards from database...");
|
tracing::info!("Loading shards from database...");
|
||||||
let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
|
let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
|
||||||
@@ -6254,6 +6266,10 @@ impl Service {
|
|||||||
.metrics_group
|
.metrics_group
|
||||||
.storage_controller_pageserver_nodes
|
.storage_controller_pageserver_nodes
|
||||||
.set(locked.nodes.len() as i64);
|
.set(locked.nodes.len() as i64);
|
||||||
|
metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_https_pageserver_nodes
|
||||||
|
.set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64);
|
||||||
|
|
||||||
locked.scheduler.node_remove(node_id);
|
locked.scheduler.node_remove(node_id);
|
||||||
|
|
||||||
@@ -6345,6 +6361,10 @@ impl Service {
|
|||||||
.metrics_group
|
.metrics_group
|
||||||
.storage_controller_pageserver_nodes
|
.storage_controller_pageserver_nodes
|
||||||
.set(nodes.len() as i64);
|
.set(nodes.len() as i64);
|
||||||
|
metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_https_pageserver_nodes
|
||||||
|
.set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -6569,6 +6589,10 @@ impl Service {
|
|||||||
.metrics_group
|
.metrics_group
|
||||||
.storage_controller_pageserver_nodes
|
.storage_controller_pageserver_nodes
|
||||||
.set(locked.nodes.len() as i64);
|
.set(locked.nodes.len() as i64);
|
||||||
|
metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_https_pageserver_nodes
|
||||||
|
.set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64);
|
||||||
|
|
||||||
match registration_status {
|
match registration_status {
|
||||||
RegistrationStatus::New => {
|
RegistrationStatus::New => {
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use std::time::Duration;
|
|||||||
|
|
||||||
use super::safekeeper_reconciler::ScheduleRequest;
|
use super::safekeeper_reconciler::ScheduleRequest;
|
||||||
use crate::heartbeater::SafekeeperState;
|
use crate::heartbeater::SafekeeperState;
|
||||||
|
use crate::metrics;
|
||||||
use crate::persistence::{
|
use crate::persistence::{
|
||||||
DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence,
|
DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence,
|
||||||
};
|
};
|
||||||
@@ -590,6 +591,20 @@ impl Service {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
locked.safekeepers = Arc::new(safekeepers);
|
locked.safekeepers = Arc::new(safekeepers);
|
||||||
|
metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_safekeeper_nodes
|
||||||
|
.set(locked.safekeepers.len() as i64);
|
||||||
|
metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_https_safekeeper_nodes
|
||||||
|
.set(
|
||||||
|
locked
|
||||||
|
.safekeepers
|
||||||
|
.values()
|
||||||
|
.filter(|s| s.has_https_port())
|
||||||
|
.count() as i64,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user