mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 21:42:56 +00:00
storage controller: add metrics for tenant shard, node count (#9475)
## Problem Previously, figuring out how many tenant shards were managed by a storage controller was typically done by peeking at the database or calling into the API. A metric makes it easier to monitor, as unexpectedly increasing shard counts can be indicative of problems elsewhere in the system. ## Summary of changes - Add metrics `storage_controller_pageserver_nodes` (updated on node CRUD operations from Service) and `storage_controller_tenant_shards` (updated RAII-style from TenantShard)
This commit is contained in:
@@ -37,6 +37,12 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
/// Count of how many times we spawn a reconcile task
|
||||
pub(crate) storage_controller_reconcile_spawn: measured::Counter,
|
||||
|
||||
/// Size of the in-memory map of tenant shards
|
||||
pub(crate) storage_controller_tenant_shards: measured::Gauge,
|
||||
|
||||
/// Size of the in-memory map of pageserver_nodes
|
||||
pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
|
||||
|
||||
/// Reconciler tasks completed, broken down by success/failure/cancelled
|
||||
pub(crate) storage_controller_reconcile_complete:
|
||||
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
|
||||
|
||||
@@ -934,7 +934,6 @@ impl Service {
|
||||
self.startup_complete.clone().wait().await;
|
||||
|
||||
const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
|
||||
|
||||
let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
|
||||
while !self.reconcilers_cancel.is_cancelled() {
|
||||
tokio::select! {
|
||||
@@ -1272,6 +1271,10 @@ impl Service {
|
||||
.collect::<Vec<_>>();
|
||||
let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.get_id(), n)).collect();
|
||||
tracing::info!("Loaded {} nodes from database.", nodes.len());
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_nodes
|
||||
.set(nodes.len() as i64);
|
||||
|
||||
tracing::info!("Loading shards from database...");
|
||||
let mut tenant_shard_persistence = persistence.list_tenant_shards().await?;
|
||||
@@ -4110,9 +4113,9 @@ impl Service {
|
||||
(
|
||||
old_attached,
|
||||
generation,
|
||||
old_state.policy,
|
||||
old_state.policy.clone(),
|
||||
old_state.shard,
|
||||
old_state.config,
|
||||
old_state.config.clone(),
|
||||
)
|
||||
};
|
||||
|
||||
@@ -5075,6 +5078,10 @@ impl Service {
|
||||
let mut nodes = (*locked.nodes).clone();
|
||||
nodes.remove(&node_id);
|
||||
locked.nodes = Arc::new(nodes);
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_nodes
|
||||
.set(locked.nodes.len() as i64);
|
||||
|
||||
locked.scheduler.node_remove(node_id);
|
||||
|
||||
@@ -5158,6 +5165,10 @@ impl Service {
|
||||
removed_node.set_availability(NodeAvailability::Offline);
|
||||
}
|
||||
*nodes = Arc::new(nodes_mut);
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_nodes
|
||||
.set(nodes.len() as i64);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5346,6 +5357,11 @@ impl Service {
|
||||
|
||||
locked.nodes = Arc::new(new_nodes);
|
||||
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_nodes
|
||||
.set(locked.nodes.len() as i64);
|
||||
|
||||
tracing::info!(
|
||||
"Registered pageserver {}, now have {} pageservers",
|
||||
register_req.node_id,
|
||||
|
||||
@@ -473,6 +473,11 @@ impl TenantShard {
|
||||
shard: ShardIdentity,
|
||||
policy: PlacementPolicy,
|
||||
) -> Self {
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_tenant_shards
|
||||
.inc();
|
||||
|
||||
Self {
|
||||
tenant_shard_id,
|
||||
policy,
|
||||
@@ -1384,6 +1389,11 @@ impl TenantShard {
|
||||
let tenant_shard_id = tsp.get_tenant_shard_id()?;
|
||||
let shard_identity = tsp.get_shard_identity()?;
|
||||
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_tenant_shards
|
||||
.inc();
|
||||
|
||||
Ok(Self {
|
||||
tenant_shard_id,
|
||||
shard: shard_identity,
|
||||
@@ -1512,6 +1522,15 @@ impl TenantShard {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TenantShard {
|
||||
fn drop(&mut self) {
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_tenant_shards
|
||||
.dec();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use std::{cell::RefCell, rc::Rc};
|
||||
|
||||
Reference in New Issue
Block a user