storage controller: add metrics for tenant shard, node count (#9475)

## Problem

Previously, figuring out how many tenant shards were managed by a
storage controller was typically done by peeking at the database or
calling into the API. A metric makes it easier to monitor, as
unexpectedly increasing shard counts can be indicative of problems
elsewhere in the system.

## Summary of changes

- Add metrics `storage_controller_pageserver_nodes` (updated on node
CRUD operations from Service) and `storage_controller_tenant_shards`
(updated RAII-style from TenantShard)
This commit is contained in:
John Spray
2024-10-22 19:43:02 +01:00
committed by GitHub
parent b7fa93f6b7
commit 8dca188974
4 changed files with 53 additions and 3 deletions

View File

@@ -37,6 +37,12 @@ pub(crate) struct StorageControllerMetricGroup {
/// Count of how many times we spawn a reconcile task
pub(crate) storage_controller_reconcile_spawn: measured::Counter,
/// Size of the in-memory map of tenant shards
pub(crate) storage_controller_tenant_shards: measured::Gauge,
/// Size of the in-memory map of pageserver_nodes
pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
/// Reconciler tasks completed, broken down by success/failure/cancelled
pub(crate) storage_controller_reconcile_complete:
measured::CounterVec<ReconcileCompleteLabelGroupSet>,

View File

@@ -934,7 +934,6 @@ impl Service {
self.startup_complete.clone().wait().await;
const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
while !self.reconcilers_cancel.is_cancelled() {
tokio::select! {
@@ -1272,6 +1271,10 @@ impl Service {
.collect::<Vec<_>>();
let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.get_id(), n)).collect();
tracing::info!("Loaded {} nodes from database.", nodes.len());
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_pageserver_nodes
.set(nodes.len() as i64);
tracing::info!("Loading shards from database...");
let mut tenant_shard_persistence = persistence.list_tenant_shards().await?;
@@ -4110,9 +4113,9 @@ impl Service {
(
old_attached,
generation,
old_state.policy,
old_state.policy.clone(),
old_state.shard,
old_state.config,
old_state.config.clone(),
)
};
@@ -5075,6 +5078,10 @@ impl Service {
let mut nodes = (*locked.nodes).clone();
nodes.remove(&node_id);
locked.nodes = Arc::new(nodes);
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_pageserver_nodes
.set(locked.nodes.len() as i64);
locked.scheduler.node_remove(node_id);
@@ -5158,6 +5165,10 @@ impl Service {
removed_node.set_availability(NodeAvailability::Offline);
}
*nodes = Arc::new(nodes_mut);
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_pageserver_nodes
.set(nodes.len() as i64);
}
}
@@ -5346,6 +5357,11 @@ impl Service {
locked.nodes = Arc::new(new_nodes);
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_pageserver_nodes
.set(locked.nodes.len() as i64);
tracing::info!(
"Registered pageserver {}, now have {} pageservers",
register_req.node_id,

View File

@@ -473,6 +473,11 @@ impl TenantShard {
shard: ShardIdentity,
policy: PlacementPolicy,
) -> Self {
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_tenant_shards
.inc();
Self {
tenant_shard_id,
policy,
@@ -1384,6 +1389,11 @@ impl TenantShard {
let tenant_shard_id = tsp.get_tenant_shard_id()?;
let shard_identity = tsp.get_shard_identity()?;
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_tenant_shards
.inc();
Ok(Self {
tenant_shard_id,
shard: shard_identity,
@@ -1512,6 +1522,15 @@ impl TenantShard {
}
}
impl Drop for TenantShard {
fn drop(&mut self) {
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_tenant_shards
.dec();
}
}
#[cfg(test)]
pub(crate) mod tests {
use std::{cell::RefCell, rc::Rc};