From 8dca188974530b3c0c2160b22930615141e0236b Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 22 Oct 2024 19:43:02 +0100 Subject: [PATCH] storage controller: add metrics for tenant shard, node count (#9475) ## Problem Previously, figuring out how many tenant shards were managed by a storage controller was typically done by peeking at the database or calling into the API. A metric makes it easier to monitor, as unexpectedly increasing shard counts can be indicative of problems elsewhere in the system. ## Summary of changes - Add metrics `storage_controller_pageserver_nodes` (updated on node CRUD operations from Service) and `storage_controller_tenant_shards` (updated RAII-style from TenantShard) --- storage_controller/src/metrics.rs | 6 +++++ storage_controller/src/service.rs | 22 ++++++++++++++++--- storage_controller/src/tenant_shard.rs | 19 ++++++++++++++++ .../regress/test_storage_controller.py | 9 ++++++++ 4 files changed, 53 insertions(+), 3 deletions(-) diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 5989aeba91..a1f7bc2457 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -37,6 +37,12 @@ pub(crate) struct StorageControllerMetricGroup { /// Count of how many times we spawn a reconcile task pub(crate) storage_controller_reconcile_spawn: measured::Counter, + /// Size of the in-memory map of tenant shards + pub(crate) storage_controller_tenant_shards: measured::Gauge, + + /// Size of the in-memory map of pageserver_nodes + pub(crate) storage_controller_pageserver_nodes: measured::Gauge, + /// Reconciler tasks completed, broken down by success/failure/cancelled pub(crate) storage_controller_reconcile_complete: measured::CounterVec, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 01aa8f1dab..2cde1d6a3d 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -934,7 +934,6 @@ impl Service { self.startup_complete.clone().wait().await; const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20); - let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD); while !self.reconcilers_cancel.is_cancelled() { tokio::select! { @@ -1272,6 +1271,10 @@ impl Service { .collect::>(); let nodes: HashMap = nodes.into_iter().map(|n| (n.get_id(), n)).collect(); tracing::info!("Loaded {} nodes from database.", nodes.len()); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_nodes + .set(nodes.len() as i64); tracing::info!("Loading shards from database..."); let mut tenant_shard_persistence = persistence.list_tenant_shards().await?; @@ -4110,9 +4113,9 @@ impl Service { ( old_attached, generation, - old_state.policy, + old_state.policy.clone(), old_state.shard, - old_state.config, + old_state.config.clone(), ) }; @@ -5075,6 +5078,10 @@ impl Service { let mut nodes = (*locked.nodes).clone(); nodes.remove(&node_id); locked.nodes = Arc::new(nodes); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_nodes + .set(locked.nodes.len() as i64); locked.scheduler.node_remove(node_id); @@ -5158,6 +5165,10 @@ impl Service { removed_node.set_availability(NodeAvailability::Offline); } *nodes = Arc::new(nodes_mut); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_nodes + .set(nodes.len() as i64); } } @@ -5346,6 +5357,11 @@ impl Service { locked.nodes = Arc::new(new_nodes); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_nodes + .set(locked.nodes.len() as i64); + tracing::info!( "Registered pageserver {}, now have {} pageservers", register_req.node_id, diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 8a7ff866e6..e696c72ba7 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -473,6 +473,11 @@ impl TenantShard { shard: ShardIdentity, policy: PlacementPolicy, ) -> Self { + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_tenant_shards + .inc(); + Self { tenant_shard_id, policy, @@ -1384,6 +1389,11 @@ impl TenantShard { let tenant_shard_id = tsp.get_tenant_shard_id()?; let shard_identity = tsp.get_shard_identity()?; + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_tenant_shards + .inc(); + Ok(Self { tenant_shard_id, shard: shard_identity, @@ -1512,6 +1522,15 @@ impl TenantShard { } } +impl Drop for TenantShard { + fn drop(&mut self) { + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_tenant_shards + .dec(); + } +} + #[cfg(test)] pub(crate) mod tests { use std::{cell::RefCell, rc::Rc}; diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index a4e293da9e..d4bc4b1a4f 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -107,6 +107,15 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination) for tid in tenant_ids: env.create_tenant(tid, shard_count=shards_per_tenant) + # Validate high level metrics + assert ( + env.storage_controller.get_metric_value("storage_controller_tenant_shards") + == len(tenant_ids) * shards_per_tenant + ) + assert env.storage_controller.get_metric_value("storage_controller_pageserver_nodes") == len( + env.storage_controller.node_list() + ) + # Repeating a creation should be idempotent (we are just testing it doesn't return an error) env.storage_controller.tenant_create( tenant_id=next(iter(tenant_ids)), shard_count=shards_per_tenant