diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index a1f7bc2457..6d5885eba6 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -50,6 +50,12 @@ pub(crate) struct StorageControllerMetricGroup { /// Count of how many times we make an optimization change to a tenant's scheduling pub(crate) storage_controller_schedule_optimization: measured::Counter, + /// How many shards are not scheduled into their preferred AZ + pub(crate) storage_controller_schedule_az_violation: measured::Gauge, + + /// How many shards would like to reconcile but were blocked by concurrency limits + pub(crate) storage_controller_pending_reconciles: measured::Gauge, + /// HTTP request status counters for handled requests pub(crate) storage_controller_http_request_status: measured::CounterVec, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 636ccf11a1..631fdb4923 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -6016,14 +6016,33 @@ impl Service { let (nodes, tenants, _scheduler) = locked.parts_mut(); let pageservers = nodes.clone(); + // This function is an efficient place to update lazy statistics, since we are walking + // all tenants. + let mut pending_reconciles = 0; + let mut az_violations = 0; + let mut reconciles_spawned = 0; for shard in tenants.values_mut() { + // Accumulate scheduling statistics + if let (Some(attached), Some(preferred)) = + (shard.intent.get_attached(), shard.preferred_az()) + { + let node_az = nodes + .get(attached) + .expect("Nodes exist if referenced") + .get_availability_zone_id(); + if node_az != preferred { + az_violations += 1; + } + } + // Skip checking if this shard is already enqueued for reconciliation if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 { // If there is something delayed, then return a nonzero count so that // callers like reconcile_all_now do not incorrectly get the impression // that the system is in a quiescent state. reconciles_spawned = std::cmp::max(1, reconciles_spawned); + pending_reconciles += 1; continue; } @@ -6031,9 +6050,22 @@ impl Service { // dirty, spawn another rone if self.maybe_reconcile_shard(shard, &pageservers).is_some() { reconciles_spawned += 1; + } else if shard.delayed_reconcile { + // Shard wanted to reconcile but for some reason couldn't. + pending_reconciles += 1; } } + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_schedule_az_violation + .set(az_violations as i64); + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pending_reconciles + .set(pending_reconciles as i64); + reconciles_spawned }