storcon: add metric for AZ scheduling violations (#9949)

## Problem We can't easily tell how far the state of shards is from their AZ preferences. This can be a cause of performance issues, so it's important for diagnosability that we can tell easily if there are significant numbers of shards that aren't running in their preferred AZ. Related: https://github.com/neondatabase/cloud/issues/15413 ## Summary of changes - In reconcile_all, count shards that are scheduled into the wrong AZ (if they have a preference), and publish it as a prometheus gauge. - Also calculate a statistic for how many shards wanted to reconcile but couldn't. This is clearly a lazy calculation: reconcile all only runs periodically. But that's okay: shards in the wrong AZ is something that only matters if it stays that way for some period of time.
2026-01-05 20:42:54 +00:00 · 2024-12-02 11:50:22 +00:00
parent 5330122049
commit bd09369198
2 changed files with 38 additions and 0 deletions
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -50,6 +50,12 @@ pub(crate) struct StorageControllerMetricGroup {
    /// Count of how many times we make an optimization change to a tenant's scheduling
    pub(crate) storage_controller_schedule_optimization: measured::Counter,

+    /// How many shards are not scheduled into their preferred AZ
+    pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
+
+    /// How many shards would like to reconcile but were blocked by concurrency limits
+    pub(crate) storage_controller_pending_reconciles: measured::Gauge,
+
    /// HTTP request status counters for handled requests
    pub(crate) storage_controller_http_request_status:
        measured::CounterVec<HttpRequestStatusLabelGroupSet>,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6016,14 +6016,33 @@ impl Service {
        let (nodes, tenants, _scheduler) = locked.parts_mut();
        let pageservers = nodes.clone();

+        // This function is an efficient place to update lazy statistics, since we are walking
+        // all tenants.
+        let mut pending_reconciles = 0;
+        let mut az_violations = 0;
+
        let mut reconciles_spawned = 0;
        for shard in tenants.values_mut() {
+            // Accumulate scheduling statistics
+            if let (Some(attached), Some(preferred)) =
+                (shard.intent.get_attached(), shard.preferred_az())
+            {
+                let node_az = nodes
+                    .get(attached)
+                    .expect("Nodes exist if referenced")
+                    .get_availability_zone_id();
+                if node_az != preferred {
+                    az_violations += 1;
+                }
+            }
+
            // Skip checking if this shard is already enqueued for reconciliation
            if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
                // If there is something delayed, then return a nonzero count so that
                // callers like reconcile_all_now do not incorrectly get the impression
                // that the system is in a quiescent state.
                reconciles_spawned = std::cmp::max(1, reconciles_spawned);
+                pending_reconciles += 1;
                continue;
            }

@@ -6031,9 +6050,22 @@ impl Service {
            // dirty, spawn another rone
            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
                reconciles_spawned += 1;
+            } else if shard.delayed_reconcile {
+                // Shard wanted to reconcile but for some reason couldn't.
+                pending_reconciles += 1;
            }
        }

+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_schedule_az_violation
+            .set(az_violations as i64);
+
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pending_reconciles
+            .set(pending_reconciles as i64);
+
        reconciles_spawned
    }