storcon: Add safekeeper request label group (#12239)

## Problem The metrics `storage_controller_safekeeper_request_error` and `storage_controller_safekeeper_request_latency` currently use `pageserver_id` as a label. This can be misleading, as the metrics are about safekeeper requests. We want to replace this with a more accurate label — either `safekeeper_id` or `node_id`. ## Summary of changes - Introduced `SafekeeperRequestLabelGroup` with `safekeeper_id`. - Updated the affected metrics to use the new label group. - Fixed incorrect metric usage in safekeeper_client.rs ## Follow-up - Review usage of these metrics in alerting rules and existing Grafana dashboards to ensure this change does not break something.
2026-01-04 03:52:56 +00:00 · 2025-06-17 17:33:01 +04:00
parent 48052477b4
commit 4a8f3508f9
2 changed files with 16 additions and 6 deletions
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -97,7 +97,7 @@ pub(crate) struct StorageControllerMetricGroup {
    /// Count of HTTP requests to the safekeeper that resulted in an error,
    /// broken down by the safekeeper node id, request name and method
    pub(crate) storage_controller_safekeeper_request_error:
-        measured::CounterVec<PageserverRequestLabelGroupSet>,
+        measured::CounterVec<SafekeeperRequestLabelGroupSet>,

    /// Latency of HTTP requests to the pageserver, broken down by pageserver
    /// node id, request name and method. This include both successful and unsuccessful
@@ -111,7 +111,7 @@ pub(crate) struct StorageControllerMetricGroup {
    /// requests.
    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
    pub(crate) storage_controller_safekeeper_request_latency:
-        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
+        measured::HistogramVec<SafekeeperRequestLabelGroupSet, 5>,

    /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
    /// broken down by the pageserver node id, request name and method
@@ -219,6 +219,16 @@ pub(crate) struct PageserverRequestLabelGroup<'a> {
    pub(crate) method: Method,
 }

+#[derive(measured::LabelGroup, Clone)]
+#[label(set = SafekeeperRequestLabelGroupSet)]
+pub(crate) struct SafekeeperRequestLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) safekeeper_id: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) path: &'a str,
+    pub(crate) method: Method,
+}
+
 #[derive(measured::LabelGroup)]
 #[label(set = DatabaseQueryErrorLabelGroupSet)]
 pub(crate) struct DatabaseQueryErrorLabelGroup {
--- a/storage_controller/src/safekeeper_client.rs
+++ b/storage_controller/src/safekeeper_client.rs
@@ -5,7 +5,7 @@ use safekeeper_client::mgmt_api::{Client, Result};
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;

-use crate::metrics::PageserverRequestLabelGroup;
+use crate::metrics::SafekeeperRequestLabelGroup;

 /// Thin wrapper around [`safekeeper_client::mgmt_api::Client`]. It allows the storage
 /// controller to collect metrics in a non-intrusive manner.
@@ -19,8 +19,8 @@ pub(crate) struct SafekeeperClient {

 macro_rules! measured_request {
    ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
-        let labels = PageserverRequestLabelGroup {
-            pageserver_id: $node_id,
+        let labels = SafekeeperRequestLabelGroup {
+            safekeeper_id: $node_id,
            path: $name,
            method: $method,
        };
@@ -35,7 +35,7 @@ macro_rules! measured_request {
        if res.is_err() {
            let error_counters = &crate::metrics::METRICS_REGISTRY
                .metrics_group
-                .storage_controller_pageserver_request_error;
+                .storage_controller_safekeeper_request_error;
            error_counters.inc(labels)
        }