mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 03:52:56 +00:00
storcon: Add safekeeper request label group (#12239)
## Problem The metrics `storage_controller_safekeeper_request_error` and `storage_controller_safekeeper_request_latency` currently use `pageserver_id` as a label. This can be misleading, as the metrics are about safekeeper requests. We want to replace this with a more accurate label — either `safekeeper_id` or `node_id`. ## Summary of changes - Introduced `SafekeeperRequestLabelGroup` with `safekeeper_id`. - Updated the affected metrics to use the new label group. - Fixed incorrect metric usage in safekeeper_client.rs ## Follow-up - Review usage of these metrics in alerting rules and existing Grafana dashboards to ensure this change does not break something.
This commit is contained in:
committed by
GitHub
parent
48052477b4
commit
4a8f3508f9
@@ -97,7 +97,7 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
/// Count of HTTP requests to the safekeeper that resulted in an error,
|
||||
/// broken down by the safekeeper node id, request name and method
|
||||
pub(crate) storage_controller_safekeeper_request_error:
|
||||
measured::CounterVec<PageserverRequestLabelGroupSet>,
|
||||
measured::CounterVec<SafekeeperRequestLabelGroupSet>,
|
||||
|
||||
/// Latency of HTTP requests to the pageserver, broken down by pageserver
|
||||
/// node id, request name and method. This include both successful and unsuccessful
|
||||
@@ -111,7 +111,7 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
/// requests.
|
||||
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
|
||||
pub(crate) storage_controller_safekeeper_request_latency:
|
||||
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
|
||||
measured::HistogramVec<SafekeeperRequestLabelGroupSet, 5>,
|
||||
|
||||
/// Count of pass-through HTTP requests to the pageserver that resulted in an error,
|
||||
/// broken down by the pageserver node id, request name and method
|
||||
@@ -219,6 +219,16 @@ pub(crate) struct PageserverRequestLabelGroup<'a> {
|
||||
pub(crate) method: Method,
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup, Clone)]
|
||||
#[label(set = SafekeeperRequestLabelGroupSet)]
|
||||
pub(crate) struct SafekeeperRequestLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
|
||||
pub(crate) safekeeper_id: &'a str,
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
|
||||
pub(crate) path: &'a str,
|
||||
pub(crate) method: Method,
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = DatabaseQueryErrorLabelGroupSet)]
|
||||
pub(crate) struct DatabaseQueryErrorLabelGroup {
|
||||
|
||||
@@ -5,7 +5,7 @@ use safekeeper_client::mgmt_api::{Client, Result};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::logging::SecretString;
|
||||
|
||||
use crate::metrics::PageserverRequestLabelGroup;
|
||||
use crate::metrics::SafekeeperRequestLabelGroup;
|
||||
|
||||
/// Thin wrapper around [`safekeeper_client::mgmt_api::Client`]. It allows the storage
|
||||
/// controller to collect metrics in a non-intrusive manner.
|
||||
@@ -19,8 +19,8 @@ pub(crate) struct SafekeeperClient {
|
||||
|
||||
macro_rules! measured_request {
|
||||
($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
|
||||
let labels = PageserverRequestLabelGroup {
|
||||
pageserver_id: $node_id,
|
||||
let labels = SafekeeperRequestLabelGroup {
|
||||
safekeeper_id: $node_id,
|
||||
path: $name,
|
||||
method: $method,
|
||||
};
|
||||
@@ -35,7 +35,7 @@ macro_rules! measured_request {
|
||||
if res.is_err() {
|
||||
let error_counters = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_request_error;
|
||||
.storage_controller_safekeeper_request_error;
|
||||
error_counters.inc(labels)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user