From 966213f429f2cddcb0907ae74c4a970beeef46e8 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 17 Jul 2023 13:46:13 +0200 Subject: [PATCH] basebackup query metric: use same buckets as control plane (#4732) The `CRITICAL_OPS_BUCKETS` is not useful for getting an accurate picture of basebackup latency because all the observations that negatively affect our SLI fall into one bucket, i.e., 100ms-1s. Use the same buckets as control plane instead. --- pageserver/src/metrics.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index dc0c1c03b7..ee8dfba69a 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -541,6 +541,17 @@ pub static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +// keep in sync with control plane Go code so that we can validate +// compute's basebackup_ms metric with our perspective in the context of SLI/SLO. +static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| { + // Go code uses milliseconds. Variable is called `computeStartupBuckets` + [ + 5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000, + 1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000, + ] + .map(|ms| (ms as f64) / 1000.0) +}); + pub struct BasebackupQueryTime(HistogramVec); pub static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { BasebackupQueryTime({ @@ -548,7 +559,7 @@ pub static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { "pageserver_basebackup_query_seconds", "Histogram of basebackup queries durations, by result type", &["result"], - CRITICAL_OP_BUCKETS.into(), + COMPUTE_STARTUP_BUCKETS.to_vec(), ) .expect("failed to define a metric") })