From 606f14034ee591c3c7832c39a5628480935f05a6 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 28 Apr 2025 13:52:44 +0200 Subject: [PATCH] pageserver: improve `pageserver_smgr_query_seconds` buckets (#11680) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem The `pageserver_smgr_query_seconds` buckets are too coarse, using powers of 10: 1 µs, 10 µs, 100 µs, 1 ms, 10 ms, 100 ms, 1 s, 10 s, 100 s. This is one of our most crucial latency metrics, and needs better resolution. Touches #11594. ## Summary of changes This patch uses buckets with better resolution around 1 ms (the typical latency): * 0.6 ms * 1 ms * 3 ms * 6 ms * 10 ms * 30 ms * 100 ms * 1 s * 3 s These will be the same as the compute's `compute_getpage_wait_seconds`, to make them comparable across the compute and Pageserver: https://github.com/neondatabase/flux-fleet/pull/579. We sacrifice buckets above 3 s, since these can already be considered "too slow". This does not change the previously used `CRITICAL_OP_BUCKETS`, which is also used for other operations on different timescales (e.g. LSN waits). We should consider replacing this with more appropriate buckets for specific operations, since it covers a large span with low resolution. --- pageserver/src/metrics.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index b16970c911..9a6c3f2378 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1774,8 +1774,12 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy = Lazy::new(| .expect("failed to define a metric") }); -// Alias so all histograms recording per-timeline smgr timings use the same buckets. -static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS; +/// Per-timeline smgr histogram buckets should be the same as the compute buckets, such that the +/// metrics are comparable across compute and Pageserver. See also: +/// +/// +static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = + &[0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.1, 1.0, 3.0]; static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { register_histogram_vec!(