Reduce the # of histogram buckets in metrics. (#3850)

Shrinks the total number of metrics collected for each timeline by about 50%. See https://github.com/neondatabase/neon/issues/2848. This doesn't fully solve the problem, we still collect a lot of metrics even with this, but this gives us a lot of headroom.
2026-01-07 21:42:56 +00:00 · 2023-03-20 15:49:16 +02:00
parent 0c1228c37a
commit 1ddb9249aa
1 changed files with 25 additions and 35 deletions
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -9,22 +9,18 @@ use once_cell::sync::Lazy;
 use pageserver_api::models::state;
 use utils::id::{TenantId, TimelineId};

-/// Prometheus histogram buckets (in seconds) that capture the majority of
-/// latencies in the microsecond range but also extend far enough up to distinguish
-/// "bad" from "really bad".
-fn get_buckets_for_critical_operations() -> Vec<f64> {
-    let buckets_per_digit = 5;
-    let min_exponent = -6;
-    let max_exponent = 2;
-
-    let mut buckets = vec![];
-    // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp
-    // because it's more numerically stable and doesn't result in numbers like 9.999999
-    for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) {
-        buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64))
-    }
-    buckets
-}
+/// Prometheus histogram buckets (in seconds) for operations in the critical
+/// path. In other words, operations that directly affect that latency of user
+/// queries.
+///
+/// The buckets capture the majority of latencies in the microsecond and
+/// millisecond range but also extend far enough up to distinguish "bad" from
+/// "really bad".
+const CRITICAL_OP_BUCKETS: &[f64] = &[
+    0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us
+    0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms
+    1.0, 10.0, 100.0, // 1 s, 10 s, 100 s
+];

 // Metrics collected on operations on the storage repository.
 const STORAGE_TIME_OPERATIONS: &[&str] = &[
@@ -55,12 +51,15 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+// Buckets for background operations like compaction, GC, size calculation
+const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
+
 pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_storage_operations_seconds_global",
        "Time spent on storage operations",
        &["operation"],
-        get_buckets_for_critical_operations(),
+        STORAGE_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -71,7 +70,7 @@ static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value",
        &["tenant_id", "timeline_id"],
-        get_buckets_for_critical_operations(),
+        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -90,7 +89,7 @@ static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
        "pageserver_wait_lsn_seconds",
        "Time spent waiting for WAL to arrive",
        &["tenant_id", "timeline_id"],
-        get_buckets_for_critical_operations(),
+        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -196,14 +195,13 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
 });

 // Metrics collected on disk IO operations
+//
+// Roughly logarithmic scale.
 const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
-    0.000001, // 1 usec
-    0.00001,  // 10 usec
-    0.0001,   // 100 usec
-    0.001,    // 1 msec
-    0.01,     // 10 msec
-    0.1,      // 100 msec
-    1.0,      // 1 sec
+    0.000030, // 30 usec
+    0.001000, // 1000 usec
+    0.030,    // 30 ms
+    1.000,    // 1000 ms
 ];

 const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
@@ -238,20 +236,12 @@ const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
    "get_db_size",
 ];

-const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[
-    0.00001, // 1/100000 s
-    0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s
-    0.001, 0.0025, 0.005, 0.0075, // 1/1000 s
-    0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s
-    0.1,  // 1/10 s
-];
-
 pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
        "Time spent on smgr query handling",
        &["smgr_query_type", "tenant_id", "timeline_id"],
-        SMGR_QUERY_TIME_BUCKETS.into()
+        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });