diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 4826a0b7ae..6a8aecfd25 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -9,22 +9,18 @@ use once_cell::sync::Lazy; use pageserver_api::models::state; use utils::id::{TenantId, TimelineId}; -/// Prometheus histogram buckets (in seconds) that capture the majority of -/// latencies in the microsecond range but also extend far enough up to distinguish -/// "bad" from "really bad". -fn get_buckets_for_critical_operations() -> Vec { - let buckets_per_digit = 5; - let min_exponent = -6; - let max_exponent = 2; - - let mut buckets = vec![]; - // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp - // because it's more numerically stable and doesn't result in numbers like 9.999999 - for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) { - buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64)) - } - buckets -} +/// Prometheus histogram buckets (in seconds) for operations in the critical +/// path. In other words, operations that directly affect that latency of user +/// queries. +/// +/// The buckets capture the majority of latencies in the microsecond and +/// millisecond range but also extend far enough up to distinguish "bad" from +/// "really bad". +const CRITICAL_OP_BUCKETS: &[f64] = &[ + 0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us + 0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms + 1.0, 10.0, 100.0, // 1 s, 10 s, 100 s +]; // Metrics collected on operations on the storage repository. const STORAGE_TIME_OPERATIONS: &[&str] = &[ @@ -55,12 +51,15 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +// Buckets for background operations like compaction, GC, size calculation +const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0]; + pub static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_storage_operations_seconds_global", "Time spent on storage operations", &["operation"], - get_buckets_for_critical_operations(), + STORAGE_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); @@ -71,7 +70,7 @@ static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { "pageserver_getpage_reconstruct_seconds", "Time spent in reconstruct_value", &["tenant_id", "timeline_id"], - get_buckets_for_critical_operations(), + CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); @@ -90,7 +89,7 @@ static WAIT_LSN_TIME: Lazy = Lazy::new(|| { "pageserver_wait_lsn_seconds", "Time spent waiting for WAL to arrive", &["tenant_id", "timeline_id"], - get_buckets_for_critical_operations(), + CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); @@ -196,14 +195,13 @@ static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { }); // Metrics collected on disk IO operations +// +// Roughly logarithmic scale. const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ - 0.000001, // 1 usec - 0.00001, // 10 usec - 0.0001, // 100 usec - 0.001, // 1 msec - 0.01, // 10 msec - 0.1, // 100 msec - 1.0, // 1 sec + 0.000030, // 30 usec + 0.001000, // 1000 usec + 0.030, // 30 ms + 1.000, // 1000 ms ]; const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[ @@ -238,20 +236,12 @@ const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[ "get_db_size", ]; -const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[ - 0.00001, // 1/100000 s - 0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s - 0.001, 0.0025, 0.005, 0.0075, // 1/1000 s - 0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s - 0.1, // 1/10 s -]; - pub static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_smgr_query_seconds", "Time spent on smgr query handling", &["smgr_query_type", "tenant_id", "timeline_id"], - SMGR_QUERY_TIME_BUCKETS.into() + CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") });