mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 21:42:56 +00:00
Reduce the # of histogram buckets in metrics. (#3850)
Shrinks the total number of metrics collected for each timeline by about 50%. See https://github.com/neondatabase/neon/issues/2848. This doesn't fully solve the problem, we still collect a lot of metrics even with this, but this gives us a lot of headroom.
This commit is contained in:
committed by
GitHub
parent
0c1228c37a
commit
1ddb9249aa
@@ -9,22 +9,18 @@ use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::state;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
/// Prometheus histogram buckets (in seconds) that capture the majority of
|
||||
/// latencies in the microsecond range but also extend far enough up to distinguish
|
||||
/// "bad" from "really bad".
|
||||
fn get_buckets_for_critical_operations() -> Vec<f64> {
|
||||
let buckets_per_digit = 5;
|
||||
let min_exponent = -6;
|
||||
let max_exponent = 2;
|
||||
|
||||
let mut buckets = vec![];
|
||||
// Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp
|
||||
// because it's more numerically stable and doesn't result in numbers like 9.999999
|
||||
for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) {
|
||||
buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64))
|
||||
}
|
||||
buckets
|
||||
}
|
||||
/// Prometheus histogram buckets (in seconds) for operations in the critical
|
||||
/// path. In other words, operations that directly affect that latency of user
|
||||
/// queries.
|
||||
///
|
||||
/// The buckets capture the majority of latencies in the microsecond and
|
||||
/// millisecond range but also extend far enough up to distinguish "bad" from
|
||||
/// "really bad".
|
||||
const CRITICAL_OP_BUCKETS: &[f64] = &[
|
||||
0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us
|
||||
0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms
|
||||
1.0, 10.0, 100.0, // 1 s, 10 s, 100 s
|
||||
];
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
const STORAGE_TIME_OPERATIONS: &[&str] = &[
|
||||
@@ -55,12 +51,15 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Buckets for background operations like compaction, GC, size calculation
|
||||
const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
|
||||
|
||||
pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_storage_operations_seconds_global",
|
||||
"Time spent on storage operations",
|
||||
&["operation"],
|
||||
get_buckets_for_critical_operations(),
|
||||
STORAGE_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -71,7 +70,7 @@ static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
"pageserver_getpage_reconstruct_seconds",
|
||||
"Time spent in reconstruct_value",
|
||||
&["tenant_id", "timeline_id"],
|
||||
get_buckets_for_critical_operations(),
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -90,7 +89,7 @@ static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
"pageserver_wait_lsn_seconds",
|
||||
"Time spent waiting for WAL to arrive",
|
||||
&["tenant_id", "timeline_id"],
|
||||
get_buckets_for_critical_operations(),
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -196,14 +195,13 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
});
|
||||
|
||||
// Metrics collected on disk IO operations
|
||||
//
|
||||
// Roughly logarithmic scale.
|
||||
const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
0.000001, // 1 usec
|
||||
0.00001, // 10 usec
|
||||
0.0001, // 100 usec
|
||||
0.001, // 1 msec
|
||||
0.01, // 10 msec
|
||||
0.1, // 100 msec
|
||||
1.0, // 1 sec
|
||||
0.000030, // 30 usec
|
||||
0.001000, // 1000 usec
|
||||
0.030, // 30 ms
|
||||
1.000, // 1000 ms
|
||||
];
|
||||
|
||||
const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
|
||||
@@ -238,20 +236,12 @@ const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
|
||||
"get_db_size",
|
||||
];
|
||||
|
||||
const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[
|
||||
0.00001, // 1/100000 s
|
||||
0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s
|
||||
0.001, 0.0025, 0.005, 0.0075, // 1/1000 s
|
||||
0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s
|
||||
0.1, // 1/10 s
|
||||
];
|
||||
|
||||
pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds",
|
||||
"Time spent on smgr query handling",
|
||||
&["smgr_query_type", "tenant_id", "timeline_id"],
|
||||
SMGR_QUERY_TIME_BUCKETS.into()
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user