mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 12:02:55 +00:00
pageserver: add per-timeline read amp histogram (#10566)
## Problem We don't have per-timeline observability for read amplification. Touches https://github.com/neondatabase/cloud/issues/23283. ## Summary of changes Add a per-timeline `pageserver_layers_per_read` histogram. NB: per-timeline histograms are expensive, but probably worth it in this case.
This commit is contained in:
@@ -122,6 +122,17 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
/// are amortized across the batch, and some layers may not intersect with a given key, each visited
|
||||
/// layer contributes directly to the observed latency for every read in the batch, which is what we
|
||||
/// care about.
|
||||
pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_layers_per_read",
|
||||
"Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
|
||||
&["tenant_id", "shard_id", "timeline_id"],
|
||||
// Low resolution to reduce cardinality.
|
||||
vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_layers_per_read_global",
|
||||
@@ -2648,6 +2659,7 @@ pub(crate) struct TimelineMetrics {
|
||||
pub disk_consistent_lsn_gauge: IntGauge,
|
||||
pub pitr_history_size: UIntGauge,
|
||||
pub archival_size: UIntGauge,
|
||||
pub layers_per_read: Histogram,
|
||||
pub standby_horizon_gauge: IntGauge,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
pub visible_physical_size_gauge: UIntGauge,
|
||||
@@ -2745,6 +2757,10 @@ impl TimelineMetrics {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let layers_per_read = LAYERS_PER_READ
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let standby_horizon_gauge = STANDBY_HORIZON
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -2809,6 +2825,7 @@ impl TimelineMetrics {
|
||||
disk_consistent_lsn_gauge,
|
||||
pitr_history_size,
|
||||
archival_size,
|
||||
layers_per_read,
|
||||
standby_horizon_gauge,
|
||||
resident_physical_size_gauge,
|
||||
visible_physical_size_gauge,
|
||||
@@ -2978,6 +2995,8 @@ impl TimelineMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
@@ -1242,6 +1242,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
for _ in &results {
|
||||
self.metrics.layers_per_read.observe(layers_visited as f64);
|
||||
LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,6 +158,9 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
|
||||
"pageserver_pitr_history_size",
|
||||
"pageserver_layer_bytes",
|
||||
"pageserver_layer_count",
|
||||
"pageserver_layers_per_read_bucket",
|
||||
"pageserver_layers_per_read_count",
|
||||
"pageserver_layers_per_read_sum",
|
||||
"pageserver_visible_physical_size",
|
||||
"pageserver_storage_operations_seconds_count_total",
|
||||
"pageserver_storage_operations_seconds_sum_total",
|
||||
|
||||
Reference in New Issue
Block a user