pageserver: add per-timeline read amp histogram (#10566)

## Problem

We don't have per-timeline observability for read amplification.

Touches https://github.com/neondatabase/cloud/issues/23283.

## Summary of changes

Add a per-timeline `pageserver_layers_per_read` histogram.

NB: per-timeline histograms are expensive, but probably worth it in this
case.
This commit is contained in:
Erik Grinaker
2025-01-30 12:24:49 +01:00
committed by GitHub
parent 8804d58943
commit 6a2afa0c02
3 changed files with 23 additions and 0 deletions

View File

@@ -122,6 +122,17 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
/// are amortized across the batch, and some layers may not intersect with a given key, each visited
/// layer contributes directly to the observed latency for every read in the batch, which is what we
/// care about.
pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_layers_per_read",
"Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
&["tenant_id", "shard_id", "timeline_id"],
// Low resolution to reduce cardinality.
vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
)
.expect("failed to define a metric")
});
pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_layers_per_read_global",
@@ -2648,6 +2659,7 @@ pub(crate) struct TimelineMetrics {
pub disk_consistent_lsn_gauge: IntGauge,
pub pitr_history_size: UIntGauge,
pub archival_size: UIntGauge,
pub layers_per_read: Histogram,
pub standby_horizon_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
pub visible_physical_size_gauge: UIntGauge,
@@ -2745,6 +2757,10 @@ impl TimelineMetrics {
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let layers_per_read = LAYERS_PER_READ
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let standby_horizon_gauge = STANDBY_HORIZON
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
@@ -2809,6 +2825,7 @@ impl TimelineMetrics {
disk_consistent_lsn_gauge,
pitr_history_size,
archival_size,
layers_per_read,
standby_horizon_gauge,
resident_physical_size_gauge,
visible_physical_size_gauge,
@@ -2978,6 +2995,8 @@ impl TimelineMetrics {
}
}
let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);

View File

@@ -1242,6 +1242,7 @@ impl Timeline {
}
for _ in &results {
self.metrics.layers_per_read.observe(layers_visited as f64);
LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
}
}

View File

@@ -158,6 +158,9 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
"pageserver_pitr_history_size",
"pageserver_layer_bytes",
"pageserver_layer_count",
"pageserver_layers_per_read_bucket",
"pageserver_layers_per_read_count",
"pageserver_layers_per_read_sum",
"pageserver_visible_physical_size",
"pageserver_storage_operations_seconds_count_total",
"pageserver_storage_operations_seconds_sum_total",