diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index bf75ea3bc6..f9edf88553 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -122,6 +122,17 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { /// are amortized across the batch, and some layers may not intersect with a given key, each visited /// layer contributes directly to the observed latency for every read in the batch, which is what we /// care about. +pub(crate) static LAYERS_PER_READ: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_layers_per_read", + "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.", + &["tenant_id", "shard_id", "timeline_id"], + // Low resolution to reduce cardinality. + vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0], + ) + .expect("failed to define a metric") +}); + pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { register_histogram!( "pageserver_layers_per_read_global", @@ -2648,6 +2659,7 @@ pub(crate) struct TimelineMetrics { pub disk_consistent_lsn_gauge: IntGauge, pub pitr_history_size: UIntGauge, pub archival_size: UIntGauge, + pub layers_per_read: Histogram, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, pub visible_physical_size_gauge: UIntGauge, @@ -2745,6 +2757,10 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let layers_per_read = LAYERS_PER_READ + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2809,6 +2825,7 @@ impl TimelineMetrics { disk_consistent_lsn_gauge, pitr_history_size, archival_size, + layers_per_read, standby_horizon_gauge, resident_physical_size_gauge, visible_physical_size_gauge, @@ -2978,6 +2995,8 @@ impl TimelineMetrics { } } + let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 4a69376239..ab4b3cac63 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1242,6 +1242,7 @@ impl Timeline { } for _ in &results { + self.metrics.layers_per_read.observe(layers_visited as f64); LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64); } } diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index fd7e193778..83a1a87611 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -158,6 +158,9 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( "pageserver_pitr_history_size", "pageserver_layer_bytes", "pageserver_layer_count", + "pageserver_layers_per_read_bucket", + "pageserver_layers_per_read_count", + "pageserver_layers_per_read_sum", "pageserver_visible_physical_size", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total",