From ef3ebfaf67592d30462f585b6828ca86175be381 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 17 Jul 2024 21:55:20 +0100 Subject: [PATCH] pageserver: layer count & size metrics (#8410) ## Problem We lack insight into: - How much of a tenant's physical size is image vs. delta layers - Average sizes of image vs. delta layers - Total layer counts per timeline, indicating size of index_part object As well as general observability love, this is motivated by https://github.com/neondatabase/neon/issues/6738, where we need to define some sensible thresholds for storage amplification, and using total physical size may not work well (if someone does a lot of DROPs then it's legitimate for the physical-synthetic ratio to be huge), but the ratio between image layer size and delta layer size may be a better indicator of whether we're generating unreasonable quantities of image layers. ## Summary of changes - Add pageserver_layer_bytes and pageserver_layer_count metrics, labelled by timeline and `kind` (delta or image) - Add & subtract these with LayerInner's lifetime. I'm intentionally avoiding using a generic metric RAII guard object, to avoid bloating LayerInner: it already has all the information it needs to update metric on new+drop. --- pageserver/src/metrics.rs | 94 ++++++++++++++++++++ pageserver/src/tenant/storage_layer/layer.rs | 21 +++++ test_runner/fixtures/metrics.py | 2 + 3 files changed, 117 insertions(+) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index abad4b44b8..753f5524c5 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum MetricLayerKind { + Delta, + Image, +} + +static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_layer_bytes", + "Sum of layer physical sizes in bytes", + &["tenant_id", "shard_id", "timeline_id", "kind"] + ) + .expect("failed to define a metric") +}); + +static TIMELINE_LAYER_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_layer_count", + "Number of layers that exist", + &["tenant_id", "shard_id", "timeline_id", "kind"] + ) + .expect("failed to define a metric") +}); + static TIMELINE_ARCHIVE_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_archive_size", @@ -2141,6 +2166,10 @@ pub(crate) struct TimelineMetrics { pub last_record_gauge: IntGauge, pub pitr_history_size: UIntGauge, pub archival_size: UIntGauge, + pub(crate) layer_size_image: UIntGauge, + pub(crate) layer_count_image: UIntGauge, + pub(crate) layer_size_delta: UIntGauge, + pub(crate) layer_count_delta: UIntGauge, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size @@ -2223,6 +2252,42 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let layer_size_image = TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Image.into(), + ]) + .unwrap(); + + let layer_count_image = TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Image.into(), + ]) + .unwrap(); + + let layer_size_delta = TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Delta.into(), + ]) + .unwrap(); + + let layer_count_delta = TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Delta.into(), + ]) + .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2277,6 +2342,10 @@ impl TimelineMetrics { last_record_gauge, pitr_history_size, archival_size, + layer_size_image, + layer_count_image, + layer_size_delta, + layer_count_delta, standby_horizon_gauge, resident_physical_size_gauge, current_logical_size_gauge, @@ -2338,6 +2407,31 @@ impl TimelineMetrics { let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Image.into(), + ]); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Image.into(), + ]); + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Delta.into(), + ]); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Delta.into(), + ]); + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]); diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 4500bc94dd..dbf6c60aae 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -693,6 +693,18 @@ impl Drop for LayerInner { // and we could be delaying shutdown for nothing. } + if let Some(timeline) = self.timeline.upgrade() { + // Only need to decrement metrics if the timeline still exists: otherwise + // it will have already de-registered these metrics via TimelineMetrics::shutdown + if self.desc.is_delta() { + timeline.metrics.layer_count_delta.dec(); + timeline.metrics.layer_size_delta.sub(self.desc.file_size); + } else { + timeline.metrics.layer_count_image.dec(); + timeline.metrics.layer_size_image.sub(self.desc.file_size); + } + } + if !*self.wanted_deleted.get_mut() { return; } @@ -791,6 +803,15 @@ impl LayerInner { (heavier_once_cell::OnceCell::default(), 0, Status::Evicted) }; + // This object acts as a RAII guard on these metrics: increment on construction + if desc.is_delta() { + timeline.metrics.layer_count_delta.inc(); + timeline.metrics.layer_size_delta.add(desc.file_size); + } else { + timeline.metrics.layer_count_image.inc(); + timeline.metrics.layer_size_image.add(desc.file_size); + } + LayerInner { conf, debug_str: { diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c019cbbc77..4836d42db5 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -146,6 +146,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_smgr_query_seconds_sum", "pageserver_archive_size", "pageserver_pitr_history_size", + "pageserver_layer_bytes", + "pageserver_layer_count", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", "pageserver_evictions_total",