diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index abad4b44b8..753f5524c5 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum MetricLayerKind { + Delta, + Image, +} + +static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_layer_bytes", + "Sum of layer physical sizes in bytes", + &["tenant_id", "shard_id", "timeline_id", "kind"] + ) + .expect("failed to define a metric") +}); + +static TIMELINE_LAYER_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_layer_count", + "Number of layers that exist", + &["tenant_id", "shard_id", "timeline_id", "kind"] + ) + .expect("failed to define a metric") +}); + static TIMELINE_ARCHIVE_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_archive_size", @@ -2141,6 +2166,10 @@ pub(crate) struct TimelineMetrics { pub last_record_gauge: IntGauge, pub pitr_history_size: UIntGauge, pub archival_size: UIntGauge, + pub(crate) layer_size_image: UIntGauge, + pub(crate) layer_count_image: UIntGauge, + pub(crate) layer_size_delta: UIntGauge, + pub(crate) layer_count_delta: UIntGauge, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size @@ -2223,6 +2252,42 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let layer_size_image = TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Image.into(), + ]) + .unwrap(); + + let layer_count_image = TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Image.into(), + ]) + .unwrap(); + + let layer_size_delta = TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Delta.into(), + ]) + .unwrap(); + + let layer_count_delta = TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Delta.into(), + ]) + .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2277,6 +2342,10 @@ impl TimelineMetrics { last_record_gauge, pitr_history_size, archival_size, + layer_size_image, + layer_count_image, + layer_size_delta, + layer_count_delta, standby_horizon_gauge, resident_physical_size_gauge, current_logical_size_gauge, @@ -2338,6 +2407,31 @@ impl TimelineMetrics { let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Image.into(), + ]); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Image.into(), + ]); + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Delta.into(), + ]); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Delta.into(), + ]); + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]); diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 4500bc94dd..dbf6c60aae 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -693,6 +693,18 @@ impl Drop for LayerInner { // and we could be delaying shutdown for nothing. } + if let Some(timeline) = self.timeline.upgrade() { + // Only need to decrement metrics if the timeline still exists: otherwise + // it will have already de-registered these metrics via TimelineMetrics::shutdown + if self.desc.is_delta() { + timeline.metrics.layer_count_delta.dec(); + timeline.metrics.layer_size_delta.sub(self.desc.file_size); + } else { + timeline.metrics.layer_count_image.dec(); + timeline.metrics.layer_size_image.sub(self.desc.file_size); + } + } + if !*self.wanted_deleted.get_mut() { return; } @@ -791,6 +803,15 @@ impl LayerInner { (heavier_once_cell::OnceCell::default(), 0, Status::Evicted) }; + // This object acts as a RAII guard on these metrics: increment on construction + if desc.is_delta() { + timeline.metrics.layer_count_delta.inc(); + timeline.metrics.layer_size_delta.add(desc.file_size); + } else { + timeline.metrics.layer_count_image.inc(); + timeline.metrics.layer_size_image.add(desc.file_size); + } + LayerInner { conf, debug_str: { diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c019cbbc77..4836d42db5 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -146,6 +146,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_smgr_query_seconds_sum", "pageserver_archive_size", "pageserver_pitr_history_size", + "pageserver_layer_bytes", + "pageserver_layer_count", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", "pageserver_evictions_total",