From ff298afb97253654f7daa5ed3c5977645a249a4f Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 29 Jan 2025 22:10:56 +0100 Subject: [PATCH] pageserver: add `level` for timeline layer metrics (#10563) ## Problem We don't have good observability for per-timeline compaction debt, specifically the number of delta layers in the frozen, L0, and L1 levels. Touches https://github.com/neondatabase/cloud/issues/23283. ## Summary of changes * Add a `level` label for `pageserver_layer_{count,size}` with values `l0`, `l1`, and `frozen`. * Track metrics for frozen layers. There is already a `kind={delta,image}` label. `kind=image` is only possible for `level=l1`. We don't include the currently open ephemeral layer, only frozen layers. There is always exactly 1 ephemeral layer, with a dynamic size which is already tracked in `pageserver_timeline_ephemeral_bytes`. --- pageserver/src/metrics.rs | 236 +++++++++++------- pageserver/src/tenant/storage_layer/layer.rs | 16 +- pageserver/src/tenant/timeline.rs | 2 +- .../src/tenant/timeline/layer_manager.rs | 8 + 4 files changed, 154 insertions(+), 108 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index d2c778276d..77c0967afc 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,4 +1,13 @@ +use std::collections::HashMap; +use std::num::NonZeroUsize; +use std::pin::Pin; +use std::sync::atomic::AtomicU64; +use std::sync::{Arc, Mutex}; +use std::task::{Context, Poll}; +use std::time::{Duration, Instant}; + use enum_map::EnumMap; +use futures::Future; use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, @@ -11,13 +20,26 @@ use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedExecutionStrategy, }; +use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; +use pin_project_lite::pin_project; use postgres_backend::{is_expected_io_error, QueryError}; use pq_proto::framed::ConnectionError; -use strum::{EnumCount, VariantNames}; + +use strum::{EnumCount, IntoEnumIterator as _, VariantNames}; use strum_macros::{IntoStaticStr, VariantNames}; use utils::id::TimelineId; +use crate::config::PageServerConf; +use crate::context::{PageContentKind, RequestContext}; +use crate::task_mgr::TaskKind; +use crate::tenant::layer_map::LayerMap; +use crate::tenant::mgr::TenantSlot; +use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc}; +use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::throttle::ThrottleResult; +use crate::tenant::Timeline; + /// Prometheus histogram buckets (in seconds) for operations in the critical /// path. In other words, operations that directly affect that latency of user /// queries. @@ -443,18 +465,38 @@ static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[derive( + strum_macros::EnumIter, + strum_macros::EnumString, + strum_macros::Display, + strum_macros::IntoStaticStr, +)] #[strum(serialize_all = "kebab_case")] -pub(crate) enum MetricLayerKind { +pub(crate) enum LayerKind { Delta, Image, } +#[derive( + strum_macros::EnumIter, + strum_macros::EnumString, + strum_macros::Display, + strum_macros::IntoStaticStr, +)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum LayerLevel { + // We don't track the currently open ephemeral layer, since there's always exactly 1 and its + // size changes. See `TIMELINE_EPHEMERAL_BYTES`. + Frozen, + L0, + L1, +} + static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_layer_bytes", - "Sum of layer physical sizes in bytes", - &["tenant_id", "shard_id", "timeline_id", "kind"] + "Sum of frozen, L0, and L1 layer physical sizes in bytes (excluding the open ephemeral layer)", + &["tenant_id", "shard_id", "timeline_id", "level", "kind"] ) .expect("failed to define a metric") }); @@ -462,8 +504,8 @@ static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { static TIMELINE_LAYER_COUNT: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_layer_count", - "Number of layers that exist", - &["tenant_id", "shard_id", "timeline_id", "kind"] + "Number of frozen, L0, and L1 layers (excluding the open ephemeral layer)", + &["tenant_id", "shard_id", "timeline_id", "level", "kind"] ) .expect("failed to define a metric") }); @@ -2590,10 +2632,6 @@ pub(crate) struct TimelineMetrics { pub disk_consistent_lsn_gauge: IntGauge, pub pitr_history_size: UIntGauge, pub archival_size: UIntGauge, - pub(crate) layer_size_image: UIntGauge, - pub(crate) layer_count_image: UIntGauge, - pub(crate) layer_size_delta: UIntGauge, - pub(crate) layer_count_delta: UIntGauge, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, pub visible_physical_size_gauge: UIntGauge, @@ -2691,42 +2729,6 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); - let layer_size_image = TIMELINE_LAYER_SIZE - .get_metric_with_label_values(&[ - &tenant_id, - &shard_id, - &timeline_id, - MetricLayerKind::Image.into(), - ]) - .unwrap(); - - let layer_count_image = TIMELINE_LAYER_COUNT - .get_metric_with_label_values(&[ - &tenant_id, - &shard_id, - &timeline_id, - MetricLayerKind::Image.into(), - ]) - .unwrap(); - - let layer_size_delta = TIMELINE_LAYER_SIZE - .get_metric_with_label_values(&[ - &tenant_id, - &shard_id, - &timeline_id, - MetricLayerKind::Delta.into(), - ]) - .unwrap(); - - let layer_count_delta = TIMELINE_LAYER_COUNT - .get_metric_with_label_values(&[ - &tenant_id, - &shard_id, - &timeline_id, - MetricLayerKind::Delta.into(), - ]) - .unwrap(); - let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2791,10 +2793,6 @@ impl TimelineMetrics { disk_consistent_lsn_gauge, pitr_history_size, archival_size, - layer_size_image, - layer_count_image, - layer_size_delta, - layer_count_delta, standby_horizon_gauge, resident_physical_size_gauge, visible_physical_size_gauge, @@ -2837,6 +2835,92 @@ impl TimelineMetrics { .add(duration); } + /// Generates TIMELINE_LAYER labels for a persistent layer. + fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] { + let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) { + true => LayerLevel::L0, + false => LayerLevel::L1, + }; + let kind = match layer_desc.is_delta() { + true => LayerKind::Delta, + false => LayerKind::Image, + }; + [ + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + level.into(), + kind.into(), + ] + } + + /// Generates TIMELINE_LAYER labels for a frozen ephemeral layer. + fn make_frozen_layer_labels(&self, _layer: &InMemoryLayer) -> [&str; 5] { + [ + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + LayerLevel::Frozen.into(), + LayerKind::Delta.into(), // by definition + ] + } + + /// Removes a frozen ephemeral layer to TIMELINE_LAYER metrics. + pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) { + assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. })); + let labels = self.make_frozen_layer_labels(layer); + let size = layer.try_len().expect("frozen layer should have no writer"); + TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&labels) + .unwrap() + .dec(); + TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&labels) + .unwrap() + .sub(size); + } + + /// Adds a frozen ephemeral layer to TIMELINE_LAYER metrics. + pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) { + assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. })); + let labels = self.make_frozen_layer_labels(layer); + let size = layer.try_len().expect("frozen layer should have no writer"); + TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&labels) + .unwrap() + .inc(); + TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&labels) + .unwrap() + .add(size); + } + + /// Removes a persistent layer from TIMELINE_LAYER metrics. + pub fn dec_layer(&self, layer_desc: &PersistentLayerDesc) { + let labels = self.make_layer_labels(layer_desc); + TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&labels) + .unwrap() + .dec(); + TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&labels) + .unwrap() + .sub(layer_desc.file_size); + } + + /// Adds a persistent layer to TIMELINE_LAYER metrics. + pub fn inc_layer(&self, layer_desc: &PersistentLayerDesc) { + let labels = self.make_layer_labels(layer_desc); + TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&labels) + .unwrap() + .inc(); + TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&labels) + .unwrap() + .add(layer_desc.file_size); + } + pub(crate) fn shutdown(&self) { let was_shutdown = self .shutdown @@ -2869,30 +2953,14 @@ impl TimelineMetrics { let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); - let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - MetricLayerKind::Image.into(), - ]); - let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - MetricLayerKind::Image.into(), - ]); - let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - MetricLayerKind::Delta.into(), - ]); - let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - MetricLayerKind::Delta.into(), - ]); + for ref level in LayerLevel::iter() { + for ref kind in LayerKind::iter() { + let labels: [&str; 5] = + [tenant_id, shard_id, timeline_id, level.into(), kind.into()]; + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&labels); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&labels); + } + } let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); @@ -2974,24 +3042,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { // we leave the BROKEN_TENANTS_SET entry if any } -use futures::Future; -use pin_project_lite::pin_project; -use std::collections::HashMap; -use std::num::NonZeroUsize; -use std::pin::Pin; -use std::sync::atomic::AtomicU64; -use std::sync::{Arc, Mutex}; -use std::task::{Context, Poll}; -use std::time::{Duration, Instant}; - -use crate::config::PageServerConf; -use crate::context::{PageContentKind, RequestContext}; -use crate::task_mgr::TaskKind; -use crate::tenant::mgr::TenantSlot; -use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::throttle::ThrottleResult; -use crate::tenant::Timeline; - /// Maintain a per timeline gauge in addition to the global gauge. pub(crate) struct PerTimelineRemotePhysicalSizeGauge { last_set: AtomicU64, diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 2a86885f6b..99e0ff1aa5 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -701,13 +701,7 @@ impl Drop for LayerInner { if let Some(timeline) = timeline.as_ref() { // Only need to decrement metrics if the timeline still exists: otherwise // it will have already de-registered these metrics via TimelineMetrics::shutdown - if self.desc.is_delta() { - timeline.metrics.layer_count_delta.dec(); - timeline.metrics.layer_size_delta.sub(self.desc.file_size); - } else { - timeline.metrics.layer_count_image.dec(); - timeline.metrics.layer_size_image.sub(self.desc.file_size); - } + timeline.metrics.dec_layer(&self.desc); if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) { debug_assert!( @@ -817,13 +811,7 @@ impl LayerInner { }; // This object acts as a RAII guard on these metrics: increment on construction - if desc.is_delta() { - timeline.metrics.layer_count_delta.inc(); - timeline.metrics.layer_size_delta.add(desc.file_size); - } else { - timeline.metrics.layer_count_image.inc(); - timeline.metrics.layer_size_image.add(desc.file_size); - } + timeline.metrics.inc_layer(&desc); // New layers are visible by default. This metric is later updated on drop or in set_visibility timeline diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 24bc7890c6..b4b30fcd23 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3703,7 +3703,7 @@ impl Timeline { let mut guard = self.layers.write().await; guard .open_mut()? - .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock) + .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics) .await }; diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index f1cef7778c..cb7783d779 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -91,6 +91,7 @@ impl LayerManager { layer_map, layer_fmgr: LayerFileManager(hashmap), }) => { + // NB: no need to decrement layer metrics; metrics are removed on timeline shutdown. let open = layer_map.open_layer.take(); let frozen = layer_map.frozen_layers.len(); let taken_writer_state = writer_state.take(); @@ -234,6 +235,7 @@ impl OpenLayerManager { lsn: Lsn, last_freeze_at: &AtomicLsn, write_lock: &mut tokio::sync::MutexGuard<'_, Option>, + metrics: &TimelineMetrics, ) -> bool { let Lsn(last_record_lsn) = lsn; let end_lsn = Lsn(last_record_lsn + 1); @@ -242,6 +244,11 @@ impl OpenLayerManager { let open_layer_rc = Arc::clone(open_layer); open_layer.freeze(end_lsn).await; + // Increment the frozen layer metrics. This is decremented in `finish_flush_l0_layer()`. + // TODO: It would be nicer to do this via `InMemoryLayer::drop()`, but it requires a + // reference to the timeline metrics. Other methods use a metrics borrow as well. + metrics.inc_frozen_layer(open_layer); + // The layer is no longer open, update the layer map to reflect this. // We will replace it with on-disk historics below. self.layer_map.frozen_layers.push_back(open_layer_rc); @@ -298,6 +305,7 @@ impl OpenLayerManager { .frozen_layers .pop_front() .expect("there must be a inmem layer to flush"); + metrics.dec_frozen_layer(&inmem); // Only one task may call this function at a time (for this // timeline). If two tasks tried to flush the same frozen