From 74855f4b8c2fcb382a69f7b2dcefa94df442f83a Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 28 Apr 2025 18:13:45 +0200 Subject: [PATCH] pageserver: add `pitr_history_size` consumption metric --- pageserver/src/consumption_metrics/metrics.rs | 33 +++++++++++++- .../src/consumption_metrics/metrics/tests.rs | 43 ++++++++++++++++--- pageserver/src/consumption_metrics/upload.rs | 2 +- pageserver/src/tenant/timeline.rs | 2 +- 4 files changed, 70 insertions(+), 10 deletions(-) diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 08ab69f349..9515f375ed 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -10,6 +10,7 @@ use utils::lsn::Lsn; use super::{Cache, NewRawMetric}; use crate::context::RequestContext; use crate::tenant::mgr::TenantManager; +use crate::tenant::timeline::GcCutoffs; use crate::tenant::timeline::logical_size::CurrentLogicalSize; /// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events` @@ -24,7 +25,9 @@ pub(super) enum Name { /// Timeline last_record_lsn, incremental #[serde(rename = "written_data_bytes_delta")] WrittenSizeDelta, - /// Timeline logical size + /// Timeline last_record_lsn - gc_cutoffs.time (i.e. pitr_interval) + #[serde(rename = "pitr_history_size")] + PitrHistorySize, #[serde(rename = "timeline_logical_size")] LogicalSize, /// Tenant remote size @@ -160,6 +163,22 @@ impl MetricsKey { .incremental_values() } + /// [`Timeline::get_last_record_lsn`] - [`GcCutoffs::time`] (i.e. `pitr_interval`). + /// + /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn + /// [`GcCutoffs::time`]: crate::tenant::timeline::GcCutoffs::time + const fn pitr_history_size( + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> AbsoluteValueFactory { + MetricsKey { + tenant_id, + timeline_id: Some(timeline_id), + metric: Name::PitrHistorySize, + } + .absolute_values() + } + /// Exact [`Timeline::get_current_logical_size`]. /// /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size @@ -371,6 +390,7 @@ struct TimelineSnapshot { loaded_at: (Lsn, SystemTime), last_record_lsn: Lsn, current_exact_logical_size: Option, + gc_cutoffs: GcCutoffs, } impl TimelineSnapshot { @@ -390,6 +410,7 @@ impl TimelineSnapshot { } else { let loaded_at = t.loaded_at; let last_record_lsn = t.get_last_record_lsn(); + let gc_cutoffs = t.gc_info.read().unwrap().cutoffs; // NB: assume periodically updated let current_exact_logical_size = { let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id); @@ -410,6 +431,7 @@ impl TimelineSnapshot { loaded_at, last_record_lsn, current_exact_logical_size, + gc_cutoffs, })) } } @@ -477,6 +499,15 @@ impl TimelineSnapshot { }); } + // Compute the PITR history size. + // + // TODO: verify that the GC cutoffs don't severely regress, e.g. to 0 such that we bill the + // entire history. Also verify that it's okay for this to regress on restart, unlike e.g. + // written_size above. + let pitr_history_size_key = MetricsKey::pitr_history_size(tenant_id, timeline_id); + let pitr_history_size = self.last_record_lsn.saturating_sub(self.gc_cutoffs.time); + metrics.push(pitr_history_size_key.at(now, pitr_history_size.into())); + { let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id); let current_or_previous = self diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 52b4fb8680..b537efe4d8 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -12,12 +12,18 @@ fn startup_collected_timeline_metrics_before_advancing() { let cache = HashMap::new(); let initdb_lsn = Lsn(0x10000); + let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + let logical_size = 0x42000; let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, SystemTime::now()), last_record_lsn: disk_consistent_lsn, - current_exact_logical_size: Some(0x42000), + current_exact_logical_size: Some(logical_size), + gc_cutoffs: GcCutoffs { + space: Lsn::INVALID, + time: pitr_cutoff, + }, }; let now = DateTime::::from(SystemTime::now()); @@ -33,7 +39,9 @@ fn startup_collected_timeline_metrics_before_advancing() { 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + MetricsKey::pitr_history_size(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } @@ -49,7 +57,9 @@ fn startup_collected_timeline_metrics_second_round() { let before = DateTime::::from(before); let initdb_lsn = Lsn(0x10000); + let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + let logical_size = 0x42000; let mut metrics = Vec::new(); let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id) @@ -59,7 +69,11 @@ fn startup_collected_timeline_metrics_second_round() { let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, init), last_record_lsn: disk_consistent_lsn, - current_exact_logical_size: Some(0x42000), + current_exact_logical_size: Some(logical_size), + gc_cutoffs: GcCutoffs { + space: Lsn::INVALID, + time: pitr_cutoff, + }, }; snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); @@ -69,7 +83,9 @@ fn startup_collected_timeline_metrics_second_round() { &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + MetricsKey::pitr_history_size(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } @@ -86,7 +102,9 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { let before = DateTime::::from(before); let initdb_lsn = Lsn(0x10000); + let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + let logical_size = 0x42000; let mut metrics = Vec::new(); let cache = HashMap::from([ @@ -103,7 +121,11 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, init), last_record_lsn: disk_consistent_lsn, - current_exact_logical_size: Some(0x42000), + current_exact_logical_size: Some(logical_size), + gc_cutoffs: GcCutoffs { + space: Lsn::INVALID, + time: pitr_cutoff, + }, }; snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); @@ -113,7 +135,9 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + MetricsKey::pitr_history_size(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } @@ -141,6 +165,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { loaded_at: (Lsn(50), at_restart), last_record_lsn: Lsn(50), current_exact_logical_size: None, + gc_cutoffs: GcCutoffs::default(), }; let mut cache = HashMap::from([ @@ -169,6 +194,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, 100), + MetricsKey::pitr_history_size(tenant_id, timeline_id).at(now, 50), // does regress ] ); @@ -183,6 +209,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0), MetricsKey::written_size(tenant_id, timeline_id).at(later, 100), + MetricsKey::pitr_history_size(tenant_id, timeline_id).at(later, 50), ] ); } @@ -202,6 +229,7 @@ fn post_restart_current_exact_logical_size_uses_cached() { loaded_at: (Lsn(50), at_restart), last_record_lsn: Lsn(50), current_exact_logical_size: None, + gc_cutoffs: GcCutoffs::default(), }; let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id) @@ -312,10 +340,11 @@ pub(crate) const fn metric_examples( timeline_id: TimelineId, now: DateTime, before: DateTime, -) -> [NewRawMetric; 6] { +) -> [NewRawMetric; 7] { [ MetricsKey::written_size(tenant_id, timeline_id).at(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), + MetricsKey::pitr_history_size(tenant_id, timeline_id).at(now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0), MetricsKey::remote_storage_size(tenant_id).at(now, 0), MetricsKey::resident_size(tenant_id).at(now, 0), diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 59e0145a5b..ce050f0cd8 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -576,7 +576,7 @@ mod tests { super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before) } - fn metric_samples() -> [NewRawMetric; 6] { + fn metric_samples() -> [NewRawMetric; 7] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index cfeab77598..33512145d4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -537,7 +537,7 @@ impl GcInfo { /// The `GcInfo` component describing which Lsns need to be retained. Functionally, this /// is a single number (the oldest LSN which we must retain), but it internally distinguishes /// between time-based and space-based retention for observability and consumption metrics purposes. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Copy)] pub(crate) struct GcCutoffs { /// Calculated from the [`pageserver_api::models::TenantConfig::gc_horizon`], this LSN indicates how much /// history we must keep to retain a specified number of bytes of WAL.