diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index b8577256ce..128c3c7311 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -24,9 +24,12 @@ pub(super) enum Name { /// Timeline last_record_lsn, incremental #[serde(rename = "written_data_bytes_delta")] WrittenSizeDelta, - /// Timeline last_record_lsn - gc_cutoffs.time (i.e. pitr_interval) - #[serde(rename = "pitr_history_size")] - PitrHistorySize, + /// Timeline's PITR cutoff LSN (now - pitr_interval). If PITR is disabled + /// (pitr_interval = 0), this will equal WrittenSize. + /// + /// Updated periodically during GC. Does not regress. + #[serde(rename = "pitr_cutoff")] + PitrCutoff, /// Timeline logical size #[serde(rename = "timeline_logical_size")] LogicalSize, @@ -163,18 +166,14 @@ impl MetricsKey { .incremental_values() } - /// [`Timeline::get_last_record_lsn`] - [`GcCutoffs::time`] (i.e. `pitr_interval`). + /// [`GcCutoffs::time`] (i.e. `now` - `pitr_interval`) /// - /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn /// [`GcCutoffs::time`]: crate::tenant::timeline::GcCutoffs::time - const fn pitr_history_size( - tenant_id: TenantId, - timeline_id: TimelineId, - ) -> AbsoluteValueFactory { + const fn pitr_cutoff(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory { MetricsKey { tenant_id, timeline_id: Some(timeline_id), - metric: Name::PitrHistorySize, + metric: Name::PitrCutoff, } .absolute_values() } @@ -501,17 +500,24 @@ impl TimelineSnapshot { }); } - // Compute the PITR history size. + // Compute the PITR cutoff. We have to make sure this doesn't regress: + // on restart, GC may not run for a while, and the PITR cutoff will be + // reported as 0 (which would bill the user for the entire history from + // 0 to last_record_lsn). We therefore clamp this to the cached value + // (which persists across restarts). // - // TODO: verify that the GC cutoffs don't severely regress, e.g. to 0 such that we bill the - // entire history. Also verify that it's okay for this to regress on restart, unlike e.g. - // written_size above. - let pitr_history_size_key = MetricsKey::pitr_history_size(tenant_id, timeline_id); - let pitr_history_size = self + // If pitr_interval is disabled (pitr_cutoff is None), the cutoff is the + // last record LSN, by definition. We don't use the written_size + // clamping from above, because our own caching should result in + // equivalent clamping. + let pitr_cutoff_key = MetricsKey::pitr_cutoff(tenant_id, timeline_id); + let mut pitr_cutoff = self .pitr_cutoff - .map(|pitr| self.last_record_lsn.saturating_sub(pitr)) - .unwrap_or_default(); - metrics.push(pitr_history_size_key.at(now, pitr_history_size.into())); + .unwrap_or(self.last_record_lsn /* PITR disabled */); + if let Some(prev) = cache.get(pitr_cutoff_key.key()) { + pitr_cutoff = pitr_cutoff.max(Lsn(prev.value)); // don't regress + } + metrics.push(pitr_cutoff_key.at(now, pitr_cutoff.into())); { let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id); diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index da30f7ecaf..2c2f9fe6c6 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -36,8 +36,7 @@ fn startup_collected_timeline_metrics_before_advancing() { 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::pitr_history_size(tenant_id, timeline_id) - .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), + MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(now, pitr_cutoff.0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); @@ -77,8 +76,7 @@ fn startup_collected_timeline_metrics_second_round() { &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::pitr_history_size(tenant_id, timeline_id) - .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), + MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(now, pitr_cutoff.0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); @@ -126,15 +124,14 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::pitr_history_size(tenant_id, timeline_id) - .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), + MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(now, pitr_cutoff.0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } #[test] -fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { +fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_pitr_cutoff() { // it can happen that we lose the inmemorylayer but have previously sent metrics and we // should never go backwards @@ -156,7 +153,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { loaded_at: (Lsn(50), at_restart), last_record_lsn: Lsn(50), current_exact_logical_size: None, - pitr_cutoff: Some(Lsn(0)), + pitr_cutoff: Some(Lsn(20)), }; let mut cache = HashMap::from([ @@ -171,6 +168,9 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { 999_999_999, ) .to_kv_pair(), + MetricsKey::pitr_cutoff(tenant_id, timeline_id) + .at(before_restart, 70) + .to_kv_pair(), ]); let mut metrics = Vec::new(); @@ -185,7 +185,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, 100), - MetricsKey::pitr_history_size(tenant_id, timeline_id).at(now, 50), // does regress + MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(now, 70), ] ); @@ -200,7 +200,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0), MetricsKey::written_size(tenant_id, timeline_id).at(later, 100), - MetricsKey::pitr_history_size(tenant_id, timeline_id).at(later, 50), + MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(later, 70), ] ); } @@ -309,16 +309,53 @@ fn time_backwards() -> [std::time::SystemTime; N] { times } +#[test] +fn pitr_cutoff_none_yields_last_record_lsn() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let mut metrics = Vec::new(); + let cache = HashMap::new(); + + let initdb_lsn = Lsn(0x10000); + let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + + let snap = TimelineSnapshot { + loaded_at: (disk_consistent_lsn, SystemTime::now()), + last_record_lsn: disk_consistent_lsn, + current_exact_logical_size: None, + pitr_cutoff: None, + }; + + let now = DateTime::::from(SystemTime::now()); + + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( + snap.loaded_at.1.into(), + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), + MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), + ] + ); +} + pub(crate) const fn metric_examples_old( tenant_id: TenantId, timeline_id: TimelineId, now: DateTime, before: DateTime, -) -> [RawMetric; 6] { +) -> [RawMetric; 7] { [ MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id) .from_until_old_format(before, now, 0), + MetricsKey::pitr_cutoff(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0), MetricsKey::resident_size(tenant_id).at_old_format(now, 0), @@ -335,7 +372,7 @@ pub(crate) const fn metric_examples( [ MetricsKey::written_size(tenant_id, timeline_id).at(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), - MetricsKey::pitr_history_size(tenant_id, timeline_id).at(now, 0), + MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0), MetricsKey::remote_storage_size(tenant_id).at(now, 0), MetricsKey::resident_size(tenant_id).at(now, 0), diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index fd6e6ae891..1b55ee9b3c 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -568,7 +568,7 @@ mod tests { assert_eq!(upgraded_samples, new_samples); } - fn metric_samples_old() -> [RawMetric; 6] { + fn metric_samples_old() -> [RawMetric; 7] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]);