Rewrite as pitr_cutoff, don't regress

This commit is contained in:
Erik Grinaker
2025-04-29 12:34:22 +02:00
parent 697c67ea85
commit c2afddbcba
3 changed files with 75 additions and 32 deletions

View File

@@ -24,9 +24,12 @@ pub(super) enum Name {
/// Timeline last_record_lsn, incremental
#[serde(rename = "written_data_bytes_delta")]
WrittenSizeDelta,
/// Timeline last_record_lsn - gc_cutoffs.time (i.e. pitr_interval)
#[serde(rename = "pitr_history_size")]
PitrHistorySize,
/// Timeline's PITR cutoff LSN (now - pitr_interval). If PITR is disabled
/// (pitr_interval = 0), this will equal WrittenSize.
///
/// Updated periodically during GC. Does not regress.
#[serde(rename = "pitr_cutoff")]
PitrCutoff,
/// Timeline logical size
#[serde(rename = "timeline_logical_size")]
LogicalSize,
@@ -163,18 +166,14 @@ impl MetricsKey {
.incremental_values()
}
/// [`Timeline::get_last_record_lsn`] - [`GcCutoffs::time`] (i.e. `pitr_interval`).
/// [`GcCutoffs::time`] (i.e. `now` - `pitr_interval`)
///
/// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
/// [`GcCutoffs::time`]: crate::tenant::timeline::GcCutoffs::time
const fn pitr_history_size(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> AbsoluteValueFactory {
const fn pitr_cutoff(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: Name::PitrHistorySize,
metric: Name::PitrCutoff,
}
.absolute_values()
}
@@ -501,17 +500,24 @@ impl TimelineSnapshot {
});
}
// Compute the PITR history size.
// Compute the PITR cutoff. We have to make sure this doesn't regress:
// on restart, GC may not run for a while, and the PITR cutoff will be
// reported as 0 (which would bill the user for the entire history from
// 0 to last_record_lsn). We therefore clamp this to the cached value
// (which persists across restarts).
//
// TODO: verify that the GC cutoffs don't severely regress, e.g. to 0 such that we bill the
// entire history. Also verify that it's okay for this to regress on restart, unlike e.g.
// written_size above.
let pitr_history_size_key = MetricsKey::pitr_history_size(tenant_id, timeline_id);
let pitr_history_size = self
// If pitr_interval is disabled (pitr_cutoff is None), the cutoff is the
// last record LSN, by definition. We don't use the written_size
// clamping from above, because our own caching should result in
// equivalent clamping.
let pitr_cutoff_key = MetricsKey::pitr_cutoff(tenant_id, timeline_id);
let mut pitr_cutoff = self
.pitr_cutoff
.map(|pitr| self.last_record_lsn.saturating_sub(pitr))
.unwrap_or_default();
metrics.push(pitr_history_size_key.at(now, pitr_history_size.into()));
.unwrap_or(self.last_record_lsn /* PITR disabled */);
if let Some(prev) = cache.get(pitr_cutoff_key.key()) {
pitr_cutoff = pitr_cutoff.max(Lsn(prev.value)); // don't regress
}
metrics.push(pitr_cutoff_key.at(now, pitr_cutoff.into()));
{
let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);

View File

@@ -36,8 +36,7 @@ fn startup_collected_timeline_metrics_before_advancing() {
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::pitr_history_size(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(now, pitr_cutoff.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
]
);
@@ -77,8 +76,7 @@ fn startup_collected_timeline_metrics_second_round() {
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::pitr_history_size(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(now, pitr_cutoff.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
]
);
@@ -126,15 +124,14 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::pitr_history_size(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(now, pitr_cutoff.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
]
);
}
#[test]
fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_pitr_cutoff() {
// it can happen that we lose the inmemorylayer but have previously sent metrics and we
// should never go backwards
@@ -156,7 +153,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
loaded_at: (Lsn(50), at_restart),
last_record_lsn: Lsn(50),
current_exact_logical_size: None,
pitr_cutoff: Some(Lsn(0)),
pitr_cutoff: Some(Lsn(20)),
};
let mut cache = HashMap::from([
@@ -171,6 +168,9 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
999_999_999,
)
.to_kv_pair(),
MetricsKey::pitr_cutoff(tenant_id, timeline_id)
.at(before_restart, 70)
.to_kv_pair(),
]);
let mut metrics = Vec::new();
@@ -185,7 +185,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
MetricsKey::pitr_history_size(tenant_id, timeline_id).at(now, 50), // does regress
MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(now, 70),
]
);
@@ -200,7 +200,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
MetricsKey::pitr_history_size(tenant_id, timeline_id).at(later, 50),
MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(later, 70),
]
);
}
@@ -309,16 +309,53 @@ fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
times
}
#[test]
fn pitr_cutoff_none_yields_last_record_lsn() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let mut metrics = Vec::new();
let cache = HashMap::new();
let initdb_lsn = Lsn(0x10000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, SystemTime::now()),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: None,
pitr_cutoff: None,
};
let now = DateTime::<Utc>::from(SystemTime::now());
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
snap.loaded_at.1.into(),
now,
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
]
);
}
pub(crate) const fn metric_examples_old(
tenant_id: TenantId,
timeline_id: TimelineId,
now: DateTime<Utc>,
before: DateTime<Utc>,
) -> [RawMetric; 6] {
) -> [RawMetric; 7] {
[
MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id)
.from_until_old_format(before, now, 0),
MetricsKey::pitr_cutoff(tenant_id, timeline_id).at_old_format(now, 0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
MetricsKey::resident_size(tenant_id).at_old_format(now, 0),
@@ -335,7 +372,7 @@ pub(crate) const fn metric_examples(
[
MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
MetricsKey::pitr_history_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::pitr_cutoff(tenant_id, timeline_id).at(now, 0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::remote_storage_size(tenant_id).at(now, 0),
MetricsKey::resident_size(tenant_id).at(now, 0),

View File

@@ -568,7 +568,7 @@ mod tests {
assert_eq!(upgraded_samples, new_samples);
}
fn metric_samples_old() -> [RawMetric; 6] {
fn metric_samples_old() -> [RawMetric; 7] {
let tenant_id = TenantId::from_array([0; 16]);
let timeline_id = TimelineId::from_array([0xff; 16]);