pageserver: add pitr_history_size consumption metric

This commit is contained in:
Erik Grinaker
2025-04-28 18:13:45 +02:00
parent 11f6044338
commit 74855f4b8c
4 changed files with 70 additions and 10 deletions

View File

@@ -10,6 +10,7 @@ use utils::lsn::Lsn;
use super::{Cache, NewRawMetric};
use crate::context::RequestContext;
use crate::tenant::mgr::TenantManager;
use crate::tenant::timeline::GcCutoffs;
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
/// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events`
@@ -24,7 +25,9 @@ pub(super) enum Name {
/// Timeline last_record_lsn, incremental
#[serde(rename = "written_data_bytes_delta")]
WrittenSizeDelta,
/// Timeline logical size
/// Timeline last_record_lsn - gc_cutoffs.time (i.e. pitr_interval)
#[serde(rename = "pitr_history_size")]
PitrHistorySize,
#[serde(rename = "timeline_logical_size")]
LogicalSize,
/// Tenant remote size
@@ -160,6 +163,22 @@ impl MetricsKey {
.incremental_values()
}
/// [`Timeline::get_last_record_lsn`] - [`GcCutoffs::time`] (i.e. `pitr_interval`).
///
/// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
/// [`GcCutoffs::time`]: crate::tenant::timeline::GcCutoffs::time
const fn pitr_history_size(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: Name::PitrHistorySize,
}
.absolute_values()
}
/// Exact [`Timeline::get_current_logical_size`].
///
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
@@ -371,6 +390,7 @@ struct TimelineSnapshot {
loaded_at: (Lsn, SystemTime),
last_record_lsn: Lsn,
current_exact_logical_size: Option<u64>,
gc_cutoffs: GcCutoffs,
}
impl TimelineSnapshot {
@@ -390,6 +410,7 @@ impl TimelineSnapshot {
} else {
let loaded_at = t.loaded_at;
let last_record_lsn = t.get_last_record_lsn();
let gc_cutoffs = t.gc_info.read().unwrap().cutoffs; // NB: assume periodically updated
let current_exact_logical_size = {
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
@@ -410,6 +431,7 @@ impl TimelineSnapshot {
loaded_at,
last_record_lsn,
current_exact_logical_size,
gc_cutoffs,
}))
}
}
@@ -477,6 +499,15 @@ impl TimelineSnapshot {
});
}
// Compute the PITR history size.
//
// TODO: verify that the GC cutoffs don't severely regress, e.g. to 0 such that we bill the
// entire history. Also verify that it's okay for this to regress on restart, unlike e.g.
// written_size above.
let pitr_history_size_key = MetricsKey::pitr_history_size(tenant_id, timeline_id);
let pitr_history_size = self.last_record_lsn.saturating_sub(self.gc_cutoffs.time);
metrics.push(pitr_history_size_key.at(now, pitr_history_size.into()));
{
let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
let current_or_previous = self

View File

@@ -12,12 +12,18 @@ fn startup_collected_timeline_metrics_before_advancing() {
let cache = HashMap::new();
let initdb_lsn = Lsn(0x10000);
let pitr_cutoff = Lsn(0x11000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let logical_size = 0x42000;
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, SystemTime::now()),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: Some(0x42000),
current_exact_logical_size: Some(logical_size),
gc_cutoffs: GcCutoffs {
space: Lsn::INVALID,
time: pitr_cutoff,
},
};
let now = DateTime::<Utc>::from(SystemTime::now());
@@ -33,7 +39,9 @@ fn startup_collected_timeline_metrics_before_advancing() {
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
MetricsKey::pitr_history_size(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
]
);
}
@@ -49,7 +57,9 @@ fn startup_collected_timeline_metrics_second_round() {
let before = DateTime::<Utc>::from(before);
let initdb_lsn = Lsn(0x10000);
let pitr_cutoff = Lsn(0x11000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let logical_size = 0x42000;
let mut metrics = Vec::new();
let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id)
@@ -59,7 +69,11 @@ fn startup_collected_timeline_metrics_second_round() {
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, init),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: Some(0x42000),
current_exact_logical_size: Some(logical_size),
gc_cutoffs: GcCutoffs {
space: Lsn::INVALID,
time: pitr_cutoff,
},
};
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
@@ -69,7 +83,9 @@ fn startup_collected_timeline_metrics_second_round() {
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
MetricsKey::pitr_history_size(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
]
);
}
@@ -86,7 +102,9 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
let before = DateTime::<Utc>::from(before);
let initdb_lsn = Lsn(0x10000);
let pitr_cutoff = Lsn(0x11000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let logical_size = 0x42000;
let mut metrics = Vec::new();
let cache = HashMap::from([
@@ -103,7 +121,11 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, init),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: Some(0x42000),
current_exact_logical_size: Some(logical_size),
gc_cutoffs: GcCutoffs {
space: Lsn::INVALID,
time: pitr_cutoff,
},
};
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
@@ -113,7 +135,9 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
MetricsKey::pitr_history_size(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
]
);
}
@@ -141,6 +165,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
loaded_at: (Lsn(50), at_restart),
last_record_lsn: Lsn(50),
current_exact_logical_size: None,
gc_cutoffs: GcCutoffs::default(),
};
let mut cache = HashMap::from([
@@ -169,6 +194,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
MetricsKey::pitr_history_size(tenant_id, timeline_id).at(now, 50), // does regress
]
);
@@ -183,6 +209,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
MetricsKey::pitr_history_size(tenant_id, timeline_id).at(later, 50),
]
);
}
@@ -202,6 +229,7 @@ fn post_restart_current_exact_logical_size_uses_cached() {
loaded_at: (Lsn(50), at_restart),
last_record_lsn: Lsn(50),
current_exact_logical_size: None,
gc_cutoffs: GcCutoffs::default(),
};
let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id)
@@ -312,10 +340,11 @@ pub(crate) const fn metric_examples(
timeline_id: TimelineId,
now: DateTime<Utc>,
before: DateTime<Utc>,
) -> [NewRawMetric; 6] {
) -> [NewRawMetric; 7] {
[
MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
MetricsKey::pitr_history_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::remote_storage_size(tenant_id).at(now, 0),
MetricsKey::resident_size(tenant_id).at(now, 0),

View File

@@ -576,7 +576,7 @@ mod tests {
super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
}
fn metric_samples() -> [NewRawMetric; 6] {
fn metric_samples() -> [NewRawMetric; 7] {
let tenant_id = TenantId::from_array([0; 16]);
let timeline_id = TimelineId::from_array([0xff; 16]);

View File

@@ -537,7 +537,7 @@ impl GcInfo {
/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this
/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
/// between time-based and space-based retention for observability and consumption metrics purposes.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Copy)]
pub(crate) struct GcCutoffs {
/// Calculated from the [`pageserver_api::models::TenantConfig::gc_horizon`], this LSN indicates how much
/// history we must keep to retain a specified number of bytes of WAL.