mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-16 01:42:55 +00:00
pageserver: add supplementary branch usage stats (#8131)
## Problem The metrics we have today aren't convenient for planning around the impact of timeline archival on costs. Closes: https://github.com/neondatabase/neon/issues/8108 ## Summary of changes - Add metric `pageserver_archive_size`, which indicates the logical bytes of data which we would expect to write into an archived branch. - Add metric `pageserver_pitr_history_size`, which indicates the distance between last_record_lsn and the PITR cutoff. These metrics are somewhat temporary: when we implement #8088 and associated consumption metric changes, these will reach a final form. For now, an "archived" branch is just any branch outside of its parent's PITR window: later, archival will become an explicit state (which will _usually_ correspond to falling outside the parent's PITR window). The overall volume of timeline metrics is something to watch, but we are removing many more in https://github.com/neondatabase/neon/pull/8245 than this PR is adding.
This commit is contained in:
@@ -661,6 +661,16 @@ pub struct TimelineInfo {
|
||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
pub current_logical_size_non_incremental: Option<u64>,
|
||||
|
||||
/// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes
|
||||
/// beyond the branch's branch point, we only count up to the branch point.
|
||||
pub pitr_history_size: u64,
|
||||
|
||||
/// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
|
||||
/// ancestor data used by this branch would have been retained anyway). If this is false, then
|
||||
/// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
|
||||
/// otherwise be able to GC.
|
||||
pub within_ancestor_pitr: bool,
|
||||
|
||||
pub timeline_dir_layer_file_size_sum: Option<u64>,
|
||||
|
||||
pub wal_source_connstr: Option<String>,
|
||||
|
||||
@@ -406,6 +406,8 @@ async fn build_timeline_info_common(
|
||||
|
||||
let walreceiver_status = timeline.walreceiver_status();
|
||||
|
||||
let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
|
||||
|
||||
let info = TimelineInfo {
|
||||
tenant_id: timeline.tenant_shard_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
@@ -426,6 +428,8 @@ async fn build_timeline_info_common(
|
||||
directory_entries_counts: timeline.get_directory_metrics().to_vec(),
|
||||
current_physical_size,
|
||||
current_logical_size_non_incremental: None,
|
||||
pitr_history_size,
|
||||
within_ancestor_pitr,
|
||||
timeline_dir_layer_file_size_sum: None,
|
||||
wal_source_connstr,
|
||||
last_received_msg_lsn,
|
||||
|
||||
@@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_pitr_history_size",
|
||||
"Data written since PITR cutoff on this timeline",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_archive_size",
|
||||
"Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_standby_horizon",
|
||||
@@ -2106,6 +2124,8 @@ pub(crate) struct TimelineMetrics {
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub find_gc_cutoffs_histo: StorageTimeMetrics,
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub pitr_history_size: UIntGauge,
|
||||
pub archival_size: UIntGauge,
|
||||
pub standby_horizon_gauge: IntGauge,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
@@ -2179,6 +2199,15 @@ impl TimelineMetrics {
|
||||
let last_record_gauge = LAST_RECORD_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let pitr_history_size = PITR_HISTORY_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let archival_size = TIMELINE_ARCHIVE_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let standby_horizon_gauge = STANDBY_HORIZON
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -2231,6 +2260,8 @@ impl TimelineMetrics {
|
||||
find_gc_cutoffs_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
pitr_history_size,
|
||||
archival_size,
|
||||
standby_horizon_gauge,
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
@@ -2288,6 +2319,10 @@ impl TimelineMetrics {
|
||||
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
|
||||
let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
@@ -2874,6 +2874,7 @@ impl Tenant {
|
||||
{
|
||||
let mut target = timeline.gc_info.write().unwrap();
|
||||
|
||||
// Cull any expired leases
|
||||
let now = SystemTime::now();
|
||||
target.leases.retain(|_, lease| !lease.is_expired(&now));
|
||||
|
||||
@@ -2882,6 +2883,31 @@ impl Tenant {
|
||||
.valid_lsn_lease_count_gauge
|
||||
.set(target.leases.len() as u64);
|
||||
|
||||
// Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
|
||||
if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
|
||||
if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
|
||||
target.within_ancestor_pitr =
|
||||
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
|
||||
}
|
||||
}
|
||||
|
||||
// Update metrics that depend on GC state
|
||||
timeline
|
||||
.metrics
|
||||
.archival_size
|
||||
.set(if target.within_ancestor_pitr {
|
||||
timeline.metrics.current_logical_size_gauge.get()
|
||||
} else {
|
||||
0
|
||||
});
|
||||
timeline.metrics.pitr_history_size.set(
|
||||
timeline
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(target.cutoffs.pitr)
|
||||
.unwrap_or(Lsn(0))
|
||||
.0,
|
||||
);
|
||||
|
||||
match gc_cutoffs.remove(&timeline.timeline_id) {
|
||||
Some(cutoffs) => {
|
||||
target.retain_lsns = branchpoints;
|
||||
@@ -7063,6 +7089,7 @@ mod tests {
|
||||
horizon: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -463,6 +463,9 @@ pub(crate) struct GcInfo {
|
||||
|
||||
/// Leases granted to particular LSNs.
|
||||
pub(crate) leases: BTreeMap<Lsn, LsnLease>,
|
||||
|
||||
/// Whether our branch point is within our ancestor's PITR interval (for cost estimation)
|
||||
pub(crate) within_ancestor_pitr: bool,
|
||||
}
|
||||
|
||||
impl GcInfo {
|
||||
@@ -851,6 +854,18 @@ impl Timeline {
|
||||
.map(|ancestor| ancestor.timeline_id)
|
||||
}
|
||||
|
||||
/// Get the bytes written since the PITR cutoff on this branch, and
|
||||
/// whether this branch's ancestor_lsn is within its parent's PITR.
|
||||
pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let history = self
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(gc_info.cutoffs.pitr)
|
||||
.unwrap_or(Lsn(0))
|
||||
.0;
|
||||
(history, gc_info.within_ancestor_pitr)
|
||||
}
|
||||
|
||||
/// Lock and get timeline's GC cutoff
|
||||
pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
|
||||
self.latest_gc_cutoff_lsn.read()
|
||||
|
||||
@@ -144,6 +144,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_smgr_query_seconds_bucket",
|
||||
"pageserver_smgr_query_seconds_count",
|
||||
"pageserver_smgr_query_seconds_sum",
|
||||
"pageserver_archive_size",
|
||||
"pageserver_pitr_history_size",
|
||||
"pageserver_storage_operations_seconds_count_total",
|
||||
"pageserver_storage_operations_seconds_sum_total",
|
||||
"pageserver_evictions_total",
|
||||
|
||||
Reference in New Issue
Block a user