From b47d3900b930c1be0dc2619b453da9f06480e6e7 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Sun, 20 Jul 2025 23:13:02 +0000 Subject: [PATCH] observability and debugging facilities --- libs/pageserver_api/src/models.rs | 2 + pageserver/src/http/routes.rs | 3 ++ pageserver/src/metrics.rs | 27 ++++++++-- pageserver/src/tenant/timeline.rs | 9 ++-- .../src/tenant/timeline/standby_horizon.rs | 53 +++++++++++++++---- 5 files changed, 74 insertions(+), 20 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 16545364c1..e976467ee9 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1604,6 +1604,8 @@ pub struct TimelineInfo { /// Whether the timeline is invisible in synthetic size calculations. pub is_invisible: Option, + + pub standby_horizon: serde_json::Value, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2995a37089..36f7346c6f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -473,6 +473,8 @@ async fn build_timeline_info_common( *timeline.get_applied_gc_cutoff_lsn(), ); + let standby_horizon = timeline.standby_horizons.dump(); + let info = TimelineInfo { tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, @@ -508,6 +510,7 @@ async fn build_timeline_info_common( is_invisible: Some(is_invisible), walreceiver_status, + standby_horizon, }; Ok(info) } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 93ce315edb..f14d93e5fb 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -39,6 +39,7 @@ use crate::tenant::mgr::TenantSlot; use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc}; use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::throttle::ThrottleResult; +use crate::tenant::timeline::standby_horizon; /// Prometheus histogram buckets (in seconds) for operations in the critical /// path. In other words, operations that directly affect that latency of user @@ -729,6 +730,14 @@ static STANDBY_HORIZON: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static STANDBY_HORIZON_LEASES: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_standby_horizon_leases", + "Gauge indicating current number of standby horizon leases, per timeline", + &["tenant_id", "shard_id", "timeline_id"] + ).expect("failed to define a metric") +}); + static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_resident_physical_size", @@ -3229,7 +3238,7 @@ pub(crate) struct TimelineMetrics { pub pitr_history_size: UIntGauge, pub archival_size: UIntGauge, pub layers_per_read: Histogram, - pub standby_horizon_gauge: IntGauge, + pub standby_horizon: standby_horizon::Metrics, pub resident_physical_size_gauge: UIntGauge, pub visible_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size @@ -3331,9 +3340,16 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); - let standby_horizon_gauge = STANDBY_HORIZON - .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) - .unwrap(); + let standby_horizon = standby_horizon::Metrics { + legacy_value: STANDBY_HORIZON + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(), + leases_count_gauge: STANDBY_HORIZON_LEASES.get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + ]).unwrap(), + }; let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -3417,7 +3433,7 @@ impl TimelineMetrics { pitr_history_size, archival_size, layers_per_read, - standby_horizon_gauge, + standby_horizon, resident_physical_size_gauge, visible_physical_size_gauge, current_logical_size_gauge, @@ -3561,6 +3577,7 @@ impl TimelineMetrics { let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = STANDBY_HORIZON_LEASES.remove_label_values(&[tenant_id, shard_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 7afbd14c10..232dbdc2e5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -11,7 +11,7 @@ pub mod layer_manager; pub(crate) mod logical_size; pub mod offload; pub mod span; -mod standby_horizon; +pub(crate) mod standby_horizon; pub mod uninit; mod walreceiver; @@ -1884,6 +1884,8 @@ impl Timeline { .checked_add(Duration::from_secs(5 * 60)) .unwrap()); } + let applied_gc_cutoff_lsn = + todo!("think about boundary conditions? we didn't have any before though"); let length = todo!("duplicate init lease deadline logic?"); self.standby_horizons .upsert_lease(lease_id, lsn, length) @@ -3170,9 +3172,7 @@ impl Timeline { l0_compaction_trigger: resources.l0_compaction_trigger, gc_lock: tokio::sync::Mutex::default(), - standby_horizons: standby_horizon::Horizons::new( - metrics.standby_horizon_gauge.clone(), - ), + standby_horizons: standby_horizon::Horizons::new(metrics.standby_horizon.clone()), pagestream_throttle: resources.pagestream_throttle, @@ -6554,6 +6554,7 @@ impl Timeline { }; let mut new_gc_cutoff = space_cutoff.min(time_cutoff.unwrap_or_default()); + // Hold GC for the standby, but as a safety guard do it only within some // reasonable lag. // TODO: revisit this once we've fully transitioned to leases. 10GiB isn't _that_ much diff --git a/pageserver/src/tenant/timeline/standby_horizon.rs b/pageserver/src/tenant/timeline/standby_horizon.rs index dc6d67f353..226452d731 100644 --- a/pageserver/src/tenant/timeline/standby_horizon.rs +++ b/pageserver/src/tenant/timeline/standby_horizon.rs @@ -15,18 +15,29 @@ use std::{ time::{Duration, SystemTime}, }; -use metrics::IntGauge; +use metrics::{IntGauge, UIntGauge}; use utils::lsn::Lsn; +use crate::assert_u64_eq_usize::UsizeIsU64; + pub struct Horizons { inner: std::sync::Mutex, } struct Inner { legacy: Option, - legacy_metric: IntGauge, leases_by_id: HashMap, + metrics: Metrics, } +#[derive(Clone)] +pub struct Metrics { + /// `pageserver_standby_horizon` + pub legacy_value: IntGauge, + /// `pageserver_standby_horizon_leases` + pub leases_count_gauge: UIntGauge, +} + +#[derive(Debug)] struct Lease { valid_until: SystemTime, lsn: Lsn, @@ -59,13 +70,18 @@ pub struct Mins { } impl Horizons { - pub fn new(legacy_metric: IntGauge) -> Self { - legacy_metric.set(Lsn::INVALID.0 as i64); + pub fn new(metrics: Metrics) -> Self { + let legacy = None; + metrics.legacy_value.set(Lsn::INVALID.0 as i64); + + let leases_by_id = HashMap::default(); + metrics.leases_count_gauge.set(0); + Self { inner: std::sync::Mutex::new(Inner { - legacy: None, - legacy_metric, - leases_by_id: Default::default(), + legacy, + leases_by_id, + metrics, }), } } @@ -74,7 +90,7 @@ impl Horizons { pub fn register_legacy_update(&self, lsn: Lsn) { let mut inner = self.inner.lock().unwrap(); inner.legacy = Some(lsn); - inner.legacy_metric.set(lsn.0 as i64); + inner.metrics.legacy_value.set(lsn.0 as i64); } /// Get the minimum standby horizon and clear the horizon propagated via the legacy mechanism @@ -88,7 +104,7 @@ impl Horizons { pub fn min_and_clear_legacy(&self) -> Mins { let mut inner = self.inner.lock().unwrap(); let legacy = { - inner.legacy_metric.set(Lsn::INVALID.0 as i64); + inner.metrics.legacy_value.set(Lsn::INVALID.0 as i64); inner.legacy.take() }; @@ -116,13 +132,28 @@ impl Horizons { } hash_map::Entry::Vacant(entry) => entry.insert(update), }; - Ok(LeaseInfo { + let res = LeaseInfo { valid_until: updated.valid_until, - }) + }; + inner.metrics.leases_count_gauge.set(inner.leases_by_id.len().into_u64()); + Ok(res) } pub fn cull_leases(&self, now: SystemTime) { let mut inner = self.inner.lock().unwrap(); inner.leases_by_id.retain(|_, l| l.valid_until <= now); } + + pub fn dump(&self) -> serde_json::Value { + let inner = self.inner.lock().unwrap(); + let Inner { + legacy, + leases_by_id, + metrics: _, + } = &*inner; + serde_json::json!({ + "legacy": format!("{legacy:?}"), + "leases_by_id": format!("{leases_by_id:?}"), + }) + } }