observability and debugging facilities

This commit is contained in:
Christian Schwarz
2025-07-20 23:13:02 +00:00
parent f4b38d5975
commit b47d3900b9
5 changed files with 74 additions and 20 deletions

View File

@@ -1604,6 +1604,8 @@ pub struct TimelineInfo {
/// Whether the timeline is invisible in synthetic size calculations.
pub is_invisible: Option<bool>,
pub standby_horizon: serde_json::Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]

View File

@@ -473,6 +473,8 @@ async fn build_timeline_info_common(
*timeline.get_applied_gc_cutoff_lsn(),
);
let standby_horizon = timeline.standby_horizons.dump();
let info = TimelineInfo {
tenant_id: timeline.tenant_shard_id,
timeline_id: timeline.timeline_id,
@@ -508,6 +510,7 @@ async fn build_timeline_info_common(
is_invisible: Some(is_invisible),
walreceiver_status,
standby_horizon,
};
Ok(info)
}

View File

@@ -39,6 +39,7 @@ use crate::tenant::mgr::TenantSlot;
use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::throttle::ThrottleResult;
use crate::tenant::timeline::standby_horizon;
/// Prometheus histogram buckets (in seconds) for operations in the critical
/// path. In other words, operations that directly affect that latency of user
@@ -729,6 +730,14 @@ static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static STANDBY_HORIZON_LEASES: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_standby_horizon_leases",
"Gauge indicating current number of standby horizon leases, per timeline",
&["tenant_id", "shard_id", "timeline_id"]
).expect("failed to define a metric")
});
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_resident_physical_size",
@@ -3229,7 +3238,7 @@ pub(crate) struct TimelineMetrics {
pub pitr_history_size: UIntGauge,
pub archival_size: UIntGauge,
pub layers_per_read: Histogram,
pub standby_horizon_gauge: IntGauge,
pub standby_horizon: standby_horizon::Metrics,
pub resident_physical_size_gauge: UIntGauge,
pub visible_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
@@ -3331,9 +3340,16 @@ impl TimelineMetrics {
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let standby_horizon_gauge = STANDBY_HORIZON
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let standby_horizon = standby_horizon::Metrics {
legacy_value: STANDBY_HORIZON
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap(),
leases_count_gauge: STANDBY_HORIZON_LEASES.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
]).unwrap(),
};
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
@@ -3417,7 +3433,7 @@ impl TimelineMetrics {
pitr_history_size,
archival_size,
layers_per_read,
standby_horizon_gauge,
standby_horizon,
resident_physical_size_gauge,
visible_physical_size_gauge,
current_logical_size_gauge,
@@ -3561,6 +3577,7 @@ impl TimelineMetrics {
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = STANDBY_HORIZON_LEASES.remove_label_values(&[tenant_id, shard_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

View File

@@ -11,7 +11,7 @@ pub mod layer_manager;
pub(crate) mod logical_size;
pub mod offload;
pub mod span;
mod standby_horizon;
pub(crate) mod standby_horizon;
pub mod uninit;
mod walreceiver;
@@ -1884,6 +1884,8 @@ impl Timeline {
.checked_add(Duration::from_secs(5 * 60))
.unwrap());
}
let applied_gc_cutoff_lsn =
todo!("think about boundary conditions? we didn't have any before though");
let length = todo!("duplicate init lease deadline logic?");
self.standby_horizons
.upsert_lease(lease_id, lsn, length)
@@ -3170,9 +3172,7 @@ impl Timeline {
l0_compaction_trigger: resources.l0_compaction_trigger,
gc_lock: tokio::sync::Mutex::default(),
standby_horizons: standby_horizon::Horizons::new(
metrics.standby_horizon_gauge.clone(),
),
standby_horizons: standby_horizon::Horizons::new(metrics.standby_horizon.clone()),
pagestream_throttle: resources.pagestream_throttle,
@@ -6554,6 +6554,7 @@ impl Timeline {
};
let mut new_gc_cutoff = space_cutoff.min(time_cutoff.unwrap_or_default());
// Hold GC for the standby, but as a safety guard do it only within some
// reasonable lag.
// TODO: revisit this once we've fully transitioned to leases. 10GiB isn't _that_ much

View File

@@ -15,18 +15,29 @@ use std::{
time::{Duration, SystemTime},
};
use metrics::IntGauge;
use metrics::{IntGauge, UIntGauge};
use utils::lsn::Lsn;
use crate::assert_u64_eq_usize::UsizeIsU64;
pub struct Horizons {
inner: std::sync::Mutex<Inner>,
}
struct Inner {
legacy: Option<Lsn>,
legacy_metric: IntGauge,
leases_by_id: HashMap<String, Lease>,
metrics: Metrics,
}
#[derive(Clone)]
pub struct Metrics {
/// `pageserver_standby_horizon`
pub legacy_value: IntGauge,
/// `pageserver_standby_horizon_leases`
pub leases_count_gauge: UIntGauge,
}
#[derive(Debug)]
struct Lease {
valid_until: SystemTime,
lsn: Lsn,
@@ -59,13 +70,18 @@ pub struct Mins {
}
impl Horizons {
pub fn new(legacy_metric: IntGauge) -> Self {
legacy_metric.set(Lsn::INVALID.0 as i64);
pub fn new(metrics: Metrics) -> Self {
let legacy = None;
metrics.legacy_value.set(Lsn::INVALID.0 as i64);
let leases_by_id = HashMap::default();
metrics.leases_count_gauge.set(0);
Self {
inner: std::sync::Mutex::new(Inner {
legacy: None,
legacy_metric,
leases_by_id: Default::default(),
legacy,
leases_by_id,
metrics,
}),
}
}
@@ -74,7 +90,7 @@ impl Horizons {
pub fn register_legacy_update(&self, lsn: Lsn) {
let mut inner = self.inner.lock().unwrap();
inner.legacy = Some(lsn);
inner.legacy_metric.set(lsn.0 as i64);
inner.metrics.legacy_value.set(lsn.0 as i64);
}
/// Get the minimum standby horizon and clear the horizon propagated via the legacy mechanism
@@ -88,7 +104,7 @@ impl Horizons {
pub fn min_and_clear_legacy(&self) -> Mins {
let mut inner = self.inner.lock().unwrap();
let legacy = {
inner.legacy_metric.set(Lsn::INVALID.0 as i64);
inner.metrics.legacy_value.set(Lsn::INVALID.0 as i64);
inner.legacy.take()
};
@@ -116,13 +132,28 @@ impl Horizons {
}
hash_map::Entry::Vacant(entry) => entry.insert(update),
};
Ok(LeaseInfo {
let res = LeaseInfo {
valid_until: updated.valid_until,
})
};
inner.metrics.leases_count_gauge.set(inner.leases_by_id.len().into_u64());
Ok(res)
}
pub fn cull_leases(&self, now: SystemTime) {
let mut inner = self.inner.lock().unwrap();
inner.leases_by_id.retain(|_, l| l.valid_until <= now);
}
pub fn dump(&self) -> serde_json::Value {
let inner = self.inner.lock().unwrap();
let Inner {
legacy,
leases_by_id,
metrics: _,
} = &*inner;
serde_json::json!({
"legacy": format!("{legacy:?}"),
"leases_by_id": format!("{leases_by_id:?}"),
})
}
}