mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 15:02:56 +00:00
observability and debugging facilities
This commit is contained in:
@@ -1604,6 +1604,8 @@ pub struct TimelineInfo {
|
||||
|
||||
/// Whether the timeline is invisible in synthetic size calculations.
|
||||
pub is_invisible: Option<bool>,
|
||||
|
||||
pub standby_horizon: serde_json::Value,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
||||
@@ -473,6 +473,8 @@ async fn build_timeline_info_common(
|
||||
*timeline.get_applied_gc_cutoff_lsn(),
|
||||
);
|
||||
|
||||
let standby_horizon = timeline.standby_horizons.dump();
|
||||
|
||||
let info = TimelineInfo {
|
||||
tenant_id: timeline.tenant_shard_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
@@ -508,6 +510,7 @@ async fn build_timeline_info_common(
|
||||
is_invisible: Some(is_invisible),
|
||||
|
||||
walreceiver_status,
|
||||
standby_horizon,
|
||||
};
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
@@ -39,6 +39,7 @@ use crate::tenant::mgr::TenantSlot;
|
||||
use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::throttle::ThrottleResult;
|
||||
use crate::tenant::timeline::standby_horizon;
|
||||
|
||||
/// Prometheus histogram buckets (in seconds) for operations in the critical
|
||||
/// path. In other words, operations that directly affect that latency of user
|
||||
@@ -729,6 +730,14 @@ static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static STANDBY_HORIZON_LEASES: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_standby_horizon_leases",
|
||||
"Gauge indicating current number of standby horizon leases, per timeline",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
).expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_resident_physical_size",
|
||||
@@ -3229,7 +3238,7 @@ pub(crate) struct TimelineMetrics {
|
||||
pub pitr_history_size: UIntGauge,
|
||||
pub archival_size: UIntGauge,
|
||||
pub layers_per_read: Histogram,
|
||||
pub standby_horizon_gauge: IntGauge,
|
||||
pub standby_horizon: standby_horizon::Metrics,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
pub visible_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
@@ -3331,9 +3340,16 @@ impl TimelineMetrics {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let standby_horizon_gauge = STANDBY_HORIZON
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let standby_horizon = standby_horizon::Metrics {
|
||||
legacy_value: STANDBY_HORIZON
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap(),
|
||||
leases_count_gauge: STANDBY_HORIZON_LEASES.get_metric_with_label_values(&[
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
]).unwrap(),
|
||||
};
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -3417,7 +3433,7 @@ impl TimelineMetrics {
|
||||
pitr_history_size,
|
||||
archival_size,
|
||||
layers_per_read,
|
||||
standby_horizon_gauge,
|
||||
standby_horizon,
|
||||
resident_physical_size_gauge,
|
||||
visible_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
@@ -3561,6 +3577,7 @@ impl TimelineMetrics {
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = STANDBY_HORIZON_LEASES.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
{
|
||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
@@ -11,7 +11,7 @@ pub mod layer_manager;
|
||||
pub(crate) mod logical_size;
|
||||
pub mod offload;
|
||||
pub mod span;
|
||||
mod standby_horizon;
|
||||
pub(crate) mod standby_horizon;
|
||||
pub mod uninit;
|
||||
mod walreceiver;
|
||||
|
||||
@@ -1884,6 +1884,8 @@ impl Timeline {
|
||||
.checked_add(Duration::from_secs(5 * 60))
|
||||
.unwrap());
|
||||
}
|
||||
let applied_gc_cutoff_lsn =
|
||||
todo!("think about boundary conditions? we didn't have any before though");
|
||||
let length = todo!("duplicate init lease deadline logic?");
|
||||
self.standby_horizons
|
||||
.upsert_lease(lease_id, lsn, length)
|
||||
@@ -3170,9 +3172,7 @@ impl Timeline {
|
||||
l0_compaction_trigger: resources.l0_compaction_trigger,
|
||||
gc_lock: tokio::sync::Mutex::default(),
|
||||
|
||||
standby_horizons: standby_horizon::Horizons::new(
|
||||
metrics.standby_horizon_gauge.clone(),
|
||||
),
|
||||
standby_horizons: standby_horizon::Horizons::new(metrics.standby_horizon.clone()),
|
||||
|
||||
pagestream_throttle: resources.pagestream_throttle,
|
||||
|
||||
@@ -6554,6 +6554,7 @@ impl Timeline {
|
||||
};
|
||||
|
||||
let mut new_gc_cutoff = space_cutoff.min(time_cutoff.unwrap_or_default());
|
||||
|
||||
// Hold GC for the standby, but as a safety guard do it only within some
|
||||
// reasonable lag.
|
||||
// TODO: revisit this once we've fully transitioned to leases. 10GiB isn't _that_ much
|
||||
|
||||
@@ -15,18 +15,29 @@ use std::{
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use metrics::IntGauge;
|
||||
use metrics::{IntGauge, UIntGauge};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::assert_u64_eq_usize::UsizeIsU64;
|
||||
|
||||
pub struct Horizons {
|
||||
inner: std::sync::Mutex<Inner>,
|
||||
}
|
||||
struct Inner {
|
||||
legacy: Option<Lsn>,
|
||||
legacy_metric: IntGauge,
|
||||
leases_by_id: HashMap<String, Lease>,
|
||||
metrics: Metrics,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Metrics {
|
||||
/// `pageserver_standby_horizon`
|
||||
pub legacy_value: IntGauge,
|
||||
/// `pageserver_standby_horizon_leases`
|
||||
pub leases_count_gauge: UIntGauge,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Lease {
|
||||
valid_until: SystemTime,
|
||||
lsn: Lsn,
|
||||
@@ -59,13 +70,18 @@ pub struct Mins {
|
||||
}
|
||||
|
||||
impl Horizons {
|
||||
pub fn new(legacy_metric: IntGauge) -> Self {
|
||||
legacy_metric.set(Lsn::INVALID.0 as i64);
|
||||
pub fn new(metrics: Metrics) -> Self {
|
||||
let legacy = None;
|
||||
metrics.legacy_value.set(Lsn::INVALID.0 as i64);
|
||||
|
||||
let leases_by_id = HashMap::default();
|
||||
metrics.leases_count_gauge.set(0);
|
||||
|
||||
Self {
|
||||
inner: std::sync::Mutex::new(Inner {
|
||||
legacy: None,
|
||||
legacy_metric,
|
||||
leases_by_id: Default::default(),
|
||||
legacy,
|
||||
leases_by_id,
|
||||
metrics,
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -74,7 +90,7 @@ impl Horizons {
|
||||
pub fn register_legacy_update(&self, lsn: Lsn) {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.legacy = Some(lsn);
|
||||
inner.legacy_metric.set(lsn.0 as i64);
|
||||
inner.metrics.legacy_value.set(lsn.0 as i64);
|
||||
}
|
||||
|
||||
/// Get the minimum standby horizon and clear the horizon propagated via the legacy mechanism
|
||||
@@ -88,7 +104,7 @@ impl Horizons {
|
||||
pub fn min_and_clear_legacy(&self) -> Mins {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
let legacy = {
|
||||
inner.legacy_metric.set(Lsn::INVALID.0 as i64);
|
||||
inner.metrics.legacy_value.set(Lsn::INVALID.0 as i64);
|
||||
inner.legacy.take()
|
||||
};
|
||||
|
||||
@@ -116,13 +132,28 @@ impl Horizons {
|
||||
}
|
||||
hash_map::Entry::Vacant(entry) => entry.insert(update),
|
||||
};
|
||||
Ok(LeaseInfo {
|
||||
let res = LeaseInfo {
|
||||
valid_until: updated.valid_until,
|
||||
})
|
||||
};
|
||||
inner.metrics.leases_count_gauge.set(inner.leases_by_id.len().into_u64());
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub fn cull_leases(&self, now: SystemTime) {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.leases_by_id.retain(|_, l| l.valid_until <= now);
|
||||
}
|
||||
|
||||
pub fn dump(&self) -> serde_json::Value {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let Inner {
|
||||
legacy,
|
||||
leases_by_id,
|
||||
metrics: _,
|
||||
} = &*inner;
|
||||
serde_json::json!({
|
||||
"legacy": format!("{legacy:?}"),
|
||||
"leases_by_id": format!("{leases_by_id:?}"),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user