mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
Log warning on slow WAL removal (#6432)
Also add `safekeeper_active_timelines` metric. Should help investigating #6403
This commit is contained in:
committed by
GitHub
parent
205b6111e6
commit
b41ee81308
@@ -110,7 +110,7 @@ pub static REMOVED_WAL_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub static BACKED_UP_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"safekeeper_backed_up_segments_total",
|
||||
"Number of WAL segments backed up to the broker"
|
||||
"Number of WAL segments backed up to the S3"
|
||||
)
|
||||
.expect("Failed to register safekeeper_backed_up_segments_total counter")
|
||||
});
|
||||
@@ -337,6 +337,7 @@ pub struct TimelineCollector {
|
||||
flushed_wal_seconds: GaugeVec,
|
||||
collect_timeline_metrics: Gauge,
|
||||
timelines_count: IntGauge,
|
||||
active_timelines_count: IntGauge,
|
||||
}
|
||||
|
||||
impl Default for TimelineCollector {
|
||||
@@ -520,6 +521,13 @@ impl TimelineCollector {
|
||||
.unwrap();
|
||||
descs.extend(timelines_count.desc().into_iter().cloned());
|
||||
|
||||
let active_timelines_count = IntGauge::new(
|
||||
"safekeeper_active_timelines",
|
||||
"Total number of active timelines",
|
||||
)
|
||||
.unwrap();
|
||||
descs.extend(active_timelines_count.desc().into_iter().cloned());
|
||||
|
||||
TimelineCollector {
|
||||
descs,
|
||||
commit_lsn,
|
||||
@@ -540,6 +548,7 @@ impl TimelineCollector {
|
||||
flushed_wal_seconds,
|
||||
collect_timeline_metrics,
|
||||
timelines_count,
|
||||
active_timelines_count,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -572,6 +581,7 @@ impl Collector for TimelineCollector {
|
||||
|
||||
let timelines = GlobalTimelines::get_all();
|
||||
let timelines_count = timelines.len();
|
||||
let mut active_timelines_count = 0;
|
||||
|
||||
// Prometheus Collector is sync, and data is stored under async lock. To
|
||||
// bridge the gap with a crutch, collect data in spawned thread with
|
||||
@@ -590,6 +600,10 @@ impl Collector for TimelineCollector {
|
||||
let timeline_id = tli.ttid.timeline_id.to_string();
|
||||
let labels = &[tenant_id.as_str(), timeline_id.as_str()];
|
||||
|
||||
if tli.timeline_is_active {
|
||||
active_timelines_count += 1;
|
||||
}
|
||||
|
||||
self.commit_lsn
|
||||
.with_label_values(labels)
|
||||
.set(tli.mem_state.commit_lsn.into());
|
||||
@@ -681,6 +695,8 @@ impl Collector for TimelineCollector {
|
||||
|
||||
// report total number of timelines
|
||||
self.timelines_count.set(timelines_count as i64);
|
||||
self.active_timelines_count
|
||||
.set(active_timelines_count as i64);
|
||||
mfs.extend(self.timelines_count.collect());
|
||||
|
||||
mfs
|
||||
|
||||
@@ -10,11 +10,15 @@ use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
let wal_removal_interval = Duration::from_millis(5000);
|
||||
loop {
|
||||
let now = tokio::time::Instant::now();
|
||||
let mut active_timelines = 0;
|
||||
|
||||
let tlis = GlobalTimelines::get_all();
|
||||
for tli in &tlis {
|
||||
if !tli.is_active().await {
|
||||
continue;
|
||||
}
|
||||
active_timelines += 1;
|
||||
let ttid = tli.ttid;
|
||||
async {
|
||||
if let Err(e) = tli.maybe_persist_control_file().await {
|
||||
@@ -27,6 +31,17 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
.instrument(info_span!("WAL removal", ttid = %ttid))
|
||||
.await;
|
||||
}
|
||||
|
||||
let elapsed = now.elapsed();
|
||||
let total_timelines = tlis.len();
|
||||
|
||||
if elapsed > wal_removal_interval {
|
||||
info!(
|
||||
"WAL removal is too long, processed {} active timelines ({} total) in {:?}",
|
||||
active_timelines, total_timelines, elapsed
|
||||
);
|
||||
}
|
||||
|
||||
sleep(wal_removal_interval).await;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user