From b41ee81308734743497acf15e2e31cb59cd8a9fd Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Mon, 22 Jan 2024 21:38:05 +0300 Subject: [PATCH] Log warning on slow WAL removal (#6432) Also add `safekeeper_active_timelines` metric. Should help investigating #6403 --- safekeeper/src/metrics.rs | 18 +++++++++++++++++- safekeeper/src/remove_wal.rs | 15 +++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 8810252e0b..fbba2e00fc 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -110,7 +110,7 @@ pub static REMOVED_WAL_SEGMENTS: Lazy = Lazy::new(|| { pub static BACKED_UP_SEGMENTS: Lazy = Lazy::new(|| { register_int_counter!( "safekeeper_backed_up_segments_total", - "Number of WAL segments backed up to the broker" + "Number of WAL segments backed up to the S3" ) .expect("Failed to register safekeeper_backed_up_segments_total counter") }); @@ -337,6 +337,7 @@ pub struct TimelineCollector { flushed_wal_seconds: GaugeVec, collect_timeline_metrics: Gauge, timelines_count: IntGauge, + active_timelines_count: IntGauge, } impl Default for TimelineCollector { @@ -520,6 +521,13 @@ impl TimelineCollector { .unwrap(); descs.extend(timelines_count.desc().into_iter().cloned()); + let active_timelines_count = IntGauge::new( + "safekeeper_active_timelines", + "Total number of active timelines", + ) + .unwrap(); + descs.extend(active_timelines_count.desc().into_iter().cloned()); + TimelineCollector { descs, commit_lsn, @@ -540,6 +548,7 @@ impl TimelineCollector { flushed_wal_seconds, collect_timeline_metrics, timelines_count, + active_timelines_count, } } } @@ -572,6 +581,7 @@ impl Collector for TimelineCollector { let timelines = GlobalTimelines::get_all(); let timelines_count = timelines.len(); + let mut active_timelines_count = 0; // Prometheus Collector is sync, and data is stored under async lock. To // bridge the gap with a crutch, collect data in spawned thread with @@ -590,6 +600,10 @@ impl Collector for TimelineCollector { let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; + if tli.timeline_is_active { + active_timelines_count += 1; + } + self.commit_lsn .with_label_values(labels) .set(tli.mem_state.commit_lsn.into()); @@ -681,6 +695,8 @@ impl Collector for TimelineCollector { // report total number of timelines self.timelines_count.set(timelines_count as i64); + self.active_timelines_count + .set(active_timelines_count as i64); mfs.extend(self.timelines_count.collect()); mfs diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index d96eedf401..87c3ecaf60 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -10,11 +10,15 @@ use crate::{GlobalTimelines, SafeKeeperConf}; pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { let wal_removal_interval = Duration::from_millis(5000); loop { + let now = tokio::time::Instant::now(); + let mut active_timelines = 0; + let tlis = GlobalTimelines::get_all(); for tli in &tlis { if !tli.is_active().await { continue; } + active_timelines += 1; let ttid = tli.ttid; async { if let Err(e) = tli.maybe_persist_control_file().await { @@ -27,6 +31,17 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { .instrument(info_span!("WAL removal", ttid = %ttid)) .await; } + + let elapsed = now.elapsed(); + let total_timelines = tlis.len(); + + if elapsed > wal_removal_interval { + info!( + "WAL removal is too long, processed {} active timelines ({} total) in {:?}", + active_timelines, total_timelines, elapsed + ); + } + sleep(wal_removal_interval).await; } }