From a2a9c598beff888c3b097b28bbe06b87c947ff26 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 12 May 2023 18:00:06 +0200 Subject: [PATCH] add counter metric that increases whenever a background loop overruns its period (#4223) We already have the warn!() log line for this condition. This PR adds a corresponding metric on which we can have a dedicated alert. Cheaper and more reliable than alerting on the logs, because, we run into log rate limits from time to time these days. refs https://github.com/neondatabase/neon/issues/4222 --- pageserver/src/metrics.rs | 12 ++++++++++++ pageserver/src/tenant/tasks.rs | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c04f6e054b..bbd59c13bd 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -489,6 +489,15 @@ pub static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); +pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_background_loop_period_overrun_count", + "Incremented whenever warn_when_period_overrun() logs a warning.", + &["task", "period"], + ) + .expect("failed to define a metric") +}); + // walreceiver metrics pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy = Lazy::new(|| { @@ -1231,4 +1240,7 @@ pub fn preinitialize_metrics() { // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0. assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0); UNEXPECTED_ONDEMAND_DOWNLOADS.reset(); + + // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels. + BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset(); } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 7e7dbd3c5c..6bf26f1da1 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -259,6 +259,7 @@ pub(crate) async fn random_init_delay( } } +/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) { // Duration::ZERO will happen because it's the "disable [bgtask]" value. if elapsed >= period && period != Duration::ZERO { @@ -271,5 +272,8 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task task, "task iteration took longer than the configured period" ); + crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT + .with_label_values(&[task, &format!("{}", period.as_secs())]) + .inc(); } }