add counter metric that increases whenever a background loop overruns its period (#4223)

We already have the warn!() log line for this condition. This PR adds a
corresponding metric on which we can have a dedicated alert. Cheaper and
more reliable than alerting on the logs, because, we run into log rate
limits from time to time these days.

refs https://github.com/neondatabase/neon/issues/4222
This commit is contained in:
Christian Schwarz
2023-05-12 18:00:06 +02:00
committed by GitHub
parent bb06d281ea
commit a2a9c598be
2 changed files with 16 additions and 0 deletions

View File

@@ -489,6 +489,15 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("Failed to register tenant_task_events metric")
});
pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_background_loop_period_overrun_count",
"Incremented whenever warn_when_period_overrun() logs a warning.",
&["task", "period"],
)
.expect("failed to define a metric")
});
// walreceiver metrics
pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
@@ -1231,4 +1240,7 @@ pub fn preinitialize_metrics() {
// Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
// Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
}

View File

@@ -259,6 +259,7 @@ pub(crate) async fn random_init_delay(
}
}
/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
// Duration::ZERO will happen because it's the "disable [bgtask]" value.
if elapsed >= period && period != Duration::ZERO {
@@ -271,5 +272,8 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task
task,
"task iteration took longer than the configured period"
);
crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
.with_label_values(&[task, &format!("{}", period.as_secs())])
.inc();
}
}