utils/logging: add critical! macro and metric (#10641)

## Problem

We don't currently have good alerts for critical errors, e.g. data
loss/corruption.

Touches #10094.

## Summary of changes

Add a `critical!` macro and corresponding
`libmetrics_tracing_event_count{level="critical"}` metric. This will:

* Emit an `ERROR` log message with prefix `"CRITICAL:"` and a backtrace.
* Increment `libmetrics_tracing_event_count{level="critical"}`, and
indirectly `level="error"`.
* Trigger a pageable alert (via the metric above).
* In debug builds, panic the process.

I'll add uses of the macro separately.
This commit is contained in:
Erik Grinaker
2025-02-03 20:23:12 +01:00
committed by GitHub
parent 715e20343a
commit 06b45fd0fd

View File

@@ -5,6 +5,24 @@ use metrics::{IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
use strum_macros::{EnumString, VariantNames};
/// Logs a critical error, similarly to `tracing::error!`. This will:
///
/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
/// * Trigger a pageable alert (via the metric above).
/// * In debug builds, panic the process.
#[macro_export]
macro_rules! critical {
($($arg:tt)*) => {
if cfg!(debug_assertions) {
panic!($($arg)*);
}
$crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
let backtrace = std::backtrace::Backtrace::capture();
tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
};
}
#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
#[strum(serialize_all = "snake_case")]
pub enum LogFormat {
@@ -25,7 +43,10 @@ impl LogFormat {
}
}
struct TracingEventCountMetric {
pub struct TracingEventCountMetric {
/// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro,
/// and also emit it as a regular error. These are thus double-counted, but that seems fine.
critical: IntCounter,
error: IntCounter,
warn: IntCounter,
info: IntCounter,
@@ -33,7 +54,7 @@ struct TracingEventCountMetric {
trace: IntCounter,
}
static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
let vec = metrics::register_int_counter_vec!(
"libmetrics_tracing_event_count",
"Number of tracing events, by level",
@@ -46,6 +67,7 @@ static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(||
impl TracingEventCountMetric {
fn new(vec: IntCounterVec) -> Self {
Self {
critical: vec.with_label_values(&["critical"]),
error: vec.with_label_values(&["error"]),
warn: vec.with_label_values(&["warn"]),
info: vec.with_label_values(&["info"]),
@@ -54,6 +76,11 @@ impl TracingEventCountMetric {
}
}
// Allow public access from `critical!` macro.
pub fn inc_critical(&self) {
self.critical.inc();
}
fn inc_for_level(&self, level: tracing::Level) {
let counter = match level {
tracing::Level::ERROR => &self.error,