From 06b45fd0fd38515cd431aab0d9baae7e13a52058 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 3 Feb 2025 20:23:12 +0100 Subject: [PATCH] utils/logging: add `critical!` macro and metric (#10641) ## Problem We don't currently have good alerts for critical errors, e.g. data loss/corruption. Touches #10094. ## Summary of changes Add a `critical!` macro and corresponding `libmetrics_tracing_event_count{level="critical"}` metric. This will: * Emit an `ERROR` log message with prefix `"CRITICAL:"` and a backtrace. * Increment `libmetrics_tracing_event_count{level="critical"}`, and indirectly `level="error"`. * Trigger a pageable alert (via the metric above). * In debug builds, panic the process. I'll add uses of the macro separately. --- libs/utils/src/logging.rs | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index e205d60d74..753f05b6fd 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -5,6 +5,24 @@ use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, VariantNames}; +/// Logs a critical error, similarly to `tracing::error!`. This will: +/// +/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace. +/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error". +/// * Trigger a pageable alert (via the metric above). +/// * In debug builds, panic the process. +#[macro_export] +macro_rules! critical { + ($($arg:tt)*) => { + if cfg!(debug_assertions) { + panic!($($arg)*); + } + $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical(); + let backtrace = std::backtrace::Backtrace::capture(); + tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*)); + }; +} + #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)] #[strum(serialize_all = "snake_case")] pub enum LogFormat { @@ -25,7 +43,10 @@ impl LogFormat { } } -struct TracingEventCountMetric { +pub struct TracingEventCountMetric { + /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro, + /// and also emit it as a regular error. These are thus double-counted, but that seems fine. + critical: IntCounter, error: IntCounter, warn: IntCounter, info: IntCounter, @@ -33,7 +54,7 @@ struct TracingEventCountMetric { trace: IntCounter, } -static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| { +pub static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| { let vec = metrics::register_int_counter_vec!( "libmetrics_tracing_event_count", "Number of tracing events, by level", @@ -46,6 +67,7 @@ static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| impl TracingEventCountMetric { fn new(vec: IntCounterVec) -> Self { Self { + critical: vec.with_label_values(&["critical"]), error: vec.with_label_values(&["error"]), warn: vec.with_label_values(&["warn"]), info: vec.with_label_values(&["info"]), @@ -54,6 +76,11 @@ impl TracingEventCountMetric { } } + // Allow public access from `critical!` macro. + pub fn inc_critical(&self) { + self.critical.inc(); + } + fn inc_for_level(&self, level: tracing::Level) { let counter = match level { tracing::Level::ERROR => &self.error,