diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 920d3fd17e..e290828d37 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,7 +3,7 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. use once_cell::sync::Lazy; -use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec}; +use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec}; pub use prometheus::opts; pub use prometheus::register; pub use prometheus::{core, default_registry, proto}; @@ -17,6 +17,7 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec}; pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{Encoder, TextEncoder}; +use prometheus::{Registry, Result}; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; @@ -32,13 +33,27 @@ macro_rules! register_uint_gauge_vec { }}; } +/// Special internal registry, to collect metrics independently from the default registry. +/// Was introduced to fix deadlock with lazy registration of metrics in the default registry. +static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); + +/// Register a collector in the internal registry. MUST be called before the first call to `gather()`. +/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector +/// while holding the lock. +pub fn register_internal(c: Box) -> Result<()> { + INTERNAL_REGISTRY.register(c) +} + /// Gathers all Prometheus metrics and records the I/O stats just before that. /// /// Metrics gathering is a relatively simple and standalone operation, so /// it might be fine to do it this way to keep things simple. pub fn gather() -> Vec { update_rusage_metrics(); - prometheus::gather() + let mut mfs = prometheus::gather(); + let mut internal_mfs = INTERNAL_REGISTRY.gather(); + mfs.append(&mut internal_mfs); + mfs } static DISK_IO_BYTES: Lazy = Lazy::new(|| { diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 4066791e2b..7a519929cf 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -9,6 +9,7 @@ use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::RequestInfo; use routerify::{Middleware, Router, RouterBuilder, RouterService}; +use tokio::task::JoinError; use tracing::info; use std::future::Future; @@ -35,7 +36,13 @@ async fn prometheus_metrics_handler(_req: Request) -> Result, init: bo // Register metrics collector for active timelines. It's important to do this // after daemonizing, otherwise process collector will be upset. - let registry = metrics::default_registry(); let timeline_collector = safekeeper::metrics::TimelineCollector::new(); - registry.register(Box::new(timeline_collector))?; + metrics::register_internal(Box::new(timeline_collector))?; let signals = signals::install_shutdown_handlers()?; let mut threads = vec![];