mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 14:02:55 +00:00
Fix deadlock in safekeeper metrics (#2566)
We had a problem where almost all of the threads were waiting on a futex syscall. More specifically: - `/metrics` handler was inside `TimelineCollector::collect()`, waiting on a mutex for a single Timeline - This exact timeline was inside `control_file::FileStorage::persist()`, waiting on a mutex for Lazy initialization of `PERSIST_CONTROL_FILE_SECONDS` - `PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram>` was blocked on `prometheus::register` - `prometheus::register` calls `DEFAULT_REGISTRY.write().register()` to take a write lock on Registry and add a new metric - `DEFAULT_REGISTRY` lock was already taken inside `DEFAULT_REGISTRY.gather()`, which was called by `/metrics` handler to collect all metrics This commit creates another Registry with a separate lock, to avoid deadlock in a case where `TimelineCollector` triggers registration of new metrics inside default registry.
This commit is contained in:
committed by
GitHub
parent
b99bed510d
commit
f25dd75be9
@@ -3,7 +3,7 @@
|
||||
//! Otherwise, we might not see all metrics registered via
|
||||
//! a default registry.
|
||||
use once_cell::sync::Lazy;
|
||||
use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec};
|
||||
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
|
||||
pub use prometheus::opts;
|
||||
pub use prometheus::register;
|
||||
pub use prometheus::{core, default_registry, proto};
|
||||
@@ -17,6 +17,7 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
|
||||
pub use prometheus::{register_int_gauge, IntGauge};
|
||||
pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
|
||||
pub use prometheus::{Encoder, TextEncoder};
|
||||
use prometheus::{Registry, Result};
|
||||
|
||||
mod wrappers;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
@@ -32,13 +33,27 @@ macro_rules! register_uint_gauge_vec {
|
||||
}};
|
||||
}
|
||||
|
||||
/// Special internal registry, to collect metrics independently from the default registry.
|
||||
/// Was introduced to fix deadlock with lazy registration of metrics in the default registry.
|
||||
static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
|
||||
|
||||
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
|
||||
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
|
||||
/// while holding the lock.
|
||||
pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
|
||||
INTERNAL_REGISTRY.register(c)
|
||||
}
|
||||
|
||||
/// Gathers all Prometheus metrics and records the I/O stats just before that.
|
||||
///
|
||||
/// Metrics gathering is a relatively simple and standalone operation, so
|
||||
/// it might be fine to do it this way to keep things simple.
|
||||
pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
|
||||
update_rusage_metrics();
|
||||
prometheus::gather()
|
||||
let mut mfs = prometheus::gather();
|
||||
let mut internal_mfs = INTERNAL_REGISTRY.gather();
|
||||
mfs.append(&mut internal_mfs);
|
||||
mfs
|
||||
}
|
||||
|
||||
static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
|
||||
@@ -9,6 +9,7 @@ use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::RequestInfo;
|
||||
use routerify::{Middleware, Router, RouterBuilder, RouterService};
|
||||
use tokio::task::JoinError;
|
||||
use tracing::info;
|
||||
|
||||
use std::future::Future;
|
||||
@@ -35,7 +36,13 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
|
||||
let mut buffer = vec![];
|
||||
let encoder = TextEncoder::new();
|
||||
|
||||
let metrics = metrics::gather();
|
||||
let metrics = tokio::task::spawn_blocking(move || {
|
||||
// Currently we take a lot of mutexes while collecting metrics, so it's
|
||||
// better to spawn a blocking task to avoid blocking the event loop.
|
||||
metrics::gather()
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
encoder.encode(&metrics, &mut buffer).unwrap();
|
||||
|
||||
let response = Response::builder()
|
||||
|
||||
@@ -291,9 +291,8 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
||||
|
||||
// Register metrics collector for active timelines. It's important to do this
|
||||
// after daemonizing, otherwise process collector will be upset.
|
||||
let registry = metrics::default_registry();
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
|
||||
registry.register(Box::new(timeline_collector))?;
|
||||
metrics::register_internal(Box::new(timeline_collector))?;
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
let mut threads = vec![];
|
||||
|
||||
Reference in New Issue
Block a user