From 210be6b6aba377592aa9aaefcea51c6427d22dac Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 30 May 2023 16:08:02 +0300 Subject: [PATCH] Replace broker duration logs with metrics (#4370) I've added logs for broker push duration after every iteration in https://github.com/neondatabase/neon/pull/4142. This log has not found any real issues, so we can replace it with metrics, to slightly reduce log volume. LogQL query found that pushes longer that 500ms happened only 90 times for the last month. https://neonprod.grafana.net/goto/KTNj9UwVg?orgId=1 `{unit="safekeeper.service"} |= "timeline updates to broker in" | regexp "to broker in (?P.*)" | duration > 500ms` --- safekeeper/src/broker.rs | 12 ++++++++++-- safekeeper/src/metrics.rs | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 5e25d22ec1..48c56ee58f 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -19,8 +19,10 @@ use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; +use crate::metrics::BROKER_ITERATION_TIMELINES; use crate::metrics::BROKER_PULLED_UPDATES; use crate::metrics::BROKER_PUSHED_UPDATES; +use crate::metrics::BROKER_PUSH_ALL_UPDATES_SECONDS; use crate::GlobalTimelines; use crate::SafeKeeperConf; @@ -61,8 +63,14 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { BROKER_PUSHED_UPDATES.inc(); } let elapsed = now.elapsed(); - // Log duration every second. Should be about 10MB of logs per day. - info!("pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed); + + BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64()); + BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64); + + if elapsed > push_interval / 2 { + info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed); + } + sleep(push_interval).await; } }; diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 189af2b044..235a88501d 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -125,6 +125,25 @@ pub static BACKUP_ERRORS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_backup_errors_total counter") }); +pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_broker_push_update_seconds", + "Seconds to push all timeline updates to the broker", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec") +}); +pub const TIMELINES_COUNT_BUCKETS: &[f64] = &[ + 1.0, 10.0, 50.0, 100.0, 200.0, 500.0, 1000.0, 2000.0, 5000.0, 10000.0, 20000.0, 50000.0, +]; +pub static BROKER_ITERATION_TIMELINES: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_broker_iteration_timelines", + "Count of timelines pushed to the broker in a single iteration", + TIMELINES_COUNT_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec") +}); pub const LABEL_UNKNOWN: &str = "unknown";