mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-28 02:20:42 +00:00
page_service: emit periodic log message while response flush is slow (#10813)
The logic might seem a bit intricate / over-optimized, but I recently spent time benchmarking this code path in the context of a nightly pagebench regression (https://github.com/neondatabase/cloud/issues/21759) and I want to avoid regressing it any further. Ideally would also log the socket send & recv queue length like we do on the compute side in - https://github.com/neondatabase/neon/pull/10673 But that is proving difficult due to the Rust abstractions that wrap the socket fd. Work in progress on that is happening in - https://github.com/neondatabase/neon/pull/10823 Regarding production impact, I am worried at a theoretical level that the additional logging may cause a downward spiral in the case where a pageserver is slow to flush because there is not enough CPU. The logging would consume more CPU and thereby slow down flushes even more. However, I don't think this matters practically speaking. # Refs - context: https://neondb.slack.com/archives/C08DE6Q9C3B/p1739464533762049?thread_ts=1739462628.361019&cid=C08DE6Q9C3B - fixes https://github.com/neondatabase/neon/issues/10668 - part of https://github.com/neondatabase/cloud/issues/23515 # Testing Tested locally by running ``` ./target/debug/pagebench get-page-latest-lsn --num-clients=1000 --queue-depth=1000 ``` in one terminal, waiting a bit, then ``` pkill -STOP pagebench ``` then wait for slow logs to show up in `pageserver.log`. To see that the completion log message is logged, run ``` pkill -CONT pagebench ```
This commit is contained in:
committed by
GitHub
parent
a82a6631fd
commit
fac5db3c8d
@@ -1439,27 +1439,43 @@ impl Drop for SmgrOpTimer {
|
||||
}
|
||||
|
||||
impl SmgrOpFlushInProgress {
|
||||
pub(crate) async fn measure<Fut, O>(self, mut started_at: Instant, mut fut: Fut) -> O
|
||||
pub(crate) async fn measure<Fut, O>(self, started_at: Instant, mut fut: Fut) -> O
|
||||
where
|
||||
Fut: std::future::Future<Output = O>,
|
||||
{
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
// Whenever observe_guard gets called, or dropped,
|
||||
// it adds the time elapsed since its last call to metrics.
|
||||
// Last call is tracked in `now`.
|
||||
let mut logged = false;
|
||||
let mut last_counter_increment_at = started_at;
|
||||
let mut observe_guard = scopeguard::guard(
|
||||
|| {
|
||||
|is_timeout| {
|
||||
let now = Instant::now();
|
||||
let elapsed = now - started_at;
|
||||
self.global_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
self.per_timeline_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
started_at = now;
|
||||
|
||||
// Increment counter
|
||||
{
|
||||
let elapsed_since_last_observe = now - last_counter_increment_at;
|
||||
self.global_micros
|
||||
.inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
|
||||
self.per_timeline_micros
|
||||
.inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
|
||||
last_counter_increment_at = now;
|
||||
}
|
||||
|
||||
// Log something on every timeout, and on completion but only if we hit a timeout.
|
||||
if is_timeout || logged {
|
||||
logged = true;
|
||||
let elapsed_total = now - started_at;
|
||||
let msg = if is_timeout {
|
||||
"slow flush ongoing"
|
||||
} else {
|
||||
"slow flush completed or cancelled"
|
||||
};
|
||||
let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64());
|
||||
tracing::info!(elapsed_total_secs, msg);
|
||||
}
|
||||
},
|
||||
|mut observe| {
|
||||
observe();
|
||||
observe(false);
|
||||
},
|
||||
);
|
||||
|
||||
@@ -1467,7 +1483,7 @@ impl SmgrOpFlushInProgress {
|
||||
match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
|
||||
Ok(v) => return v,
|
||||
Err(_timeout) => {
|
||||
(*observe_guard)();
|
||||
(*observe_guard)(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user