pageserver: better observability for slow wait_lsn (#11176)

# Problem We leave too few observability breadcrumbs in the case where wait_lsn is exceptionally slow. # Changes - refactor: extract the monitoring logic out of `log_slow` into `monitor_slow_future` - add global + per-timeline counter for time spent waiting for wait_lsn - It is updated while we're still waiting, similar to what we do for page_service response flush. - add per-timeline counterpair for started & finished wait_lsn count - add slow-logging to leave breadcrumbs in logs, not just metrics For the slow-logging, we need to consider not flooding the logs during a broker or network outage/blip. The solution is a "log-streak-level" concurrency limit per timeline. At any given time, there is at most one slow wait_lsn that is logging the "still running" and "completed" sequence of logs. Other concurrent slow wait_lsn's don't log at all. This leaves at least one breadcrumb in each timeline's logs if some wait_lsn was exceptionally slow during a given period. The full degree of slowness can then be determined by looking at the per-timeline metric. # Performance Reran the `bench_log_slow` benchmark, no difference, so, existing call sites are fine. We do use a Semaphore, but only try_acquire it _after_ things have already been determined to be slow. So, no baseline overhead anticipated. # Refs - https://github.com/neondatabase/cloud/issues/23486#issuecomment-2711587222
2026-01-05 20:42:54 +00:00 · 2025-03-13 16:03:53 +01:00
parent 3dec117572
commit ed31dd2a3c
6 changed files with 227 additions and 49 deletions
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -331,37 +331,90 @@ impl std::fmt::Debug for SecretString {
 ///
 /// TODO: consider upgrading this to a warning, but currently it fires too often.
 #[inline]
-pub async fn log_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
-    // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
-    // won't fit on the stack.
-    let mut f = Box::pin(f);
+pub async fn log_slow<F, O>(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O
+where
+    F: Future<Output = O>,
+{
+    monitor_slow_future(
+        threshold,
+        threshold, // period = threshold
+        f,
+        |MonitorSlowFutureCallback {
+             ready,
+             is_slow,
+             elapsed_total,
+             elapsed_since_last_callback: _,
+         }| {
+            if !is_slow {
+                return;
+            }
+            if ready {
+                info!(
+                    "slow {name} completed after {:.3}s",
+                    elapsed_total.as_secs_f64()
+                );
+            } else {
+                info!(
+                    "slow {name} still running after {:.3}s",
+                    elapsed_total.as_secs_f64()
+                );
+            }
+        },
+    )
+    .await
+}

+/// Poll future `fut` to completion, invoking callback `cb` at the given `threshold` and every
+/// `period` afterwards, and also unconditionally when the future completes.
+#[inline]
+pub async fn monitor_slow_future<F, O>(
+    threshold: Duration,
+    period: Duration,
+    mut fut: std::pin::Pin<&mut F>,
+    mut cb: impl FnMut(MonitorSlowFutureCallback),
+) -> O
+where
+    F: Future<Output = O>,
+{
    let started = Instant::now();
    let mut attempt = 1;
-
+    let mut last_cb = started;
    loop {
        // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common
        // case where the timeout doesn't fire.
-        let deadline = started + attempt * threshold;
-        if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await {
-            // NB: we check if we exceeded the threshold even if the timeout never fired, because
-            // scheduling or execution delays may cause the future to succeed even if it exceeds the
-            // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid
-            // false negatives.
-            let elapsed = started.elapsed();
-            if elapsed >= threshold {
-                info!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
-            }
+        let deadline = started + threshold + (attempt - 1) * period;
+        // TODO: still call the callback if the future panics? Copy how we do it for the page_service flush_in_progress counter.
+        let res = tokio::time::timeout_at(deadline, &mut fut).await;
+        let now = Instant::now();
+        let elapsed_total = now - started;
+        cb(MonitorSlowFutureCallback {
+            ready: res.is_ok(),
+            is_slow: elapsed_total >= threshold,
+            elapsed_total,
+            elapsed_since_last_callback: now - last_cb,
+        });
+        last_cb = now;
+        if let Ok(output) = res {
            return output;
        }
-
-        let elapsed = started.elapsed().as_secs_f64();
-        info!("slow {name} still running after {elapsed:.3}s",);
-
        attempt += 1;
    }
 }

+/// See [`monitor_slow_future`].
+pub struct MonitorSlowFutureCallback {
+    /// Whether the future completed. If true, there will be no more callbacks.
+    pub ready: bool,
+    /// Whether the future is taking `>=` the specififed threshold duration to complete.
+    /// Monotonic: if true in one callback invocation, true in all subsequent onces.
+    pub is_slow: bool,
+    /// The time elapsed since the [`monitor_slow_future`] was first polled.
+    pub elapsed_total: Duration,
+    /// The time elapsed since the last callback invocation.
+    /// For the initial callback invocation, the time elapsed since the [`monitor_slow_future`] was first polled.
+    pub elapsed_since_last_callback: Duration,
+}
+
 #[cfg(test)]
 mod tests {
    use metrics::IntCounterVec;