pageserver: periodically log slow ongoing getpage requests (#10906)

## Problem We don't have good observability for "stuck" getpage requests. Resolves https://github.com/neondatabase/cloud/issues/23808. ## Summary of changes Log a periodic warning (every 30 seconds) if GetPage request execution is slow to complete, to aid in debugging stuck GetPage requests. This does not cover response flushing (we have separate logging for that), nor reading the request from the socket and batching it (expected to be insignificant and not straightforward to handle with the current protocol). This costs 95 nanoseconds on the happy path when awaiting a `tokio::task::yield_now()`: ``` warn_slow/enabled=false time: [45.716 ns 46.116 ns 46.687 ns] warn_slow/enabled=true time: [141.53 ns 141.83 ns 142.18 ns] ```
2026-01-06 21:12:55 +00:00 · 2025-02-20 22:38:42 +01:00
parent 0b9b391ea0
commit 9b42d1ce1a
6 changed files with 139 additions and 16 deletions
--- a/libs/utils/benches/benchmarks.rs
+++ b/libs/utils/benches/benchmarks.rs
@@ -1,5 +1,18 @@
-use criterion::{criterion_group, criterion_main, Criterion};
+use std::time::Duration;
+
+use criterion::{criterion_group, criterion_main, Bencher, Criterion};
+use pprof::criterion::{Output, PProfProfiler};
 use utils::id;
+use utils::logging::warn_slow;
+
+// Register benchmarks with Criterion.
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_id_stringify,
+    bench_warn_slow,
+);
+criterion_main!(benches);

 pub fn bench_id_stringify(c: &mut Criterion) {
    // Can only use public methods.
@@ -16,5 +29,31 @@ pub fn bench_id_stringify(c: &mut Criterion) {
    });
 }

-criterion_group!(benches, bench_id_stringify);
-criterion_main!(benches);
+pub fn bench_warn_slow(c: &mut Criterion) {
+    for enabled in [false, true] {
+        c.bench_function(&format!("warn_slow/enabled={enabled}"), |b| {
+            run_bench(b, enabled).unwrap()
+        });
+    }
+
+    // The actual benchmark.
+    fn run_bench(b: &mut Bencher, enabled: bool) -> anyhow::Result<()> {
+        const THRESHOLD: Duration = Duration::from_secs(1);
+
+        // Use a multi-threaded runtime to avoid thread parking overhead when yielding.
+        let runtime = tokio::runtime::Builder::new_multi_thread()
+            .enable_all()
+            .build()?;
+
+        // Test both with and without warn_slow, since we're essentially measuring Tokio scheduling
+        // performance too. Use a simple noop future that yields once, to avoid any scheduler fast
+        // paths for a ready future.
+        if enabled {
+            b.iter(|| runtime.block_on(warn_slow("ready", THRESHOLD, tokio::task::yield_now())));
+        } else {
+            b.iter(|| runtime.block_on(tokio::task::yield_now()));
+        }
+
+        Ok(())
+    }
+}