pageserver: periodically log slow ongoing getpage requests (#10906)

## Problem

We don't have good observability for "stuck" getpage requests.

Resolves https://github.com/neondatabase/cloud/issues/23808.

## Summary of changes

Log a periodic warning (every 30 seconds) if GetPage request execution
is slow to complete, to aid in debugging stuck GetPage requests.

This does not cover response flushing (we have separate logging for
that), nor reading the request from the socket and batching it (expected
to be insignificant and not straightforward to handle with the current
protocol).

This costs 95 nanoseconds on the happy path when awaiting a
`tokio::task::yield_now()`:

```
warn_slow/enabled=false time:   [45.716 ns 46.116 ns 46.687 ns]
warn_slow/enabled=true  time:   [141.53 ns 141.83 ns 142.18 ns]
```
This commit is contained in:
Erik Grinaker
2025-02-20 22:38:42 +01:00
committed by GitHub
parent 0b9b391ea0
commit 9b42d1ce1a
6 changed files with 139 additions and 16 deletions

View File

@@ -1,5 +1,18 @@
use criterion::{criterion_group, criterion_main, Criterion};
use std::time::Duration;
use criterion::{criterion_group, criterion_main, Bencher, Criterion};
use pprof::criterion::{Output, PProfProfiler};
use utils::id;
use utils::logging::warn_slow;
// Register benchmarks with Criterion.
criterion_group!(
name = benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = bench_id_stringify,
bench_warn_slow,
);
criterion_main!(benches);
pub fn bench_id_stringify(c: &mut Criterion) {
// Can only use public methods.
@@ -16,5 +29,31 @@ pub fn bench_id_stringify(c: &mut Criterion) {
});
}
criterion_group!(benches, bench_id_stringify);
criterion_main!(benches);
pub fn bench_warn_slow(c: &mut Criterion) {
for enabled in [false, true] {
c.bench_function(&format!("warn_slow/enabled={enabled}"), |b| {
run_bench(b, enabled).unwrap()
});
}
// The actual benchmark.
fn run_bench(b: &mut Bencher, enabled: bool) -> anyhow::Result<()> {
const THRESHOLD: Duration = Duration::from_secs(1);
// Use a multi-threaded runtime to avoid thread parking overhead when yielding.
let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()?;
// Test both with and without warn_slow, since we're essentially measuring Tokio scheduling
// performance too. Use a simple noop future that yields once, to avoid any scheduler fast
// paths for a ready future.
if enabled {
b.iter(|| runtime.block_on(warn_slow("ready", THRESHOLD, tokio::task::yield_now())));
} else {
b.iter(|| runtime.block_on(tokio::task::yield_now()));
}
Ok(())
}
}