mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 12:32:54 +00:00
# Problem We leave too few observability breadcrumbs in the case where wait_lsn is exceptionally slow. # Changes - refactor: extract the monitoring logic out of `log_slow` into `monitor_slow_future` - add global + per-timeline counter for time spent waiting for wait_lsn - It is updated while we're still waiting, similar to what we do for page_service response flush. - add per-timeline counterpair for started & finished wait_lsn count - add slow-logging to leave breadcrumbs in logs, not just metrics For the slow-logging, we need to consider not flooding the logs during a broker or network outage/blip. The solution is a "log-streak-level" concurrency limit per timeline. At any given time, there is at most one slow wait_lsn that is logging the "still running" and "completed" sequence of logs. Other concurrent slow wait_lsn's don't log at all. This leaves at least one breadcrumb in each timeline's logs if some wait_lsn was exceptionally slow during a given period. The full degree of slowness can then be determined by looking at the per-timeline metric. # Performance Reran the `bench_log_slow` benchmark, no difference, so, existing call sites are fine. We do use a Semaphore, but only try_acquire it _after_ things have already been determined to be slow. So, no baseline overhead anticipated. # Refs - https://github.com/neondatabase/cloud/issues/23486#issuecomment-2711587222
66 lines
2.0 KiB
Rust
66 lines
2.0 KiB
Rust
use std::time::Duration;
|
|
|
|
use criterion::{Bencher, Criterion, criterion_group, criterion_main};
|
|
use pprof::criterion::{Output, PProfProfiler};
|
|
use utils::id;
|
|
use utils::logging::log_slow;
|
|
|
|
// Register benchmarks with Criterion.
|
|
criterion_group!(
|
|
name = benches;
|
|
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
|
targets = bench_id_stringify,
|
|
bench_log_slow,
|
|
);
|
|
criterion_main!(benches);
|
|
|
|
pub fn bench_id_stringify(c: &mut Criterion) {
|
|
// Can only use public methods.
|
|
let ttid = id::TenantTimelineId::generate();
|
|
|
|
c.bench_function("id.to_string", |b| {
|
|
b.iter(|| {
|
|
// FIXME measurement overhead?
|
|
//for _ in 0..1000 {
|
|
// ttid.tenant_id.to_string();
|
|
//}
|
|
ttid.tenant_id.to_string();
|
|
})
|
|
});
|
|
}
|
|
|
|
pub fn bench_log_slow(c: &mut Criterion) {
|
|
for enabled in [false, true] {
|
|
c.bench_function(&format!("log_slow/enabled={enabled}"), |b| {
|
|
run_bench(b, enabled).unwrap()
|
|
});
|
|
}
|
|
|
|
// The actual benchmark.
|
|
fn run_bench(b: &mut Bencher, enabled: bool) -> anyhow::Result<()> {
|
|
const THRESHOLD: Duration = Duration::from_secs(1);
|
|
|
|
// Use a multi-threaded runtime to avoid thread parking overhead when yielding.
|
|
let runtime = tokio::runtime::Builder::new_multi_thread()
|
|
.enable_all()
|
|
.build()?;
|
|
|
|
// Test both with and without log_slow, since we're essentially measuring Tokio scheduling
|
|
// performance too. Use a simple noop future that yields once, to avoid any scheduler fast
|
|
// paths for a ready future.
|
|
if enabled {
|
|
b.iter(|| {
|
|
runtime.block_on(log_slow(
|
|
"ready",
|
|
THRESHOLD,
|
|
std::pin::pin!(tokio::task::yield_now()),
|
|
))
|
|
});
|
|
} else {
|
|
b.iter(|| runtime.block_on(tokio::task::yield_now()));
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
}
|