pageserver: tweak slow GetPage logging (#10956)

## Problem We recently added slow GetPage request logging. However, this unintentionally included the flush time when logging (which we already have separate logging for). It also logs at WARN level, which is a bit aggressive since we see this fire quite frequently. Follows https://github.com/neondatabase/neon/pull/10906. ## Summary of changes * Only log the request execution time, not the flush time. * Extract a `pagestream_dispatch_batched_message()` helper. * Rename `warn_slow()` to `log_slow()` and downgrade to INFO.
2026-05-31 20:10:38 +00:00 · 2025-02-24 23:01:14 +01:00
parent 565a9e62a1
commit 6621be6b7b
4 changed files with 170 additions and 156 deletions
--- a/libs/utils/benches/README.md
+++ b/libs/utils/benches/README.md
@@ -10,14 +10,14 @@ cargo bench --package utils
 cargo bench --package utils --bench benchmarks

 # Specific benchmark.
-cargo bench --package utils --bench benchmarks warn_slow/enabled=true
+cargo bench --package utils --bench benchmarks log_slow/enabled=true

 # List available benchmarks.
 cargo bench --package utils --benches -- --list

 # Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
 # Output in target/criterion/*/profile/flamegraph.svg.
-cargo bench --package utils --bench benchmarks warn_slow/enabled=true --profile-time 10
+cargo bench --package utils --bench benchmarks log_slow/enabled=true --profile-time 10
 ```

 Additional charts and statistics are available in `target/criterion/report/index.html`.
--- a/libs/utils/benches/benchmarks.rs
+++ b/libs/utils/benches/benchmarks.rs
@@ -3,14 +3,14 @@ use std::time::Duration;
 use criterion::{criterion_group, criterion_main, Bencher, Criterion};
 use pprof::criterion::{Output, PProfProfiler};
 use utils::id;
-use utils::logging::warn_slow;
+use utils::logging::log_slow;

 // Register benchmarks with Criterion.
 criterion_group!(
    name = benches;
    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
    targets = bench_id_stringify,
-    bench_warn_slow,
+    bench_log_slow,
 );
 criterion_main!(benches);

@@ -29,9 +29,9 @@ pub fn bench_id_stringify(c: &mut Criterion) {
    });
 }

-pub fn bench_warn_slow(c: &mut Criterion) {
+pub fn bench_log_slow(c: &mut Criterion) {
    for enabled in [false, true] {
-        c.bench_function(&format!("warn_slow/enabled={enabled}"), |b| {
+        c.bench_function(&format!("log_slow/enabled={enabled}"), |b| {
            run_bench(b, enabled).unwrap()
        });
    }
@@ -45,11 +45,11 @@ pub fn bench_warn_slow(c: &mut Criterion) {
            .enable_all()
            .build()?;

-        // Test both with and without warn_slow, since we're essentially measuring Tokio scheduling
+        // Test both with and without log_slow, since we're essentially measuring Tokio scheduling
        // performance too. Use a simple noop future that yields once, to avoid any scheduler fast
        // paths for a ready future.
        if enabled {
-            b.iter(|| runtime.block_on(warn_slow("ready", THRESHOLD, tokio::task::yield_now())));
+            b.iter(|| runtime.block_on(log_slow("ready", THRESHOLD, tokio::task::yield_now())));
        } else {
            b.iter(|| runtime.block_on(tokio::task::yield_now()));
        }
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -7,7 +7,7 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
 use tokio::time::Instant;
-use tracing::warn;
+use tracing::info;

 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
@@ -322,11 +322,13 @@ impl std::fmt::Debug for SecretString {
    }
 }

-/// Logs a periodic warning if a future is slow to complete.
+/// Logs a periodic message if a future is slow to complete.
 ///
 /// This is performance-sensitive as it's used on the GetPage read path.
+///
+/// TODO: consider upgrading this to a warning, but currently it fires too often.
 #[inline]
-pub async fn warn_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
+pub async fn log_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
    // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
    // won't fit on the stack.
    let mut f = Box::pin(f);
@@ -345,13 +347,13 @@ pub async fn warn_slow<O>(name: &str, threshold: Duration, f: impl Future<Output
            // false negatives.
            let elapsed = started.elapsed();
            if elapsed >= threshold {
-                warn!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
+                info!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
            }
            return output;
        }

        let elapsed = started.elapsed().as_secs_f64();
-        warn!("slow {name} still running after {elapsed:.3}s",);
+        info!("slow {name} still running after {elapsed:.3}s",);

        attempt += 1;
    }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -40,7 +40,7 @@ use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::logging::warn_slow;
+use utils::logging::log_slow;
 use utils::sync::gate::{Gate, GateGuard};
 use utils::sync::spsc_fold;
 use utils::{
@@ -84,8 +84,8 @@ use std::os::fd::AsRawFd;
 /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);

-/// Threshold at which to log a warning about slow GetPage requests.
-const WARN_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
+/// Threshold at which to log slow GetPage requests.
+const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);

 ///////////////////////////////////////////////////////////////////////////////

@@ -1087,11 +1087,147 @@ impl PageServerHandler {
            batch
        };

-        // invoke handler function
-        let (mut handler_results, span): (
+        // Dispatch the batch to the appropriate request handler.
+        let (mut handler_results, span) = log_slow(
+            batch.as_static_str(),
+            LOG_SLOW_GETPAGE_THRESHOLD,
+            self.pagestream_dispatch_batched_message(batch, io_concurrency, ctx),
+        )
+        .await?;
+
+        // We purposefully don't count flush time into the smgr operation timer.
+        //
+        // The reason is that current compute client will not perform protocol processing
+        // if the postgres backend process is doing things other than `->smgr_read()`.
+        // This is especially the case for prefetch.
+        //
+        // If the compute doesn't read from the connection, eventually TCP will backpressure
+        // all the way into our flush call below.
+        //
+        // The timer's underlying metric is used for a storage-internal latency SLO and
+        // we don't want to include latency in it that we can't control.
+        // And as pointed out above, in this case, we don't control the time that flush will take.
+        //
+        // We put each response in the batch onto the wire in a separate pgb_writer.flush()
+        // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
+        // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
+        // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
+        let flush_timers = {
+            let flushing_start_time = Instant::now();
+            let mut flush_timers = Vec::with_capacity(handler_results.len());
+            for handler_result in &mut handler_results {
+                let flush_timer = match handler_result {
+                    Ok((_, timer)) => Some(
+                        timer
+                            .observe_execution_end(flushing_start_time)
+                            .expect("we are the first caller"),
+                    ),
+                    Err(_) => {
+                        // TODO: measure errors
+                        None
+                    }
+                };
+                flush_timers.push(flush_timer);
+            }
+            assert_eq!(flush_timers.len(), handler_results.len());
+            flush_timers
+        };
+
+        // Map handler result to protocol behavior.
+        // Some handler errors cause exit from pagestream protocol.
+        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
+        for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) {
+            let response_msg = match handler_result {
+                Err(e) => match &e.err {
+                    PageStreamError::Shutdown => {
+                        // If we fail to fulfil a request during shutdown, which may be _because_ of
+                        // shutdown, then do not send the error to the client.  Instead just drop the
+                        // connection.
+                        span.in_scope(|| info!("dropping connection due to shutdown"));
+                        return Err(QueryError::Shutdown);
+                    }
+                    PageStreamError::Reconnect(reason) => {
+                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                        return Err(QueryError::Reconnect);
+                    }
+                    PageStreamError::Read(_)
+                    | PageStreamError::LsnTimeout(_)
+                    | PageStreamError::NotFound(_)
+                    | PageStreamError::BadRequest(_) => {
+                        // print the all details to the log with {:#}, but for the client the
+                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                        // here includes cancellation which is not an error.
+                        let full = utils::error::report_compact_sources(&e.err);
+                        span.in_scope(|| {
+                            error!("error reading relation or page version: {full:#}")
+                        });
+
+                        PagestreamBeMessage::Error(PagestreamErrorResponse {
+                            req: e.req,
+                            message: e.err.to_string(),
+                        })
+                    }
+                },
+                Ok((response_msg, _op_timer_already_observed)) => response_msg,
+            };
+
+            //
+            // marshal & transmit response message
+            //
+
+            pgb_writer.write_message_noflush(&BeMessage::CopyData(
+                &response_msg.serialize(protocol_version),
+            ))?;
+
+            failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel);
+
+            // what we want to do
+            let socket_fd = pgb_writer.socket_fd;
+            let flush_fut = pgb_writer.flush();
+            // metric for how long flushing takes
+            let flush_fut = match flushing_timer {
+                Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
+                    Instant::now(),
+                    flush_fut,
+                    socket_fd,
+                )),
+                None => futures::future::Either::Right(flush_fut),
+            };
+            // do it while respecting cancellation
+            let _: () = async move {
+                tokio::select! {
+                    biased;
+                    _ = cancel.cancelled() => {
+                        // We were requested to shut down.
+                        info!("shutdown request received in page handler");
+                        return Err(QueryError::Shutdown)
+                    }
+                    res = flush_fut => {
+                        res?;
+                    }
+                }
+                Ok(())
+            }
+            .await?;
+        }
+        Ok(())
+    }
+
+    /// Helper which dispatches a batched message to the appropriate handler.
+    /// Returns a vec of results, along with the extracted trace span.
+    async fn pagestream_dispatch_batched_message(
+        &mut self,
+        batch: BatchedFeMessage,
+        io_concurrency: IoConcurrency,
+        ctx: &RequestContext,
+    ) -> Result<
+        (
            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
-            _,
-        ) = match batch {
+            Span,
+        ),
+        QueryError,
+    > {
+        Ok(match batch {
            BatchedFeMessage::Exists {
                span,
                timer,
@@ -1213,124 +1349,7 @@ impl PageServerHandler {
                // call the handler.
                (vec![Err(error)], span)
            }
-        };
-
-        // We purposefully don't count flush time into the smgr operation timer.
-        //
-        // The reason is that current compute client will not perform protocol processing
-        // if the postgres backend process is doing things other than `->smgr_read()`.
-        // This is especially the case for prefetch.
-        //
-        // If the compute doesn't read from the connection, eventually TCP will backpressure
-        // all the way into our flush call below.
-        //
-        // The timer's underlying metric is used for a storage-internal latency SLO and
-        // we don't want to include latency in it that we can't control.
-        // And as pointed out above, in this case, we don't control the time that flush will take.
-        //
-        // We put each response in the batch onto the wire in a separate pgb_writer.flush()
-        // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
-        // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
-        // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
-        let flush_timers = {
-            let flushing_start_time = Instant::now();
-            let mut flush_timers = Vec::with_capacity(handler_results.len());
-            for handler_result in &mut handler_results {
-                let flush_timer = match handler_result {
-                    Ok((_, timer)) => Some(
-                        timer
-                            .observe_execution_end(flushing_start_time)
-                            .expect("we are the first caller"),
-                    ),
-                    Err(_) => {
-                        // TODO: measure errors
-                        None
-                    }
-                };
-                flush_timers.push(flush_timer);
-            }
-            assert_eq!(flush_timers.len(), handler_results.len());
-            flush_timers
-        };
-
-        // Map handler result to protocol behavior.
-        // Some handler errors cause exit from pagestream protocol.
-        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) {
-            let response_msg = match handler_result {
-                Err(e) => match &e.err {
-                    PageStreamError::Shutdown => {
-                        // If we fail to fulfil a request during shutdown, which may be _because_ of
-                        // shutdown, then do not send the error to the client.  Instead just drop the
-                        // connection.
-                        span.in_scope(|| info!("dropping connection due to shutdown"));
-                        return Err(QueryError::Shutdown);
-                    }
-                    PageStreamError::Reconnect(reason) => {
-                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                        return Err(QueryError::Reconnect);
-                    }
-                    PageStreamError::Read(_)
-                    | PageStreamError::LsnTimeout(_)
-                    | PageStreamError::NotFound(_)
-                    | PageStreamError::BadRequest(_) => {
-                        // print the all details to the log with {:#}, but for the client the
-                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                        // here includes cancellation which is not an error.
-                        let full = utils::error::report_compact_sources(&e.err);
-                        span.in_scope(|| {
-                            error!("error reading relation or page version: {full:#}")
-                        });
-
-                        PagestreamBeMessage::Error(PagestreamErrorResponse {
-                            req: e.req,
-                            message: e.err.to_string(),
-                        })
-                    }
-                },
-                Ok((response_msg, _op_timer_already_observed)) => response_msg,
-            };
-
-            //
-            // marshal & transmit response message
-            //
-
-            pgb_writer.write_message_noflush(&BeMessage::CopyData(
-                &response_msg.serialize(protocol_version),
-            ))?;
-
-            failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel);
-
-            // what we want to do
-            let socket_fd = pgb_writer.socket_fd;
-            let flush_fut = pgb_writer.flush();
-            // metric for how long flushing takes
-            let flush_fut = match flushing_timer {
-                Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
-                    Instant::now(),
-                    flush_fut,
-                    socket_fd,
-                )),
-                None => futures::future::Either::Right(flush_fut),
-            };
-            // do it while respecting cancellation
-            let _: () = async move {
-                tokio::select! {
-                    biased;
-                    _ = cancel.cancelled() => {
-                        // We were requested to shut down.
-                        info!("shutdown request received in page handler");
-                        return Err(QueryError::Shutdown)
-                    }
-                    res = flush_fut => {
-                        res?;
-                    }
-                }
-                Ok(())
-            }
-            .await?;
-        }
-        Ok(())
+        })
    }

    /// Pagestream sub-protocol handler.
@@ -1476,19 +1495,16 @@ impl PageServerHandler {
                }
            };

-            let result = warn_slow(
-                msg.as_static_str(),
-                WARN_SLOW_GETPAGE_THRESHOLD,
-                self.pagesteam_handle_batched_message(
+            let result = self
+                .pagesteam_handle_batched_message(
                    pgb_writer,
                    msg,
                    io_concurrency.clone(),
                    &cancel,
                    protocol_version,
                    ctx,
-                ),
-            )
-            .await;
+                )
+                .await;
            match result {
                Ok(()) => {}
                Err(e) => break e,
@@ -1652,17 +1668,13 @@ impl PageServerHandler {
                            return Err(e);
                        }
                    };
-                    warn_slow(
-                        batch.as_static_str(),
-                        WARN_SLOW_GETPAGE_THRESHOLD,
-                        self.pagesteam_handle_batched_message(
-                            pgb_writer,
-                            batch,
-                            io_concurrency.clone(),
-                            &cancel,
-                            protocol_version,
-                            &ctx,
-                        ),
+                    self.pagesteam_handle_batched_message(
+                        pgb_writer,
+                        batch,
+                        io_concurrency.clone(),
+                        &cancel,
+                        protocol_version,
+                        &ctx,
                    )
                    .await?;
                }