From 6cd3b501ec8f9100e9eeb092f64f3de18c47c5ea Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 8 Feb 2025 10:28:09 +0100
Subject: [PATCH] fix(page_service / batching): smgr op latency metrics
 includes the flush time of preceding requests (#10728)

Before this PR, if a batch contains N responses, the smgr op latency
reported for response (N-i) would include the time we spent flushing
the preceding requests.

refs:
- fixup of https://github.com/neondatabase/neon/pull/10042
- fixes https://github.com/neondatabase/neon/issues/10674
---
 pageserver/src/page_service.rs | 37 ++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index db8c428795..69f1f1c051 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1201,6 +1201,29 @@ impl PageServerHandler {
             }
         };
 
+        // We purposefully don't count flush time into the smgr operaiton timer.
+        //
+        // The reason is that current compute client will not perform protocol processing
+        // if the postgres backend process is doing things other than `->smgr_read()`.
+        // This is especially the case for prefetch.
+        //
+        // If the compute doesn't read from the connection, eventually TCP will backpressure
+        // all the way into our flush call below.
+        //
+        // The timer's underlying metric is used for a storage-internal latency SLO and
+        // we don't want to include latency in it that we can't control.
+        // And as pointed out above, in this case, we don't control the time that flush will take.
+        //
+        // We put each response in the batch onto the wire in a separate pgb_writer.flush()
+        // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
+        // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
+        // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
+        //
+        // Since we're flushing multiple times in the loop, but only have access to the per-op
+        // timers inside the loop, we capture the flush start time here and reuse it to finish
+        // each op timer.
+        let flushing_start_time = Instant::now();
+
         // Map handler result to protocol behavior.
         // Some handler errors cause exit from pagestream protocol.
         // Other handler errors are sent back as an error message and we stay in pagestream protocol.
@@ -1249,21 +1272,9 @@ impl PageServerHandler {
                 &response_msg.serialize(protocol_version),
             ))?;
 
-            // We purposefully don't count flush time into the timer.
-            //
-            // The reason is that current compute client will not perform protocol processing
-            // if the postgres backend process is doing things other than `->smgr_read()`.
-            // This is especially the case for prefetch.
-            //
-            // If the compute doesn't read from the connection, eventually TCP will backpressure
-            // all the way into our flush call below.
-            //
-            // The timer's underlying metric is used for a storage-internal latency SLO and
-            // we don't want to include latency in it that we can't control.
-            // And as pointed out above, in this case, we don't control the time that flush will take.
             let flushing_timer = timer.map(|mut timer| {
                 timer
-                    .observe_execution_end_flush_start(Instant::now())
+                    .observe_execution_end_flush_start(flushing_start_time)
                     .expect("we are the first caller")
             });