impr(perf tracing): ability to correlate with page_service logs (#11398)

# Problem Current perf tracing fields do not allow answering the question what a specific Postgres backend was waiting for. # Background For Pageserver logs, we set the backend PID as the libpq `application_name` on the compute side, and funnel that into the a tracing field for the spans that emit to the global tracing subscriber. # Solution Funnel `application_name`, and the other fields that we use in the logging spans, into the root span for perf tracing. # Refs - fixes https://github.com/neondatabase/neon/issues/11393 - stacked atop https://github.com/neondatabase/neon/pull/11433 - epic: https://github.com/neondatabase/neon/issues/9873
2026-06-05 06:20:37 +00:00 · 2025-04-04 17:13:54 +02:00
parent 295be03a33
commit 6ee84d985a
1 changed files with 31 additions and 0 deletions
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -247,6 +247,15 @@ pub async fn libpq_listener_main(

 type ConnectionHandlerResult = anyhow::Result<()>;

+/// Perf root spans start at the per-request level, after shard routing.
+/// This struct carries connection-level information to the root perf span definition.
+#[derive(Clone)]
+struct ConnectionPerfSpanFields {
+    peer_addr: String,
+    application_name: Option<String>,
+    compute_mode: Option<String>,
+}
+
 #[instrument(skip_all, fields(peer_addr, application_name, compute_mode))]
 #[allow(clippy::too_many_arguments)]
 async fn page_service_conn_main(
@@ -271,6 +280,12 @@ async fn page_service_conn_main(
    let socket_fd = socket.as_raw_fd();

    let peer_addr = socket.peer_addr().context("get peer address")?;
+
+    let perf_span_fields = ConnectionPerfSpanFields {
+        peer_addr: peer_addr.to_string(),
+        application_name: None, // filled in later
+        compute_mode: None,     // filled in later
+    };
    tracing::Span::current().record("peer_addr", field::display(peer_addr));

    // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
@@ -314,6 +329,7 @@ async fn page_service_conn_main(
        tenant_manager,
        auth,
        pipelining_config,
+        perf_span_fields,
        connection_ctx,
        cancel.clone(),
        gate_guard,
@@ -358,6 +374,8 @@ struct PageServerHandler {
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,

+    perf_span_fields: ConnectionPerfSpanFields,
+
    cancel: CancellationToken,

    /// None only while pagestream protocol is being processed.
@@ -703,11 +721,13 @@ impl BatchedFeMessage {
 }

 impl PageServerHandler {
+    #[allow(clippy::too_many_arguments)]
    pub fn new(
        conf: &'static PageServerConf,
        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<SwappableJwtAuth>>,
        pipelining_config: PageServicePipeliningConfig,
+        perf_span_fields: ConnectionPerfSpanFields,
        connection_ctx: RequestContext,
        cancel: CancellationToken,
        gate_guard: GateGuard,
@@ -717,6 +737,7 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
+            perf_span_fields,
            timeline_handles: Some(TimelineHandles::new(tenant_manager)),
            cancel,
            pipelining_config,
@@ -754,6 +775,7 @@ impl PageServerHandler {
        tenant_id: TenantId,
        timeline_id: TimelineId,
        timeline_handles: &mut TimelineHandles,
+        conn_perf_span_fields: &ConnectionPerfSpanFields,
        cancel: &CancellationToken,
        ctx: &RequestContext,
        protocol_version: PagestreamProtocolVersion,
@@ -952,6 +974,9 @@ impl PageServerHandler {
                            info_span!(
                            target: PERF_TRACE_TARGET,
                            "GET_PAGE",
+                            peer_addr = conn_perf_span_fields.peer_addr,
+                            application_name = conn_perf_span_fields.application_name,
+                            compute_mode = conn_perf_span_fields.compute_mode,
                            tenant_id = %tenant_id,
                            shard_id = %shard.get_shard_identity().shard_slug(),
                            timeline_id = %timeline_id,
@@ -1581,6 +1606,7 @@ impl PageServerHandler {
                tenant_id,
                timeline_id,
                &mut timeline_handles,
+                &self.perf_span_fields,
                &cancel,
                ctx,
                protocol_version,
@@ -1714,6 +1740,8 @@ impl PageServerHandler {
        // Batcher
        //

+        let perf_span_fields = self.perf_span_fields.clone();
+
        let cancel_batcher = self.cancel.child_token();
        let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
        let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| {
@@ -1727,6 +1755,7 @@ impl PageServerHandler {
                        tenant_id,
                        timeline_id,
                        &mut timeline_handles,
+                        &perf_span_fields,
                        &cancel_batcher,
                        &ctx,
                        protocol_version,
@@ -2669,12 +2698,14 @@ where

        if let FeStartupPacket::StartupMessage { params, .. } = sm {
            if let Some(app_name) = params.get("application_name") {
+                self.perf_span_fields.application_name = Some(app_name.to_string());
                Span::current().record("application_name", field::display(app_name));
            }
            if let Some(options) = params.get("options") {
                let (config, _) = parse_options(options);
                for (key, value) in config {
                    if key == "neon.compute_mode" {
+                        self.perf_span_fields.compute_mode = Some(value.clone());
                        Span::current().record("compute_mode", field::display(value));
                    }
                }