impr(perf tracing): ability to correlate with page_service logs (#11398)

# Problem

Current perf tracing fields do not allow answering the question what a
specific Postgres backend was waiting for.

# Background

For Pageserver logs, we set the backend PID as the libpq
`application_name` on the compute side, and funnel that into the a
tracing field for the spans that emit to the global tracing subscriber.

# Solution

Funnel `application_name`, and the other fields that we use in the
logging spans, into the root span for perf tracing.

# Refs

- fixes https://github.com/neondatabase/neon/issues/11393
- stacked atop https://github.com/neondatabase/neon/pull/11433
- epic: https://github.com/neondatabase/neon/issues/9873
This commit is contained in:
Christian Schwarz
2025-04-04 17:13:54 +02:00
committed by GitHub
parent 295be03a33
commit 6ee84d985a

View File

@@ -247,6 +247,15 @@ pub async fn libpq_listener_main(
type ConnectionHandlerResult = anyhow::Result<()>;
/// Perf root spans start at the per-request level, after shard routing.
/// This struct carries connection-level information to the root perf span definition.
#[derive(Clone)]
struct ConnectionPerfSpanFields {
peer_addr: String,
application_name: Option<String>,
compute_mode: Option<String>,
}
#[instrument(skip_all, fields(peer_addr, application_name, compute_mode))]
#[allow(clippy::too_many_arguments)]
async fn page_service_conn_main(
@@ -271,6 +280,12 @@ async fn page_service_conn_main(
let socket_fd = socket.as_raw_fd();
let peer_addr = socket.peer_addr().context("get peer address")?;
let perf_span_fields = ConnectionPerfSpanFields {
peer_addr: peer_addr.to_string(),
application_name: None, // filled in later
compute_mode: None, // filled in later
};
tracing::Span::current().record("peer_addr", field::display(peer_addr));
// setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
@@ -314,6 +329,7 @@ async fn page_service_conn_main(
tenant_manager,
auth,
pipelining_config,
perf_span_fields,
connection_ctx,
cancel.clone(),
gate_guard,
@@ -358,6 +374,8 @@ struct PageServerHandler {
/// `process_query` creates a child context from this one.
connection_ctx: RequestContext,
perf_span_fields: ConnectionPerfSpanFields,
cancel: CancellationToken,
/// None only while pagestream protocol is being processed.
@@ -703,11 +721,13 @@ impl BatchedFeMessage {
}
impl PageServerHandler {
#[allow(clippy::too_many_arguments)]
pub fn new(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
pipelining_config: PageServicePipeliningConfig,
perf_span_fields: ConnectionPerfSpanFields,
connection_ctx: RequestContext,
cancel: CancellationToken,
gate_guard: GateGuard,
@@ -717,6 +737,7 @@ impl PageServerHandler {
auth,
claims: None,
connection_ctx,
perf_span_fields,
timeline_handles: Some(TimelineHandles::new(tenant_manager)),
cancel,
pipelining_config,
@@ -754,6 +775,7 @@ impl PageServerHandler {
tenant_id: TenantId,
timeline_id: TimelineId,
timeline_handles: &mut TimelineHandles,
conn_perf_span_fields: &ConnectionPerfSpanFields,
cancel: &CancellationToken,
ctx: &RequestContext,
protocol_version: PagestreamProtocolVersion,
@@ -952,6 +974,9 @@ impl PageServerHandler {
info_span!(
target: PERF_TRACE_TARGET,
"GET_PAGE",
peer_addr = conn_perf_span_fields.peer_addr,
application_name = conn_perf_span_fields.application_name,
compute_mode = conn_perf_span_fields.compute_mode,
tenant_id = %tenant_id,
shard_id = %shard.get_shard_identity().shard_slug(),
timeline_id = %timeline_id,
@@ -1581,6 +1606,7 @@ impl PageServerHandler {
tenant_id,
timeline_id,
&mut timeline_handles,
&self.perf_span_fields,
&cancel,
ctx,
protocol_version,
@@ -1714,6 +1740,8 @@ impl PageServerHandler {
// Batcher
//
let perf_span_fields = self.perf_span_fields.clone();
let cancel_batcher = self.cancel.child_token();
let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| {
@@ -1727,6 +1755,7 @@ impl PageServerHandler {
tenant_id,
timeline_id,
&mut timeline_handles,
&perf_span_fields,
&cancel_batcher,
&ctx,
protocol_version,
@@ -2669,12 +2698,14 @@ where
if let FeStartupPacket::StartupMessage { params, .. } = sm {
if let Some(app_name) = params.get("application_name") {
self.perf_span_fields.application_name = Some(app_name.to_string());
Span::current().record("application_name", field::display(app_name));
}
if let Some(options) = params.get("options") {
let (config, _) = parse_options(options);
for (key, value) in config {
if key == "neon.compute_mode" {
self.perf_span_fields.compute_mode = Some(value.clone());
Span::current().record("compute_mode", field::display(value));
}
}