mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
impr(perf tracing): ability to correlate with page_service logs (#11398)
# Problem Current perf tracing fields do not allow answering the question what a specific Postgres backend was waiting for. # Background For Pageserver logs, we set the backend PID as the libpq `application_name` on the compute side, and funnel that into the a tracing field for the spans that emit to the global tracing subscriber. # Solution Funnel `application_name`, and the other fields that we use in the logging spans, into the root span for perf tracing. # Refs - fixes https://github.com/neondatabase/neon/issues/11393 - stacked atop https://github.com/neondatabase/neon/pull/11433 - epic: https://github.com/neondatabase/neon/issues/9873
This commit is contained in:
committed by
GitHub
parent
295be03a33
commit
6ee84d985a
@@ -247,6 +247,15 @@ pub async fn libpq_listener_main(
|
||||
|
||||
type ConnectionHandlerResult = anyhow::Result<()>;
|
||||
|
||||
/// Perf root spans start at the per-request level, after shard routing.
|
||||
/// This struct carries connection-level information to the root perf span definition.
|
||||
#[derive(Clone)]
|
||||
struct ConnectionPerfSpanFields {
|
||||
peer_addr: String,
|
||||
application_name: Option<String>,
|
||||
compute_mode: Option<String>,
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(peer_addr, application_name, compute_mode))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn page_service_conn_main(
|
||||
@@ -271,6 +280,12 @@ async fn page_service_conn_main(
|
||||
let socket_fd = socket.as_raw_fd();
|
||||
|
||||
let peer_addr = socket.peer_addr().context("get peer address")?;
|
||||
|
||||
let perf_span_fields = ConnectionPerfSpanFields {
|
||||
peer_addr: peer_addr.to_string(),
|
||||
application_name: None, // filled in later
|
||||
compute_mode: None, // filled in later
|
||||
};
|
||||
tracing::Span::current().record("peer_addr", field::display(peer_addr));
|
||||
|
||||
// setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
|
||||
@@ -314,6 +329,7 @@ async fn page_service_conn_main(
|
||||
tenant_manager,
|
||||
auth,
|
||||
pipelining_config,
|
||||
perf_span_fields,
|
||||
connection_ctx,
|
||||
cancel.clone(),
|
||||
gate_guard,
|
||||
@@ -358,6 +374,8 @@ struct PageServerHandler {
|
||||
/// `process_query` creates a child context from this one.
|
||||
connection_ctx: RequestContext,
|
||||
|
||||
perf_span_fields: ConnectionPerfSpanFields,
|
||||
|
||||
cancel: CancellationToken,
|
||||
|
||||
/// None only while pagestream protocol is being processed.
|
||||
@@ -703,11 +721,13 @@ impl BatchedFeMessage {
|
||||
}
|
||||
|
||||
impl PageServerHandler {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
perf_span_fields: ConnectionPerfSpanFields,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
@@ -717,6 +737,7 @@ impl PageServerHandler {
|
||||
auth,
|
||||
claims: None,
|
||||
connection_ctx,
|
||||
perf_span_fields,
|
||||
timeline_handles: Some(TimelineHandles::new(tenant_manager)),
|
||||
cancel,
|
||||
pipelining_config,
|
||||
@@ -754,6 +775,7 @@ impl PageServerHandler {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
timeline_handles: &mut TimelineHandles,
|
||||
conn_perf_span_fields: &ConnectionPerfSpanFields,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
@@ -952,6 +974,9 @@ impl PageServerHandler {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
"GET_PAGE",
|
||||
peer_addr = conn_perf_span_fields.peer_addr,
|
||||
application_name = conn_perf_span_fields.application_name,
|
||||
compute_mode = conn_perf_span_fields.compute_mode,
|
||||
tenant_id = %tenant_id,
|
||||
shard_id = %shard.get_shard_identity().shard_slug(),
|
||||
timeline_id = %timeline_id,
|
||||
@@ -1581,6 +1606,7 @@ impl PageServerHandler {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
&mut timeline_handles,
|
||||
&self.perf_span_fields,
|
||||
&cancel,
|
||||
ctx,
|
||||
protocol_version,
|
||||
@@ -1714,6 +1740,8 @@ impl PageServerHandler {
|
||||
// Batcher
|
||||
//
|
||||
|
||||
let perf_span_fields = self.perf_span_fields.clone();
|
||||
|
||||
let cancel_batcher = self.cancel.child_token();
|
||||
let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
|
||||
let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| {
|
||||
@@ -1727,6 +1755,7 @@ impl PageServerHandler {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
&mut timeline_handles,
|
||||
&perf_span_fields,
|
||||
&cancel_batcher,
|
||||
&ctx,
|
||||
protocol_version,
|
||||
@@ -2669,12 +2698,14 @@ where
|
||||
|
||||
if let FeStartupPacket::StartupMessage { params, .. } = sm {
|
||||
if let Some(app_name) = params.get("application_name") {
|
||||
self.perf_span_fields.application_name = Some(app_name.to_string());
|
||||
Span::current().record("application_name", field::display(app_name));
|
||||
}
|
||||
if let Some(options) = params.get("options") {
|
||||
let (config, _) = parse_options(options);
|
||||
for (key, value) in config {
|
||||
if key == "neon.compute_mode" {
|
||||
self.perf_span_fields.compute_mode = Some(value.clone());
|
||||
Span::current().record("compute_mode", field::display(value));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user