mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-03 05:20:38 +00:00
compute_ctl: Add OTEL tracing to incoming HTTP requests and startup (#10971)
We lost this with the switch to axum for the HTTP server. Add it back. In addition to just resurrecting the functionality we had before, pass the tracing context of the /configure HTTP request to the start_postgres operation that runs in the main thread. This way, the 'start_postgres' and all its sub-spans like getting the basebackup become children of the HTTP request span. This allows end-to-end tracing of a compute start, all the way from the proxy to the SQL queries executed by compute_ctl as part of compute startup.
This commit is contained in:
committed by
GitHub
parent
643a48210f
commit
5cfdb1244f
@@ -406,6 +406,21 @@ fn start_postgres(
|
||||
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
|
||||
// We got all we need, update the state.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
|
||||
// Create a tracing span for the startup operation.
|
||||
//
|
||||
// We could otherwise just annotate the function with #[instrument], but if
|
||||
// we're being configured from a /configure HTTP request, we want the
|
||||
// startup to be considered part of the /configure request.
|
||||
let _this_entered = {
|
||||
// Temporarily enter the /configure request's span, so that the new span
|
||||
// becomes its child.
|
||||
let _parent_entered = state.startup_span.take().map(|p| p.entered());
|
||||
|
||||
tracing::info_span!("start_postgres")
|
||||
}
|
||||
.entered();
|
||||
|
||||
state.set_status(ComputeStatus::Init, &compute.state_changed);
|
||||
|
||||
info!(
|
||||
|
||||
@@ -110,7 +110,23 @@ pub struct ComputeState {
|
||||
/// compute wasn't used since start.
|
||||
pub last_active: Option<DateTime<Utc>>,
|
||||
pub error: Option<String>,
|
||||
|
||||
/// Compute spec. This can be received from the CLI or - more likely -
|
||||
/// passed by the control plane with a /configure HTTP request.
|
||||
pub pspec: Option<ParsedSpec>,
|
||||
|
||||
/// If the spec is passed by a /configure request, 'startup_span' is the
|
||||
/// /configure request's tracing span. The main thread enters it when it
|
||||
/// processes the compute startup, so that the compute startup is considered
|
||||
/// to be part of the /configure request for tracing purposes.
|
||||
///
|
||||
/// If the request handling thread/task called startup_compute() directly,
|
||||
/// it would automatically be a child of the request handling span, and we
|
||||
/// wouldn't need this. But because we use the main thread to perform the
|
||||
/// startup, and the /configure task just waits for it to finish, we need to
|
||||
/// set up the span relationship ourselves.
|
||||
pub startup_span: Option<tracing::span::Span>,
|
||||
|
||||
pub metrics: ComputeMetrics,
|
||||
}
|
||||
|
||||
@@ -122,6 +138,7 @@ impl ComputeState {
|
||||
last_active: None,
|
||||
error: None,
|
||||
pspec: None,
|
||||
startup_span: None,
|
||||
metrics: ComputeMetrics::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,13 +45,18 @@ pub(in crate::http) async fn configure(
|
||||
return JsonResponse::invalid_status(state.status);
|
||||
}
|
||||
|
||||
// Pass the tracing span to the main thread that performs the startup,
|
||||
// so that the start_compute operation is considered a child of this
|
||||
// configure request for tracing purposes.
|
||||
state.startup_span = Some(tracing::Span::current());
|
||||
|
||||
state.pspec = Some(pspec);
|
||||
state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
|
||||
drop(state);
|
||||
}
|
||||
|
||||
// Spawn a blocking thread to wait for compute to become Running. This is
|
||||
// needed to do not block the main pool of workers and be able to serve
|
||||
// needed to not block the main pool of workers and to be able to serve
|
||||
// other requests while some particular request is waiting for compute to
|
||||
// finish configuration.
|
||||
let c = compute.clone();
|
||||
|
||||
@@ -121,6 +121,7 @@ impl From<Server> for Router<Arc<ComputeNode>> {
|
||||
)
|
||||
.layer(PropagateRequestIdLayer::x_request_id()),
|
||||
)
|
||||
.layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user