From 5cfdb1244f7ebdf2844e9b5f8e15af01389e653b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 26 Feb 2025 21:27:16 +0200 Subject: [PATCH] compute_ctl: Add OTEL tracing to incoming HTTP requests and startup (#10971) We lost this with the switch to axum for the HTTP server. Add it back. In addition to just resurrecting the functionality we had before, pass the tracing context of the /configure HTTP request to the start_postgres operation that runs in the main thread. This way, the 'start_postgres' and all its sub-spans like getting the basebackup become children of the HTTP request span. This allows end-to-end tracing of a compute start, all the way from the proxy to the SQL queries executed by compute_ctl as part of compute startup. --- Cargo.lock | 24 ++++++++++++++++++---- Cargo.toml | 4 ++++ compute_tools/Cargo.toml | 2 ++ compute_tools/src/bin/compute_ctl.rs | 15 ++++++++++++++ compute_tools/src/compute.rs | 17 +++++++++++++++ compute_tools/src/http/routes/configure.rs | 7 ++++++- compute_tools/src/http/server.rs | 1 + 7 files changed, 65 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 47552174d2..7d11f2b7fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1342,7 +1342,9 @@ dependencies = [ "tokio-util", "tower 0.5.2", "tower-http", + "tower-otel", "tracing", + "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "url", @@ -4484,18 +4486,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.0" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead" +checksum = "dfe2e71e1471fe07709406bf725f710b02927c9c54b2b5b2ec0e8087d97c327d" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.0" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" +checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67" dependencies = [ "proc-macro2", "quote", @@ -7294,6 +7296,20 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" +[[package]] +name = "tower-otel" +version = "0.2.0" +source = "git+https://github.com/mattiapenati/tower-otel?rev=56a7321053bcb72443888257b622ba0d43a11fcd#56a7321053bcb72443888257b622ba0d43a11fcd" +dependencies = [ + "http 1.1.0", + "opentelemetry", + "pin-project", + "tower-layer", + "tower-service", + "tracing", + "tracing-opentelemetry", +] + [[package]] name = "tower-service" version = "0.3.3" diff --git a/Cargo.toml b/Cargo.toml index e6ca3c982c..223ff4249e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -193,6 +193,10 @@ toml_edit = "0.22" tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]} tower = { version = "0.5.2", default-features = false } tower-http = { version = "0.6.2", features = ["request-id", "trace"] } + +# This revision uses opentelemetry 0.27. There's no tag for it. +tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" } + tower-service = "0.3.3" tracing = "0.1" tracing-error = "0.2" diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index ba2c304141..8f3bcbeef8 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -46,7 +46,9 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true tokio-stream.workspace = true +tower-otel.workspace = true tracing.workspace = true +tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index efe707cb7c..6dae1a2753 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -406,6 +406,21 @@ fn start_postgres( ) -> Result<(Option, StartPostgresResult)> { // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); + + // Create a tracing span for the startup operation. + // + // We could otherwise just annotate the function with #[instrument], but if + // we're being configured from a /configure HTTP request, we want the + // startup to be considered part of the /configure request. + let _this_entered = { + // Temporarily enter the /configure request's span, so that the new span + // becomes its child. + let _parent_entered = state.startup_span.take().map(|p| p.entered()); + + tracing::info_span!("start_postgres") + } + .entered(); + state.set_status(ComputeStatus::Init, &compute.state_changed); info!( diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index e3c70ba622..27dc05e71f 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -110,7 +110,23 @@ pub struct ComputeState { /// compute wasn't used since start. pub last_active: Option>, pub error: Option, + + /// Compute spec. This can be received from the CLI or - more likely - + /// passed by the control plane with a /configure HTTP request. pub pspec: Option, + + /// If the spec is passed by a /configure request, 'startup_span' is the + /// /configure request's tracing span. The main thread enters it when it + /// processes the compute startup, so that the compute startup is considered + /// to be part of the /configure request for tracing purposes. + /// + /// If the request handling thread/task called startup_compute() directly, + /// it would automatically be a child of the request handling span, and we + /// wouldn't need this. But because we use the main thread to perform the + /// startup, and the /configure task just waits for it to finish, we need to + /// set up the span relationship ourselves. + pub startup_span: Option, + pub metrics: ComputeMetrics, } @@ -122,6 +138,7 @@ impl ComputeState { last_active: None, error: None, pspec: None, + startup_span: None, metrics: ComputeMetrics::default(), } } diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index a2892196b7..63d428fff4 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -45,13 +45,18 @@ pub(in crate::http) async fn configure( return JsonResponse::invalid_status(state.status); } + // Pass the tracing span to the main thread that performs the startup, + // so that the start_compute operation is considered a child of this + // configure request for tracing purposes. + state.startup_span = Some(tracing::Span::current()); + state.pspec = Some(pspec); state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed); drop(state); } // Spawn a blocking thread to wait for compute to become Running. This is - // needed to do not block the main pool of workers and be able to serve + // needed to not block the main pool of workers and to be able to serve // other requests while some particular request is waiting for compute to // finish configuration. let c = compute.clone(); diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index efd18afc78..7283401bb5 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -121,6 +121,7 @@ impl From for Router> { ) .layer(PropagateRequestIdLayer::x_request_id()), ) + .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO)) } }