From 66b06e416a72971e87480e1d4dd90e9b217b352d Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 9 May 2023 17:08:02 +0300 Subject: [PATCH] Pass tracing context in env variables instead of the spec file. (#4174) If compute_ctl is launched without a spec file, it fetches it from the control plane with an HTTP request. We cannot get the startup tracing context from the compute spec in that case, because we don't have it available on start. We could still read the tracing context from the compute spec after we have fetched it, but that would leave the fetch itself out of the context. Pass the tracing context in environment variables instead. --- compute_tools/src/bin/compute_ctl.rs | 74 ++++++++++++++++++---------- libs/compute_api/src/spec.rs | 3 -- 2 files changed, 49 insertions(+), 28 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 36dbc382b5..2f515c9bf1 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -30,6 +30,7 @@ //! -b /usr/local/bin/postgres //! ``` //! +use std::collections::HashMap; use std::fs::File; use std::panic; use std::path::Path; @@ -67,6 +68,54 @@ fn main() -> Result<()> { let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); + // Extract OpenTelemetry context for the startup actions from the + // TRACEPARENT and TRACESTATE env variables, and attach it to the current + // tracing context. + // + // This is used to propagate the context for the 'start_compute' operation + // from the neon control plane. This allows linking together the wider + // 'start_compute' operation that creates the compute container, with the + // startup actions here within the container. + // + // There is no standard for passing context in env variables, but a lot of + // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See + // https://github.com/open-telemetry/opentelemetry-specification/issues/740 + // + // Switch to the startup context here, and exit it once the startup has + // completed and Postgres is up and running. + // + // If this pod is pre-created without binding it to any particular endpoint + // yet, this isn't the right place to enter the startup context. In that + // case, the control plane should pass the tracing context as part of the + // /configure API call. + // + // NOTE: This is supposed to only cover the *startup* actions. Once + // postgres is configured and up-and-running, we exit this span. Any other + // actions that are performed on incoming HTTP requests, for example, are + // performed in separate spans. + // + // XXX: If the pod is restarted, we perform the startup actions in the same + // context as the original startup actions, which probably doesn't make + // sense. + let mut startup_tracing_carrier: HashMap = HashMap::new(); + if let Ok(val) = std::env::var("TRACEPARENT") { + startup_tracing_carrier.insert("traceparent".to_string(), val); + } + if let Ok(val) = std::env::var("TRACESTATE") { + startup_tracing_carrier.insert("tracestate".to_string(), val); + } + let startup_context_guard = if !startup_tracing_carrier.is_empty() { + use opentelemetry::propagation::TextMapPropagator; + use opentelemetry::sdk::propagation::TraceContextPropagator; + let guard = TraceContextPropagator::new() + .extract(&startup_tracing_carrier) + .attach(); + info!("startup tracing context attached"); + Some(guard) + } else { + None + }; + let compute_id = matches.get_one::("compute-id"); let control_plane_uri = matches.get_one::("control-plane-uri"); @@ -148,8 +197,6 @@ fn main() -> Result<()> { // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); - let pspec = state.pspec.as_ref().expect("spec must be set"); - let startup_tracing_context = pspec.spec.startup_tracing_context.clone(); // Record for how long we slept waiting for the spec. state.metrics.wait_for_spec_ms = Utc::now() @@ -165,29 +212,6 @@ fn main() -> Result<()> { compute.state_changed.notify_all(); drop(state); - // Extract OpenTelemetry context for the startup actions from the spec, and - // attach it to the current tracing context. - // - // This is used to propagate the context for the 'start_compute' operation - // from the neon control plane. This allows linking together the wider - // 'start_compute' operation that creates the compute container, with the - // startup actions here within the container. - // - // Switch to the startup context here, and exit it once the startup has - // completed and Postgres is up and running. - // - // NOTE: This is supposed to only cover the *startup* actions. Once - // postgres is configured and up-and-running, we exit this span. Any other - // actions that are performed on incoming HTTP requests, for example, are - // performed in separate spans. - let startup_context_guard = if let Some(ref carrier) = startup_tracing_context { - use opentelemetry::propagation::TextMapPropagator; - use opentelemetry::sdk::propagation::TraceContextPropagator; - Some(TraceContextPropagator::new().extract(carrier).attach()) - } else { - None - }; - // Launch remaining service threads let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread"); let _configurator_handle = diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 87c0edd687..6072980ed8 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -5,7 +5,6 @@ //! and connect it to the storage nodes. use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; -use std::collections::HashMap; use utils::lsn::Lsn; /// String type alias representing Postgres identifier and @@ -31,8 +30,6 @@ pub struct ComputeSpec { pub mode: ComputeMode, pub storage_auth_token: Option, - - pub startup_tracing_context: Option>, } #[serde_as]