From 3e94fd5af31930eac0a1e5ec6f599d4fa68e4515 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 26 Jan 2023 14:15:49 +0200 Subject: [PATCH] Inherit OpenTelemetry context for compute startup from cloud console. This allows fine-grained distributed tracing of the 'start_compute' operation from the cloud console. The startup actions performed by 'compute_ctl' are now performed in a child of the 'start_compute' context, so you can trace through the whole compute start operation. This needs a corresponding change in the cloud console to fill in the 'startup_tracing_context' field in the json spec. If it's missing, the startup operations are simply traced as a separate trace, without a parent. --- Cargo.lock | 1 + compute_tools/Cargo.toml | 1 + compute_tools/src/bin/compute_ctl.rs | 26 ++++++++++++++++++++++++++ compute_tools/src/spec.rs | 3 +++ 4 files changed, 31 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 91515df5bb..f28eedfa56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -831,6 +831,7 @@ dependencies = [ "futures", "hyper", "notify", + "opentelemetry", "postgres", "regex", "serde", diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 0fabd23965..f8c3481f57 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -11,6 +11,7 @@ clap.workspace = true futures.workspace = true hyper = { workspace = true, features = ["full"] } notify.workspace = true +opentelemetry.workspace = true postgres.workspace = true regex.workspace = true serde.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 0e7f38bf84..2c42662020 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -84,6 +84,29 @@ fn main() -> Result<()> { } }; + // Extract OpenTelemetry context for the startup actions from the spec, and + // attach it to the current tracing context. + // + // This is used to propagate the context for the 'start_compute' operation + // from the neon control plane. This allows linking together the wider + // 'start_compute' operation that creates the compute container, with the + // startup actions here within the container. + // + // Switch to the startup context here, and exit it once the startup has + // completed and Postgres is up and running. + // + // NOTE: This is supposed to only cover the *startup* actions. Once + // postgres is configured and up-and-running, we exit this span. Any other + // actions that are performed on incoming HTTP requests, for example, are + // performed in separate spans. + let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context { + use opentelemetry::propagation::TextMapPropagator; + use opentelemetry::sdk::propagation::TraceContextPropagator; + Some(TraceContextPropagator::new().extract(carrier).attach()) + } else { + None + }; + let pageserver_connstr = spec .cluster .settings @@ -140,6 +163,9 @@ fn main() -> Result<()> { // Wait for the child Postgres process forever. In this state Ctrl+C will // propagate to Postgres and it will be shut down as well. if let Some(mut pg) = pg { + // Startup is finished, exit the startup tracing span + drop(startup_context_guard); + let ecode = pg .wait() .expect("failed to start waiting on Postgres process"); diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 4c7cca545c..8e249f3722 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::path::Path; use std::str::FromStr; @@ -22,6 +23,8 @@ pub struct ComputeSpec { /// Expected cluster state at the end of transition process. pub cluster: Cluster, pub delta_operations: Option>, + + pub startup_tracing_context: Option>, } /// Cluster state seen from the perspective of the external tools