Configure 'compute_ctl' to use OpenTelemetry exporter.

This allows tracing the startup actions e.g. with Jaeger
(https://www.jaegertracing.io/). We use the "tracing-opentelemetry"
crate, which turns tracing spans into OpenTelemetry spans, so you can
use the usual "#[instrument]" directives to add tracing.

I put the tracing initialization code to a separate crate,
`tracing-utils`, so that we can reuse it in other programs. We
probably want to set up tracing in the same way in all our programs.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
This commit is contained in:
Heikki Linnakangas
2023-01-26 14:13:20 +02:00
committed by Heikki Linnakangas
parent 4bcbb7793d
commit 006ee5f94a
8 changed files with 367 additions and 5 deletions

152
Cargo.lock generated
View File

@@ -839,7 +839,9 @@ dependencies = [
"tokio",
"tokio-postgres",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
"tracing-utils",
"url",
"workspace_hack",
]
@@ -1140,6 +1142,19 @@ dependencies = [
"syn",
]
[[package]]
name = "dashmap"
version = "5.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
dependencies = [
"cfg-if",
"hashbrown 0.12.3",
"lock_api",
"once_cell",
"parking_lot_core",
]
[[package]]
name = "data-encoding"
version = "2.3.3"
@@ -2162,6 +2177,108 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "opentelemetry"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
dependencies = [
"opentelemetry_api",
"opentelemetry_sdk",
]
[[package]]
name = "opentelemetry-http"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
dependencies = [
"async-trait",
"bytes",
"http",
"opentelemetry_api",
"reqwest",
]
[[package]]
name = "opentelemetry-otlp"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
dependencies = [
"async-trait",
"futures",
"futures-util",
"http",
"opentelemetry",
"opentelemetry-http",
"opentelemetry-proto",
"prost",
"reqwest",
"thiserror",
]
[[package]]
name = "opentelemetry-proto"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
dependencies = [
"futures",
"futures-util",
"opentelemetry",
"prost",
"tonic",
"tonic-build",
]
[[package]]
name = "opentelemetry-semantic-conventions"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
dependencies = [
"opentelemetry",
]
[[package]]
name = "opentelemetry_api"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
dependencies = [
"fnv",
"futures-channel",
"futures-util",
"indexmap",
"js-sys",
"once_cell",
"pin-project-lite",
"thiserror",
]
[[package]]
name = "opentelemetry_sdk"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
dependencies = [
"async-trait",
"crossbeam-channel",
"dashmap",
"fnv",
"futures-channel",
"futures-executor",
"futures-util",
"once_cell",
"opentelemetry_api",
"percent-encoding",
"rand",
"thiserror",
"tokio",
"tokio-stream",
]
[[package]]
name = "os_info"
version = "3.5.1"
@@ -4002,6 +4119,20 @@ dependencies = [
"tracing-core",
]
[[package]]
name = "tracing-opentelemetry"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
dependencies = [
"once_cell",
"opentelemetry",
"tracing",
"tracing-core",
"tracing-log",
"tracing-subscriber",
]
[[package]]
name = "tracing-serde"
version = "0.1.3"
@@ -4033,6 +4164,21 @@ dependencies = [
"tracing-serde",
]
[[package]]
name = "tracing-utils"
version = "0.1.0"
dependencies = [
"opentelemetry",
"opentelemetry-otlp",
"opentelemetry-semantic-conventions",
"reqwest",
"tokio",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
"workspace_hack",
]
[[package]]
name = "try-lock"
version = "0.2.4"
@@ -4470,7 +4616,9 @@ dependencies = [
"crossbeam-utils",
"either",
"fail",
"futures",
"futures-channel",
"futures-executor",
"futures-task",
"futures-util",
"indexmap",
@@ -4486,6 +4634,9 @@ dependencies = [
"rand",
"regex",
"regex-syntax",
"reqwest",
"ring",
"rustls",
"scopeguard",
"serde",
"serde_json",
@@ -4493,6 +4644,7 @@ dependencies = [
"syn",
"tokio",
"tokio-util",
"tonic",
"tower",
"tracing",
"tracing-core",

View File

@@ -61,6 +61,10 @@ nix = "0.26"
notify = "5.0.0"
num-traits = "0.2.15"
once_cell = "1.13"
opentelemetry = "0.18.0"
opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.10.0"
tracing-opentelemetry = "0.18.0"
parking_lot = "0.12"
pin-project-lite = "0.2"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
@@ -125,6 +129,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
utils = { version = "0.1", path = "./libs/utils/" }
## Common library dependency

View File

@@ -19,7 +19,9 @@ tar.workspace = true
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true
tracing.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
tracing-utils.workspace = true
url.workspace = true
workspace_hack.workspace = true

View File

@@ -53,7 +53,7 @@ use compute_tools::spec::*;
use url::Url;
fn main() -> Result<()> {
init_logger(DEFAULT_LOG_LEVEL)?;
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let matches = cli().get_matches();
@@ -159,6 +159,10 @@ fn main() -> Result<()> {
info!("shutting down");
}
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit.
tracing_utils::shutdown_tracing();
exit(exit_code.unwrap_or(1))
}

View File

@@ -1,21 +1,37 @@
use anyhow::Result;
use tracing_opentelemetry::OpenTelemetryLayer;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::prelude::*;
/// Initialize `env_logger` using either `default_level` or
/// Initialize logging to stderr, and OpenTelemetry tracing and exporter.
///
/// Logging is configured using either `default_log_level` or
/// `RUST_LOG` environment variable as default log level.
pub fn init_logger(default_level: &str) -> Result<()> {
///
/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
/// configuration from environment variables. For example, to change the destination,
/// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
/// `tracing-utils` package description.
///
pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
// Initialize Logging
let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_level));
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
let fmt_layer = tracing_subscriber::fmt::layer()
.with_target(false)
.with_writer(std::io::stderr);
// Initialize OpenTelemetry
let otlp_layer =
tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);
// Put it all together
tracing_subscriber::registry()
.with(env_filter)
.with(otlp_layer)
.with(fmt_layer)
.init();
tracing::info!("logging and tracing started");
Ok(())
}

View File

@@ -0,0 +1,16 @@
[package]
name = "tracing-utils"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
opentelemetry = { workspace = true, features=["rt-tokio"] }
opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions.workspace = true
reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tracing.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -0,0 +1,161 @@
//! Helper functions to set up OpenTelemetry tracing.
//!
//! This comes in two variants, depending on whether you have a Tokio runtime available.
//! If you do, call `init_tracing()`. It sets up the trace processor and exporter to use
//! the current tokio runtime. If you don't have a runtime available, or you don't want
//! to share the runtime with the tracing tasks, call `init_tracing_without_runtime()`
//! instead. It sets up a dedicated single-threaded Tokio runtime for the tracing tasks.
//!
//! Example:
//!
//! ```rust,no_run
//! use tracing_subscriber::prelude::*;
//! use tracing_opentelemetry::OpenTelemetryLayer;
//!
//! #[tokio::main]
//! async fn main() {
//! // Set up logging to stderr
//! let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
//! .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"));
//! let fmt_layer = tracing_subscriber::fmt::layer()
//! .with_target(false)
//! .with_writer(std::io::stderr);
//!
//! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
//! let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new);
//!
//! // Put it all together
//! tracing_subscriber::registry()
//! .with(env_filter)
//! .with(otlp_layer)
//! .with(fmt_layer)
//! .init();
//! }
//! ```
use opentelemetry::sdk::Resource;
use opentelemetry::KeyValue;
use opentelemetry_otlp::WithExportConfig;
use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT};
pub use tracing_opentelemetry::OpenTelemetryLayer;
/// Set up OpenTelemetry exporter, using configuration from environment variables.
///
/// `service_name` is set as the OpenTelemetry 'service.name' resource (see
/// <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/resource/semantic_conventions/README.md#service>)
///
/// We try to follow the conventions for the environment variables specified in
/// <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables/>
///
/// However, we only support a subset of those options:
///
/// - OTEL_SDK_DISABLED is supported. The default is "false", meaning tracing
/// is enabled by default. Set it to "true" to disable.
///
/// - We use the OTLP exporter, with HTTP protocol. Most of the OTEL_EXPORTER_OTLP_*
/// settings specified in
/// <https://opentelemetry.io/docs/reference/specification/protocol/exporter/>
/// are supported, as they are handled by the `opentelemetry-otlp` crate.
/// Settings related to other exporters have no effect.
///
/// - Some other settings are supported by the `opentelemetry` crate.
///
/// If you need some other setting, please test if it works first. And perhaps
/// add a comment in the list above to save the effort of testing for the next
/// person.
///
/// This doesn't block, but is marked as 'async' to hint that this must be called in
/// asynchronous execution context.
pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trace::Tracer> {
if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
return None;
};
Some(init_tracing_internal(service_name.to_string()))
}
/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
/// tasks.
pub fn init_tracing_without_runtime(
service_name: &str,
) -> Option<opentelemetry::sdk::trace::Tracer> {
if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
return None;
};
// The opentelemetry batch processor and the OTLP exporter needs a Tokio
// runtime. Create a dedicated runtime for them. One thread should be
// enough.
//
// (Alternatively, instead of batching, we could use the "simple
// processor", which doesn't need Tokio, and use "reqwest-blocking"
// feature for the OTLP exporter, which also doesn't need Tokio. However,
// batching is considered best practice, and also I have the feeling that
// the non-Tokio codepaths in the opentelemetry crate are less used and
// might be more buggy, so better to stay on the well-beaten path.)
//
// We leak the runtime so that it keeps running after we exit the
// function.
let runtime = Box::leak(Box::new(
tokio::runtime::Builder::new_multi_thread()
.enable_all()
.thread_name("otlp runtime thread")
.worker_threads(1)
.build()
.unwrap(),
));
let _guard = runtime.enter();
Some(init_tracing_internal(service_name.to_string()))
}
fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer {
// Set up exporter from the OTEL_EXPORTER_* environment variables
let mut exporter = opentelemetry_otlp::new_exporter().http().with_env();
// XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the
// OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the
// OpenTelemetry spec at
// <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md#endpoint-urls-for-otlphttp>,
// the full exporter URL is formed by appending "/v1/traces" to the value
// of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does
// that with the grpc-tonic exporter. Other exporters, like the HTTP
// exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without
// appending "/v1/traces".
//
// See https://github.com/open-telemetry/opentelemetry-rust/pull/950
//
// Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting
// the endpoint url with the "/v1/traces" path ourselves. If the bug is
// fixed in a later version, we can remove this code. But if we don't
// remember to remove this, it won't do any harm either, as the crate will
// just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint
// is set directly with `with_endpoint`.
if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() {
if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) {
if !endpoint.ends_with('/') {
endpoint.push('/');
}
endpoint.push_str("v1/traces");
exporter = exporter.with_endpoint(endpoint);
}
}
opentelemetry_otlp::new_pipeline()
.tracing()
.with_exporter(exporter)
.with_trace_config(
opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
opentelemetry_semantic_conventions::resource::SERVICE_NAME,
service_name,
)])),
)
.install_batch(opentelemetry::runtime::Tokio)
.expect("could not initialize opentelemetry exporter")
}
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit.
pub fn shutdown_tracing() {
opentelemetry::global::shutdown_tracer_provider();
}

View File

@@ -20,7 +20,9 @@ clap = { version = "4", features = ["derive", "string"] }
crossbeam-utils = { version = "0.8" }
either = { version = "1" }
fail = { version = "0.5", default-features = false, features = ["failpoints"] }
futures = { version = "0.3" }
futures-channel = { version = "0.3", features = ["sink"] }
futures-executor = { version = "0.3" }
futures-task = { version = "0.3", default-features = false, features = ["std"] }
futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
indexmap = { version = "1", default-features = false, features = ["std"] }
@@ -36,12 +38,16 @@ prost = { version = "0.11" }
rand = { version = "0.8", features = ["small_rng"] }
regex = { version = "1" }
regex-syntax = { version = "0.6" }
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
ring = { version = "0.16", features = ["std"] }
rustls = { version = "0.20", features = ["dangerous_configuration"] }
scopeguard = { version = "1" }
serde = { version = "1", features = ["alloc", "derive"] }
serde_json = { version = "1", features = ["raw_value"] }
socket2 = { version = "0.4", default-features = false, features = ["all"] }
tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] }
tokio-util = { version = "0.7", features = ["codec", "io"] }
tonic = { version = "0.8", features = ["tls-roots"] }
tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
tracing = { version = "0.1", features = ["log"] }
tracing-core = { version = "0.1" }