diff --git a/Cargo.lock b/Cargo.lock index c76861267f..91515df5bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -839,7 +839,9 @@ dependencies = [ "tokio", "tokio-postgres", "tracing", + "tracing-opentelemetry", "tracing-subscriber", + "tracing-utils", "url", "workspace_hack", ] @@ -1140,6 +1142,19 @@ dependencies = [ "syn", ] +[[package]] +name = "dashmap" +version = "5.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc" +dependencies = [ + "cfg-if", + "hashbrown 0.12.3", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "data-encoding" version = "2.3.3" @@ -2162,6 +2177,108 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "opentelemetry" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e" +dependencies = [ + "opentelemetry_api", + "opentelemetry_sdk", +] + +[[package]] +name = "opentelemetry-http" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d" +dependencies = [ + "async-trait", + "bytes", + "http", + "opentelemetry_api", + "reqwest", +] + +[[package]] +name = "opentelemetry-otlp" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde" +dependencies = [ + "async-trait", + "futures", + "futures-util", + "http", + "opentelemetry", + "opentelemetry-http", + "opentelemetry-proto", + "prost", + "reqwest", + "thiserror", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28" +dependencies = [ + "futures", + "futures-util", + "opentelemetry", + "prost", + "tonic", + "tonic-build", +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb" +dependencies = [ + "opentelemetry", +] + +[[package]] +name = "opentelemetry_api" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22" +dependencies = [ + "fnv", + "futures-channel", + "futures-util", + "indexmap", + "js-sys", + "once_cell", + "pin-project-lite", + "thiserror", +] + +[[package]] +name = "opentelemetry_sdk" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113" +dependencies = [ + "async-trait", + "crossbeam-channel", + "dashmap", + "fnv", + "futures-channel", + "futures-executor", + "futures-util", + "once_cell", + "opentelemetry_api", + "percent-encoding", + "rand", + "thiserror", + "tokio", + "tokio-stream", +] + [[package]] name = "os_info" version = "3.5.1" @@ -4002,6 +4119,20 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-opentelemetry" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de" +dependencies = [ + "once_cell", + "opentelemetry", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", +] + [[package]] name = "tracing-serde" version = "0.1.3" @@ -4033,6 +4164,21 @@ dependencies = [ "tracing-serde", ] +[[package]] +name = "tracing-utils" +version = "0.1.0" +dependencies = [ + "opentelemetry", + "opentelemetry-otlp", + "opentelemetry-semantic-conventions", + "reqwest", + "tokio", + "tracing", + "tracing-opentelemetry", + "tracing-subscriber", + "workspace_hack", +] + [[package]] name = "try-lock" version = "0.2.4" @@ -4470,7 +4616,9 @@ dependencies = [ "crossbeam-utils", "either", "fail", + "futures", "futures-channel", + "futures-executor", "futures-task", "futures-util", "indexmap", @@ -4486,6 +4634,9 @@ dependencies = [ "rand", "regex", "regex-syntax", + "reqwest", + "ring", + "rustls", "scopeguard", "serde", "serde_json", @@ -4493,6 +4644,7 @@ dependencies = [ "syn", "tokio", "tokio-util", + "tonic", "tower", "tracing", "tracing-core", diff --git a/Cargo.toml b/Cargo.toml index 57f4b1d981..e6695c4246 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,6 +61,10 @@ nix = "0.26" notify = "5.0.0" num-traits = "0.2.15" once_cell = "1.13" +opentelemetry = "0.18.0" +opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.10.0" +tracing-opentelemetry = "0.18.0" parking_lot = "0.12" pin-project-lite = "0.2" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency @@ -125,6 +129,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" } +tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" } utils = { version = "0.1", path = "./libs/utils/" } ## Common library dependency diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 4536604bdf..0fabd23965 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -19,7 +19,9 @@ tar.workspace = true tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tracing.workspace = true +tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true +tracing-utils.workspace = true url.workspace = true workspace_hack.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index e5ab8eb153..0e7f38bf84 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -53,7 +53,7 @@ use compute_tools::spec::*; use url::Url; fn main() -> Result<()> { - init_logger(DEFAULT_LOG_LEVEL)?; + init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; let matches = cli().get_matches(); @@ -159,6 +159,10 @@ fn main() -> Result<()> { info!("shutting down"); } + // Shutdown trace pipeline gracefully, so that it has a chance to send any + // pending traces before we exit. + tracing_utils::shutdown_tracing(); + exit(exit_code.unwrap_or(1)) } diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 57e5496e86..1b5cf647b0 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -1,21 +1,37 @@ -use anyhow::Result; +use tracing_opentelemetry::OpenTelemetryLayer; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::prelude::*; -/// Initialize `env_logger` using either `default_level` or +/// Initialize logging to stderr, and OpenTelemetry tracing and exporter. +/// +/// Logging is configured using either `default_log_level` or /// `RUST_LOG` environment variable as default log level. -pub fn init_logger(default_level: &str) -> Result<()> { +/// +/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up +/// configuration from environment variables. For example, to change the destination, +/// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See +/// `tracing-utils` package description. +/// +pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { + // Initialize Logging let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_level)); + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level)); let fmt_layer = tracing_subscriber::fmt::layer() .with_target(false) .with_writer(std::io::stderr); + // Initialize OpenTelemetry + let otlp_layer = + tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new); + + // Put it all together tracing_subscriber::registry() .with(env_filter) + .with(otlp_layer) .with(fmt_layer) .init(); + tracing::info!("logging and tracing started"); Ok(()) } diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml new file mode 100644 index 0000000000..9d5cfb0d56 --- /dev/null +++ b/libs/tracing-utils/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "tracing-utils" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +opentelemetry = { workspace = true, features=["rt-tokio"] } +opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions.workspace = true +reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] } +tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } +tracing.workspace = true +tracing-opentelemetry.workspace = true +tracing-subscriber.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs new file mode 100644 index 0000000000..179f54c659 --- /dev/null +++ b/libs/tracing-utils/src/lib.rs @@ -0,0 +1,161 @@ +//! Helper functions to set up OpenTelemetry tracing. +//! +//! This comes in two variants, depending on whether you have a Tokio runtime available. +//! If you do, call `init_tracing()`. It sets up the trace processor and exporter to use +//! the current tokio runtime. If you don't have a runtime available, or you don't want +//! to share the runtime with the tracing tasks, call `init_tracing_without_runtime()` +//! instead. It sets up a dedicated single-threaded Tokio runtime for the tracing tasks. +//! +//! Example: +//! +//! ```rust,no_run +//! use tracing_subscriber::prelude::*; +//! use tracing_opentelemetry::OpenTelemetryLayer; +//! +//! #[tokio::main] +//! async fn main() { +//! // Set up logging to stderr +//! let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() +//! .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")); +//! let fmt_layer = tracing_subscriber::fmt::layer() +//! .with_target(false) +//! .with_writer(std::io::stderr); +//! +//! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces +//! let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new); +//! +//! // Put it all together +//! tracing_subscriber::registry() +//! .with(env_filter) +//! .with(otlp_layer) +//! .with(fmt_layer) +//! .init(); +//! } +//! ``` + +use opentelemetry::sdk::Resource; +use opentelemetry::KeyValue; +use opentelemetry_otlp::WithExportConfig; +use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}; + +pub use tracing_opentelemetry::OpenTelemetryLayer; + +/// Set up OpenTelemetry exporter, using configuration from environment variables. +/// +/// `service_name` is set as the OpenTelemetry 'service.name' resource (see +/// ) +/// +/// We try to follow the conventions for the environment variables specified in +/// +/// +/// However, we only support a subset of those options: +/// +/// - OTEL_SDK_DISABLED is supported. The default is "false", meaning tracing +/// is enabled by default. Set it to "true" to disable. +/// +/// - We use the OTLP exporter, with HTTP protocol. Most of the OTEL_EXPORTER_OTLP_* +/// settings specified in +/// +/// are supported, as they are handled by the `opentelemetry-otlp` crate. +/// Settings related to other exporters have no effect. +/// +/// - Some other settings are supported by the `opentelemetry` crate. +/// +/// If you need some other setting, please test if it works first. And perhaps +/// add a comment in the list above to save the effort of testing for the next +/// person. +/// +/// This doesn't block, but is marked as 'async' to hint that this must be called in +/// asynchronous execution context. +pub async fn init_tracing(service_name: &str) -> Option { + if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { + return None; + }; + Some(init_tracing_internal(service_name.to_string())) +} + +/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing +/// tasks. +pub fn init_tracing_without_runtime( + service_name: &str, +) -> Option { + if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { + return None; + }; + + // The opentelemetry batch processor and the OTLP exporter needs a Tokio + // runtime. Create a dedicated runtime for them. One thread should be + // enough. + // + // (Alternatively, instead of batching, we could use the "simple + // processor", which doesn't need Tokio, and use "reqwest-blocking" + // feature for the OTLP exporter, which also doesn't need Tokio. However, + // batching is considered best practice, and also I have the feeling that + // the non-Tokio codepaths in the opentelemetry crate are less used and + // might be more buggy, so better to stay on the well-beaten path.) + // + // We leak the runtime so that it keeps running after we exit the + // function. + let runtime = Box::leak(Box::new( + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .thread_name("otlp runtime thread") + .worker_threads(1) + .build() + .unwrap(), + )); + let _guard = runtime.enter(); + + Some(init_tracing_internal(service_name.to_string())) +} + +fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer { + // Set up exporter from the OTEL_EXPORTER_* environment variables + let mut exporter = opentelemetry_otlp::new_exporter().http().with_env(); + + // XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the + // OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the + // OpenTelemetry spec at + // , + // the full exporter URL is formed by appending "/v1/traces" to the value + // of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does + // that with the grpc-tonic exporter. Other exporters, like the HTTP + // exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without + // appending "/v1/traces". + // + // See https://github.com/open-telemetry/opentelemetry-rust/pull/950 + // + // Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting + // the endpoint url with the "/v1/traces" path ourselves. If the bug is + // fixed in a later version, we can remove this code. But if we don't + // remember to remove this, it won't do any harm either, as the crate will + // just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint + // is set directly with `with_endpoint`. + if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() { + if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) { + if !endpoint.ends_with('/') { + endpoint.push('/'); + } + endpoint.push_str("v1/traces"); + exporter = exporter.with_endpoint(endpoint); + } + } + + opentelemetry_otlp::new_pipeline() + .tracing() + .with_exporter(exporter) + .with_trace_config( + opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAME, + service_name, + )])), + ) + .install_batch(opentelemetry::runtime::Tokio) + .expect("could not initialize opentelemetry exporter") +} + +// Shutdown trace pipeline gracefully, so that it has a chance to send any +// pending traces before we exit. +pub fn shutdown_tracing() { + opentelemetry::global::shutdown_tracer_provider(); +} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index acc38cfb9d..3a852b2207 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -20,7 +20,9 @@ clap = { version = "4", features = ["derive", "string"] } crossbeam-utils = { version = "0.8" } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } +futures = { version = "0.3" } futures-channel = { version = "0.3", features = ["sink"] } +futures-executor = { version = "0.3" } futures-task = { version = "0.3", default-features = false, features = ["std"] } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } indexmap = { version = "1", default-features = false, features = ["std"] } @@ -36,12 +38,16 @@ prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-syntax = { version = "0.6" } +reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } +ring = { version = "0.16", features = ["std"] } +rustls = { version = "0.20", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } socket2 = { version = "0.4", default-features = false, features = ["all"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] } tokio-util = { version = "0.7", features = ["codec", "io"] } +tonic = { version = "0.8", features = ["tls-roots"] } tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" }