From ac1159cb33533d6f1b91adda21dea285a70c5af9 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Mon, 10 Feb 2025 23:26:27 +0100 Subject: [PATCH] utils: allow for setting up OTEL tracing subscriber --- Cargo.lock | 2 + compute_tools/src/bin/fast_import.rs | 1 + libs/desim/tests/reliable_copy_test.rs | 1 + libs/remote_storage/tests/common/mod.rs | 1 + libs/tracing-utils/src/perf_span.rs | 2 +- libs/utils/Cargo.toml | 1 + libs/utils/src/logging.rs | 48 +++++++++++++++++++++++- pageserver/Cargo.toml | 1 + pageserver/compaction/tests/tests.rs | 3 +- pageserver/ctl/src/main.rs | 1 + pageserver/pagebench/src/main.rs | 1 + pageserver/src/bin/pageserver.rs | 22 +++++++++-- pageserver/src/tenant.rs | 3 +- safekeeper/src/bin/safekeeper.rs | 1 + storage_broker/src/bin/storage_broker.rs | 1 + storage_controller/src/main.rs | 1 + 16 files changed, 82 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c621c07f35..c6592f6bd6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4301,6 +4301,7 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", + "tracing-utils", "url", "utils", "wal_decoder", @@ -7808,6 +7809,7 @@ dependencies = [ "tracing", "tracing-error", "tracing-subscriber", + "tracing-utils", "walkdir", ] diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 47558be7a0..265bf8112b 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -592,6 +592,7 @@ pub(crate) async fn main() -> anyhow::Result<()> { utils::logging::init( utils::logging::LogFormat::Json, utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + utils::logging::OtelEnablement::Disabled, utils::logging::Output::Stdout, )?; diff --git a/libs/desim/tests/reliable_copy_test.rs b/libs/desim/tests/reliable_copy_test.rs index 1ddf9844de..aa73b276af 100644 --- a/libs/desim/tests/reliable_copy_test.rs +++ b/libs/desim/tests/reliable_copy_test.rs @@ -158,6 +158,7 @@ mod reliable_copy_test { utils::logging::init( utils::logging::LogFormat::Test, utils::logging::TracingErrorLayerEnablement::Disabled, + utils::logging::OtelEnablement::Disabled, utils::logging::Output::Stdout, ) .expect("logging init failed"); diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs index daab05d91a..d720d61dae 100644 --- a/libs/remote_storage/tests/common/mod.rs +++ b/libs/remote_storage/tests/common/mod.rs @@ -208,6 +208,7 @@ pub(crate) fn ensure_logging_ready() { utils::logging::init( utils::logging::LogFormat::Test, utils::logging::TracingErrorLayerEnablement::Disabled, + utils::logging::OtelEnablement::Disabled, utils::logging::Output::Stdout, ) .expect("logging init failed"); diff --git a/libs/tracing-utils/src/perf_span.rs b/libs/tracing-utils/src/perf_span.rs index 77458c4f3a..f4278507e9 100644 --- a/libs/tracing-utils/src/perf_span.rs +++ b/libs/tracing-utils/src/perf_span.rs @@ -28,7 +28,7 @@ use core::{ task::{Context, Poll}, }; use pin_project_lite::pin_project; -use tracing::{field, span::Span, Dispatch}; +use tracing::{Dispatch, field, span::Span}; #[derive(Debug, Clone)] pub struct PerfSpan { diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index ac44300a51..4180602ac7 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -42,6 +42,7 @@ toml_edit = { workspace = true, features = ["serde"] } tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } +tracing-utils.workspace = true rand.workspace = true scopeguard.workspace = true strum.workspace = true diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 881f1e765d..518b39dc37 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -7,7 +7,9 @@ use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, VariantNames}; use tokio::time::Instant; +use tracing::Dispatch; use tracing::info; +use tracing::level_filters::LevelFilter; /// Logs a critical error, similarly to `tracing::error!`. This will: /// @@ -125,6 +127,15 @@ pub enum TracingErrorLayerEnablement { EnableWithRustLogFilter, } +pub enum OtelEnablement { + Disabled, + Enabled { + service_name: String, + export_config: tracing_utils::ExportConfig, + runtime: &'static tokio::runtime::Runtime, + }, +} + /// Where the logging should output to. #[derive(Clone, Copy)] pub enum Output { @@ -132,11 +143,24 @@ pub enum Output { Stderr, } +pub struct OtelGuard { + pub dispatch: Dispatch, +} + +impl Drop for OtelGuard { + fn drop(&mut self) { + tracing_utils::shutdown_tracing(); + } +} + +pub const PERF_TRACE_TARGET: &str = "P"; + pub fn init( log_format: LogFormat, tracing_error_layer_enablement: TracingErrorLayerEnablement, + otel_enablement: OtelEnablement, output: Output, -) -> anyhow::Result<()> { +) -> anyhow::Result> { // We fall back to printing all spans at info-level or above if // the RUST_LOG environment variable is not set. let rust_log_env_filter = || { @@ -165,6 +189,7 @@ pub fn init( }; log_layer.with_filter(rust_log_env_filter()) }); + let r = r.with( TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()), ); @@ -175,7 +200,26 @@ pub fn init( TracingErrorLayerEnablement::Disabled => r.init(), } - Ok(()) + let otel_subscriber = match otel_enablement { + OtelEnablement::Disabled => None, + OtelEnablement::Enabled { + service_name, + export_config, + runtime, + } => { + let otel_layer = runtime + .block_on(tracing_utils::init_tracing(&service_name, export_config)) + .with_filter(LevelFilter::INFO); + let otel_subscriber = tracing_subscriber::registry().with(otel_layer); + let otel_dispatch = Dispatch::new(otel_subscriber); + + Some(otel_dispatch) + } + }; + + let otel_guard = otel_subscriber.map(|dispatch| OtelGuard { dispatch }); + + Ok(otel_guard) } /// Disable the default rust panic hook by using `set_hook`. diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 7330856be4..5754759c81 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -66,6 +66,7 @@ tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true +tracing-utils.workspace = true url.workspace = true walkdir.workspace = true metrics.workspace = true diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs index bd8b54a286..7db1e0e2d6 100644 --- a/pageserver/compaction/tests/tests.rs +++ b/pageserver/compaction/tests/tests.rs @@ -10,9 +10,10 @@ pub(crate) fn setup_logging() { logging::init( logging::LogFormat::Test, logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + utils::logging::OtelEnablement::Disabled, logging::Output::Stdout, ) - .expect("Failed to init test logging") + .expect("Failed to init test logging"); }); } diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 72a120a69b..957537cc8e 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -117,6 +117,7 @@ async fn main() -> anyhow::Result<()> { logging::init( LogFormat::Plain, TracingErrorLayerEnablement::EnableWithRustLogFilter, + utils::logging::OtelEnablement::Disabled, logging::Output::Stdout, )?; diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs index 5527557450..fb017de119 100644 --- a/pageserver/pagebench/src/main.rs +++ b/pageserver/pagebench/src/main.rs @@ -35,6 +35,7 @@ fn main() { logging::init( logging::LogFormat::Plain, logging::TracingErrorLayerEnablement::Disabled, + utils::logging::OtelEnablement::Disabled, logging::Output::Stderr, ) .unwrap(); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 703629aed5..66384ca8b4 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -21,7 +21,8 @@ use pageserver::deletion_queue::DeletionQueue; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::{ - BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, + BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, OTEL_RUNTIME, + WALRECEIVER_RUNTIME, }; use pageserver::tenant::{TenantSharedResources, mgr, secondary}; use pageserver::{ @@ -36,7 +37,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::crashsafe::syncfs; -use utils::logging::TracingErrorLayerEnablement; +use utils::logging::{OtelGuard, TracingErrorLayerEnablement}; use utils::sentry_init::init_sentry; use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener}; @@ -110,12 +111,27 @@ fn main() -> anyhow::Result<()> { } else { TracingErrorLayerEnablement::Disabled }; - logging::init( + + let otel_enablement = match &conf.tracing { + Some(cfg) => utils::logging::OtelEnablement::Enabled { + service_name: "pageserver".to_string(), + export_config: (&cfg.export_config).into(), + runtime: *OTEL_RUNTIME, + }, + None => utils::logging::OtelEnablement::Disabled, + }; + + let otel_guard = logging::init( conf.log_format, tracing_error_layer_enablement, + otel_enablement, logging::Output::Stdout, )?; + if otel_guard.is_some() { + info!(?conf.tracing, "starting with OTEL tracing enabled"); + } + // mind the order required here: 1. logging, 2. panic_hook, 3. sentry. // disarming this hook on pageserver, because we never tear down tracing. logging::replace_panic_hook_with_tracing_panic_hook().forget(); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c78d15c9b5..d877d2eb1c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -5718,9 +5718,10 @@ pub(crate) mod harness { // enable it in case the tests exercise code paths that use // debug_assert_current_span_has_tenant_and_timeline_id logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + utils::logging::OtelEnablement::Disabled, logging::Output::Stdout, ) - .expect("Failed to init test logging") + .expect("Failed to init test logging"); }); } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 10fc4a4b59..73e8eae57e 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -255,6 +255,7 @@ async fn main() -> anyhow::Result<()> { logging::init( LogFormat::from_config(&args.log_format)?, logging::TracingErrorLayerEnablement::Disabled, + utils::logging::OtelEnablement::Disabled, logging::Output::Stdout, )?; logging::replace_panic_hook_with_tracing_panic_hook().forget(); diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index cc33ec20ff..61962e1e94 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -643,6 +643,7 @@ async fn main() -> Result<(), Box> { logging::init( LogFormat::from_config(&args.log_format)?, logging::TracingErrorLayerEnablement::Disabled, + utils::logging::OtelEnablement::Disabled, logging::Output::Stdout, )?; logging::replace_panic_hook_with_tracing_panic_hook().forget(); diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 967fb2996f..7ebdb4347f 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -232,6 +232,7 @@ fn main() -> anyhow::Result<()> { logging::init( LogFormat::Plain, logging::TracingErrorLayerEnablement::Disabled, + utils::logging::OtelEnablement::Disabled, logging::Output::Stdout, )?;