From bcd4fb7db293ca9e72dadc2b0170630f06af6593 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 6 Dec 2023 17:48:50 +0000 Subject: [PATCH] utils::logging: implement tracing_chrome & tracing_flame support --- Cargo.lock | 42 ++++++++ control_plane/src/background_process.rs | 14 ++- control_plane/src/bin/attachment_service.rs | 2 +- libs/remote_storage/tests/test_real_azure.rs | 2 +- libs/remote_storage/tests/test_real_s3.rs | 2 +- libs/utils/Cargo.toml | 2 + libs/utils/src/logging.rs | 108 +++++++++++++++++-- pageserver/Cargo.toml | 1 + pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/tenant.rs | 2 +- safekeeper/src/bin/safekeeper.rs | 2 +- storage_broker/src/bin/storage_broker.rs | 2 +- 12 files changed, 163 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1b6b423444..ed714f2ed6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2780,6 +2780,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num-bigint" version = "0.4.3" @@ -3040,6 +3050,12 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "p256" version = "0.11.1" @@ -3139,6 +3155,7 @@ dependencies = [ "tokio", "tokio-io-timeout", "tokio-postgres", + "tokio-stream", "tokio-tar", "tokio-util", "toml_edit", @@ -5481,6 +5498,17 @@ dependencies = [ "syn 2.0.28", ] +[[package]] +name = "tracing-chrome" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "496b3cd5447f7ff527bbbf19b071ad542a000adf297d4127078b4dfdb931f41a" +dependencies = [ + "serde_json", + "tracing-core", + "tracing-subscriber", +] + [[package]] name = "tracing-core" version = "0.1.31" @@ -5501,6 +5529,17 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "tracing-flame" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bae117ee14789185e129aaee5d93750abe67fdc5a9a62650452bfe4e122a3a9" +dependencies = [ + "lazy_static", + "tracing", + "tracing-subscriber", +] + [[package]] name = "tracing-futures" version = "0.2.5" @@ -5553,6 +5592,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ "matchers", + "nu-ansi-term", "once_cell", "regex", "serde", @@ -5773,7 +5813,9 @@ dependencies = [ "tokio-stream", "tokio-util", "tracing", + "tracing-chrome", "tracing-error", + "tracing-flame", "tracing-subscriber", "url", "uuid", diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 26fc08fc8f..6c18330be3 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -86,7 +86,10 @@ where .stdout(process_log_file) .stderr(same_file_for_stderr) .args(args); - let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command)); + + let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars( + fill_rust_env_vars(background_command), + )); filled_cmd.envs(envs); let pid_file_to_check = match initial_pid_file { @@ -253,6 +256,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { cmd } +fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command { + for (var, val) in std::env::vars() { + if var.starts_with("NEON_") { + cmd = cmd.env(var, val); + } + } + cmd +} + /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), /// 1. Claims a pidfile with a fcntl lock on it and /// 2. Sets up the pidfile's file descriptor so that it (and the lock) diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs index e50c8fbba0..0362fe0c04 100644 --- a/control_plane/src/bin/attachment_service.rs +++ b/control_plane/src/bin/attachment_service.rs @@ -299,7 +299,7 @@ fn make_router(persistent_state: PersistentState) -> RouterBuilder anyhow::Result<()> { - logging::init( + let _guard = logging::init( LogFormat::Plain, logging::TracingErrorLayerEnablement::Disabled, logging::Output::Stdout, diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 7327803198..7bcf29606f 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -274,7 +274,7 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res fn ensure_logging_ready() { LOGGING_DONE.get_or_init(|| { - utils::logging::init( + let _ = utils::logging::init( utils::logging::LogFormat::Test, utils::logging::TracingErrorLayerEnablement::Disabled, utils::logging::Output::Stdout, diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index ecd834e61c..d7c11d7276 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -200,7 +200,7 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> fn ensure_logging_ready() { LOGGING_DONE.get_or_init(|| { - utils::logging::init( + let _ = utils::logging::init( utils::logging::LogFormat::Test, utils::logging::TracingErrorLayerEnablement::Disabled, utils::logging::Output::Stdout, diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index ccf6f4f2d7..3756a436fa 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -49,6 +49,8 @@ const_format.workspace = true # to use tokio channels as streams, this is faster to compile than async_stream # why is it only here? no other crate should use it, streams are rarely needed. tokio-stream = { version = "0.1.14" } +tracing-chrome = "0.7.1" +tracing-flame = "0.2.0" [dev-dependencies] byteorder.workspace = true diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 2f09c2f3ea..844c526c96 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,4 +1,4 @@ -use std::str::FromStr; +use std::{io::BufWriter, str::FromStr, sync::{Arc, Mutex}}; use anyhow::Context; use once_cell::sync::Lazy; @@ -73,11 +73,26 @@ pub enum Output { Stderr, } +/// Keep alive and drop it before the program terminates. +#[must_use] +pub struct FlushGuard(Arc>); + +struct FlushGuardInner { + _tracing_chrome_layer: Option, + _tracing_flame_layer: Option>>, +} + +impl From for FlushGuard { + fn from(value: FlushGuardInner) -> Self { + Self(Arc::new(Mutex::new(value))) + } +} + pub fn init( log_format: LogFormat, tracing_error_layer_enablement: TracingErrorLayerEnablement, output: Output, -) -> anyhow::Result<()> { +) -> anyhow::Result { // We fall back to printing all spans at info-level or above if // the RUST_LOG environment variable is not set. let rust_log_env_filter = || { @@ -85,11 +100,51 @@ pub fn init( .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")) }; + // WIP: lift it up as an argument + let enable_tracing_chrome = match std::env::var("NEON_PAGESERVER_ENABLE_TRACING_CHROME") { + Ok(s) if s != "0" => true, + Ok(_s) => false, + Err(std::env::VarError::NotPresent) => false, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var NEON_PAGESERVER_ENABLE_TRACING_CHROME not unicode") + } + }; + + // WIP: lift it up as an argument + let enable_tracing_flame = match std::env::var("NEON_PAGESERVER_ENABLE_TRACING_FLAME") { + Ok(s) if s != "0" => true, + Ok(_s) => false, + Err(std::env::VarError::NotPresent) => false, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var NEON_PAGESERVER_ENABLE_TRACING_FLAME not unicode") + } + }; + // NB: the order of the with() calls does not matter. // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering use tracing_subscriber::prelude::*; - let r = tracing_subscriber::registry(); - let r = r.with({ + + // https://users.rust-lang.org/t/how-can-i-init-tracing-registry-dynamically-with-multiple-outputs/94307/6 + #[derive(Default)] + struct LayerStack { + layers: + Option + Sync + Send>>, + } + impl LayerStack { + fn add_layer(&mut self, new_layer: L) + where + L: tracing_subscriber::Layer + Send + Sync, + { + let new = match self.layers.take() { + Some(layers) => Some(layers.and_then(new_layer).boxed()), + None => Some(new_layer.boxed()), + }; + self.layers = new; + } + } + let mut layers = LayerStack::default(); + + layers.add_layer({ let log_layer = tracing_subscriber::fmt::layer() .with_target(false) .with_ansi(false) @@ -106,15 +161,48 @@ pub fn init( }; log_layer.with_filter(rust_log_env_filter()) }); - let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter())); + + layers + .add_layer(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter())); + + let tracing_chrome_layer_flush_guard = if enable_tracing_chrome { + let (layer, guard) = tracing_chrome::ChromeLayerBuilder::new() + .trace_style(tracing_chrome::TraceStyle::Async) + .build(); + layers.add_layer(layer.with_filter(rust_log_env_filter())); + Some(guard) + } else { + None + }; + + let tracing_flame_flush_guard = if enable_tracing_flame { + let (layer, guard) = tracing_flame::FlameLayer::with_file("./tracing.folded").unwrap(); + let layer = layer + .with_empty_samples(false) + .with_module_path(false) + .with_file_and_line(false) + .with_threads_collapsed(true); + layers.add_layer(layer.with_filter(rust_log_env_filter())); + Some(guard) + } else { + None + }; + match tracing_error_layer_enablement { - TracingErrorLayerEnablement::EnableWithRustLogFilter => r - .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter())) - .init(), - TracingErrorLayerEnablement::Disabled => r.init(), + TracingErrorLayerEnablement::EnableWithRustLogFilter => layers + .add_layer(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter())), + TracingErrorLayerEnablement::Disabled => (), } - Ok(()) + let r = tracing_subscriber::registry(); + r.with(layers.layers.expect("we add at least one layer")) + .init(); + + Ok(FlushGuardInner { + _tracing_chrome_layer: tracing_chrome_layer_flush_guard, + _tracing_flame_layer: tracing_flame_flush_guard, + } + .into()) } /// Disable the default rust panic hook by using `set_hook`. diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 35c260740c..e0cbff3123 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -62,6 +62,7 @@ thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } tokio-io-timeout.workspace = true tokio-postgres.workspace = true +tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 7607119dda..f14884070b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -103,7 +103,7 @@ fn main() -> anyhow::Result<()> { } else { TracingErrorLayerEnablement::Disabled }; - logging::init( + let _guard = logging::init( conf.log_format, tracing_error_layer_enablement, logging::Output::Stdout, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index a8e8b4cbfa..4d0586e492 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3716,7 +3716,7 @@ pub(crate) mod harness { pub deletion_queue: MockDeletionQueue, } - static LOG_HANDLE: OnceCell<()> = OnceCell::new(); + static LOG_HANDLE: OnceCell = OnceCell::new(); pub(crate) fn setup_logging() { LOG_HANDLE.get_or_init(|| { diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index e59deb9fda..beb26b3615 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -200,7 +200,7 @@ async fn main() -> anyhow::Result<()> { // 1. init logging // 2. tracing panic hook // 3. sentry - logging::init( + let _guard = logging::init( LogFormat::from_config(&args.log_format)?, logging::TracingErrorLayerEnablement::Disabled, logging::Output::Stdout, diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 9f81ac6cac..74f8ee2225 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -431,7 +431,7 @@ async fn main() -> Result<(), Box> { // 1. init logging // 2. tracing panic hook // 3. sentry - logging::init( + let _guard = logging::init( LogFormat::from_config(&args.log_format)?, logging::TracingErrorLayerEnablement::Disabled, logging::Output::Stdout,