From b6cb717a763a247ff3e6068f531ec5860571fac2 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 7 Dec 2023 18:07:45 +0000 Subject: [PATCH] WIP: add tracing_flame stuff, many more spans, looked at spans with Joonas --- Cargo.lock | 12 +++++++ libs/utils/Cargo.toml | 1 + libs/utils/src/logging.rs | 33 ++++++++++++++++++-- pageserver/src/tenant/block_io.rs | 1 + pageserver/src/tenant/layer_map.rs | 1 + pageserver/src/tenant/storage_layer/layer.rs | 1 + pageserver/src/tenant/timeline.rs | 4 ++- 7 files changed, 49 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cd6eaae4a3..f6f9335c3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5302,6 +5302,17 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "tracing-flame" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bae117ee14789185e129aaee5d93750abe67fdc5a9a62650452bfe4e122a3a9" +dependencies = [ + "lazy_static", + "tracing", + "tracing-subscriber", +] + [[package]] name = "tracing-futures" version = "0.2.5" @@ -5571,6 +5582,7 @@ dependencies = [ "tracing", "tracing-chrome", "tracing-error", + "tracing-flame", "tracing-subscriber", "url", "uuid", diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index ffff9e90a4..3756a436fa 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -50,6 +50,7 @@ const_format.workspace = true # why is it only here? no other crate should use it, streams are rarely needed. tokio-stream = { version = "0.1.14" } tracing-chrome = "0.7.1" +tracing-flame = "0.2.0" [dev-dependencies] byteorder.workspace = true diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 7d2c1f9c5d..a3f58ddbc8 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,4 +1,4 @@ -use std::str::FromStr; +use std::{io::BufWriter, str::FromStr}; use anyhow::Context; use once_cell::sync::Lazy; @@ -77,6 +77,7 @@ pub enum Output { #[must_use] pub struct FlushGuard { _tracing_chrome_layer: Option, + _tracing_flame_layer: Option>>, } pub fn init( @@ -101,6 +102,16 @@ pub fn init( } }; + // WIP: lift it up as an argument + let enable_tracing_flame = match std::env::var("NEON_PAGESERVER_ENABLE_TRACING_FLAME") { + Ok(s) if s != "0" => true, + Ok(_s) => false, + Err(std::env::VarError::NotPresent) => false, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var NEON_PAGESERVER_ENABLE_TRACING_FLAME not unicode") + } + }; + // NB: the order of the with() calls does not matter. // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering use tracing_subscriber::prelude::*; @@ -147,8 +158,23 @@ pub fn init( .add_layer(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter())); let tracing_chrome_layer_flush_guard = if enable_tracing_chrome { - let (layer, guard) = tracing_chrome::ChromeLayerBuilder::new().build(); - layers.add_layer(layer); + let (layer, guard) = tracing_chrome::ChromeLayerBuilder::new() + .trace_style(tracing_chrome::TraceStyle::Async) + .build(); + layers.add_layer(layer.with_filter(rust_log_env_filter())); + Some(guard) + } else { + None + }; + + let tracing_flame_flush_guard = if enable_tracing_flame { + let (layer, guard) = tracing_flame::FlameLayer::with_file("./tracing.folded").unwrap(); + let layer = layer + .with_empty_samples(false) + .with_module_path(false) + .with_file_and_line(false) + .with_threads_collapsed(true); + layers.add_layer(layer.with_filter(rust_log_env_filter())); Some(guard) } else { None @@ -166,6 +192,7 @@ pub fn init( Ok(FlushGuard { _tracing_chrome_layer: tracing_chrome_layer_flush_guard, + _tracing_flame_layer: tracing_flame_flush_guard, }) } diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 0617017528..e2e027e564 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -141,6 +141,7 @@ impl<'a> BlockCursor<'a> { /// access to the contents of the page. (For the page cache, the /// lease object represents a lock on the buffer.) #[inline(always)] + #[tracing::instrument(skip_all, level = tracing::Level::DEBUG)] pub async fn read_blk( &self, blknum: u32, diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 9b6225501f..b33bb62ff5 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -181,6 +181,7 @@ impl LayerMap { /// NOTE: This only searches the 'historic' layers, *not* the /// 'open' and 'frozen' layers! /// + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)] pub fn search(&self, key: Key, end_lsn: Lsn) -> Option { let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; let latest_delta = version.delta_coverage.query(key.to_i128()); diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 18b15d54c0..e349a24039 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -227,6 +227,7 @@ impl Layer { /// /// It is up to the caller to collect more data from the previous layer and /// perform WAL redo, if necessary. + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)] pub(crate) async fn get_value_reconstruct_data( &self, key: Key, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 297f4720f0..ce901c70c8 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2045,6 +2045,7 @@ impl Timeline { /// /// This function takes the current timeline's locked LayerMap as an argument, /// so callers can avoid potential race conditions. + #[instrument(level = tracing::Level::DEBUG, skip_all)] async fn get_reconstruct_data( &self, key: Key, @@ -2079,7 +2080,8 @@ impl Timeline { let mut cont_lsn = Lsn(request_lsn.0 + 1); 'outer: loop { - if self.cancel.is_cancelled() { + + if self.cancel.is_cancelled() { return Err(PageReconstructError::Cancelled); }