From 9ea7b5dd38cd1fc89311eba3fcb6e8987d51e787 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 27 Apr 2023 11:54:48 +0200 Subject: [PATCH] clean up logging around on-demand downloads (#4030) - Remove repeated tenant & timeline from span - Demote logging of the path to debug level - Log completion at info level, in the same function where we log errors - distinguish between layer file download success & on-demand download succeeding as a whole in the log message wording - Assert that the span contains a tenant id and a timeline id fixes https://github.com/neondatabase/neon/issues/3945 Before: ``` INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{tenant_id=$TENANT_ID timeline_id=$TIMELINE_ID layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: download complete: /storage/pageserver/data/tenants/$TENANT_ID/timelines/$TIMELINE_ID/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91 INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{tenant_id=$TENANT_ID timeline_id=$TIMELINE_ID layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: Rebuilt layer map. Did 9 insertions to process a batch of 1 updates. ``` After: ``` INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: layer file download finished INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: Rebuilt layer map. Did 9 insertions to process a batch of 1 updates. INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: on-demand download successful ``` --- Cargo.lock | 11 + Cargo.toml | 1 + libs/remote_storage/tests/pagination_tests.rs | 6 +- libs/utils/Cargo.toml | 3 +- libs/utils/src/lib.rs | 2 + libs/utils/src/logging.rs | 50 ++- libs/utils/src/tracing_span_assert.rs | 287 ++++++++++++++++++ pageserver/src/bin/pageserver.rs | 16 +- pageserver/src/tenant.rs | 8 +- .../tenant/remote_timeline_client/download.rs | 5 +- pageserver/src/tenant/timeline.rs | 39 ++- .../walreceiver/connection_manager.rs | 2 +- safekeeper/src/bin/safekeeper.rs | 5 +- storage_broker/src/bin/storage_broker.rs | 5 +- 14 files changed, 413 insertions(+), 27 deletions(-) create mode 100644 libs/utils/src/tracing_span_assert.rs diff --git a/Cargo.lock b/Cargo.lock index ce24bbcee8..08b24d263c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4629,6 +4629,16 @@ dependencies = [ "valuable", ] +[[package]] +name = "tracing-error" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d686ec1c0f384b1277f097b2f279a2ecc11afe8c133c1aabf036a27cb4cd206e" +dependencies = [ + "tracing", + "tracing-subscriber", +] + [[package]] name = "tracing-futures" version = "0.2.5" @@ -4879,6 +4889,7 @@ dependencies = [ "thiserror", "tokio", "tracing", + "tracing-error", "tracing-subscriber", "url", "uuid", diff --git a/Cargo.toml b/Cargo.toml index 0b545e6190..f4872433cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -110,6 +110,7 @@ toml = "0.7" toml_edit = "0.19" tonic = {version = "0.9", features = ["tls", "tls-roots"]} tracing = "0.1" +tracing-error = "0.2.0" tracing-opentelemetry = "0.18.0" tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.2" diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/pagination_tests.rs index 048e99d841..86a6888f98 100644 --- a/libs/remote_storage/tests/pagination_tests.rs +++ b/libs/remote_storage/tests/pagination_tests.rs @@ -99,7 +99,11 @@ struct S3WithTestBlobs { #[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledS3 { async fn setup() -> Self { - utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed"); + utils::logging::init( + utils::logging::LogFormat::Test, + utils::logging::TracingErrorLayerEnablement::Disabled, + ) + .expect("logging init failed"); if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { info!( "`{}` env variable is not set, skipping the test", diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index dc6326e73e..2b04dfdef6 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -27,7 +27,8 @@ signal-hook.workspace = true thiserror.workspace = true tokio.workspace = true tracing.workspace = true -tracing-subscriber = { workspace = true, features = ["json"] } +tracing-error.workspace = true +tracing-subscriber = { workspace = true, features = ["json", "registry"] } rand.workspace = true serde_with.workspace = true strum.workspace = true diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index d4176911ac..9b52aa75b7 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -54,6 +54,8 @@ pub mod measured_stream; pub mod serde_percent; pub mod serde_regex; +pub mod tracing_span_assert; + /// use with fail::cfg("$name", "return(2000)") #[macro_export] macro_rules! failpoint_sleep_millis_async { diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index ed856b6804..2b8c852d86 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -56,7 +56,20 @@ where } } -pub fn init(log_format: LogFormat) -> anyhow::Result<()> { +/// Whether to add the `tracing_error` crate's `ErrorLayer` +/// to the global tracing subscriber. +/// +pub enum TracingErrorLayerEnablement { + /// Do not add the `ErrorLayer`. + Disabled, + /// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset. + EnableWithRustLogFilter, +} + +pub fn init( + log_format: LogFormat, + tracing_error_layer_enablement: TracingErrorLayerEnablement, +) -> anyhow::Result<()> { // We fall back to printing all spans at info-level or above if // the RUST_LOG environment variable is not set. let rust_log_env_filter = || { @@ -67,21 +80,26 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> { // NB: the order of the with() calls does not matter. // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering use tracing_subscriber::prelude::*; - tracing_subscriber::registry() - .with({ - let log_layer = tracing_subscriber::fmt::layer() - .with_target(false) - .with_ansi(atty::is(atty::Stream::Stdout)) - .with_writer(std::io::stdout); - let log_layer = match log_format { - LogFormat::Json => log_layer.json().boxed(), - LogFormat::Plain => log_layer.boxed(), - LogFormat::Test => log_layer.with_test_writer().boxed(), - }; - log_layer.with_filter(rust_log_env_filter()) - }) - .with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter())) - .init(); + let r = tracing_subscriber::registry(); + let r = r.with({ + let log_layer = tracing_subscriber::fmt::layer() + .with_target(false) + .with_ansi(atty::is(atty::Stream::Stdout)) + .with_writer(std::io::stdout); + let log_layer = match log_format { + LogFormat::Json => log_layer.json().boxed(), + LogFormat::Plain => log_layer.boxed(), + LogFormat::Test => log_layer.with_test_writer().boxed(), + }; + log_layer.with_filter(rust_log_env_filter()) + }); + let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter())); + match tracing_error_layer_enablement { + TracingErrorLayerEnablement::EnableWithRustLogFilter => r + .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter())) + .init(), + TracingErrorLayerEnablement::Disabled => r.init(), + } Ok(()) } diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs new file mode 100644 index 0000000000..b9f7986442 --- /dev/null +++ b/libs/utils/src/tracing_span_assert.rs @@ -0,0 +1,287 @@ +//! Assert that the current [`tracing::Span`] has a given set of fields. +//! +//! # Usage +//! +//! ``` +//! use tracing_subscriber::prelude::*; +//! let registry = tracing_subscriber::registry() +//! .with(tracing_error::ErrorLayer::default()); +//! +//! // Register the registry as the global subscriber. +//! // In this example, we'll only use it as a thread-local subscriber. +//! let _guard = tracing::subscriber::set_default(registry); +//! +//! // Then, in the main code: +//! +//! let span = tracing::info_span!("TestSpan", test_id = 1); +//! let _guard = span.enter(); +//! +//! // ... down the call stack +//! +//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; +//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]); +//! match check_fields_present([&extractor]) { +//! Ok(()) => {}, +//! Err(missing) => { +//! panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::>()); +//! } +//! } +//! ``` +//! +//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering +//! + +use std::{ + collections::HashSet, + fmt::{self}, + hash::{Hash, Hasher}, +}; + +pub enum ExtractionResult { + Present, + Absent, +} + +pub trait Extractor: Send + Sync + std::fmt::Debug { + fn name(&self) -> &str; + fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult; +} + +#[derive(Debug)] +pub struct MultiNameExtractor { + name: &'static str, + field_names: [&'static str; L], +} + +impl MultiNameExtractor { + pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor { + MultiNameExtractor { name, field_names } + } +} +impl Extractor for MultiNameExtractor { + fn name(&self) -> &str { + self.name + } + fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult { + if fields.iter().any(|f| self.field_names.contains(&f.name())) { + ExtractionResult::Present + } else { + ExtractionResult::Absent + } + } +} + +struct MemoryIdentity<'a>(&'a dyn Extractor); + +impl<'a> MemoryIdentity<'a> { + fn as_ptr(&self) -> *const () { + self.0 as *const _ as *const () + } +} +impl<'a> PartialEq for MemoryIdentity<'a> { + fn eq(&self, other: &Self) -> bool { + self.as_ptr() == other.as_ptr() + } +} +impl<'a> Eq for MemoryIdentity<'a> {} +impl<'a> Hash for MemoryIdentity<'a> { + fn hash(&self, state: &mut H) { + self.as_ptr().hash(state); + } +} +impl<'a> fmt::Debug for MemoryIdentity<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:p}: {}", self.as_ptr(), self.0.name()) + } +} + +/// The extractor names passed as keys to [`new`]. +pub fn check_fields_present( + must_be_present: [&dyn Extractor; L], +) -> Result<(), Vec<&dyn Extractor>> { + let mut missing: HashSet = + HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r))); + let trace = tracing_error::SpanTrace::capture(); + trace.with_spans(|md, _formatted_fields| { + missing.retain(|extractor| match extractor.0.extract(md.fields()) { + ExtractionResult::Present => false, + ExtractionResult::Absent => true, + }); + !missing.is_empty() // continue walking up until we've found all missing + }); + if missing.is_empty() { + Ok(()) + } else { + Err(missing.into_iter().map(|mi| mi.0).collect()) + } +} + +#[cfg(test)] +mod tests { + + use tracing_subscriber::prelude::*; + + use super::*; + + struct Setup { + _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard, + tenant_extractor: MultiNameExtractor<2>, + timeline_extractor: MultiNameExtractor<2>, + } + + fn setup_current_thread() -> Setup { + let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]); + let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]); + + let registry = tracing_subscriber::registry() + .with(tracing_subscriber::fmt::layer()) + .with(tracing_error::ErrorLayer::default()); + + let guard = tracing::subscriber::set_default(registry); + + Setup { + _current_thread_subscriber_guard: guard, + tenant_extractor, + timeline_extractor, + } + } + + fn assert_missing(missing: Vec<&dyn Extractor>, expected: Vec<&dyn Extractor>) { + let missing: HashSet = + HashSet::from_iter(missing.into_iter().map(MemoryIdentity)); + let expected: HashSet = + HashSet::from_iter(expected.into_iter().map(MemoryIdentity)); + assert_eq!(missing, expected); + } + + #[test] + fn positive_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); + let _guard = span.enter(); + check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap(); + } + + #[test] + fn negative_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", timeline_id = "timeline-1"); + let _guard = span.enter(); + let missing = + check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn positive_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", tenant_id = "tenant-1"); + let _guard = span.enter(); + + let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); + let _guard = span.enter(); + + check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap(); + } + + #[test] + fn negative_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", timeline_id = "timeline-1"); + let _guard = span.enter(); + + let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn positive_subset_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); + let _guard = span.enter(); + check_fields_present([&setup.tenant_extractor]).unwrap(); + } + + #[test] + fn positive_subset_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", tenant_id = "tenant-1"); + let _guard = span.enter(); + + let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); + let _guard = span.enter(); + + check_fields_present([&setup.tenant_extractor]).unwrap(); + } + + #[test] + fn negative_subset_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", timeline_id = "timeline-1"); + let _guard = span.enter(); + let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn negative_subset_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", timeline_id = "timeline-1"); + let _guard = span.enter(); + + let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn tracing_error_subscriber_not_set_up() { + // no setup + + let span = tracing::info_span!("foo", e = "some value"); + let _guard = span.enter(); + + let extractor = MultiNameExtractor::new("E", ["e"]); + let missing = check_fields_present([&extractor]).unwrap_err(); + assert_missing(missing, vec![&extractor]); + } + + #[test] + #[should_panic] + fn panics_if_tracing_error_subscriber_has_wrong_filter() { + let r = tracing_subscriber::registry().with({ + tracing_error::ErrorLayer::default().with_filter( + tracing_subscriber::filter::dynamic_filter_fn(|md, _| { + if md.is_span() && *md.level() == tracing::Level::INFO { + return false; + } + true + }), + ) + }); + + let _guard = tracing::subscriber::set_default(r); + + let span = tracing::info_span!("foo", e = "some value"); + let _guard = span.enter(); + + let extractor = MultiNameExtractor::new("E", ["e"]); + let missing = check_fields_present([&extractor]).unwrap_err(); + assert_missing(missing, vec![&extractor]); + } +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 8e4897c09c..d843b01ed7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -25,6 +25,7 @@ use pageserver::{ virtual_file, }; use postgres_backend::AuthType; +use utils::logging::TracingErrorLayerEnablement; use utils::signals::ShutdownSignals; use utils::{ auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal, @@ -86,8 +87,19 @@ fn main() -> anyhow::Result<()> { } }; - // Initialize logging, which must be initialized before the custom panic hook is installed. - logging::init(conf.log_format)?; + // Initialize logging. + // + // It must be initialized before the custom panic hook is installed below. + // + // Regarding tracing_error enablement: at this time, we only use the + // tracing_error crate to debug_assert that log spans contain tenant and timeline ids. + // See `debug_assert_current_span_has_tenant_and_timeline_id` in the timeline module + let tracing_error_layer_enablement = if cfg!(debug_assertions) { + TracingErrorLayerEnablement::EnableWithRustLogFilter + } else { + TracingErrorLayerEnablement::Disabled + }; + logging::init(conf.log_format, tracing_error_layer_enablement)?; // mind the order required here: 1. logging, 2. panic_hook, 3. sentry. // disarming this hook on pageserver, because we never tear down tracing. diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index b5966b4618..d69d5e4b45 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2886,7 +2886,13 @@ pub mod harness { }; LOG_HANDLE.get_or_init(|| { - logging::init(logging::LogFormat::Test).expect("Failed to init test logging") + logging::init( + logging::LogFormat::Test, + // enable it in case in case the tests exercise code paths that use + // debug_assert_current_span_has_tenant_and_timeline_id + logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + ) + .expect("Failed to init test logging") }); let repo_dir = PageServerConf::test_repo_dir(test_name); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index bda095d850..a0d8c0193a 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -16,6 +16,7 @@ use tracing::{info, warn}; use crate::config::PageServerConf; use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id; use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use remote_storage::{DownloadError, GenericRemoteStorage}; use utils::crashsafe::path_with_suffix_extension; @@ -43,6 +44,8 @@ pub async fn download_layer_file<'a>( layer_file_name: &'a LayerFileName, layer_metadata: &'a LayerFileMetadata, ) -> Result { + debug_assert_current_span_has_tenant_and_timeline_id(); + let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); let local_path = timeline_path.join(layer_file_name.file_name()); @@ -154,7 +157,7 @@ pub async fn download_layer_file<'a>( .with_context(|| format!("Could not fsync layer file {}", local_path.display(),)) .map_err(DownloadError::Other)?; - tracing::info!("download complete: {}", local_path.display()); + tracing::debug!("download complete: {}", local_path.display()); Ok(bytes_amount) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6c34f5a5b5..87f03f30b6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -19,6 +19,7 @@ use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TenantTimelineId; +use utils::tracing_span_assert; use std::cmp::{max, min, Ordering}; use std::collections::BinaryHeap; @@ -936,6 +937,7 @@ impl Timeline { } } + #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))] pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name) else { return Ok(None) }; let Some(remote_layer) = layer.downcast_remote_layer() else { return Ok(Some(false)) }; @@ -3819,11 +3821,13 @@ impl Timeline { /// If the caller has a deadline or needs a timeout, they can simply stop polling: /// we're **cancellation-safe** because the download happens in a separate task_mgr task. /// So, the current download attempt will run to completion even if we stop polling. - #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))] + #[instrument(skip_all, fields(layer=%remote_layer.short_id()))] pub async fn download_remote_layer( &self, remote_layer: Arc, ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_and_timeline_id(); + use std::sync::atomic::Ordering::Relaxed; let permit = match Arc::clone(&remote_layer.ongoing_download) @@ -3867,6 +3871,8 @@ impl Timeline { .await; if let Ok(size) = &result { + info!("layer file download finished"); + // XXX the temp file is still around in Err() case // and consumes space until we clean up upon pageserver restart. self_clone.metrics.resident_physical_size_gauge.add(*size); @@ -3938,6 +3944,8 @@ impl Timeline { updates.flush(); drop(layers); + info!("on-demand download successful"); + // Now that we've inserted the download into the layer map, // close the semaphore. This will make other waiters for // this download return Ok(()). @@ -3945,7 +3953,7 @@ impl Timeline { remote_layer.ongoing_download.close(); } else { // Keep semaphore open. We'll drop the permit at the end of the function. - error!("on-demand download failed: {:?}", result.as_ref().unwrap_err()); + error!("layer file download failed: {:?}", result.as_ref().unwrap_err()); } // Don't treat it as an error if the task that triggered the download @@ -4256,3 +4264,30 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } + +#[inline] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { + pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy< + tracing_span_assert::MultiNameExtractor<2>, + > = once_cell::sync::Lazy::new(|| { + tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]) + }); + + pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy< + tracing_span_assert::MultiNameExtractor<2>, + > = once_cell::sync::Lazy::new(|| { + tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]) + }); + + #[cfg(debug_assertions)] + match tracing_span_assert::check_fields_present([ + &*TENANT_ID_EXTRACTOR, + &*TIMELINE_ID_EXTRACTOR, + ]) { + Ok(()) => (), + Err(missing) => panic!( + "missing extractors: {:?}", + missing.into_iter().map(|e| e.name()).collect::>() + ), + } +} diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index efcbfbce3d..731c5c4644 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -348,7 +348,7 @@ impl ConnectionManagerState { .context("walreceiver connection handling failure") } .instrument( - info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id), + info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, node_id = %new_sk.safekeeper_id), ) }); diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index ace921a26d..3699a2a74c 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -134,7 +134,10 @@ fn main() -> anyhow::Result<()> { // 1. init logging // 2. tracing panic hook // 3. sentry - logging::init(LogFormat::from_config(&args.log_format)?)?; + logging::init( + LogFormat::from_config(&args.log_format)?, + logging::TracingErrorLayerEnablement::Disabled, + )?; logging::replace_panic_hook_with_tracing_panic_hook().forget(); info!("version: {GIT_VERSION}"); diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index de7b634ba0..597d9860d8 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -430,7 +430,10 @@ async fn main() -> Result<(), Box> { // 1. init logging // 2. tracing panic hook // 3. sentry - logging::init(LogFormat::from_config(&args.log_format)?)?; + logging::init( + LogFormat::from_config(&args.log_format)?, + logging::TracingErrorLayerEnablement::Disabled, + )?; logging::replace_panic_hook_with_tracing_panic_hook().forget(); // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);