From 53a3ed0a7e26ddba5a6a70b2a5176ee7d5491283 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 6 Feb 2024 15:43:33 +0100 Subject: [PATCH] debug_assert presence of `shard_id` tracing field (#6572) also: fixes https://github.com/neondatabase/neon/issues/6638 --- libs/utils/src/tracing_span_assert.rs | 51 +++++++------- pageserver/src/http/routes.rs | 10 +-- pageserver/src/lib.rs | 1 + pageserver/src/page_service.rs | 68 +++++++++++++------ pageserver/src/pgdatadir_mapping.rs | 3 +- pageserver/src/span.rs | 43 ++++++++++++ pageserver/src/tenant.rs | 16 +++-- pageserver/src/tenant/mgr.rs | 22 +++--- .../src/tenant/remote_timeline_client.rs | 1 + .../tenant/remote_timeline_client/download.rs | 2 +- pageserver/src/tenant/span.rs | 17 ----- pageserver/src/tenant/storage_layer/layer.rs | 3 + pageserver/src/tenant/timeline.rs | 8 +-- pageserver/src/tenant/timeline/span.rs | 19 ------ pageserver/src/walredo.rs | 9 +++ pageserver/src/walredo/process.rs | 4 +- 16 files changed, 165 insertions(+), 112 deletions(-) create mode 100644 pageserver/src/span.rs delete mode 100644 pageserver/src/tenant/span.rs diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index db17f7d8cd..d24c81ad0b 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -20,13 +20,13 @@ //! //! // Then, in the main code: //! -//! let span = tracing::info_span!("TestSpan", test_id = 1); +//! let span = tracing::info_span!("TestSpan", tenant_id = 1); //! let _guard = span.enter(); //! //! // ... down the call stack //! -//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; -//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]); +//! use utils::tracing_span_assert::{check_fields_present, ConstExtractor}; +//! let extractor = ConstExtractor::new("tenant_id"); //! if let Err(missing) = check_fields_present!([&extractor]) { //! // if you copypaste this to a custom assert method, remember to add #[track_caller] //! // to get the "user" code location for the panic. @@ -45,27 +45,26 @@ pub enum ExtractionResult { } pub trait Extractor: Send + Sync + std::fmt::Debug { - fn name(&self) -> &str; + fn id(&self) -> &str; fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult; } #[derive(Debug)] -pub struct MultiNameExtractor { - name: &'static str, - field_names: [&'static str; L], +pub struct ConstExtractor { + field_name: &'static str, } -impl MultiNameExtractor { - pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor { - MultiNameExtractor { name, field_names } +impl ConstExtractor { + pub const fn new(field_name: &'static str) -> ConstExtractor { + ConstExtractor { field_name } } } -impl Extractor for MultiNameExtractor { - fn name(&self) -> &str { - self.name +impl Extractor for ConstExtractor { + fn id(&self) -> &str { + self.field_name } fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult { - if fields.iter().any(|f| self.field_names.contains(&f.name())) { + if fields.iter().any(|f| f.name() == self.field_name) { ExtractionResult::Present } else { ExtractionResult::Absent @@ -203,19 +202,19 @@ mod tests { } impl<'a> fmt::Debug for MemoryIdentity<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:p}: {}", self.as_ptr(), self.0.name()) + write!(f, "{:p}: {}", self.as_ptr(), self.0.id()) } } struct Setup { _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard, - tenant_extractor: MultiNameExtractor<2>, - timeline_extractor: MultiNameExtractor<2>, + tenant_extractor: ConstExtractor, + timeline_extractor: ConstExtractor, } fn setup_current_thread() -> Setup { - let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]); - let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]); + let tenant_extractor = ConstExtractor::new("tenant_id"); + let timeline_extractor = ConstExtractor::new("timeline_id"); let registry = tracing_subscriber::registry() .with(tracing_subscriber::fmt::layer()) @@ -343,12 +342,12 @@ mod tests { let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); - let extractor = MultiNameExtractor::new("E", ["e"]); + let extractor = ConstExtractor::new("e"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); // similarly for a not found key - let extractor = MultiNameExtractor::new("F", ["foobar"]); + let extractor = ConstExtractor::new("foobar"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); } @@ -368,16 +367,14 @@ mod tests { // normally this would work, but without any tracing-subscriber configured, both // check_field_present find nothing let _guard = subspan.enter(); - let extractors: [&dyn Extractor; 2] = [ - &MultiNameExtractor::new("E", ["e"]), - &MultiNameExtractor::new("F", ["f"]), - ]; + let extractors: [&dyn Extractor; 2] = + [&ConstExtractor::new("e"), &ConstExtractor::new("f")]; let res = check_fields_present0(extractors); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); // similarly for a not found key - let extractor = MultiNameExtractor::new("G", ["g"]); + let extractor = ConstExtractor::new("g"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); } @@ -410,7 +407,7 @@ mod tests { let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); - let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])]; + let extractors: [&dyn Extractor; 1] = [&ConstExtractor::new("e")]; if span.is_disabled() { // the tests are running single threaded, or we got lucky and no other tests subscriber diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b97e272c86..792089ebe7 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -535,7 +535,7 @@ async fn timeline_create_handler( } .instrument(info_span!("timeline_create", tenant_id = %tenant_shard_id.tenant_id, - shard = %tenant_shard_id.shard_slug(), + shard_id = %tenant_shard_id.shard_slug(), timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) .await } @@ -831,7 +831,7 @@ async fn timeline_delete_handler( } })?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id)) + tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) .await?; json_response(StatusCode::ACCEPTED, ()) @@ -856,7 +856,7 @@ async fn tenant_detach_handler( detach_ignored.unwrap_or(false), &state.deletion_queue_client, ) - .instrument(info_span!("tenant_detach", %tenant_id)) + .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug())) .await?; json_response(StatusCode::OK, ()) @@ -1007,7 +1007,7 @@ async fn tenant_delete_handler( .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT) .instrument(info_span!("tenant_delete_handler", tenant_id = %tenant_shard_id.tenant_id, - shard = %tenant_shard_id.shard_slug() + shard_id = %tenant_shard_id.shard_slug() )) .await?; @@ -1363,7 +1363,7 @@ async fn put_tenant_location_config_handler( mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client) .instrument(info_span!("tenant_detach", tenant_id = %tenant_shard_id.tenant_id, - shard = %tenant_shard_id.shard_slug() + shard_id = %tenant_shard_id.shard_slug() )) .await { diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index bcde1166b7..c3f35142ec 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -17,6 +17,7 @@ pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; pub mod repository; +pub mod span; pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 754c021c88..6fc38a76d4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -63,9 +63,10 @@ use crate::import_datadir::import_wal_from_tar; use crate::metrics; use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::pgdatadir_mapping::Version; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::task_mgr; use crate::task_mgr::TaskKind; -use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::mgr; use crate::tenant::mgr::get_active_tenant_with_timeout; use crate::tenant::mgr::GetActiveTenantError; @@ -549,7 +550,7 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); let tenant = mgr::get_active_tenant_with_timeout( tenant_id, @@ -631,6 +632,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::GetPage(req) => { + // shard_id is filled in by the handler let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn); ( self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx) @@ -719,7 +721,7 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); // Create empty timeline info!("creating new timeline"); @@ -772,7 +774,7 @@ impl PageServerHandler { Ok(()) } - #[instrument(skip_all, fields(%start_lsn, %end_lsn))] + #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))] async fn handle_import_wal( &self, pgb: &mut PostgresBackend, @@ -785,8 +787,6 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); - let timeline = self .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) .await?; @@ -893,6 +893,7 @@ impl PageServerHandler { Ok(lsn) } + #[instrument(skip_all, fields(shard_id))] async fn handle_get_rel_exists_request( &mut self, tenant_id: TenantId, @@ -919,6 +920,7 @@ impl PageServerHandler { })) } + #[instrument(skip_all, fields(shard_id))] async fn handle_get_nblocks_request( &mut self, tenant_id: TenantId, @@ -946,6 +948,7 @@ impl PageServerHandler { })) } + #[instrument(skip_all, fields(shard_id))] async fn handle_db_size_request( &mut self, tenant_id: TenantId, @@ -1096,6 +1099,7 @@ impl PageServerHandler { } } + #[instrument(skip_all, fields(shard_id))] async fn handle_get_page_at_lsn_request( &mut self, tenant_id: TenantId, @@ -1129,6 +1133,9 @@ impl PageServerHandler { } }; + // load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't + set_tracing_field_shard_id(timeline); + let _timer = timeline .query_metrics .start_timer(metrics::SmgrQueryType::GetPageAtLsn); @@ -1147,6 +1154,7 @@ impl PageServerHandler { })) } + #[instrument(skip_all, fields(shard_id))] async fn handle_get_slru_segment_request( &mut self, tenant_id: TenantId, @@ -1175,7 +1183,7 @@ impl PageServerHandler { } #[allow(clippy::too_many_arguments)] - #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))] + #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))] async fn handle_basebackup_request( &mut self, pgb: &mut PostgresBackend, @@ -1190,8 +1198,6 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); - let started = std::time::Instant::now(); // check that the timeline exists @@ -1313,6 +1319,7 @@ impl PageServerHandler { .await .map_err(GetActiveTimelineError::Tenant)?; let timeline = tenant.get_timeline(timeline_id, true)?; + set_tracing_field_shard_id(&timeline); Ok(timeline) } } @@ -1477,21 +1484,29 @@ where .record("timeline_id", field::display(timeline_id)); self.check_permission(Some(tenant_id))?; - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) - .await?; + async { + let timeline = self + .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) + .await?; - let end_of_timeline = timeline.get_last_record_rlsn(); + let end_of_timeline = timeline.get_last_record_rlsn(); - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::text_col(b"prev_lsn"), - RowDescriptor::text_col(b"last_lsn"), - ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(end_of_timeline.prev.to_string().as_bytes()), - Some(end_of_timeline.last.to_string().as_bytes()), - ]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message_noflush(&BeMessage::RowDescription(&[ + RowDescriptor::text_col(b"prev_lsn"), + RowDescriptor::text_col(b"last_lsn"), + ]))? + .write_message_noflush(&BeMessage::DataRow(&[ + Some(end_of_timeline.prev.to_string().as_bytes()), + Some(end_of_timeline.last.to_string().as_bytes()), + ]))? + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + anyhow::Ok(()) + } + .instrument(info_span!( + "handle_get_last_record_lsn", + shard_id = tracing::field::Empty + )) + .await?; } // same as basebackup, but result includes relational data as well else if query_string.starts_with("fullbackup ") { @@ -1748,3 +1763,12 @@ impl From for QueryError { } } } + +fn set_tracing_field_shard_id(timeline: &Timeline) { + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + tracing::Span::current().record( + "shard_id", + tracing::field::display(timeline.tenant_shard_id.shard_slug()), + ); + debug_assert_current_span_has_tenant_and_timeline_id(); +} diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index a36785a69f..f1d18c0146 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -10,6 +10,7 @@ use super::tenant::{PageReconstructError, Timeline}; use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::repository::*; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::walrecord::NeonWalRecord; use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; @@ -699,7 +700,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result { - crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await?; diff --git a/pageserver/src/span.rs b/pageserver/src/span.rs new file mode 100644 index 0000000000..91fee50514 --- /dev/null +++ b/pageserver/src/span.rs @@ -0,0 +1,43 @@ +use utils::tracing_span_assert::check_fields_present; + +mod extractors { + use utils::tracing_span_assert::ConstExtractor; + + pub(super) const TENANT_ID: ConstExtractor = ConstExtractor::new("tenant_id"); + pub(super) const SHARD_ID: ConstExtractor = ConstExtractor::new("shard_id"); + pub(super) const TIMELINE_ID: ConstExtractor = ConstExtractor::new("timeline_id"); +} + +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_id() { + if cfg!(debug_assertions) { + if let Err(missing) = check_fields_present!([&extractors::TENANT_ID, &extractors::SHARD_ID]) + { + panic!("missing extractors: {missing:?}") + } + } +} + +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { + if cfg!(debug_assertions) { + if let Err(missing) = check_fields_present!([ + &extractors::TENANT_ID, + &extractors::SHARD_ID, + &extractors::TIMELINE_ID, + ]) { + panic!("missing extractors: {missing:?}") + } + } +} + +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id() { + if cfg!(debug_assertions) { + if let Err(missing) = + check_fields_present!([&extractors::TENANT_ID, &extractors::TIMELINE_ID,]) + { + panic!("missing extractors: {missing:?}") + } + } +} diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 624c3e365f..fe85cf9753 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -100,6 +100,7 @@ use std::sync::Arc; use std::sync::{Mutex, RwLock}; use std::time::{Duration, Instant}; +use crate::span; use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; @@ -150,7 +151,6 @@ pub mod block_io; pub mod disk_btree; pub(crate) mod ephemeral_file; pub mod layer_map; -mod span; pub mod metadata; mod par_fsync; @@ -168,7 +168,7 @@ pub(crate) mod timeline; pub mod size; -pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; +pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; // re-export for use in remote_timeline_client.rs @@ -3998,6 +3998,10 @@ pub(crate) mod harness { }) } + pub fn span(&self) -> tracing::Span { + info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) + } + pub async fn load(&self) -> (Arc, RequestContext) { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); ( @@ -4602,7 +4606,7 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) + .instrument(harness.span()) .await .ok() .unwrap(); @@ -4643,7 +4647,7 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) + .instrument(harness.span()) .await .ok() .unwrap(); @@ -4705,7 +4709,7 @@ mod tests { // so that all uploads finish & we can call harness.try_load() below again tenant .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) + .instrument(harness.span()) .await .ok() .unwrap(); @@ -5238,7 +5242,7 @@ mod tests { let raw_tline = tline.raw_timeline().unwrap(); raw_tline .shutdown() - .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, timeline_id=%TIMELINE_ID)) + .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID)) .await; std::mem::forget(tline); } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index de0b636d47..5ec910ca3e 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -684,7 +684,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { // going to log too many lines debug!("tenant successfully stopped"); } - .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug())), + .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())), ); total_attached += 1; @@ -1720,6 +1720,7 @@ pub(crate) async fn ignore_tenant( ignore_tenant0(conf, &TENANTS, tenant_id).await } +#[instrument(skip_all, fields(shard_id))] async fn ignore_tenant0( conf: &'static PageServerConf, tenants: &std::sync::RwLock, @@ -1727,6 +1728,10 @@ async fn ignore_tenant0( ) -> Result<(), TenantStateError> { // This is a legacy API (replaced by `/location_conf`). It does not support sharding let tenant_shard_id = TenantShardId::unsharded(tenant_id); + tracing::Span::current().record( + "shard_id", + tracing::field::display(tenant_shard_id.shard_slug()), + ); remove_tenant_from_memory(tenants, tenant_shard_id, async { let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id); @@ -2122,7 +2127,7 @@ fn tenant_map_acquire_slot_impl( METRICS.tenant_slot_writes.inc(); let mut locked = tenants.write().unwrap(); - let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug()); + let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()); let _guard = span.enter(); let m = match &mut *locked { @@ -2358,7 +2363,7 @@ pub(crate) async fn immediate_gc( mod tests { use std::collections::BTreeMap; use std::sync::Arc; - use tracing::{info_span, Instrument}; + use tracing::Instrument; use crate::tenant::mgr::TenantSlot; @@ -2369,17 +2374,16 @@ mod tests { // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully // wait for it to complete before proceeding. - let (t, _ctx) = TenantHarness::create("shutdown_awaits_in_progress_tenant") - .unwrap() - .load() - .await; + let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap(); + let (t, _ctx) = h.load().await; // harness loads it to active, which is forced and nothing is running on the tenant let id = t.tenant_shard_id(); // tenant harness configures the logging and we cannot escape it - let _e = info_span!("testing", tenant_id = %id).entered(); + let span = h.span(); + let _e = span.enter(); let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]); let tenants = Arc::new(std::sync::RwLock::new(TenantsMap::Open(tenants))); @@ -2400,7 +2404,7 @@ mod tests { }; super::remove_tenant_from_memory(&tenants, id, cleanup).await } - .instrument(info_span!("foobar", tenant_id = %id)) + .instrument(h.span()) }); // now the long cleanup should be in place, with the stopping state diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 831a073d17..152c9a2b7d 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1952,6 +1952,7 @@ mod tests { tracing::info_span!( "test", tenant_id = %self.harness.tenant_shard_id.tenant_id, + shard_id = %self.harness.tenant_shard_id.shard_slug(), timeline_id = %TIMELINE_ID ) } diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 2c50726b43..6c1125746b 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -17,11 +17,11 @@ use utils::timeout::timeout_cancellable; use utils::{backoff, crashsafe}; use crate::config::PageServerConf; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{ download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT, }; use crate::tenant::storage_layer::LayerFileName; -use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::Generation; use crate::virtual_file::on_fatal_io_error; use crate::TEMP_FILE_SUFFIX; diff --git a/pageserver/src/tenant/span.rs b/pageserver/src/tenant/span.rs deleted file mode 100644 index 04e92f4096..0000000000 --- a/pageserver/src/tenant/span.rs +++ /dev/null @@ -1,17 +0,0 @@ -#[cfg(debug_assertions)] -use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; - -#[cfg(not(debug_assertions))] -pub(crate) fn debug_assert_current_span_has_tenant_id() {} - -#[cfg(debug_assertions)] -pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy> = - once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"])); - -#[cfg(debug_assertions)] -#[track_caller] -pub(crate) fn debug_assert_current_span_has_tenant_id() { - if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) { - panic!("missing extractors: {missing:?}") - } -} diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 1f337adf53..52c0f8abdc 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -15,6 +15,7 @@ use utils::sync::heavier_once_cell; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::repository::Key; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; @@ -836,6 +837,8 @@ impl LayerInner { timeline: Arc, permit: heavier_once_cell::InitPermit, ) -> Result { + debug_assert_current_span_has_tenant_and_timeline_id(); + let task_name = format!("download layer {}", self); let (tx, rx) = tokio::sync::oneshot::channel(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 50ffc4d265..43aa178ab5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1138,7 +1138,7 @@ impl Timeline { /// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of /// the graceful [`Timeline::flush_and_shutdown`] function. pub(crate) async fn shutdown(&self) { - span::debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id(); // Signal any subscribers to our cancellation token to drop out tracing::debug!("Cancelling CancellationToken"); @@ -1964,7 +1964,7 @@ impl Timeline { .await; Ok(()) } - .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)), + .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id)), ); } @@ -2151,7 +2151,7 @@ impl Timeline { cause: LogicalSizeCalculationCause, ctx: &RequestContext, ) -> Result { - span::debug_assert_current_span_has_tenant_and_timeline_id(); + crate::span::debug_assert_current_span_has_tenant_and_timeline_id(); // We should never be calculating logical sizes on shard !=0, because these shards do not have // accurate relation sizes, and they do not emit consumption metrics. debug_assert!(self.tenant_shard_id.is_zero()); @@ -2849,7 +2849,7 @@ impl Timeline { frozen_layer: Arc, ctx: &RequestContext, ) -> Result<(), FlushLayerError> { - span::debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id(); // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer // files instead. This is possible as long as *all* the data imported into the diff --git a/pageserver/src/tenant/timeline/span.rs b/pageserver/src/tenant/timeline/span.rs index 3b580c9d1b..8b13789179 100644 --- a/pageserver/src/tenant/timeline/span.rs +++ b/pageserver/src/tenant/timeline/span.rs @@ -1,20 +1 @@ -#[cfg(debug_assertions)] -use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor}; -#[cfg(not(debug_assertions))] -pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {} - -#[cfg(debug_assertions)] -#[track_caller] -pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { - static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy> = - once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TimelineId", ["timeline_id"])); - - let fields: [&dyn Extractor; 2] = [ - &*crate::tenant::span::TENANT_ID_EXTRACTOR, - &*TIMELINE_ID_EXTRACTOR, - ]; - if let Err(missing) = check_fields_present!(fields) { - panic!("missing extractors: {missing:?}") - } -} diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 773e5fc051..98a6a0bb6c 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -373,6 +373,7 @@ mod tests { use bytes::Bytes; use pageserver_api::shard::TenantShardId; use std::str::FromStr; + use tracing::Instrument; use utils::{id::TenantId, lsn::Lsn}; #[tokio::test] @@ -397,6 +398,7 @@ mod tests { short_records(), 14, ) + .instrument(h.span()) .await .unwrap(); @@ -424,6 +426,7 @@ mod tests { short_records(), 14, ) + .instrument(h.span()) .await .unwrap(); @@ -444,6 +447,7 @@ mod tests { short_records(), 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */ ) + .instrument(h.span()) .await .unwrap_err(); } @@ -472,6 +476,7 @@ mod tests { // underscored because unused, except for removal at drop _repo_dir: camino_tempfile::Utf8TempDir, manager: PostgresRedoManager, + tenant_shard_id: TenantShardId, } impl RedoHarness { @@ -488,7 +493,11 @@ mod tests { Ok(RedoHarness { _repo_dir: repo_dir, manager, + tenant_shard_id, }) } + fn span(&self) -> tracing::Span { + tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) + } } } diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index 85db3b4a4a..bcbb263663 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -54,12 +54,14 @@ impl WalRedoProcess { // // Start postgres binary in special WAL redo mode. // - #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))] + #[instrument(skip_all,fields(pg_version=pg_version))] pub(crate) fn launch( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, pg_version: u32, ) -> anyhow::Result { + crate::span::debug_assert_current_span_has_tenant_id(); + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;