From 70f646ffe2fe9829316f1ed02a5a1529bc296fd6 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 5 Feb 2024 09:34:03 +0200 Subject: [PATCH] More logging fixes (#6584) I was on-call this week, these would had made me understand more/faster of the system: - move stray attaching start logging inside the span it starts, add generation - log ancestor timeline_id or bootstrapping in the beginning of timeline creation --- pageserver/src/http/routes.rs | 6 +++++ pageserver/src/tenant.rs | 28 ++++++++++------------- pageserver/src/tenant/config.rs | 4 ++-- pageserver/src/tenant/mgr.rs | 7 ------ test_runner/regress/test_timeline_size.py | 2 +- 5 files changed, 21 insertions(+), 26 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 5735489742..b97e272c86 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -489,6 +489,12 @@ async fn timeline_create_handler( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() { + tracing::info!(%ancestor_id, "starting to branch"); + } else { + tracing::info!("bootstrapping"); + } + match tenant.create_timeline( new_timeline_id, request_data.ancestor_timeline_id.map(TimelineId::from), diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 58af80238d..dd4f9107f9 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -205,7 +205,7 @@ impl AttachedTenantConf { match &location_conf.mode { LocationMode::Attached(attach_conf) => Ok(Self { tenant_conf: location_conf.tenant_conf, - location: attach_conf.clone(), + location: *attach_conf, }), LocationMode::Secondary(_) => { anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode") @@ -625,6 +625,9 @@ impl Tenant { deletion_queue_client, } = resources; + let attach_mode = attached_conf.location.attach_mode; + let generation = attached_conf.location.generation; + let tenant = Arc::new(Tenant::new( TenantState::Attaching, conf, @@ -654,6 +657,12 @@ impl Tenant { "attach tenant", false, async move { + + info!( + ?attach_mode, + "Attaching tenant" + ); + let _gate_guard = attach_gate_guard; // Is this tenant being spawned as part of process startup? @@ -865,7 +874,7 @@ impl Tenant { Ok(()) } .instrument({ - let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()); + let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation); span.follows_from(Span::current()); span }), @@ -2354,12 +2363,7 @@ impl Tenant { } pub(crate) fn get_attach_mode(&self) -> AttachmentMode { - self.tenant_conf - .read() - .unwrap() - .location - .attach_mode - .clone() + self.tenant_conf.read().unwrap().location.attach_mode } /// For API access: generate a LocationConfig equivalent to the one that would be used to @@ -3225,8 +3229,6 @@ impl Tenant { .context("branch initial metadata upload")?; } - info!("branched timeline {dst_id} from {src_id} at {start_lsn}"); - Ok(new_timeline) } @@ -3444,12 +3446,6 @@ impl Tenant { // All done! let timeline = raw_timeline.finish_creation()?; - info!( - "created root timeline {} timeline.lsn {}", - timeline_id, - timeline.get_last_record_lsn() - ); - Ok(timeline) } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 63bd56cf5f..563887088d 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -51,7 +51,7 @@ pub mod defaults { pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) enum AttachmentMode { /// Our generation is current as far as we know, and as far as we know we are the only attached /// pageserver. This is the "normal" attachment mode. @@ -66,7 +66,7 @@ pub(crate) enum AttachmentMode { Stale, } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) struct AttachedLocationConfig { pub(crate) generation: Generation, pub(crate) attach_mode: AttachmentMode, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 64fd709386..de0b636d47 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -607,13 +607,6 @@ pub(crate) fn tenant_spawn( "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" ); - info!( - tenant_id = %tenant_shard_id.tenant_id, - shard_id = %tenant_shard_id.shard_slug(), - generation = ?location_conf.location.generation, - attach_mode = ?location_conf.location.attach_mode, - "Attaching tenant" - ); let tenant = match Tenant::spawn( conf, tenant_shard_id, diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 303aabb58d..cd7203bba6 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -883,7 +883,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating # logical size is paused in a failpoint. So instead we will use a log observation to check that # on-demand activation was triggered by the tenant deletion - log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*" + log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*" def activated_on_demand(): assert env.pageserver.log_contains(log_match) is not None