From 7d7cd8375c188bbade6f6761666cd17e2f16bdc4 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 5 Jul 2023 14:04:05 +0200 Subject: [PATCH] callers of task_mgr::spawn: some top-level async blocks were missing tenant/timeline id (#4283) Looking at logs from staging and prod, I found there are a bunch of log lines without tenant / timeline context. Manully walk through all task_mgr::spawn lines and fix that using the least amount of work required. While doing it, remove some redundant `shutting down` messages. refs https://github.com/neondatabase/neon/issues/4222 --- pageserver/src/disk_usage_eviction_task.rs | 7 ++++--- pageserver/src/tenant/timeline/eviction_task.rs | 6 +++--- .../tenant/timeline/walreceiver/walreceiver_connection.rs | 5 +++++ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 61cbd5066f..d09b661da5 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -110,7 +110,6 @@ pub fn launch_disk_usage_global_eviction_task( disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel) .await; - info!("disk usage based eviction task finishing"); Ok(()) }, ); @@ -126,13 +125,16 @@ async fn disk_usage_eviction_task( tenants_dir: &Path, cancel: CancellationToken, ) { + scopeguard::defer! { + info!("disk usage based eviction task finishing"); + }; + use crate::tenant::tasks::random_init_delay; { if random_init_delay(task_config.period, &cancel) .await .is_err() { - info!("shutting down"); return; } } @@ -167,7 +169,6 @@ async fn disk_usage_eviction_task( tokio::select! { _ = tokio::time::sleep_until(sleep_until) => {}, _ = cancel.cancelled() => { - info!("shutting down"); break } } diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 03cf2d89ad..f8b10d906f 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -70,7 +70,6 @@ impl Timeline { }; self_clone.eviction_task(cancel).await; - info!("eviction task finishing"); Ok(()) }, ); @@ -78,6 +77,9 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))] async fn eviction_task(self: Arc, cancel: CancellationToken) { + scopeguard::defer! { + info!("eviction task finishing"); + } use crate::tenant::tasks::random_init_delay; { let policy = self.get_eviction_policy(); @@ -86,7 +88,6 @@ impl Timeline { EvictionPolicy::NoEviction => Duration::from_secs(10), }; if random_init_delay(period, &cancel).await.is_err() { - info!("shutting down"); return; } } @@ -101,7 +102,6 @@ impl Timeline { ControlFlow::Continue(sleep_until) => { tokio::select! { _ = cancel.cancelled() => { - info!("shutting down"); break; } _ = tokio::time::sleep_until(sleep_until) => { } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 1c1fe87305..817a30247e 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -71,6 +71,8 @@ pub(super) async fn handle_walreceiver_connection( ctx: RequestContext, node: NodeId, ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_and_timeline_id(); + WALRECEIVER_STARTED_CONNECTIONS.inc(); // Connect to the database in replication mode. @@ -140,6 +142,9 @@ pub(super) async fn handle_walreceiver_connection( } Ok(()) } + // Enrich the log lines emitted by this closure with meaningful context. + // TODO: technically, this task outlives the surrounding function, so, the + // spans won't be properly nested. .instrument(tracing::info_span!("poller")), );