From 877fd144012d43b078772f90610b77eb80723c89 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 18 Mar 2024 16:27:53 +0200 Subject: [PATCH] fix: spanless log message (#7155) with `immediate_gc` the span only covered the `gc_iteration`, make it cover the whole needless spawned task, which also does waiting for layer drops and stray logging in tests. also clarify some comments while we are here. Fixes: #6910 --- pageserver/src/http/routes.rs | 3 +-- pageserver/src/tenant/mgr.rs | 12 +++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 97ffb99465..229f3ae98f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1653,8 +1653,7 @@ async fn timeline_gc_handler( let gc_req: TimelineGcRequest = json_request(&mut request).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let wait_task_done = - mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; + let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?; let gc_result = wait_task_done .await .context("wait for gc task") diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index facaaa2ad7..f456ca3006 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2730,7 +2730,7 @@ use { utils::http::error::ApiError, }; -pub(crate) async fn immediate_gc( +pub(crate) fn immediate_gc( tenant_shard_id: TenantShardId, timeline_id: TimelineId, gc_req: TimelineGcRequest, @@ -2752,6 +2752,8 @@ pub(crate) async fn immediate_gc( // Run in task_mgr to avoid race with tenant_detach operation let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); + let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); + // TODO: spawning is redundant now, need to hold the gate task_mgr::spawn( &tokio::runtime::Handle::current(), @@ -2766,16 +2768,15 @@ pub(crate) async fn immediate_gc( #[allow(unused_mut)] let mut result = tenant .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) - .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) .await; // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it // better once the types support it. #[cfg(feature = "testing")] { + // we need to synchronize with drop completion for python tests without polling for + // log messages if let Ok(result) = result.as_mut() { - // why not futures unordered? it seems it needs very much the same task structure - // but would only run on single task. let mut js = tokio::task::JoinSet::new(); for layer in std::mem::take(&mut result.doomed_layers) { js.spawn(layer.wait_drop()); @@ -2791,7 +2792,7 @@ pub(crate) async fn immediate_gc( if let Some(rtc) = rtc { // layer drops schedule actions on remote timeline client to actually do the - // deletions; don't care just exit fast about the shutdown error + // deletions; don't care about the shutdown error, just exit fast drop(rtc.wait_completion().await); } } @@ -2802,6 +2803,7 @@ pub(crate) async fn immediate_gc( } Ok(()) } + .instrument(span) ); // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task