From 5f3551e40545b144e0b74ef88be9f83993814c3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 24 Mar 2025 18:29:44 +0100 Subject: [PATCH] Add "still waiting for task" for slow shutdowns (#11351) To help with narrowing down https://github.com/neondatabase/cloud/issues/26362, we make the case more noisy where we are wait for the shutdown of a specific task (in the case of that issue, the `gc_loop`). --- pageserver/src/task_mgr.rs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 0b71b2cf5b..9cc604f86d 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -38,6 +38,7 @@ use std::panic::AssertUnwindSafe; use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; +use std::time::Duration; use futures::FutureExt; use once_cell::sync::Lazy; @@ -584,18 +585,25 @@ pub async fn shutdown_tasks( // warn to catch these in tests; there shouldn't be any warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); } - if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle) + const INITIAL_COMPLAIN_TIMEOUT: Duration = Duration::from_secs(1); + const PERIODIC_COMPLAIN_TIMEOUT: Duration = Duration::from_secs(60); + if tokio::time::timeout(INITIAL_COMPLAIN_TIMEOUT, &mut join_handle) .await .is_err() { // allow some time to elapse before logging to cut down the number of log // lines. info!("waiting for task {} to shut down", task.name); - // we never handled this return value, but: - // - we don't deschedule which would lead to is_cancelled - // - panics are already logged (is_panicked) - // - task errors are already logged in the wrapper - let _ = join_handle.await; + loop { + tokio::select! { + // we never handled this return value, but: + // - we don't deschedule which would lead to is_cancelled + // - panics are already logged (is_panicked) + // - task errors are already logged in the wrapper + _ = &mut join_handle => break, + _ = tokio::time::sleep(PERIODIC_COMPLAIN_TIMEOUT) => info!("still waiting for task {} to shut down", task.name), + } + } info!("task {} completed", task.name); } } else {