From 20e9cf7d3131357b2ae9fa702795a6981e1347a9 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 12 Dec 2023 07:19:59 +0000 Subject: [PATCH] pageserver: tweaks to slow/hung task logging (#6098) ## Problem - `shutdown_tasks` would log when a particular task was taking a long time to shut down, but not when it eventually completed. That left one uncertain as to whether the slow task was the source of a hang, or just a precursor. ## Summary of changes - Add a log line after a slow task shutdown - Add an equivalent in Gate's `warn_if_stuck`, in case we ever need it. This isn't related to the original issue but was noticed when checking through these logging paths. --- libs/utils/src/sync/gate.rs | 18 ++++++++++++++++-- pageserver/src/task_mgr.rs | 3 ++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 9aad0af22d..31c76d2f74 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -30,18 +30,32 @@ async fn warn_if_stuck( let mut fut = std::pin::pin!(fut); - loop { + let mut warned = false; + let ret = loop { match tokio::time::timeout(warn_period, &mut fut).await { - Ok(ret) => return ret, + Ok(ret) => break ret, Err(_) => { tracing::warn!( gate = name, elapsed_ms = started.elapsed().as_millis(), "still waiting, taking longer than expected..." ); + warned = true; } } + }; + + // If we emitted a warning for slowness, also emit a message when we complete, so that + // someone debugging a shutdown can know for sure whether we have moved past this operation. + if warned { + tracing::info!( + gate = name, + elapsed_ms = started.elapsed().as_millis(), + "completed, after taking longer than expected" + ) } + + ret } #[derive(Debug)] diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 5786356720..8747d9ad50 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -518,12 +518,13 @@ pub async fn shutdown_tasks( { // allow some time to elapse before logging to cut down the number of log // lines. - info!("waiting for {} to shut down", task.name); + info!("waiting for task {} to shut down", task.name); // we never handled this return value, but: // - we don't deschedule which would lead to is_cancelled // - panics are already logged (is_panicked) // - task errors are already logged in the wrapper let _ = join_handle.await; + info!("task {} completed", task.name); } } else { // Possibly one of: