From 9c35e1e6e5609780a5bc55b5e98a4a711f6e6465 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 30 Oct 2023 18:32:30 +0000 Subject: [PATCH] pageserver: downgrade slow task warnings from warn to info (#5724) ## Problem In #5658 we suppressed the first-iteration output from these logs, but the volume of warnings is still problematic. ## Summary of changes - Downgrade all slow task warnings to INFO. The information is still there if we actively want to know about which tasks are running slowly, without polluting the overall stream of warnings with situations that are unsurprising to us. - Revert the previous change so that we output on the first iteration as we used to do. There is no reason to suppress these, now that the severity is just info. --- pageserver/src/tenant/tasks.rs | 38 ++++++++++++---------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index f419d2e73a..eb77f7c83a 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -162,8 +162,11 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { // TODO: we shouldn't need to await to find tenant and this could be moved outside of // loop, #3501. There are also additional "allowed_errors" in tests. - if first && random_init_delay(period, &cancel).await.is_err() { - break; + if first { + first = false; + if random_init_delay(period, &cancel).await.is_err() { + break; + } } let started_at = Instant::now(); @@ -193,16 +196,7 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } }; - if !first { - // The first iteration is typically much slower, because all tenants compete for the - // compaction sempahore to run, and because of concurrent startup work like initializing - // logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration. - warn_when_period_overrun( - started_at.elapsed(), - period, - BackgroundLoopKind::Compaction, - ); - } + warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction); // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) @@ -211,8 +205,6 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { { break; } - - first = false; } } .await; @@ -247,8 +239,11 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { let period = tenant.get_gc_period(); - if first && random_init_delay(period, &cancel).await.is_err() { - break; + if first { + first = false; + if random_init_delay(period, &cancel).await.is_err() { + break; + } } let started_at = Instant::now(); @@ -282,12 +277,7 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { } }; - if !first { - // The first iteration is typically much slower, because all tenants compete for the - // compaction sempahore to run, and because of concurrent startup work like initializing - // logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration. - warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc); - } + warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc); // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) @@ -296,8 +286,6 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { { break; } - - first = false; } } .await; @@ -373,7 +361,7 @@ pub(crate) fn warn_when_period_overrun( // humantime does no significant digits clamping whereas Duration's debug is a bit more // intelligent. however it makes sense to keep the "configuration format" for period, even // though there's no way to output the actual config value. - warn!( + info!( ?elapsed, period = %humantime::format_duration(period), ?task,