pageserver: downgrade slow task warnings from warn to info (#5724)

## Problem

In #5658 we suppressed the first-iteration output from these logs, but
the volume of warnings is still problematic.

## Summary of changes

- Downgrade all slow task warnings to INFO. The information is still
there if we actively want to know about which tasks are running slowly,
without polluting the overall stream of warnings with situations that
are unsurprising to us.
- Revert the previous change so that we output on the first iteration as
we used to do. There is no reason to suppress these, now that the
severity is just info.
This commit is contained in:
John Spray
2023-10-30 18:32:30 +00:00
committed by GitHub
parent d8c21ec70d
commit 9c35e1e6e5

View File

@@ -162,8 +162,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
// loop, #3501. There are also additional "allowed_errors" in tests.
if first && random_init_delay(period, &cancel).await.is_err() {
break;
if first {
first = false;
if random_init_delay(period, &cancel).await.is_err() {
break;
}
}
let started_at = Instant::now();
@@ -193,16 +196,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
}
};
if !first {
// The first iteration is typically much slower, because all tenants compete for the
// compaction sempahore to run, and because of concurrent startup work like initializing
// logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration.
warn_when_period_overrun(
started_at.elapsed(),
period,
BackgroundLoopKind::Compaction,
);
}
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -211,8 +205,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
{
break;
}
first = false;
}
}
.await;
@@ -247,8 +239,11 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let period = tenant.get_gc_period();
if first && random_init_delay(period, &cancel).await.is_err() {
break;
if first {
first = false;
if random_init_delay(period, &cancel).await.is_err() {
break;
}
}
let started_at = Instant::now();
@@ -282,12 +277,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
}
};
if !first {
// The first iteration is typically much slower, because all tenants compete for the
// compaction sempahore to run, and because of concurrent startup work like initializing
// logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration.
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
}
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -296,8 +286,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
{
break;
}
first = false;
}
}
.await;
@@ -373,7 +361,7 @@ pub(crate) fn warn_when_period_overrun(
// humantime does no significant digits clamping whereas Duration's debug is a bit more
// intelligent. however it makes sense to keep the "configuration format" for period, even
// though there's no way to output the actual config value.
warn!(
info!(
?elapsed,
period = %humantime::format_duration(period),
?task,