Warn when background tasks exceed their configured period (#3654)

Fixes #3648.
This commit is contained in:
Joonas Koivunen
2023-02-21 13:02:19 +02:00
committed by GitHub
parent 5c5b03ce08
commit 7de373210d
3 changed files with 47 additions and 27 deletions

View File

@@ -3,7 +3,7 @@
use std::ops::ControlFlow;
use std::sync::Arc;
use std::time::Duration;
use std::time::{Duration, Instant};
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::TENANT_TASK_EVENTS;
@@ -60,26 +60,32 @@ async fn compaction_loop(tenant_id: TenantId) {
let tenant = tokio::select! {
_ = task_mgr::shutdown_watcher() => {
info!("received cancellation request");
return;
return;
},
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
ControlFlow::Break(()) => return,
ControlFlow::Continue(tenant) => tenant,
},
};
};
let mut sleep_duration = tenant.get_compaction_period();
if sleep_duration == Duration::ZERO {
let started_at = Instant::now();
let period = tenant.get_compaction_period();
let sleep_duration = if period == Duration::ZERO {
info!("automatic compaction is disabled");
// check again in 10 seconds, in case it's been enabled again.
sleep_duration = Duration::from_secs(10);
Duration::from_secs(10)
} else {
// Run compaction
if let Err(e) = tenant.compaction_iteration(&ctx).await {
sleep_duration = wait_duration;
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
wait_duration
} else {
period
}
}
};
warn_when_period_overrun(started_at.elapsed(), period, "compaction");
// Sleep
tokio::select! {
@@ -122,23 +128,26 @@ async fn gc_loop(tenant_id: TenantId) {
},
};
let gc_period = tenant.get_gc_period();
let started_at = Instant::now();
let period = tenant.get_gc_period();
let gc_horizon = tenant.get_gc_horizon();
let mut sleep_duration = gc_period;
if sleep_duration == Duration::ZERO {
let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
info!("automatic GC is disabled");
// check again in 10 seconds, in case it's been enabled again.
sleep_duration = Duration::from_secs(10);
Duration::from_secs(10)
} else {
// Run gc
if gc_horizon > 0 {
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
{
sleep_duration = wait_duration;
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
}
let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
if let Err(e) = res {
error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
wait_duration
} else {
period
}
}
};
warn_when_period_overrun(started_at.elapsed(), period, "gc");
// Sleep
tokio::select! {
@@ -197,3 +206,18 @@ async fn wait_for_active_tenant(
}
}
}
pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
// Duration::ZERO will happen because it's the "disable [bgtask]" value.
if elapsed >= period && period != Duration::ZERO {
// humantime does no significant digits clamping whereas Duration's debug is a bit more
// intelligent. however it makes sense to keep the "configuration format" for period, even
// though there's no way to output the actual config value.
warn!(
?elapsed,
period = %humantime::format_duration(period),
task,
"task iteration took longer than the configured period"
);
}
}

View File

@@ -44,6 +44,7 @@ impl Timeline {
loop {
let policy = self.get_eviction_policy();
let cf = self.eviction_iteration(&policy, cancel.clone()).await;
match cf {
ControlFlow::Break(()) => break,
ControlFlow::Continue(sleep_until) => {
@@ -78,13 +79,7 @@ impl Timeline {
ControlFlow::Continue(()) => (),
}
let elapsed = start.elapsed();
if elapsed > p.period {
warn!(
configured_period = %humantime::format_duration(p.period),
last_period = %humantime::format_duration(elapsed),
"this eviction period took longer than the configured period"
);
}
crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
ControlFlow::Continue(start + p.period)
}
}

View File

@@ -2079,6 +2079,7 @@ class NeonPageserver(PgProtocol):
".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress
".*task iteration took longer than the configured period.*",
]
def start(