From 7de373210df708d874d36bf335c669b12f685da9 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 Feb 2023 13:02:19 +0200 Subject: [PATCH] Warn when background tasks exceed their configured period (#3654) Fixes #3648. --- pageserver/src/tenant/tasks.rs | 64 +++++++++++++------ .../src/tenant/timeline/eviction_task.rs | 9 +-- test_runner/fixtures/neon_fixtures.py | 1 + 3 files changed, 47 insertions(+), 27 deletions(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index b126545ee4..db269a1745 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -3,7 +3,7 @@ use std::ops::ControlFlow; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; @@ -60,26 +60,32 @@ async fn compaction_loop(tenant_id: TenantId) { let tenant = tokio::select! { _ = task_mgr::shutdown_watcher() => { info!("received cancellation request"); - return; + return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { ControlFlow::Break(()) => return, ControlFlow::Continue(tenant) => tenant, }, - }; + }; - let mut sleep_duration = tenant.get_compaction_period(); - if sleep_duration == Duration::ZERO { + let started_at = Instant::now(); + + let period = tenant.get_compaction_period(); + let sleep_duration = if period == Duration::ZERO { info!("automatic compaction is disabled"); // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10); + Duration::from_secs(10) } else { // Run compaction if let Err(e) = tenant.compaction_iteration(&ctx).await { - sleep_duration = wait_duration; - error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration); + error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration); + wait_duration + } else { + period } - } + }; + + warn_when_period_overrun(started_at.elapsed(), period, "compaction"); // Sleep tokio::select! { @@ -122,23 +128,26 @@ async fn gc_loop(tenant_id: TenantId) { }, }; - let gc_period = tenant.get_gc_period(); + let started_at = Instant::now(); + + let period = tenant.get_gc_period(); let gc_horizon = tenant.get_gc_horizon(); - let mut sleep_duration = gc_period; - if sleep_duration == Duration::ZERO { + let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 { info!("automatic GC is disabled"); // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10); + Duration::from_secs(10) } else { // Run gc - if gc_horizon > 0 { - if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await - { - sleep_duration = wait_duration; - error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration); - } + let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await; + if let Err(e) = res { + error!("Gc failed, retrying in {:?}: {e:?}", wait_duration); + wait_duration + } else { + period } - } + }; + + warn_when_period_overrun(started_at.elapsed(), period, "gc"); // Sleep tokio::select! { @@ -197,3 +206,18 @@ async fn wait_for_active_tenant( } } } + +pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) { + // Duration::ZERO will happen because it's the "disable [bgtask]" value. + if elapsed >= period && period != Duration::ZERO { + // humantime does no significant digits clamping whereas Duration's debug is a bit more + // intelligent. however it makes sense to keep the "configuration format" for period, even + // though there's no way to output the actual config value. + warn!( + ?elapsed, + period = %humantime::format_duration(period), + task, + "task iteration took longer than the configured period" + ); + } +} diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index fe7e7a1654..0dd169363e 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -44,6 +44,7 @@ impl Timeline { loop { let policy = self.get_eviction_policy(); let cf = self.eviction_iteration(&policy, cancel.clone()).await; + match cf { ControlFlow::Break(()) => break, ControlFlow::Continue(sleep_until) => { @@ -78,13 +79,7 @@ impl Timeline { ControlFlow::Continue(()) => (), } let elapsed = start.elapsed(); - if elapsed > p.period { - warn!( - configured_period = %humantime::format_duration(p.period), - last_period = %humantime::format_duration(elapsed), - "this eviction period took longer than the configured period" - ); - } + crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction"); ControlFlow::Continue(start + p.period) } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 59d616eb6f..63196609cc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2079,6 +2079,7 @@ class NeonPageserver(PgProtocol): ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs ".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock() ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress + ".*task iteration took longer than the configured period.*", ] def start(