diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 186d49fe8b..b93bf30e62 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -36,7 +36,7 @@ use utils::pid_file::{self, PidFileRead}; // it's waiting. If the process hasn't started/stopped after 5 seconds, // it prints a notice that it's taking long, but keeps waiting. // -const RETRY_UNTIL_SECS: u64 = 10; +const RETRY_UNTIL_SECS: u64 = 40; const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS; const RETRY_INTERVAL_MILLIS: u64 = 100; const DOT_EVERY_RETRIES: u64 = 10; diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 72a2099d92..060c6c35fa 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -2,6 +2,7 @@ //! and push them to a HTTP endpoint. use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::{mgr, LogicalSizeCalculationCause}; use camino::Utf8PathBuf; use consumption_metrics::EventType; @@ -143,7 +144,7 @@ pub async fn collect_metrics( crate::tenant::tasks::warn_when_period_overrun( tick_at.elapsed(), metric_collection_interval, - "consumption_metrics_collect_metrics", + BackgroundLoopKind::ConsumptionMetricsCollectMetrics, ); } } @@ -268,6 +269,11 @@ async fn calculate_synthetic_size_worker( } if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await { + // TODO should we just use concurrent_background_tasks_rate_limit(). + // We can put in some prioritization for consumption metrics. + // Same for the loop that fetches computed metrics. + // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, + // which turns out is really handy to understand the system. if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await { error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}"); } @@ -277,7 +283,7 @@ async fn calculate_synthetic_size_worker( crate::tenant::tasks::warn_when_period_overrun( tick_at.elapsed(), synthetic_size_calculation_interval, - "consumption_metrics_synthetic_size_worker", + BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker, ); } } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 1938f1b154..46a1efa0ad 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1060,6 +1060,26 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); +pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy = + Lazy::new(|| { + register_int_counter_vec!( + "pageserver_background_loop_semaphore_wait_start_count", + "Counter for background loop concurrency-limiting semaphore acquire calls started", + &["task"], + ) + .unwrap() + }); + +pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy = + Lazy::new(|| { + register_int_counter_vec!( + "pageserver_background_loop_semaphore_wait_finish_count", + "Counter for background loop concurrency-limiting semaphore acquire calls finished", + &["task"], + ) + .unwrap() + }); + pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_background_loop_period_overrun_count", diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index ae7b3c233c..00c8ced68a 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -36,15 +36,38 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy &'static str { + let s: &'static str = self.into(); + s + } +} + pub(crate) enum RateLimitError { Cancelled, } pub(crate) async fn concurrent_background_tasks_rate_limit( + loop_kind: BackgroundLoopKind, _ctx: &RequestContext, cancel: &CancellationToken, ) -> Result { - // TODO: use request context TaskKind to get statistics on how many tasks of what kind are waiting for background task semaphore + crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT + .with_label_values(&[loop_kind.as_static_str()]) + .inc(); + scopeguard::defer!( + crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc(); + ); tokio::select! { permit = CONCURRENT_BACKGROUND_TASKS.acquire() => { match permit { @@ -160,7 +183,7 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } }; - warn_when_period_overrun(started_at.elapsed(), period, "compaction"); + warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction); // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) @@ -228,7 +251,7 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { } }; - warn_when_period_overrun(started_at.elapsed(), period, "gc"); + warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc); // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) @@ -302,7 +325,11 @@ pub(crate) async fn random_init_delay( } /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. -pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) { +pub(crate) fn warn_when_period_overrun( + elapsed: Duration, + period: Duration, + task: BackgroundLoopKind, +) { // Duration::ZERO will happen because it's the "disable [bgtask]" value. if elapsed >= period && period != Duration::ZERO { // humantime does no significant digits clamping whereas Duration's debug is a bit more @@ -311,11 +338,11 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task warn!( ?elapsed, period = %humantime::format_duration(period), - task, + ?task, "task iteration took longer than the configured period" ); crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT - .with_label_values(&[task, &format!("{}", period.as_secs())]) + .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())]) .inc(); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 9e55117424..b5d4a7128a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -44,7 +44,7 @@ use crate::tenant::storage_layer::delta_layer::DeltaEntry; use crate::tenant::storage_layer::{ DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer, }; -use crate::tenant::tasks::RateLimitError; +use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError}; use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, @@ -687,7 +687,12 @@ impl Timeline { // this wait probably never needs any "long time spent" logging, because we already nag if // compaction task goes over it's period (20s) which is quite often in production. - let _permit = match super::tasks::concurrent_background_tasks_rate_limit(ctx, cancel).await + let _permit = match super::tasks::concurrent_background_tasks_rate_limit( + BackgroundLoopKind::Compaction, + ctx, + cancel, + ) + .await { Ok(permit) => permit, Err(RateLimitError::Cancelled) => return Ok(()), diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index f9b35ca78b..657bbb2966 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -30,8 +30,9 @@ use crate::{ tenant::{ config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, storage_layer::PersistentLayer, + tasks::{BackgroundLoopKind, RateLimitError}, timeline::EvictionError, - LogicalSizeCalculationCause, Tenant, tasks::RateLimitError, + LogicalSizeCalculationCause, Tenant, }, }; @@ -129,7 +130,11 @@ impl Timeline { ControlFlow::Continue(()) => (), } let elapsed = start.elapsed(); - crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction"); + crate::tenant::tasks::warn_when_period_overrun( + elapsed, + p.period, + BackgroundLoopKind::Eviction, + ); crate::metrics::EVICTION_ITERATION_DURATION .get_metric_with_label_values(&[ &format!("{}", p.period.as_secs()), @@ -150,7 +155,13 @@ impl Timeline { ) -> ControlFlow<()> { let now = SystemTime::now(); - let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(ctx, cancel).await { + let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit( + BackgroundLoopKind::Eviction, + ctx, + cancel, + ) + .await + { Ok(permit) => permit, Err(RateLimitError::Cancelled) => return ControlFlow::Break(()), };