diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 72a2099d92..13f7977946 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -2,6 +2,7 @@ //! and push them to a HTTP endpoint. use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::{mgr, LogicalSizeCalculationCause}; use camino::Utf8PathBuf; use consumption_metrics::EventType; @@ -143,7 +144,7 @@ pub async fn collect_metrics( crate::tenant::tasks::warn_when_period_overrun( tick_at.elapsed(), metric_collection_interval, - "consumption_metrics_collect_metrics", + BackgroundLoopKind::ConsumptionMetricsCollectMetrics, ); } } @@ -268,6 +269,11 @@ async fn calculate_synthetic_size_worker( } if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await { + // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks? + // We can put in some prioritization for consumption metrics. + // Same for the loop that fetches computed metrics. + // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, + // which turns out is really handy to understand the system. if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await { error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}"); } @@ -277,7 +283,7 @@ async fn calculate_synthetic_size_worker( crate::tenant::tasks::warn_when_period_overrun( tick_at.elapsed(), synthetic_size_calculation_interval, - "consumption_metrics_synthetic_size_worker", + BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker, ); } } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c154b4a4ca..eea3de0583 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1067,6 +1067,26 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); +pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy = + Lazy::new(|| { + register_int_counter_vec!( + "pageserver_background_loop_semaphore_wait_start_count", + "Counter for background loop concurrency-limiting semaphore acquire calls started", + &["task"], + ) + .unwrap() + }); + +pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy = + Lazy::new(|| { + register_int_counter_vec!( + "pageserver_background_loop_semaphore_wait_finish_count", + "Counter for background loop concurrency-limiting semaphore acquire calls finished", + &["task"], + ) + .unwrap() + }); + pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_background_loop_period_overrun_count", diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index df3ffd08d3..00c8ced68a 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -14,6 +14,73 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::completion; +static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = + once_cell::sync::Lazy::new(|| { + let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS; + let permits = usize::max( + 1, + // while a lot of the work is done on spawn_blocking, we still do + // repartitioning in the async context. this should give leave us some workers + // unblocked to be blocked on other work, hopefully easing any outside visible + // effects of restarts. + // + // 6/8 is a guess; previously we ran with unlimited 8 and more from + // spawn_blocking. + (total_threads * 3).checked_div(4).unwrap_or(0), + ); + assert_ne!(permits, 0, "we will not be adding in permits later"); + assert!( + permits < total_threads, + "need threads avail for shorter work" + ); + tokio::sync::Semaphore::new(permits) + }); + +#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)] +#[strum(serialize_all = "snake_case")] +pub(crate) enum BackgroundLoopKind { + Compaction, + Gc, + Eviction, + ConsumptionMetricsCollectMetrics, + ConsumptionMetricsSyntheticSizeWorker, +} + +impl BackgroundLoopKind { + fn as_static_str(&self) -> &'static str { + let s: &'static str = self.into(); + s + } +} + +pub(crate) enum RateLimitError { + Cancelled, +} + +pub(crate) async fn concurrent_background_tasks_rate_limit( + loop_kind: BackgroundLoopKind, + _ctx: &RequestContext, + cancel: &CancellationToken, +) -> Result { + crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT + .with_label_values(&[loop_kind.as_static_str()]) + .inc(); + scopeguard::defer!( + crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc(); + ); + tokio::select! { + permit = CONCURRENT_BACKGROUND_TASKS.acquire() => { + match permit { + Ok(permit) => Ok(permit), + Err(_closed) => unreachable!("we never close the semaphore"), + } + }, + _ = cancel.cancelled() => { + Err(RateLimitError::Cancelled) + } + } +} + /// Start per tenant background loops: compaction and gc. pub fn start_background_loops( tenant: &Arc, @@ -116,7 +183,7 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } }; - warn_when_period_overrun(started_at.elapsed(), period, "compaction"); + warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction); // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) @@ -184,7 +251,7 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { } }; - warn_when_period_overrun(started_at.elapsed(), period, "gc"); + warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc); // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) @@ -258,7 +325,11 @@ pub(crate) async fn random_init_delay( } /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. -pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) { +pub(crate) fn warn_when_period_overrun( + elapsed: Duration, + period: Duration, + task: BackgroundLoopKind, +) { // Duration::ZERO will happen because it's the "disable [bgtask]" value. if elapsed >= period && period != Duration::ZERO { // humantime does no significant digits clamping whereas Duration's debug is a bit more @@ -267,11 +338,11 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task warn!( ?elapsed, period = %humantime::format_duration(period), - task, + ?task, "task iteration took longer than the configured period" ); crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT - .with_label_values(&[task, &format!("{}", period.as_secs())]) + .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())]) .inc(); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6776d16d60..250047823e 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -44,6 +44,7 @@ use crate::tenant::storage_layer::delta_layer::DeltaEntry; use crate::tenant::storage_layer::{ DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer, }; +use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError}; use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, @@ -684,37 +685,17 @@ impl Timeline { ) -> anyhow::Result<()> { const ROUNDS: usize = 2; - static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy = - once_cell::sync::Lazy::new(|| { - let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS; - let permits = usize::max( - 1, - // while a lot of the work is done on spawn_blocking, we still do - // repartitioning in the async context. this should give leave us some workers - // unblocked to be blocked on other work, hopefully easing any outside visible - // effects of restarts. - // - // 6/8 is a guess; previously we ran with unlimited 8 and more from - // spawn_blocking. - (total_threads * 3).checked_div(4).unwrap_or(0), - ); - assert_ne!(permits, 0, "we will not be adding in permits later"); - assert!( - permits < total_threads, - "need threads avail for shorter work" - ); - tokio::sync::Semaphore::new(permits) - }); - // this wait probably never needs any "long time spent" logging, because we already nag if // compaction task goes over it's period (20s) which is quite often in production. - let _permit = tokio::select! { - permit = CONCURRENT_COMPACTIONS.acquire() => { - permit - }, - _ = cancel.cancelled() => { - return Ok(()); - } + let _permit = match super::tasks::concurrent_background_tasks_rate_limit( + BackgroundLoopKind::Compaction, + ctx, + cancel, + ) + .await + { + Ok(permit) => permit, + Err(RateLimitError::Cancelled) => return Ok(()), }; let last_record_lsn = self.get_last_record_lsn(); diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 9bf31d85d4..38da993deb 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -30,6 +30,7 @@ use crate::{ tenant::{ config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, storage_layer::PersistentLayer, + tasks::{BackgroundLoopKind, RateLimitError}, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, }, @@ -129,7 +130,11 @@ impl Timeline { ControlFlow::Continue(()) => (), } let elapsed = start.elapsed(); - crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction"); + crate::tenant::tasks::warn_when_period_overrun( + elapsed, + p.period, + BackgroundLoopKind::Eviction, + ); crate::metrics::EVICTION_ITERATION_DURATION .get_metric_with_label_values(&[ &format!("{}", p.period.as_secs()), @@ -150,6 +155,17 @@ impl Timeline { ) -> ControlFlow<()> { let now = SystemTime::now(); + let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit( + BackgroundLoopKind::Eviction, + ctx, + cancel, + ) + .await + { + Ok(permit) => permit, + Err(RateLimitError::Cancelled) => return ControlFlow::Break(()), + }; + // If we evict layers but keep cached values derived from those layers, then // we face a storm of on-demand downloads after pageserver restart. // The reason is that the restart empties the caches, and so, the values