mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 05:22:56 +00:00
feat(pageserver): num of background job metrics (#10690)
## Problem We need a metrics to know what's going on in pageserver's background jobs. ## Summary of changes * Waiting tasks: task still waiting for the semaphore. * Running tasks: tasks doing their actual jobs. --------- Signed-off-by: Alex Chi Z <chi@neon.tech> Co-authored-by: Erik Grinaker <erik@neon.tech>
This commit is contained in:
@@ -2214,6 +2214,8 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
|||||||
pub struct BackgroundLoopSemaphoreMetrics {
|
pub struct BackgroundLoopSemaphoreMetrics {
|
||||||
counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
|
counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
|
||||||
durations: EnumMap<BackgroundLoopKind, Counter>,
|
durations: EnumMap<BackgroundLoopKind, Counter>,
|
||||||
|
waiting_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
|
||||||
|
running_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
|
pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
|
||||||
@@ -2234,6 +2236,20 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
|
|||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
let waiting_tasks = register_int_gauge_vec!(
|
||||||
|
"pageserver_background_loop_semaphore_waiting_tasks",
|
||||||
|
"Number of background loop tasks waiting for semaphore",
|
||||||
|
&["task"],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let running_tasks = register_int_gauge_vec!(
|
||||||
|
"pageserver_background_loop_semaphore_running_tasks",
|
||||||
|
"Number of background loop tasks running concurrently",
|
||||||
|
&["task"],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
BackgroundLoopSemaphoreMetrics {
|
BackgroundLoopSemaphoreMetrics {
|
||||||
counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||||
@@ -2243,29 +2259,69 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
|
|||||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||||
durations.with_label_values(&[kind.into()])
|
durations.with_label_values(&[kind.into()])
|
||||||
})),
|
})),
|
||||||
|
waiting_tasks: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||||
|
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||||
|
waiting_tasks.with_label_values(&[kind.into()])
|
||||||
|
})),
|
||||||
|
running_tasks: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||||
|
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||||
|
running_tasks.with_label_values(&[kind.into()])
|
||||||
|
})),
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
impl BackgroundLoopSemaphoreMetrics {
|
impl BackgroundLoopSemaphoreMetrics {
|
||||||
pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
|
/// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the
|
||||||
struct Record<'a> {
|
/// semaphore is acquired, and drop it when the task completes or is cancelled.
|
||||||
metrics: &'a BackgroundLoopSemaphoreMetrics,
|
pub(crate) fn record(
|
||||||
task: BackgroundLoopKind,
|
&self,
|
||||||
_counter_guard: metrics::IntCounterPairGuard,
|
task: BackgroundLoopKind,
|
||||||
start: Instant,
|
) -> BackgroundLoopSemaphoreMetricsRecorder {
|
||||||
}
|
BackgroundLoopSemaphoreMetricsRecorder::start(self, task)
|
||||||
impl Drop for Record<'_> {
|
}
|
||||||
fn drop(&mut self) {
|
}
|
||||||
let elapsed = self.start.elapsed().as_secs_f64();
|
|
||||||
self.metrics.durations[self.task].inc_by(elapsed);
|
/// Records metrics for a background task.
|
||||||
}
|
pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> {
|
||||||
}
|
metrics: &'a BackgroundLoopSemaphoreMetrics,
|
||||||
Record {
|
task: BackgroundLoopKind,
|
||||||
metrics: self,
|
start: Instant,
|
||||||
|
wait_counter_guard: Option<metrics::IntCounterPairGuard>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> {
|
||||||
|
/// Starts recording semaphore metrics, by recording wait time and incrementing
|
||||||
|
/// `wait_start_count` and `waiting_tasks`.
|
||||||
|
fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self {
|
||||||
|
metrics.waiting_tasks[task].inc();
|
||||||
|
Self {
|
||||||
|
metrics,
|
||||||
task,
|
task,
|
||||||
_counter_guard: self.counters[task].guard(),
|
|
||||||
start: Instant::now(),
|
start: Instant::now(),
|
||||||
|
wait_counter_guard: Some(metrics.counters[task].guard()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Signals that the semaphore has been acquired, and updates relevant metrics.
|
||||||
|
pub fn acquired(&mut self) {
|
||||||
|
self.wait_counter_guard.take().expect("already acquired");
|
||||||
|
self.metrics.durations[self.task].inc_by(self.start.elapsed().as_secs_f64());
|
||||||
|
self.metrics.waiting_tasks[self.task].dec();
|
||||||
|
self.metrics.running_tasks[self.task].inc();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> {
|
||||||
|
/// The task either completed or was cancelled.
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if self.wait_counter_guard.take().is_some() {
|
||||||
|
// Waiting.
|
||||||
|
self.metrics.durations[self.task].inc_by(self.start.elapsed().as_secs_f64());
|
||||||
|
self.metrics.waiting_tasks[self.task].dec();
|
||||||
|
} else {
|
||||||
|
// Running.
|
||||||
|
self.metrics.running_tasks[self.task].dec();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use std::sync::Arc;
|
|||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
use crate::metrics::TENANT_TASK_EVENTS;
|
use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
|
||||||
use crate::task_mgr;
|
use crate::task_mgr;
|
||||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||||
use crate::tenant::throttle::Stats;
|
use crate::tenant::throttle::Stats;
|
||||||
@@ -61,21 +61,32 @@ impl BackgroundLoopKind {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct BackgroundLoopSemaphorePermit<'a> {
|
||||||
|
_permit: tokio::sync::SemaphorePermit<'static>,
|
||||||
|
_recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
|
||||||
|
}
|
||||||
|
|
||||||
/// Cancellation safe.
|
/// Cancellation safe.
|
||||||
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
|
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
|
||||||
loop_kind: BackgroundLoopKind,
|
loop_kind: BackgroundLoopKind,
|
||||||
_ctx: &RequestContext,
|
_ctx: &RequestContext,
|
||||||
) -> tokio::sync::SemaphorePermit<'static> {
|
) -> BackgroundLoopSemaphorePermit<'static> {
|
||||||
let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
|
let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
|
||||||
|
|
||||||
if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
|
if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
|
||||||
pausable_failpoint!("initial-size-calculation-permit-pause");
|
pausable_failpoint!("initial-size-calculation-permit-pause");
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
|
// TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
|
||||||
match CONCURRENT_BACKGROUND_TASKS.acquire().await {
|
let permit = CONCURRENT_BACKGROUND_TASKS
|
||||||
Ok(permit) => permit,
|
.acquire()
|
||||||
Err(_closed) => unreachable!("we never close the semaphore"),
|
.await
|
||||||
|
.expect("should never close");
|
||||||
|
recorder.acquired();
|
||||||
|
|
||||||
|
BackgroundLoopSemaphorePermit {
|
||||||
|
_permit: permit,
|
||||||
|
_recorder: recorder,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -30,8 +30,11 @@ use crate::{
|
|||||||
pgdatadir_mapping::CollectKeySpaceError,
|
pgdatadir_mapping::CollectKeySpaceError,
|
||||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||||
tenant::{
|
tenant::{
|
||||||
size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
|
size::CalculateSyntheticSizeError,
|
||||||
tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
|
storage_layer::LayerVisibilityHint,
|
||||||
|
tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit},
|
||||||
|
timeline::EvictionError,
|
||||||
|
LogicalSizeCalculationCause, Tenant,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -330,7 +333,7 @@ impl Timeline {
|
|||||||
&self,
|
&self,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
|
) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
|
||||||
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
|
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
|
||||||
BackgroundLoopKind::Eviction,
|
BackgroundLoopKind::Eviction,
|
||||||
ctx,
|
ctx,
|
||||||
@@ -374,7 +377,7 @@ impl Timeline {
|
|||||||
p: &EvictionPolicyLayerAccessThreshold,
|
p: &EvictionPolicyLayerAccessThreshold,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
gate: &GateGuard,
|
gate: &GateGuard,
|
||||||
permit: tokio::sync::SemaphorePermit<'static>,
|
permit: BackgroundLoopSemaphorePermit<'static>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> ControlFlow<()> {
|
) -> ControlFlow<()> {
|
||||||
if !self.tenant_shard_id.is_shard_zero() {
|
if !self.tenant_shard_id.is_shard_zero() {
|
||||||
|
|||||||
Reference in New Issue
Block a user