pageserver: add separate, disabled compaction semaphore (#10716)

## Problem

L0 compaction can get starved by other background tasks. It needs to be
responsive to avoid read amp blowing up during heavy write workloads.

Touches #10694.

## Summary of changes

Add a separate semaphore for compaction, configurable via
`use_compaction_semaphore` (disabled by default). This is primarily for
testing in staging; it needs further work (in particular to split
image/L0 compaction jobs) before it can be enabled.
This commit is contained in:
Erik Grinaker
2025-02-07 16:11:31 +01:00
committed by GitHub
parent f5243992fa
commit d6e87a3a9c
5 changed files with 59 additions and 35 deletions

View File

@@ -94,6 +94,7 @@ pub struct ConfigToml {
pub ondemand_download_behavior_treat_error_as_warn: bool,
#[serde(with = "humantime_serde")]
pub background_task_maximum_delay: Duration,
pub use_compaction_semaphore: bool,
pub control_plane_api: Option<reqwest::Url>,
pub control_plane_api_token: Option<String>,
pub control_plane_emergency_mode: bool,
@@ -470,6 +471,7 @@ impl Default for ConfigToml {
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
)
.unwrap()),
use_compaction_semaphore: false,
control_plane_api: (None),
control_plane_api_token: (None),

View File

@@ -140,6 +140,10 @@ pub struct PageServerConf {
/// not terrible.
pub background_task_maximum_delay: Duration,
/// If true, use a separate semaphore for compaction tasks instead of the common background task
/// semaphore. Defaults to false.
pub use_compaction_semaphore: bool,
pub control_plane_api: Option<Url>,
/// JWT token for use with the control plane API.
@@ -332,6 +336,7 @@ impl PageServerConf {
test_remote_failures,
ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay,
use_compaction_semaphore,
control_plane_api,
control_plane_api_token,
control_plane_emergency_mode,
@@ -385,6 +390,7 @@ impl PageServerConf {
test_remote_failures,
ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay,
use_compaction_semaphore,
control_plane_api,
control_plane_emergency_mode,
heatmap_upload_concurrency,

View File

@@ -1,47 +1,56 @@
//! This module contains functions to serve per-tenant background processes,
//! such as compaction and GC
use std::cmp::max;
use std::ops::ControlFlow;
use std::str::FromStr;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use once_cell::sync::Lazy;
use rand::Rng;
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use tracing::*;
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
use crate::task_mgr;
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS};
use crate::tenant::throttle::Stats;
use crate::tenant::timeline::compaction::CompactionOutcome;
use crate::tenant::timeline::CompactionError;
use crate::tenant::{Tenant, TenantState};
use once_cell::sync::Lazy;
use rand::Rng;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::rate_limit::RateLimit;
use utils::{backoff, completion, pausable_failpoint};
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
once_cell::sync::Lazy::new(|| {
let total_threads = task_mgr::TOKIO_WORKER_THREADS.get();
let permits = usize::max(
1,
// while a lot of the work is done on spawn_blocking, we still do
// repartitioning in the async context. this should give leave us some workers
// unblocked to be blocked on other work, hopefully easing any outside visible
// effects of restarts.
//
// 6/8 is a guess; previously we ran with unlimited 8 and more from
// spawn_blocking.
(total_threads * 3).checked_div(4).unwrap_or(0),
);
assert_ne!(permits, 0, "we will not be adding in permits later");
assert!(
permits < total_threads,
"need threads avail for shorter work"
);
tokio::sync::Semaphore::new(permits)
});
/// Semaphore limiting concurrent background tasks (across all tenants).
///
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
let total_threads = TOKIO_WORKER_THREADS.get();
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
assert_ne!(permits, 0, "we will not be adding in permits later");
assert!(permits < total_threads, "need threads for other work");
Semaphore::new(permits)
});
/// Semaphore limiting concurrent compaction tasks (across all tenants). This is disabled by
/// default, see `use_compaction_semaphore`.
///
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
///
/// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive
/// to avoid high read amp during heavy write workloads.
///
/// TODO: split image compaction and L0 compaction, and move image compaction to background tasks.
/// Only L0 compaction needs to be responsive, and it shouldn't block on image compaction.
static CONCURRENT_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
let total_threads = TOKIO_WORKER_THREADS.get();
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
assert_ne!(permits, 0, "we will not be adding in permits later");
assert!(permits < total_threads, "need threads for other work");
Semaphore::new(permits)
});
#[derive(
Debug,
@@ -73,8 +82,9 @@ pub struct BackgroundLoopSemaphorePermit<'a> {
/// Cancellation safe.
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
loop_kind: BackgroundLoopKind,
_ctx: &RequestContext,
loop_kind: BackgroundLoopKind,
use_compaction_semaphore: bool,
) -> BackgroundLoopSemaphorePermit<'static> {
// TODO: use a lower threshold and remove the pacer once we resolve some blockage.
const WARN_THRESHOLD: Duration = Duration::from_secs(600);
@@ -88,10 +98,13 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
}
// TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
let permit = CONCURRENT_BACKGROUND_TASKS
.acquire()
.await
.expect("should never close");
let permit = if loop_kind == BackgroundLoopKind::Compaction && use_compaction_semaphore {
CONCURRENT_COMPACTION_TASKS.acquire().await
} else {
assert!(!use_compaction_semaphore);
CONCURRENT_BACKGROUND_TASKS.acquire().await
}
.expect("should never close");
let waited = recorder.acquired();
if waited >= WARN_THRESHOLD {

View File

@@ -1719,8 +1719,9 @@ impl Timeline {
let guard = self.compaction_lock.lock().await;
let permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::Compaction,
ctx,
BackgroundLoopKind::Compaction,
self.conf.use_compaction_semaphore,
)
.await;
@@ -3057,8 +3058,9 @@ impl Timeline {
let skip_concurrency_limiter = &skip_concurrency_limiter;
async move {
let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::InitialLogicalSizeCalculation,
background_ctx,
BackgroundLoopKind::InitialLogicalSizeCalculation,
false,
);
use crate::metrics::initial_logical_size::StartCircumstances;

View File

@@ -335,8 +335,9 @@ impl Timeline {
ctx: &RequestContext,
) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::Eviction,
ctx,
BackgroundLoopKind::Eviction,
false,
);
tokio::select! {