mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-28 02:20:42 +00:00
## Problem L0 compaction frequently gets starved out by other background tasks and image/GC compaction. L0 compaction must be responsive to keep read amplification under control. Touches #10694. Resolves #10689. ## Summary of changes Use a separate semaphore for the L0-only compaction pass. * Add a `CONCURRENT_L0_COMPACTION_TASKS` semaphore and `BackgroundLoopKind::L0Compaction`. * Add a setting `compaction_l0_semaphore` (default off via `compaction_l0_first`). * Use the L0 semaphore when doing an `OnlyL0Compaction` pass. * Use the background semaphore when doing a regular compaction pass (which includes an initial L0 pass). * While waiting for the background semaphore, yield for L0 compaction if triggered. * Add `CompactFlags::NoYield` to disable L0 yielding, and set it for the HTTP API route. * Remove the old `use_compaction_semaphore` setting and compaction-scoped semaphore. * Remove the warning when waiting for a semaphore; it's noisy and we have metrics.
572 lines
23 KiB
Rust
572 lines
23 KiB
Rust
//! The per-timeline layer eviction task, which evicts data which has not been accessed for more
|
|
//! than a given threshold.
|
|
//!
|
|
//! Data includes all kinds of caches, namely:
|
|
//! - (in-memory layers)
|
|
//! - on-demand downloaded layer files on disk
|
|
//! - (cached layer file pages)
|
|
//! - derived data from layer file contents, namely:
|
|
//! - initial logical size
|
|
//! - partitioning
|
|
//! - (other currently missing unknowns)
|
|
//!
|
|
//! Items with parentheses are not (yet) touched by this task.
|
|
//!
|
|
//! See write-up on restart on-demand download spike: <https://gist.github.com/problame/2265bf7b8dc398be834abfead36c76b5>
|
|
use std::{
|
|
collections::HashMap,
|
|
ops::ControlFlow,
|
|
sync::Arc,
|
|
time::{Duration, SystemTime},
|
|
};
|
|
|
|
use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold};
|
|
use tokio::time::Instant;
|
|
use tokio_util::sync::CancellationToken;
|
|
use tracing::{debug, info, info_span, instrument, warn, Instrument};
|
|
|
|
use crate::{
|
|
context::{DownloadBehavior, RequestContext},
|
|
pgdatadir_mapping::CollectKeySpaceError,
|
|
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
|
tenant::{
|
|
size::CalculateSyntheticSizeError,
|
|
storage_layer::LayerVisibilityHint,
|
|
tasks::{sleep_random, BackgroundLoopKind, BackgroundLoopSemaphorePermit},
|
|
timeline::EvictionError,
|
|
LogicalSizeCalculationCause, Tenant,
|
|
},
|
|
};
|
|
|
|
use utils::{completion, sync::gate::GateGuard};
|
|
|
|
use super::Timeline;
|
|
|
|
#[derive(Default)]
|
|
pub struct EvictionTaskTimelineState {
|
|
last_layer_access_imitation: Option<tokio::time::Instant>,
|
|
}
|
|
|
|
#[derive(Default)]
|
|
pub struct EvictionTaskTenantState {
|
|
last_layer_access_imitation: Option<Instant>,
|
|
}
|
|
|
|
impl Timeline {
|
|
pub(super) fn launch_eviction_task(
|
|
self: &Arc<Self>,
|
|
parent: Arc<Tenant>,
|
|
background_tasks_can_start: Option<&completion::Barrier>,
|
|
) {
|
|
let self_clone = Arc::clone(self);
|
|
let background_tasks_can_start = background_tasks_can_start.cloned();
|
|
task_mgr::spawn(
|
|
BACKGROUND_RUNTIME.handle(),
|
|
TaskKind::Eviction,
|
|
self.tenant_shard_id,
|
|
Some(self.timeline_id),
|
|
&format!(
|
|
"layer eviction for {}/{}",
|
|
self.tenant_shard_id, self.timeline_id
|
|
),
|
|
async move {
|
|
tokio::select! {
|
|
_ = self_clone.cancel.cancelled() => { return Ok(()); }
|
|
_ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
|
|
};
|
|
|
|
self_clone.eviction_task(parent).await;
|
|
Ok(())
|
|
},
|
|
);
|
|
}
|
|
|
|
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
|
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
|
|
// acquire the gate guard only once within a useful span
|
|
let Ok(guard) = self.gate.enter() else {
|
|
return;
|
|
};
|
|
|
|
{
|
|
let policy = self.get_eviction_policy();
|
|
let period = match policy {
|
|
EvictionPolicy::LayerAccessThreshold(lat) => lat.period,
|
|
EvictionPolicy::OnlyImitiate(lat) => lat.period,
|
|
EvictionPolicy::NoEviction => Duration::from_secs(10),
|
|
};
|
|
if sleep_random(period, &self.cancel).await.is_err() {
|
|
return;
|
|
}
|
|
}
|
|
|
|
let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
|
|
loop {
|
|
let policy = self.get_eviction_policy();
|
|
let cf = self
|
|
.eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx)
|
|
.await;
|
|
|
|
match cf {
|
|
ControlFlow::Break(()) => break,
|
|
ControlFlow::Continue(sleep_until) => {
|
|
if tokio::time::timeout_at(sleep_until, self.cancel.cancelled())
|
|
.await
|
|
.is_ok()
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
|
|
async fn eviction_iteration(
|
|
self: &Arc<Self>,
|
|
tenant: &Tenant,
|
|
policy: &EvictionPolicy,
|
|
cancel: &CancellationToken,
|
|
gate: &GateGuard,
|
|
ctx: &RequestContext,
|
|
) -> ControlFlow<(), Instant> {
|
|
debug!("eviction iteration: {policy:?}");
|
|
let start = Instant::now();
|
|
let (period, threshold) = match policy {
|
|
EvictionPolicy::NoEviction => {
|
|
// check again in 10 seconds; XXX config watch mechanism
|
|
return ControlFlow::Continue(Instant::now() + Duration::from_secs(10));
|
|
}
|
|
EvictionPolicy::LayerAccessThreshold(p) => {
|
|
match self
|
|
.eviction_iteration_threshold(tenant, p, cancel, gate, ctx)
|
|
.await
|
|
{
|
|
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
|
ControlFlow::Continue(()) => (),
|
|
}
|
|
(p.period, p.threshold)
|
|
}
|
|
EvictionPolicy::OnlyImitiate(p) => {
|
|
if self
|
|
.imitiate_only(tenant, p, cancel, gate, ctx)
|
|
.await
|
|
.is_break()
|
|
{
|
|
return ControlFlow::Break(());
|
|
}
|
|
(p.period, p.threshold)
|
|
}
|
|
};
|
|
|
|
let elapsed = start.elapsed();
|
|
crate::tenant::tasks::warn_when_period_overrun(
|
|
elapsed,
|
|
period,
|
|
BackgroundLoopKind::Eviction,
|
|
);
|
|
// FIXME: if we were to mix policies on a pageserver, we would have no way to sense this. I
|
|
// don't think that is a relevant fear however, and regardless the imitation should be the
|
|
// most costly part.
|
|
crate::metrics::EVICTION_ITERATION_DURATION
|
|
.get_metric_with_label_values(&[
|
|
&format!("{}", period.as_secs()),
|
|
&format!("{}", threshold.as_secs()),
|
|
])
|
|
.unwrap()
|
|
.observe(elapsed.as_secs_f64());
|
|
|
|
ControlFlow::Continue(start + period)
|
|
}
|
|
|
|
async fn eviction_iteration_threshold(
|
|
self: &Arc<Self>,
|
|
tenant: &Tenant,
|
|
p: &EvictionPolicyLayerAccessThreshold,
|
|
cancel: &CancellationToken,
|
|
gate: &GateGuard,
|
|
ctx: &RequestContext,
|
|
) -> ControlFlow<()> {
|
|
let now = SystemTime::now();
|
|
|
|
let permit = self.acquire_imitation_permit(cancel, ctx).await?;
|
|
|
|
self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
|
|
.await?;
|
|
|
|
#[derive(Debug, Default)]
|
|
struct EvictionStats {
|
|
candidates: usize,
|
|
evicted: usize,
|
|
errors: usize,
|
|
not_evictable: usize,
|
|
timeouts: usize,
|
|
#[allow(dead_code)]
|
|
skipped_for_shutdown: usize,
|
|
}
|
|
|
|
let mut stats = EvictionStats::default();
|
|
// Gather layers for eviction.
|
|
// NB: all the checks can be invalidated as soon as we release the layer map lock.
|
|
// We don't want to hold the layer map lock during eviction.
|
|
|
|
// So, we just need to deal with this.
|
|
|
|
let mut js = tokio::task::JoinSet::new();
|
|
{
|
|
let guard = self.layers.read().await;
|
|
|
|
guard
|
|
.likely_resident_layers()
|
|
.filter(|layer| {
|
|
let last_activity_ts = layer.latest_activity();
|
|
|
|
let no_activity_for = match now.duration_since(last_activity_ts) {
|
|
Ok(d) => d,
|
|
Err(_e) => {
|
|
// We reach here if `now` < `last_activity_ts`, which can legitimately
|
|
// happen if there is an access between us getting `now`, and us getting
|
|
// the access stats from the layer.
|
|
//
|
|
// The other reason why it can happen is system clock skew because
|
|
// SystemTime::now() is not monotonic, so, even if there is no access
|
|
// to the layer after we get `now` at the beginning of this function,
|
|
// it could be that `now` < `last_activity_ts`.
|
|
//
|
|
// To distinguish the cases, we would need to record `Instant`s in the
|
|
// access stats (i.e., monotonic timestamps), but then, the timestamps
|
|
// values in the access stats would need to be `Instant`'s, and hence
|
|
// they would be meaningless outside of the pageserver process.
|
|
// At the time of writing, the trade-off is that access stats are more
|
|
// valuable than detecting clock skew.
|
|
return false;
|
|
}
|
|
};
|
|
|
|
match layer.visibility() {
|
|
LayerVisibilityHint::Visible => {
|
|
// Usual case: a visible layer might be read any time, and we will keep it
|
|
// resident until it hits our configured TTL threshold.
|
|
no_activity_for > p.threshold
|
|
}
|
|
LayerVisibilityHint::Covered => {
|
|
// Covered layers: this is probably a layer that was recently covered by
|
|
// an image layer during compaction. We don't evict it immediately, but
|
|
// it doesn't stay resident for the full `threshold`: we just keep it
|
|
// for a shorter time in case
|
|
// - it is used for Timestamp->LSN lookups
|
|
// - a new branch is created in recent history which will read this layer
|
|
no_activity_for > p.period
|
|
}
|
|
}
|
|
})
|
|
.cloned()
|
|
.for_each(|layer| {
|
|
js.spawn(async move {
|
|
layer
|
|
.evict_and_wait(std::time::Duration::from_secs(5))
|
|
.await
|
|
});
|
|
stats.candidates += 1;
|
|
});
|
|
};
|
|
|
|
let join_all = async move {
|
|
while let Some(next) = js.join_next().await {
|
|
match next {
|
|
Ok(Ok(())) => stats.evicted += 1,
|
|
Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
|
|
stats.not_evictable += 1;
|
|
}
|
|
Ok(Err(EvictionError::Timeout)) => {
|
|
stats.timeouts += 1;
|
|
}
|
|
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
|
Err(je) if je.is_panic() => {
|
|
/* already logged */
|
|
stats.errors += 1;
|
|
}
|
|
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
|
|
}
|
|
}
|
|
stats
|
|
};
|
|
|
|
tokio::select! {
|
|
stats = join_all => {
|
|
if stats.candidates == stats.not_evictable {
|
|
debug!(stats=?stats, "eviction iteration complete");
|
|
} else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 {
|
|
// reminder: timeouts are not eviction cancellations
|
|
warn!(stats=?stats, "eviction iteration complete");
|
|
} else {
|
|
info!(stats=?stats, "eviction iteration complete");
|
|
}
|
|
}
|
|
_ = cancel.cancelled() => {
|
|
// just drop the joinset to "abort"
|
|
}
|
|
}
|
|
|
|
ControlFlow::Continue(())
|
|
}
|
|
|
|
/// Like `eviction_iteration_threshold`, but without any eviction. Eviction will be done by
|
|
/// disk usage based eviction task.
|
|
async fn imitiate_only(
|
|
self: &Arc<Self>,
|
|
tenant: &Tenant,
|
|
p: &EvictionPolicyLayerAccessThreshold,
|
|
cancel: &CancellationToken,
|
|
gate: &GateGuard,
|
|
ctx: &RequestContext,
|
|
) -> ControlFlow<()> {
|
|
let permit = self.acquire_imitation_permit(cancel, ctx).await?;
|
|
|
|
self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
|
|
.await
|
|
}
|
|
|
|
async fn acquire_imitation_permit(
|
|
&self,
|
|
cancel: &CancellationToken,
|
|
ctx: &RequestContext,
|
|
) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
|
|
let acquire_permit =
|
|
crate::tenant::tasks::acquire_concurrency_permit(BackgroundLoopKind::Eviction, ctx);
|
|
|
|
tokio::select! {
|
|
permit = acquire_permit => ControlFlow::Continue(permit),
|
|
_ = cancel.cancelled() => ControlFlow::Break(()),
|
|
_ = self.cancel.cancelled() => ControlFlow::Break(()),
|
|
}
|
|
}
|
|
|
|
/// If we evict layers but keep cached values derived from those layers, then
|
|
/// we face a storm of on-demand downloads after pageserver restart.
|
|
/// The reason is that the restart empties the caches, and so, the values
|
|
/// need to be re-computed by accessing layers, which we evicted while the
|
|
/// caches were filled.
|
|
///
|
|
/// Solutions here would be one of the following:
|
|
/// 1. Have a persistent cache.
|
|
/// 2. Count every access to a cached value to the access stats of all layers
|
|
/// that were accessed to compute the value in the first place.
|
|
/// 3. Invalidate the caches at a period of < p.threshold/2, so that the values
|
|
/// get re-computed from layers, thereby counting towards layer access stats.
|
|
/// 4. Make the eviction task imitate the layer accesses that typically hit caches.
|
|
///
|
|
/// We follow approach (4) here because in Neon prod deployment:
|
|
/// - page cache is quite small => high churn => low hit rate
|
|
/// => eviction gets correct access stats
|
|
/// - value-level caches such as logical size & repatition have a high hit rate,
|
|
/// especially for inactive tenants
|
|
/// => eviction sees zero accesses for these
|
|
/// => they cause the on-demand download storm on pageserver restart
|
|
///
|
|
/// We should probably move to persistent caches in the future, or avoid
|
|
/// having inactive tenants attached to pageserver in the first place.
|
|
#[instrument(skip_all)]
|
|
async fn imitate_layer_accesses(
|
|
&self,
|
|
tenant: &Tenant,
|
|
p: &EvictionPolicyLayerAccessThreshold,
|
|
cancel: &CancellationToken,
|
|
gate: &GateGuard,
|
|
permit: BackgroundLoopSemaphorePermit<'static>,
|
|
ctx: &RequestContext,
|
|
) -> ControlFlow<()> {
|
|
if !self.tenant_shard_id.is_shard_zero() {
|
|
// Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
|
|
// for consumption metrics (consumption metrics are only sent from shard 0). We may therefore
|
|
// skip imitating logical size accesses for eviction purposes.
|
|
return ControlFlow::Continue(());
|
|
}
|
|
|
|
let mut state = self.eviction_task_timeline_state.lock().await;
|
|
|
|
// Only do the imitate_layer accesses approximately as often as the threshold. A little
|
|
// more frequently, to avoid this period racing with the threshold/period-th eviction iteration.
|
|
let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold);
|
|
|
|
match state.last_layer_access_imitation {
|
|
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
|
_ => {
|
|
self.imitate_timeline_cached_layer_accesses(gate, ctx).await;
|
|
state.last_layer_access_imitation = Some(tokio::time::Instant::now())
|
|
}
|
|
}
|
|
drop(state);
|
|
|
|
if cancel.is_cancelled() {
|
|
return ControlFlow::Break(());
|
|
}
|
|
|
|
// This task is timeline-scoped, but the synthetic size calculation is tenant-scoped.
|
|
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
|
// The others wait until the calculation is done so that they take into account the
|
|
// imitated accesses that the winner made.
|
|
let (mut state, _permit) = {
|
|
if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() {
|
|
(locked, permit)
|
|
} else {
|
|
// we might need to wait for a long time here in case of pathological synthetic
|
|
// size calculation performance
|
|
drop(permit);
|
|
let locked = tokio::select! {
|
|
locked = tenant.eviction_task_tenant_state.lock() => locked,
|
|
_ = self.cancel.cancelled() => {
|
|
return ControlFlow::Break(())
|
|
},
|
|
_ = cancel.cancelled() => {
|
|
return ControlFlow::Break(())
|
|
}
|
|
};
|
|
// then reacquire -- this will be bad if there is a lot of traffic, but because we
|
|
// released the permit, the overall latency will be much better.
|
|
let permit = self.acquire_imitation_permit(cancel, ctx).await?;
|
|
(locked, permit)
|
|
}
|
|
};
|
|
match state.last_layer_access_imitation {
|
|
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
|
_ => {
|
|
self.imitate_synthetic_size_calculation_worker(tenant, cancel, ctx)
|
|
.await;
|
|
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
|
|
}
|
|
}
|
|
drop(state);
|
|
|
|
if cancel.is_cancelled() {
|
|
return ControlFlow::Break(());
|
|
}
|
|
|
|
ControlFlow::Continue(())
|
|
}
|
|
|
|
/// Recompute the values which would cause on-demand downloads during restart.
|
|
#[instrument(skip_all)]
|
|
async fn imitate_timeline_cached_layer_accesses(
|
|
&self,
|
|
guard: &GateGuard,
|
|
ctx: &RequestContext,
|
|
) {
|
|
let lsn = self.get_last_record_lsn();
|
|
|
|
// imitiate on-restart initial logical size
|
|
let size = self
|
|
.calculate_logical_size(
|
|
lsn,
|
|
LogicalSizeCalculationCause::EvictionTaskImitation,
|
|
guard,
|
|
ctx,
|
|
)
|
|
.instrument(info_span!("calculate_logical_size"))
|
|
.await;
|
|
|
|
match &size {
|
|
Ok(_size) => {
|
|
// good, don't log it to avoid confusion
|
|
}
|
|
Err(_) => {
|
|
// we have known issues for which we already log this on consumption metrics,
|
|
// gc, and compaction. leave logging out for now.
|
|
//
|
|
// https://github.com/neondatabase/neon/issues/2539
|
|
}
|
|
}
|
|
|
|
// imitiate repartiting on first compactation
|
|
if let Err(e) = self
|
|
.collect_keyspace(lsn, ctx)
|
|
.instrument(info_span!("collect_keyspace"))
|
|
.await
|
|
{
|
|
// if this failed, we probably failed logical size because these use the same keys
|
|
if size.is_err() {
|
|
// ignore, see above comment
|
|
} else {
|
|
match e {
|
|
CollectKeySpaceError::Cancelled => {
|
|
// Shutting down, ignore
|
|
}
|
|
err => {
|
|
warn!(
|
|
"failed to collect keyspace but succeeded in calculating logical size: {err:#}"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Imitate the synthetic size calculation done by the consumption_metrics module.
|
|
#[instrument(skip_all)]
|
|
async fn imitate_synthetic_size_calculation_worker(
|
|
&self,
|
|
tenant: &Tenant,
|
|
cancel: &CancellationToken,
|
|
ctx: &RequestContext,
|
|
) {
|
|
if self.conf.metric_collection_endpoint.is_none() {
|
|
// We don't start the consumption metrics task if this is not set in the config.
|
|
// So, no need to imitate the accesses in that case.
|
|
return;
|
|
}
|
|
|
|
// The consumption metrics are collected on a per-tenant basis, by a single
|
|
// global background loop.
|
|
// It limits the number of synthetic size calculations using the global
|
|
// `concurrent_tenant_size_logical_size_queries` semaphore to not overload
|
|
// the pageserver. (size calculation is somewhat expensive in terms of CPU and IOs).
|
|
//
|
|
// If we used that same semaphore here, then we'd compete for the
|
|
// same permits, which may impact timeliness of consumption metrics.
|
|
// That is a no-go, as consumption metrics are much more important
|
|
// than what we do here.
|
|
//
|
|
// So, we have a separate semaphore, initialized to the same
|
|
// number of permits as the `concurrent_tenant_size_logical_size_queries`.
|
|
// In the worst, we would have twice the amount of concurrenct size calculations.
|
|
// But in practice, the `p.threshold` >> `consumption metric interval`, and
|
|
// we spread out the eviction task using `random_init_delay`.
|
|
// So, the chance of the worst case is quite low in practice.
|
|
// It runs as a per-tenant task, but the eviction_task.rs is per-timeline.
|
|
// So, we must coordinate with other with other eviction tasks of this tenant.
|
|
let limit = self
|
|
.conf
|
|
.eviction_task_immitated_concurrent_logical_size_queries
|
|
.inner();
|
|
|
|
let mut throwaway_cache = HashMap::new();
|
|
let gather = crate::tenant::size::gather_inputs(
|
|
tenant,
|
|
limit,
|
|
None,
|
|
&mut throwaway_cache,
|
|
LogicalSizeCalculationCause::EvictionTaskImitation,
|
|
cancel,
|
|
ctx,
|
|
)
|
|
.instrument(info_span!("gather_inputs"));
|
|
|
|
tokio::select! {
|
|
_ = cancel.cancelled() => {}
|
|
gather_result = gather => {
|
|
match gather_result {
|
|
Ok(_) => {},
|
|
// It can happen sometimes that we hit this instead of the cancellation token firing above
|
|
Err(CalculateSyntheticSizeError::Cancelled) => {}
|
|
Err(e) => {
|
|
// We don't care about the result, but, if it failed, we should log it,
|
|
// since consumption metric might be hitting the cached value and
|
|
// thus not encountering this error.
|
|
warn!("failed to imitate synthetic size calculation accesses: {e:#}")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|