diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 2aad0ef0f3..666768ff87 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -1,5 +1,18 @@ -//! The per-timeline layer eviction task. - +//! The per-timeline layer eviction task, which evicts data which has not been accessed for more +//! than a given threshold. +//! +//! Data includes all kinds of caches, namely: +//! - (in-memory layers) +//! - on-demand downloaded layer files on disk +//! - (cached layer file pages) +//! - derived data from layer file contents, namely: +//! - initial logical size +//! - partitioning +//! - (other currently missing unknowns) +//! +//! Items with parentheses are not (yet) touched by this task. +//! +//! See write-up on restart on-demand download spike: use std::{ ops::ControlFlow, sync::Arc, @@ -12,6 +25,7 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn}; use crate::{ + context::{DownloadBehavior, RequestContext}, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, @@ -54,9 +68,10 @@ impl Timeline { } } + let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn); loop { let policy = self.get_eviction_policy(); - let cf = self.eviction_iteration(&policy, cancel.clone()).await; + let cf = self.eviction_iteration(&policy, &cancel, &ctx).await; match cf { ControlFlow::Break(()) => break, @@ -77,7 +92,8 @@ impl Timeline { async fn eviction_iteration( self: &Arc, policy: &EvictionPolicy, - cancel: CancellationToken, + cancel: &CancellationToken, + ctx: &RequestContext, ) -> ControlFlow<(), Instant> { debug!("eviction iteration: {policy:?}"); match policy { @@ -87,7 +103,7 @@ impl Timeline { } EvictionPolicy::LayerAccessThreshold(p) => { let start = Instant::now(); - match self.eviction_iteration_threshold(p, cancel).await { + match self.eviction_iteration_threshold(p, cancel, ctx).await { ControlFlow::Break(()) => return ControlFlow::Break(()), ControlFlow::Continue(()) => (), } @@ -101,7 +117,8 @@ impl Timeline { async fn eviction_iteration_threshold( self: &Arc, p: &EvictionPolicyLayerAccessThreshold, - cancel: CancellationToken, + cancel: &CancellationToken, + ctx: &RequestContext, ) -> ControlFlow<()> { let now = SystemTime::now(); @@ -114,6 +131,20 @@ impl Timeline { not_evictable: usize, skipped_for_shutdown: usize, } + + // what we want is to invalidate any caches which haven't been accessed for `p.threshold`, + // but we cannot actually do it for current limitations except by restarting pageserver. we + // just recompute the values which would be recomputed on startup. + // + // for active tenants this will likely materialized page cache or in-memory layers. for + // inactive tenants it will refresh the last_access timestamps so that we will not evict + // and re-download on restart these layers. + self.refresh_layers_required_in_restart(cancel, ctx).await; + + if cancel.is_cancelled() { + return ControlFlow::Break(()); + } + let mut stats = EvictionStats::default(); // Gather layers for eviction. // NB: all the checks can be invalidated as soon as we release the layer map lock. @@ -174,7 +205,7 @@ impl Timeline { }; let results = match self - .evict_layer_batch(remote_client, &candidates[..], cancel) + .evict_layer_batch(remote_client, &candidates[..], cancel.clone()) .await { Err(pre_err) => { @@ -216,4 +247,40 @@ impl Timeline { } ControlFlow::Continue(()) } + + /// Recompute the values which would cause on-demand downloads during restart. + async fn refresh_layers_required_in_restart( + &self, + cancel: &CancellationToken, + ctx: &RequestContext, + ) { + let lsn = self.get_last_record_lsn(); + + // imitiate on-restart initial logical size + let size = self.calculate_logical_size(lsn, cancel.clone(), ctx).await; + + match &size { + Ok(_size) => { + // good, don't log it to avoid confusion + } + Err(_) => { + // we have known issues for which we already log this on consumption metrics, + // gc, and compaction. leave logging out for now. + // + // https://github.com/neondatabase/neon/issues/2539 + } + } + + // imitiate repartiting on first compactation + if let Err(e) = self.collect_keyspace(lsn, ctx).await { + // if this failed, we probably failed logical size because these use the same keys + if size.is_err() { + // ignore, see above comment + } else { + warn!( + "failed to collect keyspace but succeeded in calculating logical size: {e:#}" + ); + } + } + } }