Re-access layers before threshold eviction (#3867)

To avoid re-downloading evicted files on restart, re-compute logical size and partitioning before each threshold based eviction run. Cc: #3802 Co-authored-by: Christian Schwarz <christian@neon.tech>
2026-05-30 03:20:36 +00:00 · 2023-03-22 16:26:27 +02:00
parent 14a40c9ca6
commit 6033dfdf4a
1 changed files with 74 additions and 7 deletions
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -1,5 +1,18 @@
-//! The per-timeline layer eviction task.
-
+//! The per-timeline layer eviction task, which evicts data which has not been accessed for more
+//! than a given threshold.
+//!
+//! Data includes all kinds of caches, namely:
+//! - (in-memory layers)
+//! - on-demand downloaded layer files on disk
+//! - (cached layer file pages)
+//! - derived data from layer file contents, namely:
+//!     - initial logical size
+//!     - partitioning
+//!     - (other currently missing unknowns)
+//!
+//! Items with parentheses are not (yet) touched by this task.
+//!
+//! See write-up on restart on-demand download spike: <https://gist.github.com/problame/2265bf7b8dc398be834abfead36c76b5>
 use std::{
    ops::ControlFlow,
    sync::Arc,
@@ -12,6 +25,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn};

 use crate::{
+    context::{DownloadBehavior, RequestContext},
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
@@ -54,9 +68,10 @@ impl Timeline {
            }
        }

+        let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
        loop {
            let policy = self.get_eviction_policy();
-            let cf = self.eviction_iteration(&policy, cancel.clone()).await;
+            let cf = self.eviction_iteration(&policy, &cancel, &ctx).await;

            match cf {
                ControlFlow::Break(()) => break,
@@ -77,7 +92,8 @@ impl Timeline {
    async fn eviction_iteration(
        self: &Arc<Self>,
        policy: &EvictionPolicy,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
    ) -> ControlFlow<(), Instant> {
        debug!("eviction iteration: {policy:?}");
        match policy {
@@ -87,7 +103,7 @@ impl Timeline {
            }
            EvictionPolicy::LayerAccessThreshold(p) => {
                let start = Instant::now();
-                match self.eviction_iteration_threshold(p, cancel).await {
+                match self.eviction_iteration_threshold(p, cancel, ctx).await {
                    ControlFlow::Break(()) => return ControlFlow::Break(()),
                    ControlFlow::Continue(()) => (),
                }
@@ -101,7 +117,8 @@ impl Timeline {
    async fn eviction_iteration_threshold(
        self: &Arc<Self>,
        p: &EvictionPolicyLayerAccessThreshold,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        let now = SystemTime::now();

@@ -114,6 +131,20 @@ impl Timeline {
            not_evictable: usize,
            skipped_for_shutdown: usize,
        }
+
+        // what we want is to invalidate any caches which haven't been accessed for `p.threshold`,
+        // but we cannot actually do it for current limitations except by restarting pageserver. we
+        // just recompute the values which would be recomputed on startup.
+        //
+        // for active tenants this will likely materialized page cache or in-memory layers. for
+        // inactive tenants it will refresh the last_access timestamps so that we will not evict
+        // and re-download on restart these layers.
+        self.refresh_layers_required_in_restart(cancel, ctx).await;
+
+        if cancel.is_cancelled() {
+            return ControlFlow::Break(());
+        }
+
        let mut stats = EvictionStats::default();
        // Gather layers for eviction.
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
@@ -174,7 +205,7 @@ impl Timeline {
        };

        let results = match self
-            .evict_layer_batch(remote_client, &candidates[..], cancel)
+            .evict_layer_batch(remote_client, &candidates[..], cancel.clone())
            .await
        {
            Err(pre_err) => {
@@ -216,4 +247,40 @@ impl Timeline {
        }
        ControlFlow::Continue(())
    }
+
+    /// Recompute the values which would cause on-demand downloads during restart.
+    async fn refresh_layers_required_in_restart(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) {
+        let lsn = self.get_last_record_lsn();
+
+        // imitiate on-restart initial logical size
+        let size = self.calculate_logical_size(lsn, cancel.clone(), ctx).await;
+
+        match &size {
+            Ok(_size) => {
+                // good, don't log it to avoid confusion
+            }
+            Err(_) => {
+                // we have known issues for which we already log this on consumption metrics,
+                // gc, and compaction. leave logging out for now.
+                //
+                // https://github.com/neondatabase/neon/issues/2539
+            }
+        }
+
+        // imitiate repartiting on first compactation
+        if let Err(e) = self.collect_keyspace(lsn, ctx).await {
+            // if this failed, we probably failed logical size because these use the same keys
+            if size.is_err() {
+                // ignore, see above comment
+            } else {
+                warn!(
+                    "failed to collect keyspace but succeeded in calculating logical size: {e:#}"
+                );
+            }
+        }
+    }
 }