fix(pageserver): ensure GC computes time cutoff using the same start time (#10193)

## Problem close https://github.com/neondatabase/neon/issues/10192 ## Summary of changes * `find_gc_time_cutoff` takes `now` parameter so that all branches compute the cutoff based on the same start time, avoiding races. * gc-compaction uses a single `get_gc_compaction_watermark` function to get the safe LSN to compact. --------- Signed-off-by: Alex Chi Z <chi@neon.tech> Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
2025-12-22 21:59:59 +00:00 · 2025-01-06 14:29:18 -05:00
parent 95f1920231
commit 4a6556e269
3 changed files with 28 additions and 5 deletions
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4488,13 +4488,17 @@ impl Tenant {
        let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
            HashMap::with_capacity(timelines.len());

+        // Ensures all timelines use the same start time when computing the time cutoff.
+        let now_ts_for_pitr_calc = SystemTime::now();
        for timeline in timelines.iter() {
            let cutoff = timeline
                .get_last_record_lsn()
                .checked_sub(horizon)
                .unwrap_or(Lsn(0));

-            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
+            let cutoffs = timeline
+                .find_gc_cutoffs(now_ts_for_pitr_calc, cutoff, pitr, cancel, ctx)
+                .await?;
            let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
            assert!(old.is_none());
        }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4859,6 +4859,7 @@ impl Timeline {

    async fn find_gc_time_cutoff(
        &self,
+        now: SystemTime,
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
@@ -4866,7 +4867,6 @@ impl Timeline {
        debug_assert_current_span_has_tenant_and_timeline_id();
        if self.shard_identity.is_shard_zero() {
            // Shard Zero has SLRU data and can calculate the PITR time -> LSN mapping itself
-            let now = SystemTime::now();
            let time_range = if pitr == Duration::ZERO {
                humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
            } else {
@@ -4952,6 +4952,7 @@ impl Timeline {
    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
    pub(super) async fn find_gc_cutoffs(
        &self,
+        now: SystemTime,
        space_cutoff: Lsn,
        pitr: Duration,
        cancel: &CancellationToken,
@@ -4979,7 +4980,7 @@ impl Timeline {
        // - if PITR interval is set, then this is our cutoff.
        // - if PITR interval is not set, then we do a lookup
        //   based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
-        let time_cutoff = self.find_gc_time_cutoff(pitr, cancel, ctx).await?;
+        let time_cutoff = self.find_gc_time_cutoff(now, pitr, cancel, ctx).await?;

        Ok(match (pitr, time_cutoff) {
            (Duration::ZERO, Some(time_cutoff)) => {
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1799,6 +1799,24 @@ impl Timeline {
        Ok(())
    }

+    /// Get a watermark for gc-compaction, that is the lowest LSN that we can use as the `gc_horizon` for
+    /// the compaction algorithm. It is min(space_cutoff, time_cutoff, latest_gc_cutoff, standby_horizon).
+    /// Leases and retain_lsns are considered in the gc-compaction job itself so we don't need to account for them
+    /// here.
+    pub(crate) fn get_gc_compaction_watermark(self: &Arc<Self>) -> Lsn {
+        let gc_cutoff_lsn = {
+            let gc_info = self.gc_info.read().unwrap();
+            gc_info.min_cutoff()
+        };
+
+        // TODO: standby horizon should use leases so we don't really need to consider it here.
+        // let watermark = watermark.min(self.standby_horizon.load());
+
+        // TODO: ensure the child branches will not use anything below the watermark, or consider
+        // them when computing the watermark.
+        gc_cutoff_lsn.min(*self.get_latest_gc_cutoff_lsn())
+    }
+
    /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.
    /// The function returns a list of compaction jobs that can be executed separately. If the upper bound of the compact LSN
    /// range is not specified, we will use the latest gc_cutoff as the upper bound, so that all jobs in the jobset acts
@@ -1811,7 +1829,7 @@ impl Timeline {
        let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
            job.compact_lsn_range.end
        } else {
-            *self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
+            self.get_gc_compaction_watermark()
        };

        // Split compaction job to about 4GB each
@@ -2006,7 +2024,7 @@ impl Timeline {
                // Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of
                // cleaning everything that theoritically it could. In the future, it should use `self.gc_info`
                // to get the truth data.
-                let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
+                let real_gc_cutoff = self.get_gc_compaction_watermark();
                // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
                // each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use
                // the real cutoff.