pageserver: properly yield for L0 compaction (#10769)

## Problem When image compaction yields for L0 compaction, it may not immediately schedule L0 compaction, because it just goes on to compact the next pending timeline. Touches #10694. Requires #10744. ## Summary of changes Extend `CompactionOutcome` with `YieldForL0` and `Skipped` variants, and immediately schedule an L0 compaction pass in the `YieldForL0` case.
2026-01-08 14:02:55 +00:00 · 2025-02-12 00:43:58 +01:00
parent 6c83ac3fd2
commit 71c30e52fa
4 changed files with 44 additions and 28 deletions
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2901,7 +2901,8 @@ impl Tenant {
    }

    /// Performs one compaction iteration. Called periodically from the compaction loop. Returns
-    /// whether another compaction iteration is needed (if we yield), or
+    /// whether another compaction is needed, if we still have pending work or if we yield for
+    /// immediate L0 compaction.
    ///
    /// Compaction can also be explicitly requested for a timeline via the HTTP API.
    async fn compaction_iteration(
@@ -2911,7 +2912,7 @@ impl Tenant {
    ) -> Result<CompactionOutcome, CompactionError> {
        // Don't compact inactive tenants.
        if !self.is_active() {
-            return Ok(CompactionOutcome::Done);
+            return Ok(CompactionOutcome::Skipped);
        }

        // Don't compact tenants that can't upload layers. We don't check `may_delete_layers_hint`,
@@ -2919,13 +2920,13 @@ impl Tenant {
        let location = self.tenant_conf.load().location;
        if !location.may_upload_layers_hint() {
            info!("skipping compaction in location state {location:?}");
-            return Ok(CompactionOutcome::Done);
+            return Ok(CompactionOutcome::Skipped);
        }

        // Don't compact if the circuit breaker is tripped.
        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
            info!("skipping compaction due to previous failures");
-            return Ok(CompactionOutcome::Done);
+            return Ok(CompactionOutcome::Skipped);
        }

        // Collect all timelines to compact, along with offload instructions and L0 counts.
@@ -2988,10 +2989,15 @@ impl Tenant {
                    .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
                    .await
                    .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
-                has_pending_l0 |= outcome == CompactionOutcome::Pending;
+                match outcome {
+                    CompactionOutcome::Done => {}
+                    CompactionOutcome::Skipped => {}
+                    CompactionOutcome::Pending => has_pending_l0 = true,
+                    CompactionOutcome::YieldForL0 => has_pending_l0 = true,
+                }
            }
            if has_pending_l0 {
-                return Ok(CompactionOutcome::Pending); // do another pass
+                return Ok(CompactionOutcome::YieldForL0); // do another pass
            }
        }

@@ -3046,7 +3052,14 @@ impl Tenant {
                    })?;
            }

-            has_pending |= outcome == CompactionOutcome::Pending;
+            match outcome {
+                CompactionOutcome::Done => {}
+                CompactionOutcome::Skipped => {}
+                CompactionOutcome::Pending => has_pending = true,
+                // This mostly makes sense when the L0-only pass above is enabled, since there's
+                // otherwise no guarantee that we'll start with the timeline that has high L0.
+                CompactionOutcome::YieldForL0 => return Ok(CompactionOutcome::YieldForL0),
+            }
        }

        // Success! Untrip the breaker if necessary.
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -268,13 +268,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
        match output {
            Ok(outcome) => {
                error_run = 0;
-                // If there's more compaction work pending, reschedule immediately. This isn't
-                // necessarily L0 compaction, but that's fine for now.
-                //
-                // TODO: differentiate between L0 compaction and other compaction. The former needs
-                // to be responsive, the latter doesn't.
-                if outcome == CompactionOutcome::Pending {
-                    tenant.l0_compaction_trigger.notify_one();
+                // If there's more compaction work, L0 or not, schedule an immediate run.
+                match outcome {
+                    CompactionOutcome::Done => {}
+                    CompactionOutcome::Skipped => {}
+                    CompactionOutcome::YieldForL0 => tenant.l0_compaction_trigger.notify_one(),
+                    CompactionOutcome::Pending => tenant.l0_compaction_trigger.notify_one(),
                }
            }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1815,8 +1815,8 @@ impl Timeline {
        // compaction task goes over it's period (20s) which is quite often in production.
        let (_guard, _permit) = tokio::select! {
            tuple = prepare => { tuple },
-            _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done),
-            _ = cancel.cancelled() => return Ok(CompactionOutcome::Done),
+            _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Skipped),
+            _ = cancel.cancelled() => return Ok(CompactionOutcome::Skipped),
        };

        let last_record_lsn = self.get_last_record_lsn();
@@ -1824,7 +1824,7 @@ impl Timeline {
        // Last record Lsn could be zero in case the timeline was just created
        if !last_record_lsn.is_valid() {
            warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
-            return Ok(CompactionOutcome::Done);
+            return Ok(CompactionOutcome::Skipped);
        }

        let result = match self.get_compaction_algorithm_settings().kind {
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -609,8 +609,11 @@ pub enum CompactionOutcome {
    /// Still has pending layers to be compacted after this round. Ideally, the scheduler
    /// should immediately schedule another compaction.
    Pending,
-    // TODO: add a skipped variant for cases where we didn't attempt compaction. These currently
-    // return Done, which can lead the caller to believe there is no compaction debt.
+    /// A timeline needs L0 compaction. Yield and schedule an immediate L0 compaction pass (only
+    /// guaranteed when `compaction_l0_first` is enabled).
+    YieldForL0,
+    /// Compaction was skipped, because the timeline is ineligible for compaction.
+    Skipped,
 }

 impl Timeline {
@@ -703,10 +706,11 @@ impl Timeline {
                .unwrap_or(self.get_disk_consistent_lsn());
            l0_min_lsn.max(self.get_ancestor_lsn())
        };
+
        // 1. L0 Compact
-        let l0_compaction_outcome = {
+        let l0_outcome = {
            let timer = self.metrics.compact_time_histo.start_timer();
-            let l0_compaction_outcome = self
+            let l0_outcome = self
                .compact_level0(
                    target_file_size,
                    options.flags.contains(CompactFlags::ForceL0Compaction),
@@ -714,17 +718,17 @@ impl Timeline {
                )
                .await?;
            timer.stop_and_record();
-            l0_compaction_outcome
+            l0_outcome
        };

        if options.flags.contains(CompactFlags::OnlyL0Compaction) {
-            return Ok(l0_compaction_outcome);
+            return Ok(l0_outcome);
        }

-        if l0_compaction_outcome == CompactionOutcome::Pending {
-            // Yield if we have pending L0 compaction. The scheduler will do another pass.
-            info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
-            return Ok(CompactionOutcome::Pending);
+        // Yield if we have pending L0 compaction. The scheduler will do another pass.
+        if l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0 {
+            info!("image/ancestor compaction yielding for L0 compaction");
+            return Ok(CompactionOutcome::YieldForL0);
        }

        if l0_l1_boundary_lsn < self.partitioning.read().1 {
@@ -788,7 +792,7 @@ impl Timeline {
                    if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
                        // Yield and do not do any other kind of compaction.
                        info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
-                        return Ok(CompactionOutcome::Pending);
+                        return Ok(CompactionOutcome::YieldForL0);
                    }
                }
                Err(err) => {