From 71c30e52faa761a306d3bfac7f24a8d76b944955 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 12 Feb 2025 00:43:58 +0100 Subject: [PATCH] pageserver: properly yield for L0 compaction (#10769) ## Problem When image compaction yields for L0 compaction, it may not immediately schedule L0 compaction, because it just goes on to compact the next pending timeline. Touches #10694. Requires #10744. ## Summary of changes Extend `CompactionOutcome` with `YieldForL0` and `Skipped` variants, and immediately schedule an L0 compaction pass in the `YieldForL0` case. --- pageserver/src/tenant.rs | 27 +++++++++++++++----- pageserver/src/tenant/tasks.rs | 13 +++++----- pageserver/src/tenant/timeline.rs | 6 ++--- pageserver/src/tenant/timeline/compaction.rs | 26 +++++++++++-------- 4 files changed, 44 insertions(+), 28 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 5f17e8cb60..8520ae62e8 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2901,7 +2901,8 @@ impl Tenant { } /// Performs one compaction iteration. Called periodically from the compaction loop. Returns - /// whether another compaction iteration is needed (if we yield), or + /// whether another compaction is needed, if we still have pending work or if we yield for + /// immediate L0 compaction. /// /// Compaction can also be explicitly requested for a timeline via the HTTP API. async fn compaction_iteration( @@ -2911,7 +2912,7 @@ impl Tenant { ) -> Result { // Don't compact inactive tenants. if !self.is_active() { - return Ok(CompactionOutcome::Done); + return Ok(CompactionOutcome::Skipped); } // Don't compact tenants that can't upload layers. We don't check `may_delete_layers_hint`, @@ -2919,13 +2920,13 @@ impl Tenant { let location = self.tenant_conf.load().location; if !location.may_upload_layers_hint() { info!("skipping compaction in location state {location:?}"); - return Ok(CompactionOutcome::Done); + return Ok(CompactionOutcome::Skipped); } // Don't compact if the circuit breaker is tripped. if self.compaction_circuit_breaker.lock().unwrap().is_broken() { info!("skipping compaction due to previous failures"); - return Ok(CompactionOutcome::Done); + return Ok(CompactionOutcome::Skipped); } // Collect all timelines to compact, along with offload instructions and L0 counts. @@ -2988,10 +2989,15 @@ impl Tenant { .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) .await .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?; - has_pending_l0 |= outcome == CompactionOutcome::Pending; + match outcome { + CompactionOutcome::Done => {} + CompactionOutcome::Skipped => {} + CompactionOutcome::Pending => has_pending_l0 = true, + CompactionOutcome::YieldForL0 => has_pending_l0 = true, + } } if has_pending_l0 { - return Ok(CompactionOutcome::Pending); // do another pass + return Ok(CompactionOutcome::YieldForL0); // do another pass } } @@ -3046,7 +3052,14 @@ impl Tenant { })?; } - has_pending |= outcome == CompactionOutcome::Pending; + match outcome { + CompactionOutcome::Done => {} + CompactionOutcome::Skipped => {} + CompactionOutcome::Pending => has_pending = true, + // This mostly makes sense when the L0-only pass above is enabled, since there's + // otherwise no guarantee that we'll start with the timeline that has high L0. + CompactionOutcome::YieldForL0 => return Ok(CompactionOutcome::YieldForL0), + } } // Success! Untrip the breaker if necessary. diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 5df7351216..1fa01e4229 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -268,13 +268,12 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { match output { Ok(outcome) => { error_run = 0; - // If there's more compaction work pending, reschedule immediately. This isn't - // necessarily L0 compaction, but that's fine for now. - // - // TODO: differentiate between L0 compaction and other compaction. The former needs - // to be responsive, the latter doesn't. - if outcome == CompactionOutcome::Pending { - tenant.l0_compaction_trigger.notify_one(); + // If there's more compaction work, L0 or not, schedule an immediate run. + match outcome { + CompactionOutcome::Done => {} + CompactionOutcome::Skipped => {} + CompactionOutcome::YieldForL0 => tenant.l0_compaction_trigger.notify_one(), + CompactionOutcome::Pending => tenant.l0_compaction_trigger.notify_one(), } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 43811b77f8..afa8efa453 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1815,8 +1815,8 @@ impl Timeline { // compaction task goes over it's period (20s) which is quite often in production. let (_guard, _permit) = tokio::select! { tuple = prepare => { tuple }, - _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done), - _ = cancel.cancelled() => return Ok(CompactionOutcome::Done), + _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Skipped), + _ = cancel.cancelled() => return Ok(CompactionOutcome::Skipped), }; let last_record_lsn = self.get_last_record_lsn(); @@ -1824,7 +1824,7 @@ impl Timeline { // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); - return Ok(CompactionOutcome::Done); + return Ok(CompactionOutcome::Skipped); } let result = match self.get_compaction_algorithm_settings().kind { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 18b5afd04b..aea92d34e0 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -609,8 +609,11 @@ pub enum CompactionOutcome { /// Still has pending layers to be compacted after this round. Ideally, the scheduler /// should immediately schedule another compaction. Pending, - // TODO: add a skipped variant for cases where we didn't attempt compaction. These currently - // return Done, which can lead the caller to believe there is no compaction debt. + /// A timeline needs L0 compaction. Yield and schedule an immediate L0 compaction pass (only + /// guaranteed when `compaction_l0_first` is enabled). + YieldForL0, + /// Compaction was skipped, because the timeline is ineligible for compaction. + Skipped, } impl Timeline { @@ -703,10 +706,11 @@ impl Timeline { .unwrap_or(self.get_disk_consistent_lsn()); l0_min_lsn.max(self.get_ancestor_lsn()) }; + // 1. L0 Compact - let l0_compaction_outcome = { + let l0_outcome = { let timer = self.metrics.compact_time_histo.start_timer(); - let l0_compaction_outcome = self + let l0_outcome = self .compact_level0( target_file_size, options.flags.contains(CompactFlags::ForceL0Compaction), @@ -714,17 +718,17 @@ impl Timeline { ) .await?; timer.stop_and_record(); - l0_compaction_outcome + l0_outcome }; if options.flags.contains(CompactFlags::OnlyL0Compaction) { - return Ok(l0_compaction_outcome); + return Ok(l0_outcome); } - if l0_compaction_outcome == CompactionOutcome::Pending { - // Yield if we have pending L0 compaction. The scheduler will do another pass. - info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers."); - return Ok(CompactionOutcome::Pending); + // Yield if we have pending L0 compaction. The scheduler will do another pass. + if l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0 { + info!("image/ancestor compaction yielding for L0 compaction"); + return Ok(CompactionOutcome::YieldForL0); } if l0_l1_boundary_lsn < self.partitioning.read().1 { @@ -788,7 +792,7 @@ impl Timeline { if let LastImageLayerCreationStatus::Incomplete { .. } = outcome { // Yield and do not do any other kind of compaction. info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)."); - return Ok(CompactionOutcome::Pending); + return Ok(CompactionOutcome::YieldForL0); } } Err(err) => {