From 3815e3b2b5809612ce335333134a3fd32317a0e4 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 1 Jul 2025 09:58:41 -0700 Subject: [PATCH] feat(pageserver): reduce lock contention in l0 compaction (#12360) ## Problem L0 compaction currently holds the read lock for a long region while it doesn't need to. ## Summary of changes This patch reduces the one long contention region into 2 short ones: gather the layers to compact at the beginning, and several short read locks when querying the image coverage. Co-Authored-By: Chen Luo --------- Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline/compaction.rs | 59 ++++++++++---------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 04852fb721..13a4f82607 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -9,7 +9,7 @@ use std::ops::{Deref, Range}; use std::sync::Arc; use std::time::{Duration, Instant}; -use super::layer_manager::{LayerManagerLockHolder, LayerManagerReadGuard}; +use super::layer_manager::LayerManagerLockHolder; use super::{ CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder, GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, @@ -1779,20 +1779,14 @@ impl Timeline { } = { let phase1_span = info_span!("compact_level0_phase1"); let ctx = ctx.attached_child(); - let mut stats = CompactLevel0Phase1StatsBuilder { + let stats = CompactLevel0Phase1StatsBuilder { version: Some(2), tenant_id: Some(self.tenant_shard_id), timeline_id: Some(self.timeline_id), ..Default::default() }; - let begin = tokio::time::Instant::now(); - let phase1_layers_locked = self.layers.read(LayerManagerLockHolder::Compaction).await; - let now = tokio::time::Instant::now(); - stats.read_lock_acquisition_micros = - DurationRecorder::Recorded(RecordedDuration(now - begin), now); self.compact_level0_phase1( - phase1_layers_locked, stats, target_file_size, force_compaction_ignore_threshold, @@ -1813,16 +1807,19 @@ impl Timeline { } /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. - async fn compact_level0_phase1<'a>( - self: &'a Arc, - guard: LayerManagerReadGuard<'a>, + async fn compact_level0_phase1( + self: &Arc, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, force_compaction_ignore_threshold: bool, ctx: &RequestContext, ) -> Result { - stats.read_lock_held_spawn_blocking_startup_micros = - stats.read_lock_acquisition_micros.till_now(); // set by caller + let begin = tokio::time::Instant::now(); + let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; + let now = tokio::time::Instant::now(); + stats.read_lock_acquisition_micros = + DurationRecorder::Recorded(RecordedDuration(now - begin), now); + let layers = guard.layer_map()?; let level0_deltas = layers.level0_deltas(); stats.level0_deltas_count = Some(level0_deltas.len()); @@ -1857,6 +1854,12 @@ impl Timeline { .map(|x| guard.get_from_desc(x)) .collect::>(); + drop_layer_manager_rlock(guard); + + // The is the last LSN that we have seen for L0 compaction in the timeline. This LSN might be updated + // by the time we finish the compaction. So we need to get it here. + let l0_last_record_lsn = self.get_last_record_lsn(); + // Gather the files to compact in this iteration. // // Start with the oldest Level 0 delta file, and collect any other @@ -1944,9 +1947,7 @@ impl Timeline { // we don't accidentally use it later in the function. drop(level0_deltas); - stats.read_lock_held_prerequisites_micros = stats - .read_lock_held_spawn_blocking_startup_micros - .till_now(); + stats.compaction_prerequisites_micros = stats.read_lock_acquisition_micros.till_now(); // TODO: replace with streaming k-merge let all_keys = { @@ -1968,7 +1969,7 @@ impl Timeline { all_keys }; - stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); + stats.read_lock_held_key_sort_micros = stats.compaction_prerequisites_micros.till_now(); // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start. // @@ -2002,7 +2003,6 @@ impl Timeline { } } let max_holes = deltas_to_compact.len(); - let last_record_lsn = self.get_last_record_lsn(); let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; let min_hole_coverage_size = 3; // TODO: something more flexible? // min-heap (reserve space for one more element added before eviction) @@ -2021,8 +2021,12 @@ impl Timeline { // has not so much sense, because largest holes will corresponds field1/field2 changes. // But we are mostly interested to eliminate holes which cause generation of excessive image layers. // That is why it is better to measure size of hole as number of covering image layers. - let coverage_size = - layers.image_coverage(&key_range, last_record_lsn).len(); + let coverage_size = { + // TODO: optimize this with copy-on-write layer map. + let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; + let layers = guard.layer_map()?; + layers.image_coverage(&key_range, l0_last_record_lsn).len() + }; if coverage_size >= min_hole_coverage_size { heap.push(Hole { key_range, @@ -2041,7 +2045,6 @@ impl Timeline { holes }; stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); - drop_layer_manager_rlock(guard); if self.cancel.is_cancelled() { return Err(CompactionError::ShuttingDown); @@ -2382,9 +2385,8 @@ struct CompactLevel0Phase1StatsBuilder { tenant_id: Option, timeline_id: Option, read_lock_acquisition_micros: DurationRecorder, - read_lock_held_spawn_blocking_startup_micros: DurationRecorder, read_lock_held_key_sort_micros: DurationRecorder, - read_lock_held_prerequisites_micros: DurationRecorder, + compaction_prerequisites_micros: DurationRecorder, read_lock_held_compute_holes_micros: DurationRecorder, read_lock_drop_micros: DurationRecorder, write_layer_files_micros: DurationRecorder, @@ -2399,9 +2401,8 @@ struct CompactLevel0Phase1Stats { tenant_id: TenantShardId, timeline_id: TimelineId, read_lock_acquisition_micros: RecordedDuration, - read_lock_held_spawn_blocking_startup_micros: RecordedDuration, read_lock_held_key_sort_micros: RecordedDuration, - read_lock_held_prerequisites_micros: RecordedDuration, + compaction_prerequisites_micros: RecordedDuration, read_lock_held_compute_holes_micros: RecordedDuration, read_lock_drop_micros: RecordedDuration, write_layer_files_micros: RecordedDuration, @@ -2426,16 +2427,12 @@ impl TryFrom for CompactLevel0Phase1Stats { .read_lock_acquisition_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, - read_lock_held_spawn_blocking_startup_micros: value - .read_lock_held_spawn_blocking_startup_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?, read_lock_held_key_sort_micros: value .read_lock_held_key_sort_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?, - read_lock_held_prerequisites_micros: value - .read_lock_held_prerequisites_micros + compaction_prerequisites_micros: value + .compaction_prerequisites_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, read_lock_held_compute_holes_micros: value