From 11334a2cdb8cf7e5c1c0245bbd61b45c54ac7d69 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 6 Mar 2025 11:44:00 -0500 Subject: [PATCH] feat(pageserver): more statistics for gc-compaction (#11103) ## Problem part of https://github.com/neondatabase/neon/issues/9114 ## Summary of changes * Add timers for each phase of the gc-compaction. * Add a final ratio computation to directly show the garbage collection ratio in the logs. --------- Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline/compaction.rs | 107 +++++++++++++++++-- 1 file changed, 99 insertions(+), 8 deletions(-) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 17f7d96e5e..8fa79ddb22 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -7,6 +7,7 @@ use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; use std::ops::{Deref, Range}; use std::sync::Arc; +use std::time::Instant; use super::layer_manager::LayerManager; use super::{ @@ -724,17 +725,41 @@ struct CompactionStatisticsNumSize { #[derive(Debug, Serialize, Default)] pub struct CompactionStatistics { + /// Delta layer visited (maybe compressed, physical size) delta_layer_visited: CompactionStatisticsNumSize, + /// Image layer visited (maybe compressed, physical size) image_layer_visited: CompactionStatisticsNumSize, + /// Delta layer produced (maybe compressed, physical size) delta_layer_produced: CompactionStatisticsNumSize, + /// Image layer produced (maybe compressed, physical size) image_layer_produced: CompactionStatisticsNumSize, - num_delta_layer_discarded: usize, - num_image_layer_discarded: usize, + /// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer) + delta_layer_discarded: CompactionStatisticsNumSize, + /// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer) + image_layer_discarded: CompactionStatisticsNumSize, num_unique_keys_visited: usize, + /// Delta visited (uncompressed, original size) wal_keys_visited: CompactionStatisticsNumSize, + /// Image visited (uncompressed, original size) image_keys_visited: CompactionStatisticsNumSize, + /// Delta produced (uncompressed, original size) wal_produced: CompactionStatisticsNumSize, + /// Image produced (uncompressed, original size) image_produced: CompactionStatisticsNumSize, + + // Time spent in each phase + time_acquire_lock_secs: f64, + time_analyze_secs: f64, + time_download_layer_secs: f64, + time_main_loop_secs: f64, + time_final_phase_secs: f64, + time_total_secs: f64, + + // Summary + /// Ratio of the key-value size before/after gc-compaction. + uncompressed_size_ratio: f64, + /// Ratio of the physical size before/after gc-compaction. + physical_size_ratio: f64, } impl CompactionStatistics { @@ -784,11 +809,13 @@ impl CompactionStatistics { self.image_produced.num += 1; self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64; } - fn discard_delta_layer(&mut self) { - self.num_delta_layer_discarded += 1; + fn discard_delta_layer(&mut self, original_size: u64) { + self.delta_layer_discarded.num += 1; + self.delta_layer_discarded.size += original_size; } - fn discard_image_layer(&mut self) { - self.num_image_layer_discarded += 1; + fn discard_image_layer(&mut self, original_size: u64) { + self.image_layer_discarded.num += 1; + self.image_layer_discarded.size += original_size; } fn produce_delta_layer(&mut self, size: u64) { self.delta_layer_produced.num += 1; @@ -798,6 +825,19 @@ impl CompactionStatistics { self.image_layer_produced.num += 1; self.image_layer_produced.size += size; } + fn finalize(&mut self) { + let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size; + let produced_key_value_size = self.image_produced.size + self.wal_produced.size; + self.uncompressed_size_ratio = + original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0 + let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size; + let produced_physical_size = self.image_layer_produced.size + + self.delta_layer_produced.size + + self.image_layer_discarded.size + + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate + self.physical_size_ratio = + original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0 + } } #[derive(Default, Debug, Clone, Copy, PartialEq, Eq)] @@ -2595,6 +2635,9 @@ impl Timeline { // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. // Note that we already acquired the compaction lock when the outer `compact` function gets called. + let timer = Instant::now(); + let begin_timer = timer; + let gc_lock = async { tokio::select! { guard = self.gc_lock.lock() => Ok(guard), @@ -2602,6 +2645,9 @@ impl Timeline { } }; + let time_acquire_lock = timer.elapsed(); + let timer = Instant::now(); + let gc_lock = crate::timed( gc_lock, "acquires gc lock", @@ -2791,6 +2837,9 @@ impl Timeline { has_data_below, ); + let time_analyze = timer.elapsed(); + let timer = Instant::now(); + for layer in &job_desc.selected_layers { debug!("read layer: {}", layer.layer_desc().key()); } @@ -2893,6 +2942,9 @@ impl Timeline { .context("failed to create filter iterator") .map_err(CompactionError::Other)?; + let time_download_layer = timer.elapsed(); + let timer = Instant::now(); + // Step 2: Produce images+deltas. let mut accumulated_values = Vec::new(); let mut last_key: Option = None; @@ -3114,6 +3166,9 @@ impl Timeline { .map_err(CompactionError::Other)?; // end: move the above part to the loop body + let time_main_loop = timer.elapsed(); + let timer = Instant::now(); + let mut rewrote_delta_layers = Vec::new(); for (key, writers) in delta_layer_rewriters { if let Some(delta_writer_before) = writers.before { @@ -3178,6 +3233,13 @@ impl Timeline { let mut keep_layers = HashSet::new(); let produced_delta_layers_len = produced_delta_layers.len(); let produced_image_layers_len = produced_image_layers.len(); + + let layer_selection_by_key = job_desc + .selected_layers + .iter() + .map(|l| (l.layer_desc().key(), l.layer_desc().clone())) + .collect::>(); + for action in produced_delta_layers { match action { BatchWriterResult::Produced(layer) => { @@ -3191,8 +3253,16 @@ impl Timeline { if cfg!(debug_assertions) { info!("discarded delta layer: {}", l); } + if let Some(layer_desc) = layer_selection_by_key.get(&l) { + stat.discard_delta_layer(layer_desc.file_size()); + } else { + tracing::warn!( + "discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?", + l + ); + stat.discard_delta_layer(0); + } keep_layers.insert(l); - stat.discard_delta_layer(); } } } @@ -3201,6 +3271,9 @@ impl Timeline { "produced rewritten delta layer: {}", layer.layer_desc().key() ); + // For now, we include rewritten delta layer size in the "produce_delta_layer". We could + // make it a separate statistics in the future. + stat.produce_delta_layer(layer.layer_desc().file_size()); } compact_to.extend(rewrote_delta_layers); for action in produced_image_layers { @@ -3212,8 +3285,16 @@ impl Timeline { } BatchWriterResult::Discarded(l) => { debug!("discarded image layer: {}", l); + if let Some(layer_desc) = layer_selection_by_key.get(&l) { + stat.discard_image_layer(layer_desc.file_size()); + } else { + tracing::warn!( + "discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?", + l + ); + stat.discard_image_layer(0); + } keep_layers.insert(l); - stat.discard_image_layer(); } } } @@ -3261,6 +3342,16 @@ impl Timeline { layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); + let time_final_phase = timer.elapsed(); + + stat.time_final_phase_secs = time_final_phase.as_secs_f64(); + stat.time_main_loop_secs = time_main_loop.as_secs_f64(); + stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64(); + stat.time_download_layer_secs = time_download_layer.as_secs_f64(); + stat.time_analyze_secs = time_analyze.as_secs_f64(); + stat.time_total_secs = begin_timer.elapsed().as_secs_f64(); + stat.finalize(); + info!( "gc-compaction statistics: {}", serde_json::to_string(&stat)