From 4c2100794b97c1f635bf9dcc9013d2b2c8733de6 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Tue, 11 Jun 2024 10:14:51 -0400 Subject: [PATCH] feat(pageserver): initial code sketch & test case for combined gc+compaction at gc_horizon (#7948) A demo for a building block for compaction. The GC-compaction operation iterates all layers below/intersect with the GC horizon, and do a full layer rewrite of all of them. The end result will be image layer covering the full keyspace at GC-horizon, and a bunch of delta layers above the GC-horizon. This helps us collect the garbages of the test_gc_feedback test case to reduce space amplification. This operation can be manually triggered using an HTTP API or be triggered based on some metrics. Actual method TBD. The test is very basic and it's very likely that most part of the algorithm will be rewritten. I would like to get this merged so that I can have a basic skeleton for the algorithm and then make incremental changes. image --------- Signed-off-by: Alex Chi Z --- pageserver/src/tenant.rs | 160 ++++++++++++++++ .../src/tenant/storage_layer/delta_layer.rs | 39 ++++ .../src/tenant/storage_layer/image_layer.rs | 28 +++ pageserver/src/tenant/storage_layer/layer.rs | 31 ++++ pageserver/src/tenant/timeline.rs | 13 ++ pageserver/src/tenant/timeline/compaction.rs | 172 ++++++++++++++++++ .../src/tenant/timeline/layer_manager.rs | 12 ++ 7 files changed, 455 insertions(+) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 10842c1504..f9ed6d3071 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4044,10 +4044,12 @@ mod tests { use crate::DEFAULT_PG_VERSION; use bytes::{Bytes, BytesMut}; use hex_literal::hex; + use itertools::Itertools; use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; use rand::{thread_rng, Rng}; + use storage_layer::PersistentLayerKey; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; use utils::bin_ser::BeSer; @@ -6697,4 +6699,162 @@ mod tests { .collect::>(); assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created } + + #[tokio::test] + async fn test_simple_bottom_most_compaction() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_simple_bottom_most_compaction")?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon. + // + // | D1 | | D3 | + // -| |-- gc horizon ----------------- + // | | | D2 | + // --------- img layer ------------------ + // + // What we should expact from this compaction is: + // | Part of D1 | | D3 | + // --------- img layer with D1+D2 at GC horizon------------------ + + // img layer at 0x10 + let img_layer = (0..10) + .map(|id| (get_key(id), test_img(&format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + // TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose. + ( + get_key(1), + Lsn(0x20), + Value::Image(test_img("value 1@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::Image(test_img("value 2@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::Image(test_img("value 3@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::Image(test_img("value 5@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::Image(test_img("value 6@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x40), + Value::Image(test_img("value 8@0x40")), + ), + ( + get_key(9), + Lsn(0x40), + Value::Image(test_img("value 9@0x40")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![delta1, delta2, delta3], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.pitr = Lsn(0x30); + guard.cutoffs.horizon = Lsn(0x30); + } + + let cancel = CancellationToken::new(); + tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + + // Check if the image layer at the GC horizon contains exactly what we want + let image_at_gc_horizon = tline + .inspect_image_layers(Lsn(0x30), &ctx) + .await + .unwrap() + .into_iter() + .filter(|(k, _)| k.is_metadata_key()) + .collect::>(); + + assert_eq!(image_at_gc_horizon.len(), 10); + let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10]; + for idx in 0..10 { + assert_eq!( + image_at_gc_horizon[idx], + ( + get_key(idx as u32), + test_img(&format!("value {idx}@{:#x}", expected_lsn[idx])) + ) + ); + } + + // Check if old layers are removed / new layers have the expected LSN + let mut all_layers = tline.inspect_historic_layers().await.unwrap(); + all_layers.sort_by(|k1, k2| { + ( + k1.is_delta, + k1.key_range.start, + k1.key_range.end, + k1.lsn_range.start, + k1.lsn_range.end, + ) + .cmp(&( + k2.is_delta, + k2.key_range.start, + k2.key_range.end, + k2.lsn_range.start, + k2.lsn_range.end, + )) + }); + assert_eq!( + all_layers, + vec![ + // Image layer at GC horizon + PersistentLayerKey { + key_range: Key::MIN..get_key(10), + lsn_range: Lsn(0x30)..Lsn(0x31), + is_delta: false + }, + // The delta layer that is cut in the middle + PersistentLayerKey { + key_range: Key::MIN..get_key(9), + lsn_range: Lsn(0x30)..Lsn(0x41), + is_delta: true + }, + // The delta layer we created and should not be picked for the compaction + PersistentLayerKey { + key_range: get_key(8)..get_key(10), + lsn_range: Lsn(0x40)..Lsn(0x41), + is_delta: true + } + ] + ); + + Ok(()) + } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 999e2e8679..eb7cf81643 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -929,6 +929,45 @@ impl DeltaLayerInner { Ok(()) } + /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. + #[cfg(test)] + pub(super) async fn load_key_values( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); + let mut result = Vec::new(); + let mut stream = + Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx)); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let cursor = block_reader.block_cursor(); + let mut buf = Vec::new(); + while let Some(item) = stream.next().await { + let (key, lsn, pos) = item?; + // TODO: dedup code with get_reconstruct_value + // TODO: ctx handling and sharding + cursor + .read_blob_into_buf(pos.pos(), &mut buf, ctx) + .await + .with_context(|| { + format!("Failed to read blob from virtual file {}", self.file.path) + })?; + let val = Value::des(&buf).with_context(|| { + format!( + "Failed to deserialize file blob from virtual file {}", + self.file.path + ) + })?; + result.push((key, lsn, val)); + } + Ok(result) + } + async fn plan_reads( keyspace: &KeySpace, lsn_range: Range, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 285618b146..06e2f09384 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -485,6 +485,34 @@ impl ImageLayerInner { Ok(()) } + /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. + #[cfg(test)] + pub(super) async fn load_key_values( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); + let mut result = Vec::new(); + let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx)); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let cursor = block_reader.block_cursor(); + while let Some(item) = stream.next().await { + // TODO: dedup code with get_reconstruct_value + let (raw_key, offset) = item?; + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + // TODO: ctx handling and sharding + let blob = cursor + .read_blob(offset, ctx) + .await + .with_context(|| format!("failed to read value from offset {}", offset))?; + let value = Bytes::from(blob); + result.push((key, self.lsn, Value::Image(value))); + } + Ok(result) + } + /// Traverse the layer's index to build read operations on the overlap of the input keyspace /// and the keys in this layer. /// diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 18f9ba4ef8..32acb3f0cd 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -388,6 +388,23 @@ impl Layer { }) } + /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future. + #[cfg(test)] + pub(crate) async fn load_key_values( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let layer = self + .0 + .get_or_maybe_download(true, Some(ctx)) + .await + .map_err(|err| match err { + DownloadError::DownloadCancelled => GetVectoredError::Cancelled, + other => GetVectoredError::Other(anyhow::anyhow!(other)), + })?; + layer.load_key_values(&self.0, ctx).await + } + /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. @@ -1757,6 +1774,20 @@ impl DownloadedLayer { } } + #[cfg(test)] + async fn load_key_values( + &self, + owner: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result> { + use LayerKind::*; + + match self.get(owner, ctx).await? { + Delta(d) => d.load_key_values(ctx).await, + Image(i) => i.load_key_values(ctx).await, + } + } + async fn dump(&self, owner: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { use LayerKind::*; match self.get(owner, ctx).await? { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6da0f9d91c..54a4ceeaf3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -5549,6 +5549,19 @@ impl Timeline { all_data.sort(); Ok(all_data) } + + /// Get all historic layer descriptors in the layer map + #[cfg(test)] + pub(crate) async fn inspect_historic_layers( + self: &Arc, + ) -> anyhow::Result> { + let mut layers = Vec::new(); + let guard = self.layers.read().await; + for layer in guard.layer_map().iter_historic_layers() { + layers.push(layer.key()); + } + Ok(layers) + } } type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index d8de6aee7c..8a95029f33 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -952,6 +952,178 @@ impl Timeline { adaptor.flush_updates().await?; Ok(()) } + + /// An experimental compaction building block that combines compaction with garbage collection. + /// + /// The current implementation picks all delta + image layers that are below or intersecting with + /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta + /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon, + /// and create delta layers with all deltas >= gc horizon. + #[cfg(test)] + pub(crate) async fn compact_with_gc( + self: &Arc, + _cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + use crate::tenant::storage_layer::ValueReconstructState; + // Step 0: pick all delta layers + image layers below/intersect with the GC horizon. + // The layer selection has the following properties: + // 1. If a layer is in the selection, all layers below it are in the selection. + // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. + let (layer_selection, gc_cutoff) = { + let guard = self.layers.read().await; + let layers = guard.layer_map(); + let gc_info = self.gc_info.read().unwrap(); + let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr); + let mut selected_layers = Vec::new(); + // TODO: consider retain_lsns + drop(gc_info); + for desc in layers.iter_historic_layers() { + if desc.get_lsn_range().start <= gc_cutoff { + selected_layers.push(guard.get_from_desc(&desc)); + } + } + (selected_layers, gc_cutoff) + }; + // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. + let mut all_key_values = Vec::new(); + for layer in &layer_selection { + all_key_values.extend(layer.load_key_values(ctx).await?); + } + // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and + // image layers, make image appear later than delta. + struct ValueWrapper<'a>(&'a crate::repository::Value); + impl Ord for ValueWrapper<'_> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + use crate::repository::Value; + use std::cmp::Ordering; + match (self.0, other.0) { + (Value::Image(_), Value::WalRecord(_)) => Ordering::Greater, + (Value::WalRecord(_), Value::Image(_)) => Ordering::Less, + _ => Ordering::Equal, + } + } + } + impl PartialOrd for ValueWrapper<'_> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + impl PartialEq for ValueWrapper<'_> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == std::cmp::Ordering::Equal + } + } + impl Eq for ValueWrapper<'_> {} + all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| { + (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2))) + }); + let max_lsn = all_key_values + .iter() + .map(|(_, lsn, _)| lsn) + .max() + .copied() + .unwrap() + + 1; + // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas. + // Data of the same key. + let mut accumulated_values = Vec::new(); + let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty + + /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon. + async fn flush_accumulated_states( + tline: &Arc, + key: Key, + accumulated_values: &[&(Key, Lsn, crate::repository::Value)], + horizon: Lsn, + ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> { + let mut base_image = None; + let mut keys_above_horizon = Vec::new(); + let mut delta_above_base_image = Vec::new(); + // We have a list of deltas/images. We want to create image layers while collect garbages. + for (key, lsn, val) in accumulated_values.iter().rev() { + if *lsn > horizon { + keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both + } else if *lsn <= horizon { + match val { + crate::repository::Value::Image(image) => { + if lsn <= &horizon { + base_image = Some((*lsn, image.clone())); + break; + } + } + crate::repository::Value::WalRecord(wal) => { + delta_above_base_image.push((*lsn, wal.clone())); + } + } + } + } + delta_above_base_image.reverse(); + keys_above_horizon.reverse(); + let state = ValueReconstructState { + img: base_image, + records: delta_above_base_image, + }; + let img = tline.reconstruct_value(key, horizon, state).await?; + Ok((keys_above_horizon, img)) + } + + let mut delta_layer_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + all_key_values.first().unwrap().0, + gc_cutoff..max_lsn, // TODO: off by one? + ctx, + ) + .await?; + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()), + gc_cutoff, + ctx, + ) + .await?; + + for item @ (key, _, _) in &all_key_values { + if &last_key == key { + accumulated_values.push(item); + } else { + let (deltas, image) = + flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff) + .await?; + image_layer_writer.put_image(last_key, image, ctx).await?; + for (key, lsn, val) in deltas { + delta_layer_writer.put_value(key, lsn, val, ctx).await?; + } + accumulated_values.clear(); + accumulated_values.push(item); + last_key = *key; + } + } + let (deltas, image) = + flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?; + image_layer_writer.put_image(last_key, image, ctx).await?; + for (key, lsn, val) in deltas { + delta_layer_writer.put_value(key, lsn, val, ctx).await?; + } + accumulated_values.clear(); + // TODO: split layers + let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?; + let image_layer = image_layer_writer.finish(self, ctx).await?; + // Step 3: Place back to the layer map. + { + let mut guard = self.layers.write().await; + guard.finish_gc_compaction( + &layer_selection, + &[delta_layer.clone(), image_layer.clone()], + &self.metrics, + ) + }; + Ok(()) + } } struct TimelineAdaptor { diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 21e64d562a..550a9a567a 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -226,6 +226,18 @@ impl LayerManager { updates.flush(); } + /// Called when a GC-compaction is completed. + #[cfg(test)] + pub(crate) fn finish_gc_compaction( + &mut self, + compact_from: &[Layer], + compact_to: &[ResidentLayer], + metrics: &TimelineMetrics, + ) { + // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification. + self.finish_compact_l0(compact_from, compact_to, metrics) + } + /// Called when compaction is completed. pub(crate) fn rewrite_layers( &mut self,