diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 10842c1504..f9ed6d3071 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4044,10 +4044,12 @@ mod tests { use crate::DEFAULT_PG_VERSION; use bytes::{Bytes, BytesMut}; use hex_literal::hex; + use itertools::Itertools; use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; use rand::{thread_rng, Rng}; + use storage_layer::PersistentLayerKey; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; use utils::bin_ser::BeSer; @@ -6697,4 +6699,162 @@ mod tests { .collect::>(); assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created } + + #[tokio::test] + async fn test_simple_bottom_most_compaction() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_simple_bottom_most_compaction")?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon. + // + // | D1 | | D3 | + // -| |-- gc horizon ----------------- + // | | | D2 | + // --------- img layer ------------------ + // + // What we should expact from this compaction is: + // | Part of D1 | | D3 | + // --------- img layer with D1+D2 at GC horizon------------------ + + // img layer at 0x10 + let img_layer = (0..10) + .map(|id| (get_key(id), test_img(&format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + // TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose. + ( + get_key(1), + Lsn(0x20), + Value::Image(test_img("value 1@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::Image(test_img("value 2@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::Image(test_img("value 3@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::Image(test_img("value 5@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::Image(test_img("value 6@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x40), + Value::Image(test_img("value 8@0x40")), + ), + ( + get_key(9), + Lsn(0x40), + Value::Image(test_img("value 9@0x40")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![delta1, delta2, delta3], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.pitr = Lsn(0x30); + guard.cutoffs.horizon = Lsn(0x30); + } + + let cancel = CancellationToken::new(); + tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + + // Check if the image layer at the GC horizon contains exactly what we want + let image_at_gc_horizon = tline + .inspect_image_layers(Lsn(0x30), &ctx) + .await + .unwrap() + .into_iter() + .filter(|(k, _)| k.is_metadata_key()) + .collect::>(); + + assert_eq!(image_at_gc_horizon.len(), 10); + let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10]; + for idx in 0..10 { + assert_eq!( + image_at_gc_horizon[idx], + ( + get_key(idx as u32), + test_img(&format!("value {idx}@{:#x}", expected_lsn[idx])) + ) + ); + } + + // Check if old layers are removed / new layers have the expected LSN + let mut all_layers = tline.inspect_historic_layers().await.unwrap(); + all_layers.sort_by(|k1, k2| { + ( + k1.is_delta, + k1.key_range.start, + k1.key_range.end, + k1.lsn_range.start, + k1.lsn_range.end, + ) + .cmp(&( + k2.is_delta, + k2.key_range.start, + k2.key_range.end, + k2.lsn_range.start, + k2.lsn_range.end, + )) + }); + assert_eq!( + all_layers, + vec![ + // Image layer at GC horizon + PersistentLayerKey { + key_range: Key::MIN..get_key(10), + lsn_range: Lsn(0x30)..Lsn(0x31), + is_delta: false + }, + // The delta layer that is cut in the middle + PersistentLayerKey { + key_range: Key::MIN..get_key(9), + lsn_range: Lsn(0x30)..Lsn(0x41), + is_delta: true + }, + // The delta layer we created and should not be picked for the compaction + PersistentLayerKey { + key_range: get_key(8)..get_key(10), + lsn_range: Lsn(0x40)..Lsn(0x41), + is_delta: true + } + ] + ); + + Ok(()) + } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 999e2e8679..eb7cf81643 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -929,6 +929,45 @@ impl DeltaLayerInner { Ok(()) } + /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. + #[cfg(test)] + pub(super) async fn load_key_values( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); + let mut result = Vec::new(); + let mut stream = + Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx)); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let cursor = block_reader.block_cursor(); + let mut buf = Vec::new(); + while let Some(item) = stream.next().await { + let (key, lsn, pos) = item?; + // TODO: dedup code with get_reconstruct_value + // TODO: ctx handling and sharding + cursor + .read_blob_into_buf(pos.pos(), &mut buf, ctx) + .await + .with_context(|| { + format!("Failed to read blob from virtual file {}", self.file.path) + })?; + let val = Value::des(&buf).with_context(|| { + format!( + "Failed to deserialize file blob from virtual file {}", + self.file.path + ) + })?; + result.push((key, lsn, val)); + } + Ok(result) + } + async fn plan_reads( keyspace: &KeySpace, lsn_range: Range, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 285618b146..06e2f09384 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -485,6 +485,34 @@ impl ImageLayerInner { Ok(()) } + /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. + #[cfg(test)] + pub(super) async fn load_key_values( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); + let mut result = Vec::new(); + let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx)); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let cursor = block_reader.block_cursor(); + while let Some(item) = stream.next().await { + // TODO: dedup code with get_reconstruct_value + let (raw_key, offset) = item?; + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + // TODO: ctx handling and sharding + let blob = cursor + .read_blob(offset, ctx) + .await + .with_context(|| format!("failed to read value from offset {}", offset))?; + let value = Bytes::from(blob); + result.push((key, self.lsn, Value::Image(value))); + } + Ok(result) + } + /// Traverse the layer's index to build read operations on the overlap of the input keyspace /// and the keys in this layer. /// diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 18f9ba4ef8..32acb3f0cd 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -388,6 +388,23 @@ impl Layer { }) } + /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future. + #[cfg(test)] + pub(crate) async fn load_key_values( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let layer = self + .0 + .get_or_maybe_download(true, Some(ctx)) + .await + .map_err(|err| match err { + DownloadError::DownloadCancelled => GetVectoredError::Cancelled, + other => GetVectoredError::Other(anyhow::anyhow!(other)), + })?; + layer.load_key_values(&self.0, ctx).await + } + /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. @@ -1757,6 +1774,20 @@ impl DownloadedLayer { } } + #[cfg(test)] + async fn load_key_values( + &self, + owner: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result> { + use LayerKind::*; + + match self.get(owner, ctx).await? { + Delta(d) => d.load_key_values(ctx).await, + Image(i) => i.load_key_values(ctx).await, + } + } + async fn dump(&self, owner: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { use LayerKind::*; match self.get(owner, ctx).await? { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6da0f9d91c..54a4ceeaf3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -5549,6 +5549,19 @@ impl Timeline { all_data.sort(); Ok(all_data) } + + /// Get all historic layer descriptors in the layer map + #[cfg(test)] + pub(crate) async fn inspect_historic_layers( + self: &Arc, + ) -> anyhow::Result> { + let mut layers = Vec::new(); + let guard = self.layers.read().await; + for layer in guard.layer_map().iter_historic_layers() { + layers.push(layer.key()); + } + Ok(layers) + } } type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index d8de6aee7c..8a95029f33 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -952,6 +952,178 @@ impl Timeline { adaptor.flush_updates().await?; Ok(()) } + + /// An experimental compaction building block that combines compaction with garbage collection. + /// + /// The current implementation picks all delta + image layers that are below or intersecting with + /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta + /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon, + /// and create delta layers with all deltas >= gc horizon. + #[cfg(test)] + pub(crate) async fn compact_with_gc( + self: &Arc, + _cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + use crate::tenant::storage_layer::ValueReconstructState; + // Step 0: pick all delta layers + image layers below/intersect with the GC horizon. + // The layer selection has the following properties: + // 1. If a layer is in the selection, all layers below it are in the selection. + // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. + let (layer_selection, gc_cutoff) = { + let guard = self.layers.read().await; + let layers = guard.layer_map(); + let gc_info = self.gc_info.read().unwrap(); + let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr); + let mut selected_layers = Vec::new(); + // TODO: consider retain_lsns + drop(gc_info); + for desc in layers.iter_historic_layers() { + if desc.get_lsn_range().start <= gc_cutoff { + selected_layers.push(guard.get_from_desc(&desc)); + } + } + (selected_layers, gc_cutoff) + }; + // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. + let mut all_key_values = Vec::new(); + for layer in &layer_selection { + all_key_values.extend(layer.load_key_values(ctx).await?); + } + // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and + // image layers, make image appear later than delta. + struct ValueWrapper<'a>(&'a crate::repository::Value); + impl Ord for ValueWrapper<'_> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + use crate::repository::Value; + use std::cmp::Ordering; + match (self.0, other.0) { + (Value::Image(_), Value::WalRecord(_)) => Ordering::Greater, + (Value::WalRecord(_), Value::Image(_)) => Ordering::Less, + _ => Ordering::Equal, + } + } + } + impl PartialOrd for ValueWrapper<'_> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + impl PartialEq for ValueWrapper<'_> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == std::cmp::Ordering::Equal + } + } + impl Eq for ValueWrapper<'_> {} + all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| { + (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2))) + }); + let max_lsn = all_key_values + .iter() + .map(|(_, lsn, _)| lsn) + .max() + .copied() + .unwrap() + + 1; + // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas. + // Data of the same key. + let mut accumulated_values = Vec::new(); + let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty + + /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon. + async fn flush_accumulated_states( + tline: &Arc, + key: Key, + accumulated_values: &[&(Key, Lsn, crate::repository::Value)], + horizon: Lsn, + ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> { + let mut base_image = None; + let mut keys_above_horizon = Vec::new(); + let mut delta_above_base_image = Vec::new(); + // We have a list of deltas/images. We want to create image layers while collect garbages. + for (key, lsn, val) in accumulated_values.iter().rev() { + if *lsn > horizon { + keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both + } else if *lsn <= horizon { + match val { + crate::repository::Value::Image(image) => { + if lsn <= &horizon { + base_image = Some((*lsn, image.clone())); + break; + } + } + crate::repository::Value::WalRecord(wal) => { + delta_above_base_image.push((*lsn, wal.clone())); + } + } + } + } + delta_above_base_image.reverse(); + keys_above_horizon.reverse(); + let state = ValueReconstructState { + img: base_image, + records: delta_above_base_image, + }; + let img = tline.reconstruct_value(key, horizon, state).await?; + Ok((keys_above_horizon, img)) + } + + let mut delta_layer_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + all_key_values.first().unwrap().0, + gc_cutoff..max_lsn, // TODO: off by one? + ctx, + ) + .await?; + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()), + gc_cutoff, + ctx, + ) + .await?; + + for item @ (key, _, _) in &all_key_values { + if &last_key == key { + accumulated_values.push(item); + } else { + let (deltas, image) = + flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff) + .await?; + image_layer_writer.put_image(last_key, image, ctx).await?; + for (key, lsn, val) in deltas { + delta_layer_writer.put_value(key, lsn, val, ctx).await?; + } + accumulated_values.clear(); + accumulated_values.push(item); + last_key = *key; + } + } + let (deltas, image) = + flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?; + image_layer_writer.put_image(last_key, image, ctx).await?; + for (key, lsn, val) in deltas { + delta_layer_writer.put_value(key, lsn, val, ctx).await?; + } + accumulated_values.clear(); + // TODO: split layers + let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?; + let image_layer = image_layer_writer.finish(self, ctx).await?; + // Step 3: Place back to the layer map. + { + let mut guard = self.layers.write().await; + guard.finish_gc_compaction( + &layer_selection, + &[delta_layer.clone(), image_layer.clone()], + &self.metrics, + ) + }; + Ok(()) + } } struct TimelineAdaptor { diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 21e64d562a..550a9a567a 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -226,6 +226,18 @@ impl LayerManager { updates.flush(); } + /// Called when a GC-compaction is completed. + #[cfg(test)] + pub(crate) fn finish_gc_compaction( + &mut self, + compact_from: &[Layer], + compact_to: &[ResidentLayer], + metrics: &TimelineMetrics, + ) { + // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification. + self.finish_compact_l0(compact_from, compact_to, metrics) + } + /// Called when compaction is completed. pub(crate) fn rewrite_layers( &mut self,