diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 3b6b56b3e4..6f68ebd662 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -263,6 +263,7 @@ fn bench_from_real_project(c: &mut Criterion) { kr.start.to_i128()..kr.end.to_i128(), lr.start.0..lr.end.0, format!("Layer {}", lr.start.0), + !layer.is_incremental(), ); } bstlm.rebuild(); @@ -323,6 +324,7 @@ fn bench_sequential(c: &mut Criterion) { kr.start.to_i128()..kr.end.to_i128(), lr.start.0..lr.end.0, format!("Layer {}", lr.start.0), + !layer.is_incremental(), ); } bstlm.rebuild(); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 530cbbd7b3..be478600be 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -74,12 +74,14 @@ use utils::{ mod blob_io; pub mod block_io; pub mod bst_layer_map; +pub mod coverage; mod delta_layer; mod disk_btree; pub(crate) mod ephemeral_file; pub mod filename; mod image_layer; mod inmemory_layer; +pub mod latest_layer_map; pub mod layer_map; pub mod metadata; diff --git a/pageserver/src/tenant/bst_layer_map.rs b/pageserver/src/tenant/bst_layer_map.rs index 64cd71f3cc..fbef929226 100644 --- a/pageserver/src/tenant/bst_layer_map.rs +++ b/pageserver/src/tenant/bst_layer_map.rs @@ -1,38 +1,14 @@ use std::collections::BTreeMap; use std::ops::Range; -use std::sync::Arc; -// TODO the `im` crate has 20x more downloads and also has -// persistent/immutable BTree. See if it's better. -use rpds::RedBlackTreeMapSync; +use super::latest_layer_map::LatestLayerMap; -/// Layer map implemented using persistent/immutable binary search tree. -/// It supports historical queries, but no retroactive inserts. For that -/// see RetroactiveLayerMap. -/// -/// Layer type is abstracted as Value to make unit testing easier. pub struct PersistentLayerMap { - /// Mapping key to the latest layer (if any) until the next key. - /// We use the Sync version of the map because we want Self to - /// be Sync. - /// - /// TODO Separate Head into its own struct LatestLayerMap - /// TODO Merge historic with retroactive, into HistoricLayerMap - /// TODO Maintain a pair of heads, one for images, one for deltas. - /// This way we can query both of them with one BTreeMap query. - head: RedBlackTreeMapSync>, + /// The latest-only solution + head: LatestLayerMap, - /// All previous states of `self.head` - /// - /// TODO: Sorted Vec + binary search could be slightly faster. - historic: BTreeMap>>, -} - -impl std::fmt::Debug for PersistentLayerMap { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let head_vec: Vec<_> = self.head.iter().collect(); - write!(f, "PersistentLayerMap: head: {:?}", head_vec) - } + /// All previous states + historic: BTreeMap>, } impl Default for PersistentLayerMap { @@ -44,22 +20,18 @@ impl Default for PersistentLayerMap { impl PersistentLayerMap { pub fn new() -> Self { Self { - head: RedBlackTreeMapSync::default(), + head: LatestLayerMap::default(), historic: BTreeMap::default(), } } - /// Helper function to subdivide the key range without changing any values - fn add_node(self: &mut Self, key: i128) { - let value = match self.head.range(..=key).last() { - Some((_, Some(v))) => Some(v.clone()), - Some((_, None)) => None, - None => None, - }; - self.head.insert_mut(key, value); - } - - pub fn insert(self: &mut Self, key: Range, lsn: Range, value: Value) { + pub fn insert( + self: &mut Self, + key: Range, + lsn: Range, + value: Value, + is_image: bool, + ) { // It's only a persistent map, not a retroactive one if let Some(last_entry) = self.historic.iter().rev().next() { let last_lsn = last_entry.0; @@ -71,65 +43,25 @@ impl PersistentLayerMap { } } - // NOTE The order of the following lines is important!! - - // Add nodes at endpoints - self.add_node(key.start); - self.add_node(key.end); - - // Raise the height where necessary - // - // NOTE This loop is worst case O(N), but amortized O(log N) in the special - // case when rectangles have no height. In practice I don't think we'll see - // the kind of layer intersections needed to trigger O(N) behavior. If we - // do it can be fixed using lazy propagation. - let mut to_update = Vec::new(); - let mut to_remove = Vec::new(); - let mut prev_covered = false; - for (k, node) in self.head.range(key.clone()) { - let needs_cover = match node { - None => true, - Some((h, _)) => h < &lsn.end, - }; - if needs_cover { - match prev_covered { - true => to_remove.push(k.clone()), - false => to_update.push(k.clone()), - } - } - prev_covered = needs_cover; - } - if !prev_covered { - to_remove.push(key.end); - } - for k in to_update { - self.head - .insert_mut(k.clone(), Some((lsn.end.clone(), value.clone()))); - } - for k in to_remove { - self.head.remove_mut(&k); - } + self.head.insert(key, lsn.clone(), value, is_image); // Remember history. Clone is O(1) self.historic.insert(lsn.start, self.head.clone()); } - pub fn query(self: &Self, key: i128, lsn: u64) -> Option { - let version = self.historic.range(..=lsn).rev().next()?.1; - version - .range(..=key) - .rev() - .next()? - .1 - .as_ref() - .map(|(_, v)| v.clone()) + pub fn query(self: &Self, key: i128, lsn: u64) -> (Option, Option) { + let version = match self.historic.range(..=lsn).rev().next() { + Some((_, v)) => v, + None => return (None, None), + }; + version.query(key) } - pub fn get_coverage( - self: &Self, - lsn: u64, - ) -> Option<&RedBlackTreeMapSync>> { - Some(self.historic.range(..=lsn).rev().next()?.1) + pub fn get_version(self: &Self, lsn: u64) -> Option<&LatestLayerMap> { + match self.historic.range(..=lsn).rev().next() { + Some((_, v)) => Some(v), + None => None, + } } pub fn trim(self: &mut Self, begin: &u64) { @@ -144,64 +76,46 @@ impl PersistentLayerMap { } } -/// Basic test for the immutable bst library, just to show usage. -#[test] -fn test_immutable_bst_dependency() { - let map = RedBlackTreeMapSync::::default(); - - let mut v1 = map.clone(); - let v2 = map.insert(1, 5); - - // We can query current and past versions of key 1 - assert_eq!(v1.get(&1), None); - assert_eq!(v2.get(&1), Some(&5)); - - // We can mutate old state, but it creates a branch. - // It doesn't retroactively change future versions. - v1.insert_mut(2, 6); - assert_eq!(v1.get(&2), Some(&6)); - assert_eq!(v2.get(&2), None); -} - /// This is the most basic test that demonstrates intended usage. /// All layers in this test have height 1. #[test] fn test_persistent_simple() { let mut map = PersistentLayerMap::::new(); - map.insert(0..5, 100..101, "Layer 1".to_string()); - map.insert(3..9, 110..111, "Layer 2".to_string()); - map.insert(5..6, 120..121, "Layer 3".to_string()); + map.insert(0..5, 100..101, "Layer 1".to_string(), true); + map.insert(3..9, 110..111, "Layer 2".to_string(), true); + map.insert(5..6, 120..121, "Layer 3".to_string(), true); // After Layer 1 insertion - assert_eq!(map.query(1, 105), Some("Layer 1".to_string())); - assert_eq!(map.query(4, 105), Some("Layer 1".to_string())); + assert_eq!(map.query(1, 105).1, Some("Layer 1".to_string())); + assert_eq!(map.query(4, 105).1, Some("Layer 1".to_string())); // After Layer 2 insertion - assert_eq!(map.query(4, 115), Some("Layer 2".to_string())); - assert_eq!(map.query(8, 115), Some("Layer 2".to_string())); - assert_eq!(map.query(11, 115), None); + assert_eq!(map.query(4, 115).1, Some("Layer 2".to_string())); + assert_eq!(map.query(8, 115).1, Some("Layer 2".to_string())); + assert_eq!(map.query(11, 115).1, None); // After Layer 3 insertion - assert_eq!(map.query(4, 125), Some("Layer 2".to_string())); - assert_eq!(map.query(5, 125), Some("Layer 3".to_string())); - assert_eq!(map.query(7, 125), Some("Layer 2".to_string())); + assert_eq!(map.query(4, 125).1, Some("Layer 2".to_string())); + assert_eq!(map.query(5, 125).1, Some("Layer 3".to_string())); + assert_eq!(map.query(7, 125).1, Some("Layer 2".to_string())); } /// Cover simple off-by-one edge cases #[test] fn test_off_by_one() { let mut map = PersistentLayerMap::::new(); - map.insert(3..5, 100..110, "Layer 1".to_string()); + map.insert(3..5, 100..110, "Layer 1".to_string(), true); // Check different LSNs - assert_eq!(map.query(4, 99), None); - assert_eq!(map.query(4, 100), Some("Layer 1".to_string())); + assert_eq!(map.query(4, 99).1, None); + assert_eq!(map.query(4, 100).1, Some("Layer 1".to_string())); + assert_eq!(map.query(4, 110).1, Some("Layer 1".to_string())); // Check different keys - assert_eq!(map.query(2, 105), None); - assert_eq!(map.query(3, 105), Some("Layer 1".to_string())); - assert_eq!(map.query(4, 105), Some("Layer 1".to_string())); - assert_eq!(map.query(5, 105), None); + assert_eq!(map.query(2, 105).1, None); + assert_eq!(map.query(3, 105).1, Some("Layer 1".to_string())); + assert_eq!(map.query(4, 105).1, Some("Layer 1".to_string())); + assert_eq!(map.query(5, 105).1, None); } /// Cover edge cases where layers begin or end on the same key @@ -209,24 +123,24 @@ fn test_off_by_one() { fn test_key_collision() { let mut map = PersistentLayerMap::::new(); - map.insert(3..5, 100..110, "Layer 10".to_string()); - map.insert(5..8, 100..110, "Layer 11".to_string()); + map.insert(3..5, 100..110, "Layer 10".to_string(), true); + map.insert(5..8, 100..110, "Layer 11".to_string(), true); - map.insert(3..4, 200..210, "Layer 20".to_string()); + map.insert(3..4, 200..210, "Layer 20".to_string(), true); // Check after layer 11 - assert_eq!(map.query(2, 105), None); - assert_eq!(map.query(3, 105), Some("Layer 10".to_string())); - assert_eq!(map.query(5, 105), Some("Layer 11".to_string())); - assert_eq!(map.query(7, 105), Some("Layer 11".to_string())); - assert_eq!(map.query(8, 105), None); + assert_eq!(map.query(2, 105).1, None); + assert_eq!(map.query(3, 105).1, Some("Layer 10".to_string())); + assert_eq!(map.query(5, 105).1, Some("Layer 11".to_string())); + assert_eq!(map.query(7, 105).1, Some("Layer 11".to_string())); + assert_eq!(map.query(8, 105).1, None); // Check after layer 20 - assert_eq!(map.query(2, 205), None); - assert_eq!(map.query(3, 205), Some("Layer 20".to_string())); - assert_eq!(map.query(5, 205), Some("Layer 11".to_string())); - assert_eq!(map.query(7, 205), Some("Layer 11".to_string())); - assert_eq!(map.query(8, 205), None); + assert_eq!(map.query(2, 205).1, None); + assert_eq!(map.query(3, 205).1, Some("Layer 20".to_string())); + assert_eq!(map.query(5, 205).1, Some("Layer 11".to_string())); + assert_eq!(map.query(7, 205).1, Some("Layer 11".to_string())); + assert_eq!(map.query(8, 205).1, None); } /// Test when rectangles have nontrivial height and possibly overlap @@ -235,45 +149,45 @@ fn test_persistent_overlapping() { let mut map = PersistentLayerMap::::new(); // Add 3 key-disjoint layers with varying LSN ranges - map.insert(1..2, 100..200, "Layer 1".to_string()); - map.insert(4..5, 110..200, "Layer 2".to_string()); - map.insert(7..8, 120..300, "Layer 3".to_string()); + map.insert(1..2, 100..200, "Layer 1".to_string(), true); + map.insert(4..5, 110..200, "Layer 2".to_string(), true); + map.insert(7..8, 120..300, "Layer 3".to_string(), true); // Add wide and short layer - map.insert(0..9, 130..199, "Layer 4".to_string()); + map.insert(0..9, 130..199, "Layer 4".to_string(), true); // Add wide layer taller than some - map.insert(0..9, 140..201, "Layer 5".to_string()); + map.insert(0..9, 140..201, "Layer 5".to_string(), true); // Add wide layer taller than all - map.insert(0..9, 150..301, "Layer 6".to_string()); + map.insert(0..9, 150..301, "Layer 6".to_string(), true); // After layer 4 insertion - assert_eq!(map.query(0, 135), Some("Layer 4".to_string())); - assert_eq!(map.query(1, 135), Some("Layer 1".to_string())); - assert_eq!(map.query(2, 135), Some("Layer 4".to_string())); - assert_eq!(map.query(4, 135), Some("Layer 2".to_string())); - assert_eq!(map.query(5, 135), Some("Layer 4".to_string())); - assert_eq!(map.query(7, 135), Some("Layer 3".to_string())); - assert_eq!(map.query(8, 135), Some("Layer 4".to_string())); + assert_eq!(map.query(0, 135).1, Some("Layer 4".to_string())); + assert_eq!(map.query(1, 135).1, Some("Layer 1".to_string())); + assert_eq!(map.query(2, 135).1, Some("Layer 4".to_string())); + assert_eq!(map.query(4, 135).1, Some("Layer 2".to_string())); + assert_eq!(map.query(5, 135).1, Some("Layer 4".to_string())); + assert_eq!(map.query(7, 135).1, Some("Layer 3".to_string())); + assert_eq!(map.query(8, 135).1, Some("Layer 4".to_string())); // After layer 5 insertion - assert_eq!(map.query(0, 145), Some("Layer 5".to_string())); - assert_eq!(map.query(1, 145), Some("Layer 5".to_string())); - assert_eq!(map.query(2, 145), Some("Layer 5".to_string())); - assert_eq!(map.query(4, 145), Some("Layer 5".to_string())); - assert_eq!(map.query(5, 145), Some("Layer 5".to_string())); - assert_eq!(map.query(7, 145), Some("Layer 3".to_string())); - assert_eq!(map.query(8, 145), Some("Layer 5".to_string())); + assert_eq!(map.query(0, 145).1, Some("Layer 5".to_string())); + assert_eq!(map.query(1, 145).1, Some("Layer 5".to_string())); + assert_eq!(map.query(2, 145).1, Some("Layer 5".to_string())); + assert_eq!(map.query(4, 145).1, Some("Layer 5".to_string())); + assert_eq!(map.query(5, 145).1, Some("Layer 5".to_string())); + assert_eq!(map.query(7, 145).1, Some("Layer 3".to_string())); + assert_eq!(map.query(8, 145).1, Some("Layer 5".to_string())); // After layer 6 insertion - assert_eq!(map.query(0, 155), Some("Layer 6".to_string())); - assert_eq!(map.query(1, 155), Some("Layer 6".to_string())); - assert_eq!(map.query(2, 155), Some("Layer 6".to_string())); - assert_eq!(map.query(4, 155), Some("Layer 6".to_string())); - assert_eq!(map.query(5, 155), Some("Layer 6".to_string())); - assert_eq!(map.query(7, 155), Some("Layer 6".to_string())); - assert_eq!(map.query(8, 155), Some("Layer 6".to_string())); + assert_eq!(map.query(0, 155).1, Some("Layer 6".to_string())); + assert_eq!(map.query(1, 155).1, Some("Layer 6".to_string())); + assert_eq!(map.query(2, 155).1, Some("Layer 6".to_string())); + assert_eq!(map.query(4, 155).1, Some("Layer 6".to_string())); + assert_eq!(map.query(5, 155).1, Some("Layer 6".to_string())); + assert_eq!(map.query(7, 155).1, Some("Layer 6".to_string())); + assert_eq!(map.query(8, 155).1, Some("Layer 6".to_string())); } /// Layer map that supports: @@ -292,15 +206,18 @@ pub struct RetroactiveLayerMap { /// We buffer insertion into the PersistentLayerMap to decrease the number of rebuilds. /// A value of None means we want to delete this item. - buffer: BTreeMap<(u64, u64, i128, i128), Option>, + buffer: BTreeMap<(u64, u64, i128, i128, bool), Option>, /// All current layers. This is not used for search. Only to make rebuilds easier. - layers: BTreeMap<(u64, u64, i128, i128), Value>, + layers: BTreeMap<(u64, u64, i128, i128, bool), Value>, } -impl std::fmt::Debug for RetroactiveLayerMap { +impl std::fmt::Debug for RetroactiveLayerMap { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "RetroactiveLayerMap: head: {:?}", self.map) + f.debug_struct("RetroactiveLayerMap") + .field("buffer", &self.buffer) + .field("layers", &self.layers) + .finish() } } @@ -319,22 +236,28 @@ impl RetroactiveLayerMap { } } - pub fn insert(self: &mut Self, key: Range, lsn: Range, value: Value) { + pub fn insert( + self: &mut Self, + key: Range, + lsn: Range, + value: Value, + is_image: bool, + ) { self.buffer.insert( - (lsn.start, lsn.end, key.start, key.end), + (lsn.start, lsn.end, key.start, key.end, is_image), Some(value.clone()), ); } - pub fn remove(self: &mut Self, key: Range, lsn: Range) { + pub fn remove(self: &mut Self, key: Range, lsn: Range, is_image: bool) { self.buffer - .insert((lsn.start, lsn.end, key.start, key.end), None); + .insert((lsn.start, lsn.end, key.start, key.end, is_image), None); } pub fn rebuild(self: &mut Self) { // Find the first LSN that needs to be rebuilt let rebuild_since: u64 = match self.buffer.iter().next() { - Some(((lsn_start, _, _, _), _)) => lsn_start.clone(), + Some(((lsn_start, _, _, _, _), _)) => lsn_start.clone(), None => return, // No need to rebuild if buffer is empty }; @@ -359,11 +282,15 @@ impl RetroactiveLayerMap { // Rebuild self.map.trim(&rebuild_since); - for ((lsn_start, lsn_end, key_start, key_end), layer) in - self.layers.range((rebuild_since, 0, 0, 0)..) + for ((lsn_start, lsn_end, key_start, key_end, is_image), layer) in + self.layers.range((rebuild_since, 0, 0, 0, false)..) { - self.map - .insert(*key_start..*key_end, *lsn_start..*lsn_end, layer.clone()); + self.map.insert( + *key_start..*key_end, + *lsn_start..*lsn_end, + layer.clone(), + *is_image, + ); } } @@ -371,7 +298,7 @@ impl RetroactiveLayerMap { self.map.trim(&0); } - pub fn query(self: &Self, key: i128, lsn: u64) -> Option { + pub fn query(self: &Self, key: i128, lsn: u64) -> (Option, Option) { if !self.buffer.is_empty() { panic!("rebuild pls") } @@ -379,16 +306,37 @@ impl RetroactiveLayerMap { self.map.query(key, lsn) } - pub fn get_coverage( - self: &Self, - lsn: u64, - ) -> Option<&RedBlackTreeMapSync>> { + pub fn get_version(self: &Self, lsn: u64) -> Option<&LatestLayerMap> { if !self.buffer.is_empty() { panic!("rebuild pls") } - self.map.get_coverage(lsn) + self.map.get_version(lsn) } + + pub fn iter(self: &Self) -> impl '_ + Iterator { + if !self.buffer.is_empty() { + panic!("rebuild pls") + } + + self.layers.iter().map(|(_, v)| v.clone()) + } +} + +#[test] +fn test_retroactive_regression_1() { + let mut map = RetroactiveLayerMap::new(); + + map.insert( + 0..21267647932558653966460912964485513215, + 23761336..23761457, + "sdfsdfs".to_string(), + false, + ); + + map.rebuild(); + + assert_eq!(map.query(100, 23761457).0, Some("sdfsdfs".to_string())); } #[test] @@ -396,29 +344,29 @@ fn test_retroactive_simple() { let mut map = RetroactiveLayerMap::new(); // Append some images in increasing LSN order - map.insert(0..5, 100..101, "Image 1".to_string()); - map.insert(3..9, 110..111, "Image 2".to_string()); - map.insert(4..6, 120..121, "Image 3".to_string()); - map.insert(8..9, 120..121, "Image 4".to_string()); + map.insert(0..5, 100..101, "Image 1".to_string(), true); + map.insert(3..9, 110..111, "Image 2".to_string(), true); + map.insert(4..6, 120..121, "Image 3".to_string(), true); + map.insert(8..9, 120..121, "Image 4".to_string(), true); // Add a delta layer out of order - map.insert(2..5, 105..106, "Delta 1".to_string()); + map.insert(2..5, 105..106, "Delta 1".to_string(), true); // Rebuild so we can start querying map.rebuild(); // Query key 4 - assert_eq!(map.query(4, 90), None); - assert_eq!(map.query(4, 102), Some("Image 1".to_string())); - assert_eq!(map.query(4, 107), Some("Delta 1".to_string())); - assert_eq!(map.query(4, 115), Some("Image 2".to_string())); - assert_eq!(map.query(4, 125), Some("Image 3".to_string())); + assert_eq!(map.query(4, 90).1, None); + assert_eq!(map.query(4, 102).1, Some("Image 1".to_string())); + assert_eq!(map.query(4, 107).1, Some("Delta 1".to_string())); + assert_eq!(map.query(4, 115).1, Some("Image 2".to_string())); + assert_eq!(map.query(4, 125).1, Some("Image 3".to_string())); // Remove Image 3 - map.remove(4..6, 120..121); + map.remove(4..6, 120..121, true); map.rebuild(); // Check deletion worked - assert_eq!(map.query(4, 125), Some("Image 2".to_string())); - assert_eq!(map.query(8, 125), Some("Image 4".to_string())); + assert_eq!(map.query(4, 125).1, Some("Image 2".to_string())); + assert_eq!(map.query(8, 125).1, Some("Image 4".to_string())); } diff --git a/pageserver/src/tenant/coverage.rs b/pageserver/src/tenant/coverage.rs new file mode 100644 index 0000000000..cc56b31bfe --- /dev/null +++ b/pageserver/src/tenant/coverage.rs @@ -0,0 +1,104 @@ +use std::ops::Range; + +// TODO the `im` crate has 20x more downloads and also has +// persistent/immutable BTree. It also runs a bit faster but +// results are not the same on some tests. +use rpds::RedBlackTreeMapSync; + +pub struct Coverage { + /// Mapping key to the latest layer (if any) until the next key. + /// We use the Sync version of the map because we want Self to + /// be Sync. Using nonsync might be faster, if we can work with + /// that. + head: RedBlackTreeMapSync>, +} + +impl Default for Coverage { + fn default() -> Self { + Self::new() + } +} + +impl Coverage { + pub fn new() -> Self { + Self { + head: RedBlackTreeMapSync::default(), + } + } + + /// Helper function to subdivide the key range without changing any values + fn add_node(self: &mut Self, key: i128) { + let value = match self.head.range(..=key).last() { + Some((_, Some(v))) => Some(v.clone()), + Some((_, None)) => None, + None => None, + }; + self.head.insert_mut(key, value); + } + + pub fn insert(self: &mut Self, key: Range, lsn: Range, value: Value) { + // NOTE The order of the following lines is important!! + + // Add nodes at endpoints + self.add_node(key.start); + self.add_node(key.end); + + // Raise the height where necessary + // + // NOTE This loop is worst case O(N), but amortized O(log N) in the special + // case when rectangles have no height. In practice I don't think we'll see + // the kind of layer intersections needed to trigger O(N) behavior. If we + // do it can be fixed using lazy propagation. + let mut to_update = Vec::new(); + let mut to_remove = Vec::new(); + let mut prev_covered = false; + for (k, node) in self.head.range(key.clone()) { + let needs_cover = match node { + None => true, + Some((h, _)) => h < &lsn.end, + }; + if needs_cover { + match prev_covered { + true => to_remove.push(k.clone()), + false => to_update.push(k.clone()), + } + } + prev_covered = needs_cover; + } + if !prev_covered { + to_remove.push(key.end); + } + for k in to_update { + self.head + .insert_mut(k.clone(), Some((lsn.end.clone(), value.clone()))); + } + for k in to_remove { + self.head.remove_mut(&k); + } + } + + pub fn query(self: &Self, key: i128) -> Option { + self.head + .range(..=key) + .rev() + .next()? + .1 + .as_ref() + .map(|(_, v)| v.clone()) + } + + pub fn range( + self: &Self, + key: Range, + ) -> impl '_ + Iterator)> { + self.head + .range(key) + .map(|(k, v)| (k.clone(), v.as_ref().map(|x| x.1.clone()))) + } + + pub fn clone(self: &Self) -> Self { + Self { + head: self.head.clone(), + } + } +} diff --git a/pageserver/src/tenant/latest_layer_map.rs b/pageserver/src/tenant/latest_layer_map.rs new file mode 100644 index 0000000000..b67f7ea3bf --- /dev/null +++ b/pageserver/src/tenant/latest_layer_map.rs @@ -0,0 +1,60 @@ +use std::ops::Range; + +use super::coverage::Coverage; + +pub struct LatestLayerMap { + image_coverage: Coverage, + delta_coverage: Coverage, +} + +impl Default for LatestLayerMap { + fn default() -> Self { + Self { + image_coverage: Coverage::default(), + delta_coverage: Coverage::default(), + } + } +} + +impl LatestLayerMap { + pub fn insert( + self: &mut Self, + key: Range, + lsn: Range, + value: Value, + is_image: bool, + ) { + if is_image { + self.image_coverage.insert(key, lsn, value); + } else { + self.delta_coverage.insert(key.clone(), lsn.clone(), value); + } + } + + pub fn query(self: &Self, key: i128) -> (Option, Option) { + let delta = self.delta_coverage.query(key); + let image = self.image_coverage.query(key); + (delta, image) + } + + pub fn image_coverage( + self: &Self, + key: Range, + ) -> impl '_ + Iterator)> { + self.image_coverage.range(key) + } + + pub fn delta_coverage( + self: &Self, + key: Range, + ) -> impl '_ + Iterator)> { + self.delta_coverage.range(key) + } + + pub fn clone(self: &Self) -> Self { + Self { + image_coverage: self.image_coverage.clone(), + delta_coverage: self.delta_coverage.clone(), + } + } +} diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index fb1b67a27e..205d6751ec 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -54,181 +54,14 @@ pub struct LayerMap { /// pub frozen_layers: VecDeque>, - /// All the historic layers are kept here - historic_layers: RTree, - - /// HACK I'm experimenting with a new index to reaplace the RTree. If this - /// works out I'll clean up the struct later. + /// Index of the historic layers optimized for search index: RetroactiveLayerMap>, - images: RetroactiveLayerMap>, /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. l0_delta_layers: Vec>, } -struct LayerRTreeObject { - layer: Arc, - - envelope: AABB<[IntKey; 2]>, -} - -// Representation of Key as numeric type. -// We can not use native implementation of i128, because rstar::RTree -// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi). -// Overflow will cause panic in debug mode and incorrect area calculation in release mode, -// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work). -// By using i256 as the type, even though all the actual values would fit in i128, we can be -// sure that multiplication doesn't overflow. -// - -#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)] -struct IntKey(i256); - -impl Copy for IntKey {} - -impl IntKey { - fn from(i: i128) -> Self { - IntKey(i256::from(i)) - } -} - -impl Bounded for IntKey { - fn min_value() -> Self { - IntKey(i256::MIN) - } - fn max_value() -> Self { - IntKey(i256::MAX) - } -} - -impl Signed for IntKey { - fn is_positive(&self) -> bool { - self.0 > i256::ZERO - } - fn is_negative(&self) -> bool { - self.0 < i256::ZERO - } - fn signum(&self) -> Self { - match self.0.cmp(&i256::ZERO) { - Ordering::Greater => IntKey(i256::ONE), - Ordering::Less => IntKey(-i256::ONE), - Ordering::Equal => IntKey(i256::ZERO), - } - } - fn abs(&self) -> Self { - IntKey(self.0.abs()) - } - fn abs_sub(&self, other: &Self) -> Self { - if self.0 <= other.0 { - IntKey(i256::ZERO) - } else { - IntKey(self.0 - other.0) - } - } -} - -impl Neg for IntKey { - type Output = Self; - fn neg(self) -> Self::Output { - IntKey(-self.0) - } -} - -impl Rem for IntKey { - type Output = Self; - fn rem(self, rhs: Self) -> Self::Output { - IntKey(self.0 % rhs.0) - } -} - -impl Div for IntKey { - type Output = Self; - fn div(self, rhs: Self) -> Self::Output { - IntKey(self.0 / rhs.0) - } -} - -impl Add for IntKey { - type Output = Self; - fn add(self, rhs: Self) -> Self::Output { - IntKey(self.0 + rhs.0) - } -} - -impl Sub for IntKey { - type Output = Self; - fn sub(self, rhs: Self) -> Self::Output { - IntKey(self.0 - rhs.0) - } -} - -impl Mul for IntKey { - type Output = Self; - fn mul(self, rhs: Self) -> Self::Output { - IntKey(self.0 * rhs.0) - } -} - -impl One for IntKey { - fn one() -> Self { - IntKey(i256::ONE) - } -} - -impl Zero for IntKey { - fn zero() -> Self { - IntKey(i256::ZERO) - } - fn is_zero(&self) -> bool { - self.0 == i256::ZERO - } -} - -impl Num for IntKey { - type FromStrRadixErr = ::FromStrRadixErr; - fn from_str_radix(str: &str, radix: u32) -> Result { - Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?))) - } -} - -impl PartialEq for LayerRTreeObject { - fn eq(&self, other: &Self) -> bool { - // FIXME: ptr_eq might fail to return true for 'dyn' - // references. Clippy complains about this. In practice it - // seems to work, the assertion below would be triggered - // otherwise but this ought to be fixed. - #[allow(clippy::vtable_address_comparisons)] - Arc::ptr_eq(&self.layer, &other.layer) - } -} - -impl RTreeObject for LayerRTreeObject { - type Envelope = AABB<[IntKey; 2]>; - fn envelope(&self) -> Self::Envelope { - self.envelope - } -} - -impl LayerRTreeObject { - fn new(layer: Arc) -> Self { - let key_range = layer.get_key_range(); - let lsn_range = layer.get_lsn_range(); - - let envelope = AABB::from_corners( - [ - IntKey::from(key_range.start.to_i128()), - IntKey::from(lsn_range.start.0 as i128), - ], - [ - IntKey::from(key_range.end.to_i128() - 1), - IntKey::from(lsn_range.end.0 as i128 - 1), - ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive - ); - LayerRTreeObject { layer, envelope } - } -} - /// Return value of LayerMap::search pub struct SearchResult { pub layer: Arc, @@ -248,159 +81,40 @@ impl LayerMap { /// layer. /// pub fn search(&self, key: Key, end_lsn: Lsn) -> Result> { - // let old = self.search_old(key, end_lsn)?; - let new = self.search_new(key, end_lsn)?; - // match (&old, &new) { - // (None, None) => {} - // (None, Some(_)) => panic!("returned Some, expected None"), - // (Some(_), None) => panic!("returned None, expected Some"), - // (Some(old), Some(new)) => { - // // TODO be more verbose and flexible - // let context = format!("query: key {}, end_lsn: {}", key, end_lsn); - // assert_eq!(old.layer.filename(), new.layer.filename(), "{}", context); - // assert_eq!(old.lsn_floor, new.lsn_floor, "{}", context); - // } - // } - return Ok(new); - } - - // HACK just testing correctness - fn search_new(&self, key: Key, end_lsn: Lsn) -> Result> { - // TODO I'm making two separate queries, which is 2x the cost, but that - // can be avoided in varous ways. Caching latest_image queries is - // probably the simplest, but combining the two data structures - // might be better. - let latest_layer = self.index.query(key.to_i128(), end_lsn.0 - 1); - let latest_image = self.images.query(key.to_i128(), end_lsn.0 - 1); - - // Check for exact match - let latest_image = if let Some(image) = latest_image { - let img_lsn = image.get_lsn_range().start; - if Lsn(img_lsn.0 + 1) == end_lsn { - return Ok(Some(SearchResult { + match self.index.query(key.to_i128(), end_lsn.0 - 1) { + (None, None) => Ok(None), + (None, Some(image)) => { + let lsn_floor = image.get_lsn_range().start; + Ok(Some(SearchResult { layer: image, - lsn_floor: img_lsn, - })); + lsn_floor, + })) } - - // HACK just to give back ownership of latest_image to parent scope. - // There's definitely a cleaner way to do it. - Some(image) - } else { - None - }; - - return Ok(latest_layer.map(|layer| { - // Compute lsn_floor - let mut lsn_floor = layer.get_lsn_range().start; - if let Some(image) = latest_image { - if layer.is_incremental() { - lsn_floor = std::cmp::max(lsn_floor, image.get_lsn_range().start + 1) + (Some(delta), None) => { + let lsn_floor = delta.get_lsn_range().start; + Ok(Some(SearchResult { + layer: delta, + lsn_floor, + })) + } + (Some(delta), Some(image)) => { + let img_lsn = image.get_lsn_range().start; + let image_is_newer = image.get_lsn_range().end > delta.get_lsn_range().end; + let image_exact_match = Lsn(img_lsn.0 + 1) == end_lsn; + if image_is_newer || image_exact_match { + Ok(Some(SearchResult { + layer: image, + lsn_floor: img_lsn, + })) + } else { + let lsn_floor = + std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); + Ok(Some(SearchResult { + layer: delta, + lsn_floor, + })) } } - SearchResult { layer, lsn_floor } - })); - } - - // HACK just testing correctness - fn search_old(&self, key: Key, end_lsn: Lsn) -> Result> { - // linear search - // Find the latest image layer that covers the given key - let mut latest_img: Option> = None; - let mut latest_img_lsn: Option = None; - let envelope = AABB::from_corners( - [IntKey::from(key.to_i128()), IntKey::from(0i128)], - [ - IntKey::from(key.to_i128()), - IntKey::from(end_lsn.0 as i128 - 1), - ], - ); - for e in self - .historic_layers - .locate_in_envelope_intersecting(&envelope) - { - let l = &e.layer; - if l.is_incremental() { - continue; - } - assert!(l.get_key_range().contains(&key)); - let img_lsn = l.get_lsn_range().start; - assert!(img_lsn < end_lsn); - if Lsn(img_lsn.0 + 1) == end_lsn { - // found exact match - return Ok(Some(SearchResult { - layer: Arc::clone(l), - lsn_floor: img_lsn, - })); - } - if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) { - latest_img = Some(Arc::clone(l)); - latest_img_lsn = Some(img_lsn); - } - } - - // Search the delta layers - let mut latest_delta: Option> = None; - for e in self - .historic_layers - .locate_in_envelope_intersecting(&envelope) - { - let l = &e.layer; - if !l.is_incremental() { - continue; - } - assert!(l.get_key_range().contains(&key)); - if l.get_lsn_range().start >= end_lsn { - info!( - "Candidate delta layer {}..{} is too new for lsn {}", - l.get_lsn_range().start, - l.get_lsn_range().end, - end_lsn - ); - } - assert!(l.get_lsn_range().start < end_lsn); - if l.get_lsn_range().end >= end_lsn { - // this layer contains the requested point in the key/lsn space. - // No need to search any further - trace!( - "found layer {} for request on {key} at {end_lsn}", - l.filename().display(), - ); - latest_delta.replace(Arc::clone(l)); - break; - } - // this layer's end LSN is smaller than the requested point. If there's - // nothing newer, this is what we need to return. Remember this. - if let Some(old_candidate) = &latest_delta { - if l.get_lsn_range().end > old_candidate.get_lsn_range().end { - latest_delta.replace(Arc::clone(l)); - } - } else { - latest_delta.replace(Arc::clone(l)); - } - } - if let Some(l) = latest_delta { - trace!( - "found (old) layer {} for request on {key} at {end_lsn}", - l.filename().display(), - ); - let lsn_floor = std::cmp::max( - Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1), - l.get_lsn_range().start, - ); - Ok(Some(SearchResult { - lsn_floor, - layer: l, - })) - } else if let Some(l) = latest_img { - trace!("found img layer and no deltas for request on {key} at {end_lsn}"); - Ok(Some(SearchResult { - lsn_floor: latest_img_lsn.unwrap(), - layer: l, - })) - } else { - trace!("no layer found for request on {key} at {end_lsn}"); - Ok(None) } } @@ -414,27 +128,19 @@ impl LayerMap { kr.start.to_i128()..kr.end.to_i128(), lr.start.0..lr.end.0, Arc::clone(&layer), + !layer.is_incremental(), ); - if !layer.is_incremental() { - self.images.insert( - kr.start.to_i128()..kr.end.to_i128(), - lr.start.0..lr.end.0, - Arc::clone(&layer), - ); - } if layer.get_key_range() == (Key::MIN..Key::MAX) { self.l0_delta_layers.push(layer.clone()); } - // TODO remove this so insert isn't slow. I need it for now for iter_historic() - self.historic_layers.insert(LayerRTreeObject::new(layer)); + NUM_ONDISK_LAYERS.inc(); } /// Must be called after a batch of insert_historic calls, before querying pub fn rebuild_index(&mut self) { self.index.rebuild(); - self.images.rebuild(); } /// @@ -445,12 +151,11 @@ impl LayerMap { pub fn remove_historic(&mut self, layer: Arc) { let kr = layer.get_key_range(); let lr = layer.get_lsn_range(); - self.index - .remove(kr.start.to_i128()..kr.end.to_i128(), lr.start.0..lr.end.0); - if !layer.is_incremental() { - self.images - .remove(kr.start.to_i128()..kr.end.to_i128(), lr.start.0..lr.end.0); - } + self.index.remove( + kr.start.to_i128()..kr.end.to_i128(), + lr.start.0..lr.end.0, + !layer.is_incremental(), + ); if layer.get_key_range() == (Key::MIN..Key::MAX) { let len_before = self.l0_delta_layers.len(); @@ -464,10 +169,7 @@ impl LayerMap { .retain(|other| !Arc::ptr_eq(other, &layer)); assert_eq!(self.l0_delta_layers.len(), len_before - 1); } - assert!(self - .historic_layers - .remove(&LayerRTreeObject::new(layer)) - .is_some()); + NUM_ONDISK_LAYERS.dec(); } @@ -480,87 +182,17 @@ impl LayerMap { key_range: &Range, lsn_range: &Range, ) -> Result { - // TODO implement using new index - let mut range_remain = key_range.clone(); - - loop { - let mut made_progress = false; - let envelope = AABB::from_corners( - [ - IntKey::from(range_remain.start.to_i128()), - IntKey::from(lsn_range.start.0 as i128), - ], - [ - IntKey::from(range_remain.end.to_i128() - 1), - IntKey::from(lsn_range.end.0 as i128 - 1), - ], - ); - for e in self - .historic_layers - .locate_in_envelope_intersecting(&envelope) - { - let l = &e.layer; - if l.is_incremental() { - continue; - } - let img_lsn = l.get_lsn_range().start; - if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) { - made_progress = true; - let img_key_end = l.get_key_range().end; - - if img_key_end >= range_remain.end { - return Ok(true); - } - range_remain.start = img_key_end; - } - } - - if !made_progress { - return Ok(false); - } - } + todo!() } pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { - // TODO implement using new index - self.historic_layers.iter().map(|e| e.layer.clone()) + self.index.iter() } /// Find the last image layer that covers 'key', ignoring any image layers /// newer than 'lsn'. fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option> { - let use_new_method = true; - if use_new_method { - return self.images.query(key.to_i128(), lsn.0); - } - - let mut candidate_lsn = Lsn(0); - let mut candidate = None; - let envelope = AABB::from_corners( - [IntKey::from(key.to_i128()), IntKey::from(0)], - [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)], - ); - for e in self - .historic_layers - .locate_in_envelope_intersecting(&envelope) - { - let l = &e.layer; - if l.is_incremental() { - continue; - } - - assert!(l.get_key_range().contains(&key)); - let this_lsn = l.get_lsn_range().start; - assert!(this_lsn <= lsn); - if this_lsn < candidate_lsn { - // our previous candidate was better - continue; - } - candidate_lsn = this_lsn; - candidate = Some(Arc::clone(l)); - } - - candidate + return self.index.query(key.to_i128(), lsn.0).1; } /// @@ -576,122 +208,44 @@ impl LayerMap { key_range: &Range, lsn: Lsn, ) -> Result, Option>)>> { - let use_new_method = true; - if use_new_method { - let bounds = match self.images.get_coverage(lsn.0) { - Some(x) => x, - None => return Ok(vec![]), - }; + let version = match self.index.get_version(lsn.0) { + Some(v) => v, + None => return Ok(vec![]), + }; - let start = key_range.start.to_i128(); - let end = key_range.end.to_i128(); + let start = key_range.start.to_i128(); + let end = key_range.end.to_i128(); - // Initialize loop variables - let mut coverage: Vec<(Range, Option>)> = vec![]; - let mut current_key = start.clone(); - let mut current_val = match bounds.range(..=start).rev().next() { - Some((_, Some((_, v)))) => Some(v.clone()), - Some((_, None)) => None, - None => None, - }; + // Initialize loop variables + let mut coverage: Vec<(Range, Option>)> = vec![]; + let mut current_key = start.clone(); + let mut current_val = version.query(start).1; - // Loop through the change events and push intervals - for (change_key, change_val) in bounds.range(start..end) { - let kr = Key::from_i128(current_key)..Key::from_i128(*change_key); - coverage.push((kr, current_val.take())); - current_key = change_key.clone(); - current_val = change_val.as_ref().map(|(_, v)| v.clone()); - } - - // Add the final interval - let kr = Key::from_i128(current_key)..Key::from_i128(end); + // Loop through the change events and push intervals + for (change_key, change_val) in version.image_coverage(start..end) { + let kr = Key::from_i128(current_key)..Key::from_i128(change_key); coverage.push((kr, current_val.take())); - - return Ok(coverage); + current_key = change_key.clone(); + current_val = change_val.clone(); } - let mut points = vec![key_range.start]; - let envelope = AABB::from_corners( - [IntKey::from(key_range.start.to_i128()), IntKey::from(0)], - [ - IntKey::from(key_range.end.to_i128()), - IntKey::from(lsn.0 as i128), - ], - ); - for e in self - .historic_layers - .locate_in_envelope_intersecting(&envelope) - { - let l = &e.layer; - assert!(l.get_lsn_range().start <= lsn); - let range = l.get_key_range(); - if key_range.contains(&range.start) { - points.push(l.get_key_range().start); - } - if key_range.contains(&range.end) { - points.push(l.get_key_range().end); - } - } - points.push(key_range.end); + // Add the final interval + let kr = Key::from_i128(current_key)..Key::from_i128(end); + coverage.push((kr, current_val.take())); - points.sort(); - points.dedup(); - - // Ok, we now have a list of "interesting" points in the key space - - // For each range between the points, find the latest image - let mut start = *points.first().unwrap(); - let mut ranges = Vec::new(); - for end in points[1..].iter() { - let img = self.find_latest_image(start, lsn); - - ranges.push((start..*end, img)); - - start = *end; - } - Ok(ranges) + return Ok(coverage); } /// Count how many L1 delta layers there are that overlap with the /// given key and LSN range. pub fn count_deltas(&self, key_range: &Range, lsn_range: &Range) -> Result { - // TODO implement using new index - let mut result = 0; - if lsn_range.start >= lsn_range.end { - return Ok(0); - } - let envelope = AABB::from_corners( - [ - IntKey::from(key_range.start.to_i128()), - IntKey::from(lsn_range.start.0 as i128), - ], - [ - IntKey::from(key_range.end.to_i128() - 1), - IntKey::from(lsn_range.end.0 as i128 - 1), - ], - ); - for e in self - .historic_layers - .locate_in_envelope_intersecting(&envelope) - { - let l = &e.layer; - if !l.is_incremental() { - continue; - } - assert!(range_overlaps(&l.get_lsn_range(), lsn_range)); - assert!(range_overlaps(&l.get_key_range(), key_range)); - - // We ignore level0 delta layers. Unless the whole keyspace fits - // into one partition - if !range_eq(key_range, &(Key::MIN..Key::MAX)) - && range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX)) - { - continue; - } - - result += 1; - } - Ok(result) + // TODO write recursive function: + // 1. Get delta coverage + // 2. Recurse on each part + // + // This will count some layers twice, but we're interested in the max number of + // stacked deltas anyway (parallel deltas don't matter) so that's fine. + todo!() } /// Return all L0 delta layers diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 8dafcab124..104e8e2ca5 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -151,3 +151,11 @@ pub trait Layer: Send + Sync { /// Dump summary of the contents of the layer to stdout fn dump(&self, verbose: bool) -> Result<()>; } + +impl std::fmt::Debug for dyn Layer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Layer") + .field("filename", &self.filename()) + .finish() + } +}