diff --git a/Cargo.lock b/Cargo.lock index c9acb882eb..88b6ef93bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -66,6 +66,15 @@ dependencies = [ "backtrace", ] +[[package]] +name = "archery" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02" +dependencies = [ + "static_assertions", +] + [[package]] name = "arrayvec" version = "0.7.2" @@ -2345,6 +2354,7 @@ dependencies = [ "rand", "regex", "remote_storage", + "rpds", "rstar", "scopeguard", "serde", @@ -3154,6 +3164,15 @@ dependencies = [ "regex", ] +[[package]] +name = "rpds" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000" +dependencies = [ + "archery", +] + [[package]] name = "rstar" version = "0.9.3" @@ -3624,6 +3643,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "storage_broker" version = "0.1.0" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 61c7b8ae97..6ada2c5cb1 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -69,6 +69,7 @@ remote_storage = { path = "../libs/remote_storage" } tenant_size_model = { path = "../libs/tenant_size_model" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } +rpds = "0.12.0" [dev-dependencies] criterion = "0.4" diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index a99580bc65..b7400eed8d 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,5 +1,7 @@ use anyhow::Result; +use num_traits::ToPrimitive; use pageserver::repository::{Key, Value}; +use pageserver::tenant::bst_layer_map::BSTLM; use pageserver::tenant::filename::{DeltaFileName, ImageFileName}; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::storage_layer::Layer; @@ -243,23 +245,67 @@ fn bench_from_captest_env(c: &mut Criterion) { // too long processing layer map queries. fn bench_from_real_project(c: &mut Criterion) { // TODO consider compressing this file + + // Init layer map + let now = Instant::now(); let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt")); + println!("Finished layer map init in {:?}", now.elapsed()); + + // Init bst layer map with the same layers + let now = Instant::now(); + let mut bstlm = BSTLM::new(); + let mut sorted_layers: Vec<_> = layer_map.iter_historic_layers().collect(); + sorted_layers.sort_by(|a, b| a.get_lsn_range().start.cmp(&b.get_lsn_range().start)); + for layer in sorted_layers { + if layer.is_incremental() { + // TODO check if they're sorted + let kr = layer.get_key_range(); + let lr = layer.get_lsn_range(); + + bstlm.insert( + kr.start.to_i128(), + kr.end.to_i128(), + lr.start.0, + format!("Layer {}", lr.start.0), + ); + } else { + let kr = layer.get_key_range(); + let lr = layer.get_lsn_range(); + + bstlm.insert( + kr.start.to_i128(), + kr.end.to_i128(), + lr.start.0, + format!("Layer {}", lr.start.0), + ); + } + } + println!("Finished bst init in {:?}", now.elapsed()); + + // Choose uniformly distributed queries let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); - // Test with uniform query pattern - c.bench_function("real_map_uniform_queries", |b| { + // Define and name the benchmark function + let mut group = c.benchmark_group("real_map_uniform_queries"); + group.bench_function("current_code", |b| { b.iter(|| { for q in queries.clone().into_iter() { layer_map.search(q.0, q.1).unwrap(); } }); }); + group.bench_function("persistent_bst", |b| { + b.iter(|| { + for q in queries.clone().into_iter() { + bstlm.query(q.0.to_i128(), q.1 .0); + } + }); + }); + group.finish(); } // Benchmark using synthetic data. Arrange image layers on stacked diagonal lines. fn bench_sequential(c: &mut Criterion) { - let mut layer_map = LayerMap::default(); - // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines. // // TODO This code is pretty slow and runs even if we're only running other @@ -267,39 +313,62 @@ fn bench_sequential(c: &mut Criterion) { // Putting it inside the `bench_function` closure is not a solution // because then it runs multiple times during warmup. let now = Instant::now(); + let mut layer_map = LayerMap::default(); for i in 0..100_000 { - // TODO try inserting a super-wide layer in between every 10 to reflect - // what often happens with L1 layers that include non-rel changes. - // Maybe do that as a separate test. let i32 = (i as u32) % 100; let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); let layer = DummyImage { key_range: zero.add(10 * i32)..zero.add(10 * i32 + 1), - lsn: Lsn(10 * i), + lsn: Lsn(i), }; layer_map.insert_historic(Arc::new(layer)); } + println!("Finished layer map init in {:?}", now.elapsed()); - // Manually measure runtime without criterion because criterion - // has a minimum sample size of 10 and I don't want to run it 10 times. - println!("Finished init in {:?}", now.elapsed()); + // Init bst layer map with the same layers + let now = Instant::now(); + let mut bstlm = BSTLM::new(); + for layer in layer_map.iter_historic_layers() { + if layer.is_incremental() { + panic!("AAA"); + } else { + let kr = layer.get_key_range(); + let lr = layer.get_lsn_range(); + + bstlm.insert( + kr.start.to_i128(), + kr.end.to_i128(), + lr.start.0, + format!("Layer {}", lr.start.0), + ); + } + } + println!("Finished bst init in {:?}", now.elapsed()); // Choose 100 uniformly random queries let rng = &mut StdRng::seed_from_u64(1); let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map) - .choose_multiple(rng, 1) + .choose_multiple(rng, 100) .copied() .collect(); // Define and name the benchmark function - c.bench_function("sequential_uniform_queries", |b| { - // Run the search queries + let mut group = c.benchmark_group("sequential_uniform_queries"); + group.bench_function("current_code", |b| { b.iter(|| { for q in queries.clone().into_iter() { layer_map.search(q.0, q.1).unwrap(); } }); }); + group.bench_function("persistent_bst", |b| { + b.iter(|| { + for q in queries.clone().into_iter() { + bstlm.query(q.0.to_i128(), q.1 .0); + } + }); + }); + group.finish(); } criterion_group!(group_1, bench_from_captest_env); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 981c049111..530cbbd7b3 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -73,6 +73,7 @@ use utils::{ mod blob_io; pub mod block_io; +pub mod bst_layer_map; mod delta_layer; mod disk_btree; pub(crate) mod ephemeral_file; diff --git a/pageserver/src/tenant/bst_layer_map.rs b/pageserver/src/tenant/bst_layer_map.rs new file mode 100644 index 0000000000..23c1a832aa --- /dev/null +++ b/pageserver/src/tenant/bst_layer_map.rs @@ -0,0 +1,111 @@ +use std::collections::BTreeMap; + +// TODO the `im` crate has 20x more downloads and also has +// persistent/immutable BTree. See if it's better. +use rpds::RedBlackTreeMap; + +/// Layer map implemented using persistent/immutable binary search tree. +/// This implementation is only good enough to run benchmarks, +/// so it's missing unnecessary details. Values are String for now. +pub struct BSTLM { + /// Mapping key to the latest layer (if any) until the next key + head: RedBlackTreeMap>, + + /// All previous states of `self.head` + historic: BTreeMap>>, +} + +impl std::fmt::Debug for BSTLM { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let head_vec: Vec<_> = self.head.iter().collect(); + write!(f, "BSTLM: head: {:?}", head_vec) + } +} + +impl BSTLM { + pub fn new() -> Self { + BSTLM { + head: RedBlackTreeMap::default(), + historic: BTreeMap::default(), + } + } + + pub fn insert(self: &mut Self, key_begin: i128, key_end: i128, lsn: u64, value: String) { + // TODO check for off-by-one errors + + // It's only a persistent map, not a retroactive one + if let Some(last_entry) = self.historic.iter().rev().next() { + let last_lsn = last_entry.0; + if lsn == *last_lsn { + // TODO there are edge cases to take care of + } + if lsn < *last_lsn { + todo!("smaller lsn not implemented yet") + } + } + + // NOTE The order of the following lines is important!! + + // Preserve information after right endpoint + let value_at_end = match self.head.range(0..key_end).last() { + Some((_, Some(v))) => Some(v.clone()), + Some((_, None)) => None, + None => None, + }; + self.head.insert_mut(key_end, value_at_end); + + // Insert the left endpoint + self.head.insert_mut(key_begin, Some(value.clone())); + + // Cover the inside of the interval + let to_remove: Vec<_> = self + .head + .range((key_begin + 1)..key_end) + .map(|(k, _)| k.clone()) + .collect(); + for key in to_remove { + self.head.remove_mut(&key); + } + + // Remember history. Clone is O(1) + self.historic.insert(lsn, self.head.clone()); + } + + pub fn query(self: &Self, key: i128, lsn: u64) -> Option<&String> { + // TODO check for off-by-one errors + + let version = self.historic.range(0..=lsn).rev().next()?.1; + version.range(0..=key).rev().next()?.1.as_ref() + } + + // TODO Add API for delta layers with lsn range. + // The easy solution is to only store images, and then from every + // image point to deltas on top of it. There might be something + // nicer but we have this solution as backup. +} + +#[test] +fn test_bstlm() { + let mut bstlm = BSTLM::new(); + bstlm.insert(0, 5, 100, "Layer 1".to_string()); + dbg!(&bstlm); + bstlm.insert(3, 9, 110, "Layer 2".to_string()); + dbg!(&bstlm); + bstlm.insert(5, 6, 120, "Layer 3".to_string()); + dbg!(&bstlm); + + // After Layer 1 insertion + assert_eq!(bstlm.query(1, 105), Some(&"Layer 1".to_string())); + assert_eq!(bstlm.query(4, 105), Some(&"Layer 1".to_string())); + + // After Layer 2 insertion + assert_eq!(bstlm.query(4, 115), Some(&"Layer 2".to_string())); + assert_eq!(bstlm.query(8, 115), Some(&"Layer 2".to_string())); + assert_eq!(bstlm.query(11, 115), None); + + // After Layer 3 insertion + assert_eq!(bstlm.query(4, 125), Some(&"Layer 2".to_string())); + assert_eq!(bstlm.query(5, 125), Some(&"Layer 3".to_string())); + + assert_eq!(bstlm.query(7, 125), Some(&"Layer 2".to_string())); +}