From cd0fdada82de336e1118874710190add6c780ef1 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 28 Mar 2022 16:15:28 +0300 Subject: [PATCH] Use R-Tree for layer map --- pageserver/Cargo.toml | 1 + pageserver/src/layered_repository.rs | 2 +- .../src/layered_repository/layer_map.rs | 193 +++++++++++------- pageserver/src/repository.rs | 13 ++ 4 files changed, 136 insertions(+), 73 deletions(-) diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index de22d0dd77..84f657b811 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -44,6 +44,7 @@ nix = "0.23" once_cell = "1.8.0" crossbeam-utils = "0.8.5" fail = "0.5.0" +rstar = "0.9.2" rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] } async-compression = {version = "0.3", features = ["zstd", "tokio"]} diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 837298a10e..0be556ec28 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1928,7 +1928,7 @@ impl LayeredTimeline { l.filename().display(), l.is_incremental(), ); - layers_to_remove.push(Arc::clone(l)); + layers_to_remove.push(Arc::clone(&l)); } // Actually delete the layers from disk and remove them from the map. diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index c4929a6173..87da7d3170 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -16,6 +16,7 @@ use crate::layered_repository::InMemoryLayer; use crate::repository::Key; use anyhow::Result; use lazy_static::lazy_static; +use rstar::{RTree, RTreeObject, AABB}; use std::collections::VecDeque; use std::ops::Range; use std::sync::Arc; @@ -51,14 +52,42 @@ pub struct LayerMap { pub frozen_layers: VecDeque>, /// All the historic layers are kept here + historic_layers: RTree, - /// TODO: This is a placeholder implementation of a data structure - /// to hold information about all the layer files on disk and in - /// S3. Currently, it's just a vector and all operations perform a - /// linear scan over it. That obviously becomes slow as the - /// number of layers grows. I'm imagining that an R-tree or some - /// other 2D data structure would be the long-term solution here. - historic_layers: Vec>, + /// L0 layers has key range: (Key::MIN..Key::MAX) and locating them using R-Tree search is very inefficient + l0_layers: Vec>, +} + +struct LayerEnvelope { + layer: Arc, +} + +impl PartialEq for LayerEnvelope { + fn eq(&self, other: &Self) -> bool { + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + Arc::ptr_eq(&self.layer, &other.layer) + } + fn ne(&self, other: &Self) -> bool { + !self.eq(other) + } +} + +impl Eq for LayerEnvelope {} + +impl RTreeObject for LayerEnvelope { + type Envelope = AABB<[i128; 2]>; + fn envelope(&self) -> Self::Envelope { + let key_range = self.layer.get_key_range(); + let lsn_range = self.layer.get_lsn_range(); + AABB::from_corners( + [key_range.start.to_i128(), lsn_range.start.0 as i128], + [key_range.end.to_i128(), lsn_range.end.0 as i128 - 1], // end is exlusive + ) + } } /// Return value of LayerMap::search @@ -84,19 +113,21 @@ impl LayerMap { // Find the latest image layer that covers the given key let mut latest_img: Option> = None; let mut latest_img_lsn: Option = None; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [key.to_i128(), 0i128], + [key.to_i128(), end_lsn.0 as i128 - 1], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } + assert!(l.get_key_range().contains(&key)); let img_lsn = l.get_lsn_range().start; - - if img_lsn >= end_lsn { - // too new - continue; - } + assert!(img_lsn < end_lsn); if Lsn(img_lsn.0 + 1) == end_lsn { // found exact match return Ok(Some(SearchResult { @@ -112,19 +143,16 @@ impl LayerMap { // Search the delta layers let mut latest_delta: Option> = None; - for l in self.historic_layers.iter() { + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if !l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } - - if l.get_lsn_range().start >= end_lsn { - // too new - continue; - } - + assert!(l.get_key_range().contains(&key)); + assert!(l.get_lsn_range().start < end_lsn); if l.get_lsn_range().end >= end_lsn { // this layer contains the requested point in the key/lsn space. // No need to search any further @@ -182,7 +210,10 @@ impl LayerMap { /// Insert an on-disk layer /// pub fn insert_historic(&mut self, layer: Arc) { - self.historic_layers.push(layer); + if layer.get_key_range() == (Key::MIN..Key::MAX) { + self.l0_layers.push(layer.clone()); + } + self.historic_layers.insert(LayerEnvelope { layer }); NUM_ONDISK_LAYERS.inc(); } @@ -193,17 +224,21 @@ impl LayerMap { /// #[allow(dead_code)] pub fn remove_historic(&mut self, layer: Arc) { - let len_before = self.historic_layers.len(); + if layer.get_key_range() == (Key::MIN..Key::MAX) { + let len_before = self.l0_layers.len(); - // FIXME: ptr_eq might fail to return true for 'dyn' - // references. Clippy complains about this. In practice it - // seems to work, the assertion below would be triggered - // otherwise but this ought to be fixed. - #[allow(clippy::vtable_address_comparisons)] - self.historic_layers - .retain(|other| !Arc::ptr_eq(other, &layer)); - - assert_eq!(self.historic_layers.len(), len_before - 1); + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + self.l0_layers.retain(|other| !Arc::ptr_eq(other, &layer)); + assert_eq!(self.l0_layers.len(), len_before - 1); + } + assert!(self + .historic_layers + .remove(&LayerEnvelope { layer }) + .is_some()); NUM_ONDISK_LAYERS.dec(); } @@ -224,13 +259,20 @@ impl LayerMap { loop { let mut made_progress = false; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [range_remain.start.to_i128(), lsn.0 as i128], + [range_remain.end.to_i128(), disk_consistent_lsn.0 as i128], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } let img_lsn = l.get_lsn_range().start; - if !l.is_incremental() - && l.get_key_range().contains(&range_remain.start) + if l.get_key_range().contains(&range_remain.start) && img_lsn > lsn && img_lsn < disk_consistent_lsn { @@ -266,8 +308,8 @@ impl LayerMap { } */ - pub fn iter_historic_layers(&self) -> std::slice::Iter> { - self.historic_layers.iter() + pub fn iter_historic_layers(&self) -> Box> + '_> { + Box::new(self.historic_layers.iter().map(|e| e.layer.clone())) } /// Find the last image layer that covers 'key', ignoring any image layers @@ -275,19 +317,19 @@ impl LayerMap { fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option> { let mut candidate_lsn = Lsn(0); let mut candidate = None; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners([key.to_i128(), 0], [key.to_i128(), lsn.0 as i128]); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } - + assert!(l.get_key_range().contains(&key)); let this_lsn = l.get_lsn_range().start; - if this_lsn > lsn { - continue; - } + assert!(this_lsn <= lsn); if this_lsn < candidate_lsn { // our previous candidate was better continue; @@ -315,10 +357,16 @@ impl LayerMap { let mut points: Vec; points = vec![key_range.start]; - for l in self.historic_layers.iter() { - if l.get_lsn_range().start > lsn { - continue; - } + let envelope = AABB::from_corners( + [key_range.start.to_i128(), 0], + [key_range.end.to_i128(), lsn.0 as i128], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; + assert!(l.get_lsn_range().start <= lsn); let range = l.get_key_range(); if key_range.contains(&range.start) { points.push(l.get_key_range().start); @@ -351,16 +399,27 @@ impl LayerMap { /// given key and LSN range. pub fn count_deltas(&self, key_range: &Range, lsn_range: &Range) -> Result { let mut result = 0; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [key_range.start.to_i128(), lsn_range.start.0 as i128], + [key_range.end.to_i128(), lsn_range.end.0 as i128 - 1], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if !l.is_incremental() { continue; } if !range_overlaps(&l.get_lsn_range(), lsn_range) { - continue; - } - if !range_overlaps(&l.get_key_range(), key_range) { - continue; + info!( + "l.lsn_range={:?}, lsn_range={:?}", + l.get_lsn_range(), + lsn_range + ); } + assert!(range_overlaps(&l.get_lsn_range(), lsn_range)); + assert!(range_overlaps(&l.get_key_range(), key_range)); // We ignore level0 delta layers. Unless the whole keyspace fits // into one partition @@ -377,25 +436,15 @@ impl LayerMap { /// Return all L0 delta layers pub fn get_level0_deltas(&self) -> Result>> { - let mut deltas = Vec::new(); - for l in self.historic_layers.iter() { - if !l.is_incremental() { - continue; - } - if l.get_key_range() != (Key::MIN..Key::MAX) { - continue; - } - deltas.push(Arc::clone(l)); - } - Ok(deltas) + Ok(self.l0_layers.clone()) } /// debugging function to print out the contents of the layer map #[allow(unused)] pub fn dump(&self) -> Result<()> { println!("Begin dump LayerMap"); - for layer in self.historic_layers.iter() { - layer.dump()?; + for e in self.historic_layers.iter() { + e.layer.dump()?; } println!("End dump LayerMap"); Ok(()) diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index b960e037be..e2808e23bd 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -29,6 +29,19 @@ pub struct Key { } impl Key { + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. + /// As far as Zenith is not supporting tablespace (because of lack of access to local file system), + /// we can assume that only some predefined namespace OIDs are used which can fit in u16 + pub fn to_i128(&self) -> i128 { + assert!(self.field2 < 0xFFFFF || self.field2 == !0u32); + (((self.field1 & 0x7f) as i128) << 120) + | ((self.field5 as i128) << 112) + | (((self.field2 & 0xFFFF) as i128) << 96) + | ((self.field3 as i128) << 64) + | ((self.field4 as i128) << 32) + | self.field6 as i128 + } + pub fn next(&self) -> Key { self.add(1) }