diff --git a/Cargo.lock b/Cargo.lock index 0579d381cc..ddb10352b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -37,6 +37,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "amplify_num" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27d3d00d3d115395a7a8a4dc045feb7aa82b641e485f7e15f4e67ac16f4f56d" + [[package]] name = "ansi_term" version = "0.12.1" @@ -135,6 +141,15 @@ dependencies = [ "syn", ] +[[package]] +name = "atomic-polyfill" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c041a8d9751a520ee19656232a18971f18946a7900f1520ee4400002244dd89" +dependencies = [ + "critical-section", +] + [[package]] name = "atty" version = "0.2.14" @@ -212,6 +227,21 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "bare-metal" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5deb64efa5bd81e31fcd1938615a6d98c82eafcbcd787162b6f63b91d6bac5b3" +dependencies = [ + "rustc_version 0.2.3", +] + +[[package]] +name = "bare-metal" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fe8f5a8a398345e52358e18ff07cc17a568fbca5c6f73873d3a62056309603" + [[package]] name = "base64" version = "0.13.0" @@ -250,6 +280,18 @@ dependencies = [ "which", ] +[[package]] +name = "bit_field" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb6dd1c2376d2e096796e234a70e17e94cc2d5d54ff8ce42b28cef1d0d359a4" + +[[package]] +name = "bitfield" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719" + [[package]] name = "bitflags" version = "1.3.2" @@ -528,6 +570,18 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +[[package]] +name = "cortex-m" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70858629a458fdfd39f9675c4dc309411f2a3f83bede76988d81bf1a0ecee9e0" +dependencies = [ + "bare-metal 0.2.5", + "bitfield", + "embedded-hal", + "volatile-register", +] + [[package]] name = "cpp_demangle" version = "0.3.5" @@ -552,7 +606,7 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" dependencies = [ - "rustc_version", + "rustc_version 0.4.0", ] [[package]] @@ -600,6 +654,18 @@ dependencies = [ "itertools", ] +[[package]] +name = "critical-section" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95da181745b56d4bd339530ec393508910c909c784e8962d15d722bacf0bcbcd" +dependencies = [ + "bare-metal 1.0.0", + "cfg-if", + "cortex-m", + "riscv", +] + [[package]] name = "crossbeam-channel" version = "0.5.6" @@ -844,6 +910,16 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be" +[[package]] +name = "embedded-hal" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35949884794ad573cf46071e41c9b60efb0cb311e3ca01f7af807af1debc66ff" +dependencies = [ + "nb 0.1.3", + "void", +] + [[package]] name = "encoding_rs" version = "0.8.31" @@ -1165,6 +1241,15 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "hash32" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" +dependencies = [ + "byteorder", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -1174,6 +1259,19 @@ dependencies = [ "ahash", ] +[[package]] +name = "heapless" +version = "0.7.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743" +dependencies = [ + "atomic-polyfill", + "hash32", + "rustc_version 0.4.0", + "spin 0.9.4", + "stable_deref_trait", +] + [[package]] name = "heck" version = "0.3.3" @@ -1491,6 +1589,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "libm" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565" + [[package]] name = "lock_api" version = "0.4.7" @@ -1649,6 +1753,21 @@ dependencies = [ "tempfile", ] +[[package]] +name = "nb" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "801d31da0513b6ec5214e9bf433a77966320625a37860f910be265be6e18d06f" +dependencies = [ + "nb 1.0.0", +] + +[[package]] +name = "nb" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "546c37ac5d9e56f55e73b677106873d9d9f5190605e41a856503623648488cae" + [[package]] name = "nix" version = "0.23.1" @@ -1716,6 +1835,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -1828,6 +1948,7 @@ checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4" name = "pageserver" version = "0.1.0" dependencies = [ + "amplify_num", "anyhow", "async-stream", "async-trait", @@ -1852,6 +1973,7 @@ dependencies = [ "itertools", "metrics", "nix", + "num-traits", "once_cell", "postgres", "postgres-protocol", @@ -1861,6 +1983,7 @@ dependencies = [ "rand", "regex", "remote_storage", + "rstar", "scopeguard", "serde", "serde_json", @@ -2515,12 +2638,33 @@ dependencies = [ "cc", "libc", "once_cell", - "spin", + "spin 0.5.2", "untrusted", "web-sys", "winapi", ] +[[package]] +name = "riscv" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6907ccdd7a31012b70faf2af85cd9e5ba97657cc3987c4f13f8e4d2c2a088aba" +dependencies = [ + "bare-metal 1.0.0", + "bit_field", + "riscv-target", +] + +[[package]] +name = "riscv-target" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88aa938cda42a0cf62a20cfe8d139ff1af20c2e681212b5b34adb5a58333f222" +dependencies = [ + "lazy_static", + "regex", +] + [[package]] name = "routerify" version = "3.0.0" @@ -2534,6 +2678,17 @@ dependencies = [ "regex", ] +[[package]] +name = "rstar" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa" +dependencies = [ + "heapless", + "num-traits", + "smallvec", +] + [[package]] name = "rstest" version = "0.12.0" @@ -2543,7 +2698,7 @@ dependencies = [ "cfg-if", "proc-macro2", "quote", - "rustc_version", + "rustc_version 0.4.0", "syn", ] @@ -2565,7 +2720,7 @@ dependencies = [ "log", "rusoto_credential", "rusoto_signature", - "rustc_version", + "rustc_version 0.4.0", "serde", "serde_json", "tokio", @@ -2623,7 +2778,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rusoto_credential", - "rustc_version", + "rustc_version 0.4.0", "serde", "sha2 0.9.9", "tokio", @@ -2641,13 +2796,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver 0.9.0", +] + [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver", + "semver 1.0.13", ] [[package]] @@ -2800,12 +2964,27 @@ dependencies = [ "libc", ] +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + [[package]] name = "semver" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93f6841e709003d68bb2deee8c343572bf446003ec20a583e76f7b15cebf3711" +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + [[package]] name = "serde" version = "1.0.142" @@ -2999,6 +3178,15 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +[[package]] +name = "spin" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" +dependencies = [ + "lock_api", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -3675,6 +3863,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vcell" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77439c1b53d2303b20d9459b1ade71a83c716e3f9c34f3228c00e6f185d6c002" + [[package]] name = "vcpkg" version = "0.2.15" @@ -3687,6 +3881,21 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + +[[package]] +name = "volatile-register" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ee8f19f9d74293faf70901bc20ad067dc1ad390d2cbf1e3f75f721ffee908b6" +dependencies = [ + "vcell", +] + [[package]] name = "wal_craft" version = "0.1.0" @@ -3952,6 +4161,7 @@ dependencies = [ "regex-syntax", "scopeguard", "serde", + "stable_deref_trait", "syn", "time 0.3.12", "tokio", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 85ece97d9b..1ec7ec4f98 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -54,6 +54,9 @@ once_cell = "1.13.0" crossbeam-utils = "0.8.5" fail = "0.5.0" git-version = "0.3.5" +rstar = "0.9.3" +num-traits = "0.2.15" +amplify_num = "0.4.1" postgres_ffi = { path = "../libs/postgres_ffi" } etcd_broker = { path = "../libs/etcd_broker" } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index cfcc87a2ed..0c2fedd7d5 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -24,6 +24,19 @@ pub struct Key { pub const KEY_SIZE: usize = 18; impl Key { + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. + /// As long as Neon does not support tablespace (because of lack of access to local file system), + /// we can assume that only some predefined namespace OIDs are used which can fit in u16 + pub fn to_i128(&self) -> i128 { + assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); + (((self.field1 & 0xf) as i128) << 120) + | (((self.field2 & 0xFFFF) as i128) << 104) + | ((self.field3 as i128) << 72) + | ((self.field4 as i128) << 40) + | ((self.field5 as i128) << 32) + | self.field6 as i128 + } + pub fn next(&self) -> Key { self.add(1) } diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index 892000c20b..57c5be91a4 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -713,7 +713,7 @@ impl DeltaLayerWriter { for buf in block_buf.blocks { file.write_all(buf.as_ref())?; } - + assert!(self.lsn_range.start < self.lsn_range.end); // Fill in the summary on blk 0 let summary = Summary { magic: DELTA_FILE_MAGIC, diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 8abeebf54c..495833e3ae 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -15,9 +15,15 @@ use crate::repository::Key; use crate::tenant::inmemory_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; use crate::tenant::storage_layer::{range_eq, range_overlaps}; +use amplify_num::i256; use anyhow::Result; +use num_traits::identities::{One, Zero}; +use num_traits::{Bounded, Num, Signed}; +use rstar::{RTree, RTreeObject, AABB}; +use std::cmp::Ordering; use std::collections::VecDeque; use std::ops::Range; +use std::ops::{Add, Div, Mul, Neg, Rem, Sub}; use std::sync::Arc; use tracing::*; use utils::lsn::Lsn; @@ -47,14 +53,163 @@ pub struct LayerMap { pub frozen_layers: VecDeque>, /// All the historic layers are kept here + historic_layers: RTree, - /// TODO: This is a placeholder implementation of a data structure - /// to hold information about all the layer files on disk and in - /// S3. Currently, it's just a vector and all operations perform a - /// linear scan over it. That obviously becomes slow as the - /// number of layers grows. I'm imagining that an R-tree or some - /// other 2D data structure would be the long-term solution here. - historic_layers: Vec>, + /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. + /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. + l0_delta_layers: Vec>, +} + +struct LayerRTreeObject { + layer: Arc, +} + +// Representation of Key as numeric type. +// We can not use native implementation of i128, because rstar::RTree +// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi). +// Overflow will cause panic in debug mode and incorrect area calculation in release mode, +// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work). +// By using i256 as the type, even though all the actual values would fit in i128, we can be +// sure that multiplication doesn't overflow. +// + +#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)] +struct IntKey(i256); + +impl Copy for IntKey {} + +impl IntKey { + fn from(i: i128) -> Self { + IntKey(i256::from(i)) + } +} + +impl Bounded for IntKey { + fn min_value() -> Self { + IntKey(i256::MIN) + } + fn max_value() -> Self { + IntKey(i256::MAX) + } +} + +impl Signed for IntKey { + fn is_positive(&self) -> bool { + self.0 > i256::ZERO + } + fn is_negative(&self) -> bool { + self.0 < i256::ZERO + } + fn signum(&self) -> Self { + match self.0.cmp(&i256::ZERO) { + Ordering::Greater => IntKey(i256::ONE), + Ordering::Less => IntKey(-i256::ONE), + Ordering::Equal => IntKey(i256::ZERO), + } + } + fn abs(&self) -> Self { + IntKey(self.0.abs()) + } + fn abs_sub(&self, other: &Self) -> Self { + if self.0 <= other.0 { + IntKey(i256::ZERO) + } else { + IntKey(self.0 - other.0) + } + } +} + +impl Neg for IntKey { + type Output = Self; + fn neg(self) -> Self::Output { + IntKey(-self.0) + } +} + +impl Rem for IntKey { + type Output = Self; + fn rem(self, rhs: Self) -> Self::Output { + IntKey(self.0 % rhs.0) + } +} + +impl Div for IntKey { + type Output = Self; + fn div(self, rhs: Self) -> Self::Output { + IntKey(self.0 / rhs.0) + } +} + +impl Add for IntKey { + type Output = Self; + fn add(self, rhs: Self) -> Self::Output { + IntKey(self.0 + rhs.0) + } +} + +impl Sub for IntKey { + type Output = Self; + fn sub(self, rhs: Self) -> Self::Output { + IntKey(self.0 - rhs.0) + } +} + +impl Mul for IntKey { + type Output = Self; + fn mul(self, rhs: Self) -> Self::Output { + IntKey(self.0 * rhs.0) + } +} + +impl One for IntKey { + fn one() -> Self { + IntKey(i256::ONE) + } +} + +impl Zero for IntKey { + fn zero() -> Self { + IntKey(i256::ZERO) + } + fn is_zero(&self) -> bool { + self.0 == i256::ZERO + } +} + +impl Num for IntKey { + type FromStrRadixErr = ::FromStrRadixErr; + fn from_str_radix(str: &str, radix: u32) -> Result { + Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?))) + } +} + +impl PartialEq for LayerRTreeObject { + fn eq(&self, other: &Self) -> bool { + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + Arc::ptr_eq(&self.layer, &other.layer) + } +} + +impl RTreeObject for LayerRTreeObject { + type Envelope = AABB<[IntKey; 2]>; + fn envelope(&self) -> Self::Envelope { + let key_range = self.layer.get_key_range(); + let lsn_range = self.layer.get_lsn_range(); + AABB::from_corners( + [ + IntKey::from(key_range.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(key_range.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive + ) + } } /// Return value of LayerMap::search @@ -80,19 +235,24 @@ impl LayerMap { // Find the latest image layer that covers the given key let mut latest_img: Option> = None; let mut latest_img_lsn: Option = None; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [IntKey::from(key.to_i128()), IntKey::from(0i128)], + [ + IntKey::from(key.to_i128()), + IntKey::from(end_lsn.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } + assert!(l.get_key_range().contains(&key)); let img_lsn = l.get_lsn_range().start; - - if img_lsn >= end_lsn { - // too new - continue; - } + assert!(img_lsn < end_lsn); if Lsn(img_lsn.0 + 1) == end_lsn { // found exact match return Ok(Some(SearchResult { @@ -108,19 +268,24 @@ impl LayerMap { // Search the delta layers let mut latest_delta: Option> = None; - for l in self.historic_layers.iter() { + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if !l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } - + assert!(l.get_key_range().contains(&key)); if l.get_lsn_range().start >= end_lsn { - // too new - continue; + info!( + "Candidate delta layer {}..{} is too new for lsn {}", + l.get_lsn_range().start, + l.get_lsn_range().end, + end_lsn + ); } - + assert!(l.get_lsn_range().start < end_lsn); if l.get_lsn_range().end >= end_lsn { // this layer contains the requested point in the key/lsn space. // No need to search any further @@ -170,7 +335,10 @@ impl LayerMap { /// Insert an on-disk layer /// pub fn insert_historic(&mut self, layer: Arc) { - self.historic_layers.push(layer); + if layer.get_key_range() == (Key::MIN..Key::MAX) { + self.l0_delta_layers.push(layer.clone()); + } + self.historic_layers.insert(LayerRTreeObject { layer }); NUM_ONDISK_LAYERS.inc(); } @@ -180,17 +348,22 @@ impl LayerMap { /// This should be called when the corresponding file on disk has been deleted. /// pub fn remove_historic(&mut self, layer: Arc) { - let len_before = self.historic_layers.len(); + if layer.get_key_range() == (Key::MIN..Key::MAX) { + let len_before = self.l0_delta_layers.len(); - // FIXME: ptr_eq might fail to return true for 'dyn' - // references. Clippy complains about this. In practice it - // seems to work, the assertion below would be triggered - // otherwise but this ought to be fixed. - #[allow(clippy::vtable_address_comparisons)] - self.historic_layers - .retain(|other| !Arc::ptr_eq(other, &layer)); - - assert_eq!(self.historic_layers.len(), len_before - 1); + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + self.l0_delta_layers + .retain(|other| !Arc::ptr_eq(other, &layer)); + assert_eq!(self.l0_delta_layers.len(), len_before - 1); + } + assert!(self + .historic_layers + .remove(&LayerRTreeObject { layer }) + .is_some()); NUM_ONDISK_LAYERS.dec(); } @@ -207,15 +380,26 @@ impl LayerMap { loop { let mut made_progress = false; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [ + IntKey::from(range_remain.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(range_remain.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } let img_lsn = l.get_lsn_range().start; - if !l.is_incremental() - && l.get_key_range().contains(&range_remain.start) - && lsn_range.contains(&img_lsn) - { + if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) { made_progress = true; let img_key_end = l.get_key_range().end; @@ -232,8 +416,8 @@ impl LayerMap { } } - pub fn iter_historic_layers(&self) -> impl Iterator> { - self.historic_layers.iter() + pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { + self.historic_layers.iter().map(|e| e.layer.clone()) } /// Find the last image layer that covers 'key', ignoring any image layers @@ -241,19 +425,22 @@ impl LayerMap { fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option> { let mut candidate_lsn = Lsn(0); let mut candidate = None; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [IntKey::from(key.to_i128()), IntKey::from(0)], + [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } - + assert!(l.get_key_range().contains(&key)); let this_lsn = l.get_lsn_range().start; - if this_lsn > lsn { - continue; - } + assert!(this_lsn <= lsn); if this_lsn < candidate_lsn { // our previous candidate was better continue; @@ -279,10 +466,19 @@ impl LayerMap { lsn: Lsn, ) -> Result, Option>)>> { let mut points = vec![key_range.start]; - for l in self.historic_layers.iter() { - if l.get_lsn_range().start > lsn { - continue; - } + let envelope = AABB::from_corners( + [IntKey::from(key_range.start.to_i128()), IntKey::from(0)], + [ + IntKey::from(key_range.end.to_i128()), + IntKey::from(lsn.0 as i128), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; + assert!(l.get_lsn_range().start <= lsn); let range = l.get_key_range(); if key_range.contains(&range.start) { points.push(l.get_key_range().start); @@ -315,16 +511,29 @@ impl LayerMap { /// given key and LSN range. pub fn count_deltas(&self, key_range: &Range, lsn_range: &Range) -> Result { let mut result = 0; - for l in self.historic_layers.iter() { + if lsn_range.start >= lsn_range.end { + return Ok(0); + } + let envelope = AABB::from_corners( + [ + IntKey::from(key_range.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(key_range.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if !l.is_incremental() { continue; } - if !range_overlaps(&l.get_lsn_range(), lsn_range) { - continue; - } - if !range_overlaps(&l.get_key_range(), key_range) { - continue; - } + assert!(range_overlaps(&l.get_lsn_range(), lsn_range)); + assert!(range_overlaps(&l.get_key_range(), key_range)); // We ignore level0 delta layers. Unless the whole keyspace fits // into one partition @@ -341,17 +550,7 @@ impl LayerMap { /// Return all L0 delta layers pub fn get_level0_deltas(&self) -> Result>> { - let mut deltas = Vec::new(); - for l in self.historic_layers.iter() { - if !l.is_incremental() { - continue; - } - if l.get_key_range() != (Key::MIN..Key::MAX) { - continue; - } - deltas.push(Arc::clone(l)); - } - Ok(deltas) + Ok(self.l0_delta_layers.clone()) } /// debugging function to print out the contents of the layer map @@ -370,8 +569,8 @@ impl LayerMap { } println!("historic_layers:"); - for layer in self.historic_layers.iter() { - layer.dump(verbose)?; + for e in self.historic_layers.iter() { + e.layer.dump(verbose)?; } println!("End dump LayerMap"); Ok(()) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b80d023c7f..6de1d44876 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2050,7 +2050,7 @@ impl Timeline { l.filename().display(), l.is_incremental(), ); - layers_to_remove.push(Arc::clone(l)); + layers_to_remove.push(Arc::clone(&l)); } // Actually delete the layers from disk and remove them from the map. diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py new file mode 100644 index 0000000000..d71fb6d12c --- /dev/null +++ b/test_runner/performance/test_layer_map.py @@ -0,0 +1,39 @@ +import time + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# +# Benchmark searching the layer map, when there are a lot of small layer files. +# +def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): + + env = neon_env_builder.init_start() + n_iters = 10 + n_records = 100000 + + # We want to have a lot of lot of layer files to exercise the layer map. Make + # gc_horizon and checkpoint_distance very small, so that we get a lot of small layer files. + tenant, _ = env.neon_cli.create_tenant( + conf={ + "gc_period": "100 m", + "gc_horizon": "1048576", + "checkpoint_distance": "8192", + "compaction_period": "1 s", + "compaction_threshold": "1", + "compaction_target_size": "8192", + } + ) + + env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant) + pg = env.postgres.create_start("test_layer_map", tenant_id=tenant) + cur = pg.connect().cursor() + cur.execute("create table t(x integer)") + for i in range(n_iters): + cur.execute(f"insert into t values (generate_series(1,{n_records}))") + time.sleep(1) + + cur.execute("vacuum t") + with zenbenchmark.record_duration("test_query"): + cur.execute("SELECT count(*) from t") + assert cur.fetchone() == (n_iters * n_records,) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 3670ca5fea..f37a42945e 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -30,7 +30,7 @@ memchr = { version = "2", features = ["std", "use_std"] } nom = { version = "7", features = ["alloc", "std"] } num-bigint = { version = "0.4", features = ["std"] } num-integer = { version = "0.1", default-features = false, features = ["i128", "std"] } -num-traits = { version = "0.2", features = ["i128", "std"] } +num-traits = { version = "0.2", features = ["i128", "libm", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } @@ -38,6 +38,7 @@ regex-automata = { version = "0.1", features = ["regex-syntax", "std"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } +stable_deref_trait = { version = "1", features = ["alloc", "std"] } time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] }