Compare commits

...

8 Commits

Author SHA1 Message Date
Konstantin Knizhnik
2c94775ca8 Use big integer arithmetic to avoid multiplication overflow 2022-03-31 12:53:41 +03:00
Konstantin Knizhnik
a31bc88e46 Fix race condition in image layer
refer #1439
2022-03-30 18:18:33 +03:00
Konstantin Knizhnik
8b137dfcd2 Check for valid LSN range in count_deltas 2022-03-30 12:55:01 +03:00
Konstantin Knizhnik
258b1d4935 Aff num-traits crate 2022-03-30 11:35:30 +03:00
Konstantin Knizhnik
a1f6bbb076 Use wrapping multiplication in R-Tree 2022-03-30 08:16:20 +03:00
Konstantin Knizhnik
b1a9f292b2 Fix tests 2022-03-28 20:19:49 +03:00
Konstantin Knizhnik
22614f74b1 Remove redundant LayerEnveloper::ne method 2022-03-28 16:39:04 +03:00
Konstantin Knizhnik
cd0fdada82 Use R-Tree for layer map 2022-03-28 16:15:28 +03:00
6 changed files with 355 additions and 85 deletions

99
Cargo.lock generated
View File

@@ -61,6 +61,18 @@ dependencies = [
"backtrace",
]
[[package]]
name = "as-slice"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45403b49e3954a4b8428a0ac21a4b7afadccf92bfd96273f1a58cd4812496ae0"
dependencies = [
"generic-array 0.12.4",
"generic-array 0.13.3",
"generic-array 0.14.5",
"stable_deref_trait",
]
[[package]]
name = "async-compression"
version = "0.3.12"
@@ -240,7 +252,7 @@ version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4"
dependencies = [
"generic-array",
"generic-array 0.14.5",
]
[[package]]
@@ -555,7 +567,7 @@ version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bff07008ec701e8028e2ceb8f83f0e4274ee62bd2dbdc4fefff2e9a91824081a"
dependencies = [
"generic-array",
"generic-array 0.14.5",
"subtle",
]
@@ -565,7 +577,7 @@ version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1d1a86f49236c215f271d40892d5fc950490551400b02ef360692c29815c714"
dependencies = [
"generic-array",
"generic-array 0.14.5",
"subtle",
]
@@ -642,7 +654,7 @@ version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066"
dependencies = [
"generic-array",
"generic-array 0.14.5",
]
[[package]]
@@ -855,6 +867,24 @@ dependencies = [
"slab",
]
[[package]]
name = "generic-array"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffdf9f34f1447443d37393cc6c2b8313aebddcd96906caf34e54c68d8e57d7bd"
dependencies = [
"typenum",
]
[[package]]
name = "generic-array"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f797e67af32588215eaaab8327027ee8e71b9dd0b2b26996aedf20c030fce309"
dependencies = [
"typenum",
]
[[package]]
name = "generic-array"
version = "0.14.5"
@@ -935,6 +965,15 @@ version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
[[package]]
name = "hash32"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4041af86e63ac4298ce40e5cca669066e75b6f1aa3390fe2561ffa5e1d9f4cc"
dependencies = [
"byteorder",
]
[[package]]
name = "hashbrown"
version = "0.9.1"
@@ -953,6 +992,18 @@ dependencies = [
"ahash 0.7.6",
]
[[package]]
name = "heapless"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "634bd4d29cbf24424d0a4bfcbf80c6960129dc24424752a7d1d1390607023422"
dependencies = [
"as-slice",
"generic-array 0.14.5",
"hash32",
"stable_deref_trait",
]
[[package]]
name = "hermit-abi"
version = "0.1.19"
@@ -1371,6 +1422,17 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-bigint"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-integer"
version = "0.1.44"
@@ -1471,6 +1533,8 @@ dependencies = [
"lazy_static",
"log",
"nix",
"num-bigint 0.4.3",
"num-traits",
"once_cell",
"postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
@@ -1478,6 +1542,7 @@ dependencies = [
"postgres_ffi",
"rand",
"regex",
"rstar",
"rust-s3",
"scopeguard",
"serde",
@@ -1524,6 +1589,12 @@ dependencies = [
"winapi",
]
[[package]]
name = "pdqselect"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec91767ecc0a0bbe558ce8c9da33c068066c57ecc8bb8477ef8c1ad3ef77c27"
[[package]]
name = "peeking_take_while"
version = "0.1.2"
@@ -2014,6 +2085,18 @@ dependencies = [
"regex",
]
[[package]]
name = "rstar"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fc6fc513b8c3853e43a0c3f909ded14ffa82e5170c9c5f6fb175f9c85c8a433"
dependencies = [
"heapless",
"num-traits",
"pdqselect",
"smallvec",
]
[[package]]
name = "rust-ini"
version = "0.17.0"
@@ -2313,7 +2396,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "692ca13de57ce0613a363c8c2f1de925adebc81b04c923ac60c5488bb44abe4b"
dependencies = [
"chrono",
"num-bigint",
"num-bigint 0.2.6",
"num-traits",
]
@@ -2351,6 +2434,12 @@ version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "stringprep"
version = "0.1.2"

View File

@@ -44,6 +44,9 @@ nix = "0.23"
once_cell = "1.8.0"
crossbeam-utils = "0.8.5"
fail = "0.5.0"
rstar = "0.9.2"
num-traits = "0.2.14"
num-bigint = "0.4.3"
rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] }
async-compression = {version = "0.3", features = ["zstd", "tokio"]}

View File

@@ -1928,7 +1928,7 @@ impl LayeredTimeline {
l.filename().display(),
l.is_incremental(),
);
layers_to_remove.push(Arc::clone(l));
layers_to_remove.push(Arc::clone(&l));
}
// Actually delete the layers from disk and remove them from the map.

View File

@@ -267,7 +267,7 @@ impl ImageLayer {
// a write lock. (Or rather, release and re-lock in write mode.)
drop(inner);
let mut inner = self.inner.write().unwrap();
if inner.book.is_none() {
if !inner.loaded {
self.load_inner(&mut inner)?;
} else {
// Another thread loaded it while we were not holding the lock.
@@ -279,7 +279,6 @@ impl ImageLayer {
// above call to `load_inner`, so it's already been released). And
// while we do that, another thread could unload again, so we have
// to re-check and retry if that happens.
drop(inner);
}
}

View File

@@ -16,8 +16,14 @@ use crate::layered_repository::InMemoryLayer;
use crate::repository::Key;
use anyhow::Result;
use lazy_static::lazy_static;
use num_bigint::BigInt;
use num_traits::cast::ToPrimitive;
use num_traits::identities::{One, Zero};
use num_traits::{Bounded, Num, Signed};
use rstar::{RTree, RTreeObject, AABB};
use std::collections::VecDeque;
use std::ops::Range;
use std::ops::{Add, Div, Mul, Neg, Rem, Sub};
use std::sync::Arc;
use tracing::*;
use zenith_metrics::{register_int_gauge, IntGauge};
@@ -51,14 +57,146 @@ pub struct LayerMap {
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
/// All the historic layers are kept here
historic_layers: RTree<LayerEnvelope>,
/// TODO: This is a placeholder implementation of a data structure
/// to hold information about all the layer files on disk and in
/// S3. Currently, it's just a vector and all operations perform a
/// linear scan over it. That obviously becomes slow as the
/// number of layers grows. I'm imagining that an R-tree or some
/// other 2D data structure would be the long-term solution here.
historic_layers: Vec<Arc<dyn Layer>>,
/// L0 layers has key range: (Key::MIN..Key::MAX) and locating them using R-Tree search is very inefficient.
/// So l) layers are also pushed in l0_layers vector.
l0_layers: Vec<Arc<dyn Layer>>,
}
struct LayerEnvelope {
layer: Arc<dyn Layer>,
}
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Debug)]
struct IntKey(i128);
impl Bounded for IntKey {
fn min_value() -> Self {
IntKey(i128::MIN)
}
fn max_value() -> Self {
IntKey(i128::MAX)
}
}
impl Signed for IntKey {
fn is_positive(&self) -> bool {
self.0 > 0
}
fn is_negative(&self) -> bool {
self.0 < 0
}
fn signum(&self) -> Self {
IntKey(self.0.signum())
}
fn abs(&self) -> Self {
IntKey(self.0.abs())
}
fn abs_sub(&self, other: &Self) -> Self {
IntKey(self.0.abs_sub(&other.0))
}
}
impl Neg for IntKey {
type Output = Self;
fn neg(self) -> Self::Output {
IntKey(-self.0)
}
}
impl Rem for IntKey {
type Output = Self;
fn rem(self, rhs: Self) -> Self::Output {
IntKey(self.0 % rhs.0)
}
}
impl Div for IntKey {
type Output = Self;
fn div(self, rhs: Self) -> Self::Output {
IntKey(self.0 / rhs.0)
}
}
impl Add for IntKey {
type Output = Self;
fn add(self, rhs: Self) -> Self::Output {
IntKey(self.0.wrapping_add(rhs.0))
}
}
impl Sub for IntKey {
type Output = Self;
fn sub(self, rhs: Self) -> Self::Output {
IntKey(self.0.wrapping_sub(rhs.0))
}
}
impl Mul for IntKey {
type Output = Self;
fn mul(self, rhs: Self) -> Self::Output {
// Use big integer arithmetic to avoid overflow.
// We have to cacluate sqrt of the result to be able to store result of operation as i128.
// As far as multiplication is used by R-Tree to calculate area and distance, it should not be a problem.
IntKey(
(BigInt::from(self.0) * BigInt::from(rhs.0))
.sqrt()
.to_i128()
.unwrap(),
)
}
}
impl One for IntKey {
fn one() -> Self {
IntKey(1)
}
}
impl Zero for IntKey {
fn zero() -> Self {
IntKey(0)
}
fn is_zero(&self) -> bool {
self.0 == 0
}
}
impl Num for IntKey {
type FromStrRadixErr = <i128 as Num>::FromStrRadixErr;
fn from_str_radix(str: &str, radix: u32) -> Result<Self, Self::FromStrRadixErr> {
Ok(IntKey(i128::from_str_radix(str, radix)?))
}
}
impl PartialEq for LayerEnvelope {
fn eq(&self, other: &Self) -> bool {
// FIXME: ptr_eq might fail to return true for 'dyn'
// references. Clippy complains about this. In practice it
// seems to work, the assertion below would be triggered
// otherwise but this ought to be fixed.
#[allow(clippy::vtable_address_comparisons)]
Arc::ptr_eq(&self.layer, &other.layer)
}
}
impl RTreeObject for LayerEnvelope {
type Envelope = AABB<[IntKey; 2]>;
fn envelope(&self) -> Self::Envelope {
let key_range = self.layer.get_key_range();
let lsn_range = self.layer.get_lsn_range();
AABB::from_corners(
[
IntKey(key_range.start.to_i128()),
IntKey(lsn_range.start.0 as i128),
],
[
IntKey(key_range.end.to_i128() - 1),
IntKey(lsn_range.end.0 as i128 - 1),
], // end is exlusive
)
}
}
/// Return value of LayerMap::search
@@ -84,19 +222,21 @@ impl LayerMap {
// Find the latest image layer that covers the given key
let mut latest_img: Option<Arc<dyn Layer>> = None;
let mut latest_img_lsn: Option<Lsn> = None;
for l in self.historic_layers.iter() {
let envelope = AABB::from_corners(
[IntKey(key.to_i128()), IntKey(0i128)],
[IntKey(key.to_i128()), IntKey(end_lsn.0 as i128 - 1)],
);
for e in self
.historic_layers
.locate_in_envelope_intersecting(&envelope)
{
let l = &e.layer;
if l.is_incremental() {
continue;
}
if !l.get_key_range().contains(&key) {
continue;
}
assert!(l.get_key_range().contains(&key));
let img_lsn = l.get_lsn_range().start;
if img_lsn >= end_lsn {
// too new
continue;
}
assert!(img_lsn < end_lsn);
if Lsn(img_lsn.0 + 1) == end_lsn {
// found exact match
return Ok(Some(SearchResult {
@@ -112,19 +252,16 @@ impl LayerMap {
// Search the delta layers
let mut latest_delta: Option<Arc<dyn Layer>> = None;
for l in self.historic_layers.iter() {
for e in self
.historic_layers
.locate_in_envelope_intersecting(&envelope)
{
let l = &e.layer;
if !l.is_incremental() {
continue;
}
if !l.get_key_range().contains(&key) {
continue;
}
if l.get_lsn_range().start >= end_lsn {
// too new
continue;
}
assert!(l.get_key_range().contains(&key));
assert!(l.get_lsn_range().start < end_lsn);
if l.get_lsn_range().end >= end_lsn {
// this layer contains the requested point in the key/lsn space.
// No need to search any further
@@ -182,7 +319,10 @@ impl LayerMap {
/// Insert an on-disk layer
///
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
self.historic_layers.push(layer);
if layer.get_key_range() == (Key::MIN..Key::MAX) {
self.l0_layers.push(layer.clone());
}
self.historic_layers.insert(LayerEnvelope { layer });
NUM_ONDISK_LAYERS.inc();
}
@@ -193,17 +333,21 @@ impl LayerMap {
///
#[allow(dead_code)]
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
let len_before = self.historic_layers.len();
if layer.get_key_range() == (Key::MIN..Key::MAX) {
let len_before = self.l0_layers.len();
// FIXME: ptr_eq might fail to return true for 'dyn'
// references. Clippy complains about this. In practice it
// seems to work, the assertion below would be triggered
// otherwise but this ought to be fixed.
#[allow(clippy::vtable_address_comparisons)]
self.historic_layers
.retain(|other| !Arc::ptr_eq(other, &layer));
assert_eq!(self.historic_layers.len(), len_before - 1);
// FIXME: ptr_eq might fail to return true for 'dyn'
// references. Clippy complains about this. In practice it
// seems to work, the assertion below would be triggered
// otherwise but this ought to be fixed.
#[allow(clippy::vtable_address_comparisons)]
self.l0_layers.retain(|other| !Arc::ptr_eq(other, &layer));
assert_eq!(self.l0_layers.len(), len_before - 1);
}
assert!(self
.historic_layers
.remove(&LayerEnvelope { layer })
.is_some());
NUM_ONDISK_LAYERS.dec();
}
@@ -224,13 +368,23 @@ impl LayerMap {
loop {
let mut made_progress = false;
for l in self.historic_layers.iter() {
let envelope = AABB::from_corners(
[IntKey(range_remain.start.to_i128()), IntKey(lsn.0 as i128)],
[
IntKey(range_remain.end.to_i128() - 1),
IntKey(disk_consistent_lsn.0 as i128),
],
);
for e in self
.historic_layers
.locate_in_envelope_intersecting(&envelope)
{
let l = &e.layer;
if l.is_incremental() {
continue;
}
let img_lsn = l.get_lsn_range().start;
if !l.is_incremental()
&& l.get_key_range().contains(&range_remain.start)
if l.get_key_range().contains(&range_remain.start)
&& img_lsn > lsn
&& img_lsn < disk_consistent_lsn
{
@@ -266,8 +420,8 @@ impl LayerMap {
}
*/
pub fn iter_historic_layers(&self) -> std::slice::Iter<Arc<dyn Layer>> {
self.historic_layers.iter()
pub fn iter_historic_layers(&self) -> Box<dyn Iterator<Item = Arc<dyn Layer>> + '_> {
Box::new(self.historic_layers.iter().map(|e| e.layer.clone()))
}
/// Find the last image layer that covers 'key', ignoring any image layers
@@ -275,19 +429,22 @@ impl LayerMap {
fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
let mut candidate_lsn = Lsn(0);
let mut candidate = None;
for l in self.historic_layers.iter() {
let envelope = AABB::from_corners(
[IntKey(key.to_i128()), IntKey(0)],
[IntKey(key.to_i128()), IntKey(lsn.0 as i128)],
);
for e in self
.historic_layers
.locate_in_envelope_intersecting(&envelope)
{
let l = &e.layer;
if l.is_incremental() {
continue;
}
if !l.get_key_range().contains(&key) {
continue;
}
assert!(l.get_key_range().contains(&key));
let this_lsn = l.get_lsn_range().start;
if this_lsn > lsn {
continue;
}
assert!(this_lsn <= lsn);
if this_lsn < candidate_lsn {
// our previous candidate was better
continue;
@@ -315,10 +472,16 @@ impl LayerMap {
let mut points: Vec<Key>;
points = vec![key_range.start];
for l in self.historic_layers.iter() {
if l.get_lsn_range().start > lsn {
continue;
}
let envelope = AABB::from_corners(
[IntKey(key_range.start.to_i128()), IntKey(0)],
[IntKey(key_range.end.to_i128()), IntKey(lsn.0 as i128)],
);
for e in self
.historic_layers
.locate_in_envelope_intersecting(&envelope)
{
let l = &e.layer;
assert!(l.get_lsn_range().start <= lsn);
let range = l.get_key_range();
if key_range.contains(&range.start) {
points.push(l.get_key_range().start);
@@ -351,16 +514,29 @@ impl LayerMap {
/// given key and LSN range.
pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
let mut result = 0;
for l in self.historic_layers.iter() {
if lsn_range.start >= lsn_range.end {
return Ok(0);
}
let envelope = AABB::from_corners(
[
IntKey(key_range.start.to_i128()),
IntKey(lsn_range.start.0 as i128),
],
[
IntKey(key_range.end.to_i128() - 1),
IntKey(lsn_range.end.0 as i128 - 1),
],
);
for e in self
.historic_layers
.locate_in_envelope_intersecting(&envelope)
{
let l = &e.layer;
if !l.is_incremental() {
continue;
}
if !range_overlaps(&l.get_lsn_range(), lsn_range) {
continue;
}
if !range_overlaps(&l.get_key_range(), key_range) {
continue;
}
assert!(range_overlaps(&l.get_lsn_range(), lsn_range));
assert!(range_overlaps(&l.get_key_range(), key_range));
// We ignore level0 delta layers. Unless the whole keyspace fits
// into one partition
@@ -377,25 +553,15 @@ impl LayerMap {
/// Return all L0 delta layers
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
let mut deltas = Vec::new();
for l in self.historic_layers.iter() {
if !l.is_incremental() {
continue;
}
if l.get_key_range() != (Key::MIN..Key::MAX) {
continue;
}
deltas.push(Arc::clone(l));
}
Ok(deltas)
Ok(self.l0_layers.clone())
}
/// debugging function to print out the contents of the layer map
#[allow(unused)]
pub fn dump(&self) -> Result<()> {
println!("Begin dump LayerMap");
for layer in self.historic_layers.iter() {
layer.dump()?;
for e in self.historic_layers.iter() {
e.layer.dump()?;
}
println!("End dump LayerMap");
Ok(())

View File

@@ -29,6 +29,19 @@ pub struct Key {
}
impl Key {
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
/// As far as Zenith is not supporting tablespace (because of lack of access to local file system),
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
pub fn to_i128(&self) -> i128 {
assert!(self.field2 < 0xFFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
(((self.field1 & 0xf) as i128) << 120)
| (((self.field2 & 0xFFFF) as i128) << 104)
| ((self.field3 as i128) << 72)
| ((self.field4 as i128) << 40)
| ((self.field5 as i128) << 32)
| self.field6 as i128
}
pub fn next(&self) -> Key {
self.add(1)
}
@@ -583,7 +596,7 @@ mod tests {
use lazy_static::lazy_static;
lazy_static! {
static ref TEST_KEY: Key = Key::from_array(hex!("112222222233333333444444445500000001"));
static ref TEST_KEY: Key = Key::from_array(hex!("110000222233333333444444445500000001"));
}
#[test]
@@ -626,9 +639,9 @@ mod tests {
use std::str::from_utf8;
#[allow(non_snake_case)]
let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
let TEST_KEY_A: Key = Key::from_hex("110000222233333333444444445500000001").unwrap();
#[allow(non_snake_case)]
let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
let TEST_KEY_B: Key = Key::from_hex("110000222233333333444444445500000002").unwrap();
// Insert a value on the timeline
writer.put(TEST_KEY_A, Lsn(0x20), test_value("foo at 0x20"))?;