Bunch of fixes, smarter iterator, metrics exporter

This commit is contained in:
Heikki Linnakangas
2025-05-06 14:53:36 +03:00
parent 44269fcd5e
commit 977bc09d2a
15 changed files with 401 additions and 59 deletions

View File

@@ -182,7 +182,7 @@ fn next_recurse<'e, V: Value>(
assert!(path.len() < min_key.len());
use std::cmp::Ordering;
let mut key_byte = match path.as_slice().cmp(&min_key[0..path.len()]) {
let mut min_key_byte = match path.as_slice().cmp(&min_key[0..path.len()]) {
Ordering::Less => {
rnode.read_unlock_or_restart()?;
return Ok(None);
@@ -191,17 +191,11 @@ fn next_recurse<'e, V: Value>(
Ordering::Greater => 0,
};
loop {
// TODO: This iterates through all possible byte values. That's pretty unoptimal.
// Implement a function to scan the node for next key value efficiently.
match rnode.find_child_or_value_or_restart(key_byte)? {
match rnode.find_next_child_or_value_or_restart(min_key_byte)? {
None => {
if key_byte == u8::MAX {
return Ok(None);
}
key_byte += 1;
continue;
}
Some(ChildOrValue::Child(child_ref)) => {
return Ok(None);
},
Some((key_byte, ChildOrValue::Child(child_ref))) => {
let path_len = path.len();
path.push(key_byte);
let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
@@ -212,9 +206,9 @@ fn next_recurse<'e, V: Value>(
return Ok(None);
}
path.truncate(path_len);
key_byte += 1;
}
Some(ChildOrValue::Value(vptr)) => {
min_key_byte = key_byte + 1;
},
Some((key_byte, ChildOrValue::Value(vptr))) => {
path.push(key_byte);
assert_eq!(path.len(), min_key.len());
// safety: It's OK to return a ref of the pointer because we checked the version
@@ -222,7 +216,7 @@ fn next_recurse<'e, V: Value>(
// as long as the epoch is pinned.
let v = unsafe { vptr.as_ref().unwrap() };
return Ok(Some(v))
}
},
}
}
}

View File

@@ -100,8 +100,6 @@ pub struct NodeInternal16<V> {
child_ptrs: [NodePtr<V>; 16],
}
const INVALID_CHILD_INDEX: u8 = u8::MAX;
#[repr(C)]
pub struct NodeInternal48<V> {
tag: NodeTag,
@@ -114,6 +112,7 @@ pub struct NodeInternal48<V> {
child_indexes: [u8; 256],
child_ptrs: [NodePtr<V>; 48],
}
const INVALID_CHILD_INDEX: u8 = u8::MAX;
#[repr(C)]
pub struct NodeInternal256<V> {
@@ -339,6 +338,35 @@ impl<V: Value> NodePtr<V> {
}
}
pub(crate) fn find_next_child_or_value(&self, key_byte: u8) -> Option<(u8, ChildOrValuePtr<V>)> {
match self.variant() {
NodeVariant::Internal4(n) => n
.find_next_child(key_byte)
.map(|(k, c)| (k, ChildOrValuePtr::Child(c))),
NodeVariant::Internal16(n) => n
.find_next_child(key_byte)
.map(|(k, c)| (k, ChildOrValuePtr::Child(c))),
NodeVariant::Internal48(n) => n
.find_next_child(key_byte)
.map(|(k, c)| (k, ChildOrValuePtr::Child(c))),
NodeVariant::Internal256(n) => n
.find_next_child(key_byte)
.map(|(k, c)| (k, ChildOrValuePtr::Child(c))),
NodeVariant::Leaf4(n) => n
.find_next_leaf_value(key_byte)
.map(|(k, v)| (k, ChildOrValuePtr::Value(v))),
NodeVariant::Leaf16(n) => n
.find_next_leaf_value(key_byte)
.map(|(k, v)| (k, ChildOrValuePtr::Value(v))),
NodeVariant::Leaf48(n) => n
.find_next_leaf_value(key_byte)
.map(|(k, v)| (k, ChildOrValuePtr::Value(v))),
NodeVariant::Leaf256(n) => n
.find_next_leaf_value(key_byte)
.map(|(k, v)| (k, ChildOrValuePtr::Value(v))),
}
}
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
match self.variant_mut() {
NodeVariantMut::Internal4(n) => n.truncate_prefix(new_prefix_len),
@@ -512,6 +540,27 @@ impl<V: Value> NodeInternal4<V> {
None
}
fn find_next_child(&self, min_key: u8) -> Option<(u8, NodePtr<V>)> {
let mut found: Option<(usize, u8)> = None;
for i in 0..self.num_children as usize {
let this_key = self.child_keys[i];
if this_key >= min_key {
if let Some((_, found_key)) = found {
if this_key < found_key {
found = Some((i, this_key));
}
} else {
found = Some((i, this_key));
}
}
}
if let Some((found_idx, found_key)) = found {
Some((found_key, self.child_ptrs[found_idx]))
} else {
None
}
}
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
for i in 0..self.num_children as usize {
if self.child_keys[i] == key_byte {
@@ -584,6 +633,27 @@ impl<V: Value> NodeInternal16<V> {
None
}
fn find_next_child(&self, min_key: u8) -> Option<(u8, NodePtr<V>)> {
let mut found: Option<(usize, u8)> = None;
for i in 0..self.num_children as usize {
let this_key = self.child_keys[i];
if this_key >= min_key {
if let Some((_, found_key)) = found {
if this_key < found_key {
found = Some((i, this_key));
}
} else {
found = Some((i, this_key));
}
}
}
if let Some((found_idx, found_key)) = found {
Some((found_key, self.child_ptrs[found_idx]))
} else {
None
}
}
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
for i in 0..self.num_children as usize {
if self.child_keys[i] == key_byte {
@@ -657,6 +727,16 @@ impl<V: Value> NodeInternal48<V> {
}
}
fn find_next_child(&self, min_key: u8) -> Option<(u8, NodePtr<V>)> {
for key in min_key..=u8::MAX {
let idx = self.child_indexes[key as usize];
if idx != INVALID_CHILD_INDEX {
return Some((key, self.child_ptrs[idx as usize]));
}
}
None
}
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
let idx = self.child_indexes[key_byte as usize];
if idx != INVALID_CHILD_INDEX {
@@ -729,6 +809,15 @@ impl<V: Value> NodeInternal256<V> {
}
}
fn find_next_child(&self, min_key: u8) -> Option<(u8, NodePtr<V>)> {
for key in min_key..=u8::MAX {
if !self.child_ptrs[key as usize].is_null() {
return Some((key, self.child_ptrs[key as usize]));
}
}
None
}
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
let idx = key_byte as usize;
if !self.child_ptrs[idx].is_null() {
@@ -774,6 +863,28 @@ impl<V: Value> NodeLeaf4<V> {
}
None
}
fn find_next_leaf_value<'a: 'b, 'b>(&'a self, min_key: u8) -> Option<(u8, &'b V)> {
let mut found: Option<(usize, u8)> = None;
for i in 0..self.num_values as usize {
let this_key = self.child_keys[i];
if this_key >= min_key {
if let Some((_, found_key)) = found {
if this_key < found_key {
found = Some((i, this_key));
}
} else {
found = Some((i, this_key));
}
}
}
if let Some((found_idx, found_key)) = found {
Some((found_key, self.child_values[found_idx].as_ref().unwrap()))
} else {
None
}
}
fn is_full(&self) -> bool {
self.num_values == 4
}
@@ -853,6 +964,28 @@ impl<V: Value> NodeLeaf16<V> {
}
None
}
fn find_next_leaf_value<'a: 'b, 'b>(&'a self, min_key: u8) -> Option<(u8, &'b V)> {
let mut found: Option<(usize, u8)> = None;
for i in 0..self.num_values as usize {
let this_key = self.child_keys[i];
if this_key >= min_key {
if let Some((_, found_key)) = found {
if this_key < found_key {
found = Some((i, this_key));
}
} else {
found = Some((i, this_key));
}
}
}
if let Some((found_idx, found_key)) = found {
Some((found_key, self.child_values[found_idx].as_ref().unwrap()))
} else {
None
}
}
fn is_full(&self) -> bool {
self.num_values == 16
}
@@ -932,6 +1065,17 @@ impl<V: Value> NodeLeaf48<V> {
None
}
}
fn find_next_leaf_value<'a: 'b, 'b>(&'a self, min_key: u8) -> Option<(u8, &'b V)> {
for key in min_key..=u8::MAX {
let idx = self.child_indexes[key as usize];
if idx != INVALID_CHILD_INDEX {
return Some((key, &self.child_values[idx as usize].as_ref().unwrap()));
}
}
None
}
fn is_full(&self) -> bool {
self.num_values == 48
}
@@ -1017,6 +1161,16 @@ impl<V: Value> NodeLeaf256<V> {
let idx = key as usize;
self.child_values[idx].as_ref()
}
fn find_next_leaf_value<'a: 'b, 'b>(&'a self, min_key: u8) -> Option<(u8, &'b V)> {
for key in min_key..=u8::MAX {
if let Some(v) = &self.child_values[key as usize] {
return Some((key, v));
}
}
None
}
fn is_full(&self) -> bool {
self.num_values == 256
}

View File

@@ -94,6 +94,23 @@ impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
}))),
}
}
pub(crate) fn find_next_child_or_value_or_restart(
&self,
min_key_byte: u8,
) -> Result<Option<(u8, ChildOrValue<'e, V>)>, ConcurrentUpdateError> {
let child_or_value = self.ptr.find_next_child_or_value(min_key_byte);
self.ptr.lockword().check_or_restart(self.version)?;
match child_or_value {
None => Ok(None),
Some((k, ChildOrValuePtr::Value(vptr)) )=> Ok(Some((k, ChildOrValue::Value(vptr)))),
Some((k, ChildOrValuePtr::Child(child_ptr))) => Ok(Some((k, ChildOrValue::Child(NodeRef {
ptr: child_ptr,
phantom: self.phantom,
})))),
}
}
pub(crate) fn upgrade_to_write_lock_or_restart(
self,

View File

@@ -116,6 +116,8 @@ impl<'t> BlockAllocator<'t> {
return INVALID_BLOCK;
}
// TODO: this is currently unused. The slab allocator never releases blocks
#[allow(dead_code)]
pub(crate) fn release_block(&self, block_ptr: *mut u8) {
let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
self.release_block_internal(blockno as u64);

View File

@@ -324,9 +324,8 @@ where
}
impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
pub fn get(&self, key: &K) -> Option<V> {
let vref = algorithm::search(key, self.tree.root, &self.epoch_pin);
vref.cloned()
pub fn get(&'e self, key: &K) -> Option<&'e V> {
algorithm::search(key, self.tree.root, &self.epoch_pin)
}
}
@@ -347,9 +346,8 @@ where
impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'t, K, V, A> {
/// Get a value
pub fn get(&mut self, key: &K) -> Option<V> {
let v = algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin);
v.cloned()
pub fn get(&'t mut self, key: &K) -> Option<&'t V> {
algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
}
/// Insert a value
@@ -377,13 +375,11 @@ impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'t, K, V, A> {
where
F: FnOnce(Option<&V>) -> Option<V>,
{
let result = algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self);
algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self);
if self.created_garbage {
let n = self.collect_garbage();
eprintln!("collected {n} obsolete nodes");
let _ = self.collect_garbage();
}
result
}
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
@@ -415,7 +411,7 @@ pub struct TreeIterator<K>
where K: Key + for<'a> From<&'a [u8]>,
{
done: bool,
next_key: Vec<u8>,
pub next_key: Vec<u8>,
max_key: Option<Vec<u8>>,
phantom_key: PhantomData<K>,
@@ -436,12 +432,16 @@ impl<K> TreeIterator<K>
}
pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
TreeIterator {
let result = TreeIterator {
done: false,
next_key: Vec::from(range.start.as_bytes()),
max_key: Some(Vec::from(range.end.as_bytes())),
phantom_key: PhantomData,
}
};
assert_eq!(result.next_key.len(), K::KEY_LEN);
assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
result
}
@@ -451,27 +451,48 @@ impl<K> TreeIterator<K>
if self.done {
return None;
}
if let Some((k , v)) = algorithm::iter_next(&mut self.next_key, read_guard.tree.root, &read_guard.epoch_pin) {
assert_eq!(k.len(), self.next_key.len());
// Check if we reached the end of the range
if let Some(max_key) = &self.max_key {
assert_eq!(k.len(), max_key.len());
if k.as_slice() >= max_key.as_slice() {
self.done = true;
return None;
let mut wrapped_around = false;
loop {
assert_eq!(self.next_key.len(), K::KEY_LEN);
if let Some((k , v)) = algorithm::iter_next(&mut self.next_key, read_guard.tree.root, &read_guard.epoch_pin) {
assert_eq!(k.len(), K::KEY_LEN);
assert_eq!(self.next_key.len(), K::KEY_LEN);
// Check if we reached the end of the range
if let Some(max_key) = &self.max_key {
if k.as_slice() >= max_key.as_slice() {
self.done = true;
break None;
}
}
// increment the key
self.next_key = k.clone();
increment_key(self.next_key.as_mut_slice());
let k = k.as_slice().into();
break Some((k, v))
} else {
if self.max_key.is_some() {
self.done = true;
} else {
// Start from beginning
if !wrapped_around {
for i in 0..K::KEY_LEN {
self.next_key[i] = 0;
}
wrapped_around = true;
continue;
} else {
// The tree is completely empty
// FIXME: perhaps we should remember the starting point instead.
// Currently this will scan some ranges twice.
break None;
}
}
break None
}
// increment the key
self.next_key = k.clone();
increment_key(self.next_key.as_mut_slice());
let k = k.as_slice().into();
Some((k, v))
} else {
self.done = true;
None
}
}
}

View File

@@ -61,7 +61,7 @@ fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
for (idx, k) in keys.iter().enumerate() {
let r = tree_writer.start_read();
let value = r.get(&(*k).into());
assert_eq!(value, Some(idx));
assert_eq!(value, Some(idx).as_ref());
}
eprintln!("stats: {:?}", tree_writer.start_write().get_statistics());