mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-03 11:32:56 +00:00
Remove the half-baked Adaptive Radix Tree implementation
We are committed to using the resizeable hash table for now. ART is a great data structure, but it's too much for now. Maybe later.
This commit is contained in:
14
Cargo.lock
generated
14
Cargo.lock
generated
@@ -3942,17 +3942,6 @@ dependencies = [
|
||||
"xxhash-rust",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
"rand 0.9.1",
|
||||
"rand_distr",
|
||||
"spin",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "never-say-never"
|
||||
version = "6.6.666"
|
||||
@@ -6996,9 +6985,6 @@ name = "spin"
|
||||
version = "0.9.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spinning_top"
|
||||
|
||||
@@ -35,7 +35,6 @@ members = [
|
||||
"libs/pq_proto",
|
||||
"libs/tenant_size_model",
|
||||
"libs/metrics",
|
||||
"libs/neonart",
|
||||
"libs/postgres_connection",
|
||||
"libs/remote_storage",
|
||||
"libs/tracing-utils",
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
[package]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
crossbeam-utils.workspace = true
|
||||
spin.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.9.1"
|
||||
rand_distr = "0.5.1"
|
||||
@@ -1,599 +0,0 @@
|
||||
mod lock_and_version;
|
||||
pub(crate) mod node_ptr;
|
||||
mod node_ref;
|
||||
|
||||
use std::vec::Vec;
|
||||
|
||||
use crate::algorithm::lock_and_version::ConcurrentUpdateError;
|
||||
use crate::algorithm::node_ptr::MAX_PREFIX_LEN;
|
||||
use crate::algorithm::node_ref::{NewNodeRef, NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
|
||||
use crate::allocator::OutOfMemoryError;
|
||||
|
||||
use crate::TreeWriteGuard;
|
||||
use crate::UpdateAction;
|
||||
use crate::allocator::ArtAllocator;
|
||||
use crate::epoch::EpochPin;
|
||||
use crate::{Key, Value};
|
||||
|
||||
pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ArtError {
|
||||
ConcurrentUpdate, // need to retry
|
||||
OutOfMemory,
|
||||
}
|
||||
|
||||
impl From<ConcurrentUpdateError> for ArtError {
|
||||
fn from(_: ConcurrentUpdateError) -> ArtError {
|
||||
ArtError::ConcurrentUpdate
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OutOfMemoryError> for ArtError {
|
||||
fn from(_: OutOfMemoryError) -> ArtError {
|
||||
ArtError::OutOfMemory
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_root<V: Value>(
|
||||
allocator: &impl ArtAllocator<V>,
|
||||
) -> Result<RootPtr<V>, OutOfMemoryError> {
|
||||
node_ptr::new_root(allocator)
|
||||
}
|
||||
|
||||
pub(crate) fn search<'e, K: Key, V: Value>(
|
||||
key: &K,
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Option<&'e V> {
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
|
||||
break result;
|
||||
}
|
||||
// retry
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn iter_next<'e, V: Value>(
|
||||
key: &[u8],
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Option<(Vec<u8>, &'e V)> {
|
||||
loop {
|
||||
let mut path = Vec::new();
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
|
||||
match next_recurse(key, &mut path, root_ref, epoch_pin) {
|
||||
Ok(Some(v)) => {
|
||||
assert_eq!(path.len(), key.len());
|
||||
break Some((path, v));
|
||||
}
|
||||
Ok(None) => break None,
|
||||
Err(ConcurrentUpdateError()) => {
|
||||
// retry
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update_fn<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
|
||||
key: &K,
|
||||
value_fn: F,
|
||||
root: RootPtr<V>,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let value_fn_cell = std::cell::Cell::new(Some(value_fn));
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
|
||||
let key_bytes = key.as_bytes();
|
||||
|
||||
match update_recurse(
|
||||
key_bytes,
|
||||
this_value_fn,
|
||||
root_ref,
|
||||
None,
|
||||
None,
|
||||
guard,
|
||||
0,
|
||||
key_bytes,
|
||||
) {
|
||||
Ok(()) => break Ok(()),
|
||||
Err(ArtError::ConcurrentUpdate) => {
|
||||
continue; // retry
|
||||
}
|
||||
Err(ArtError::OutOfMemory) => break Err(OutOfMemoryError()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Error means you must retry.
|
||||
//
|
||||
// This corresponds to the 'lookupOpt' function in the paper
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
fn lookup_recurse<'e, V: Value>(
|
||||
key: &[u8],
|
||||
node: NodeRef<'e, V>,
|
||||
parent: Option<ReadLockedNodeRef<V>>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Result<Option<&'e V>, ConcurrentUpdateError> {
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
if let Some(parent) = parent {
|
||||
parent.read_unlock_or_restart()?;
|
||||
}
|
||||
|
||||
// check if the prefix matches, may increment level
|
||||
let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
|
||||
prefix_len
|
||||
} else {
|
||||
rnode.read_unlock_or_restart()?;
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(key.len(), prefix_len);
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let v = unsafe { vptr.as_ref().unwrap() };
|
||||
return Ok(Some(v));
|
||||
}
|
||||
|
||||
let key = &key[prefix_len..];
|
||||
|
||||
// find child (or leaf value)
|
||||
let next_node = rnode.find_child_or_restart(key[0])?;
|
||||
|
||||
match next_node {
|
||||
None => Ok(None), // key not found
|
||||
Some(child) => lookup_recurse(&key[1..], child, Some(rnode), epoch_pin),
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
fn next_recurse<'e, V: Value>(
|
||||
min_key: &[u8],
|
||||
path: &mut Vec<u8>,
|
||||
node: NodeRef<'e, V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Result<Option<&'e V>, ConcurrentUpdateError> {
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
let prefix = rnode.get_prefix();
|
||||
if !prefix.is_empty() {
|
||||
path.extend_from_slice(prefix);
|
||||
}
|
||||
|
||||
use std::cmp::Ordering;
|
||||
let comparison = path.as_slice().cmp(&min_key[0..path.len()]);
|
||||
if comparison == Ordering::Less {
|
||||
rnode.read_unlock_or_restart()?;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(path.len(), min_key.len());
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let v = unsafe { vptr.as_ref().unwrap() };
|
||||
return Ok(Some(v));
|
||||
}
|
||||
|
||||
let mut min_key_byte = match comparison {
|
||||
Ordering::Less => unreachable!(), // checked this above already
|
||||
Ordering::Equal => min_key[path.len()],
|
||||
Ordering::Greater => 0,
|
||||
};
|
||||
|
||||
loop {
|
||||
match rnode.find_next_child_or_restart(min_key_byte)? {
|
||||
None => {
|
||||
return Ok(None);
|
||||
}
|
||||
Some((key_byte, child_ref)) => {
|
||||
let path_len = path.len();
|
||||
path.push(key_byte);
|
||||
let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
|
||||
if result.is_some() {
|
||||
return Ok(result);
|
||||
}
|
||||
if key_byte == u8::MAX {
|
||||
return Ok(None);
|
||||
}
|
||||
path.truncate(path_len);
|
||||
min_key_byte = key_byte + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This corresponds to the 'insertOpt' function in the paper
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn update_recurse<'e, K: Key, V: Value, A: ArtAllocator<V>, F>(
|
||||
key: &[u8],
|
||||
value_fn: F,
|
||||
node: NodeRef<'e, V>,
|
||||
rparent: Option<(ReadLockedNodeRef<V>, u8)>,
|
||||
rgrandparent: Option<(ReadLockedNodeRef<V>, u8)>,
|
||||
guard: &'_ mut TreeWriteGuard<'e, K, V, A>,
|
||||
level: usize,
|
||||
orig_key: &[u8],
|
||||
) -> Result<(), ArtError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
|
||||
let prefix_match_len = rnode.prefix_matches(key);
|
||||
if prefix_match_len.is_none() {
|
||||
let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_split_prefix(key, new_value, &mut wnode, &mut wparent, parent_key, guard)?;
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
}
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
return Ok(());
|
||||
}
|
||||
let prefix_match_len = prefix_match_len.unwrap();
|
||||
let key = &key[prefix_match_len..];
|
||||
let level = level + prefix_match_len;
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(key.len(), 0);
|
||||
let (rparent, parent_key) = rparent.expect("root cannot be leaf");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
// safety: Now that we have acquired the write lock, we have exclusive access to the
|
||||
// value. XXX: There might be concurrent reads though?
|
||||
let value_mut = wnode.get_leaf_value_mut();
|
||||
|
||||
match value_fn(Some(value_mut)) {
|
||||
UpdateAction::Nothing => {
|
||||
wparent.write_unlock();
|
||||
wnode.write_unlock();
|
||||
}
|
||||
UpdateAction::Insert(_) => panic!("cannot insert over existing value"),
|
||||
UpdateAction::Remove => {
|
||||
guard.remember_obsolete_node(wnode.as_ptr());
|
||||
wparent.delete_child(parent_key);
|
||||
wnode.write_unlock_obsolete();
|
||||
|
||||
if let Some(rgrandparent) = rgrandparent {
|
||||
// FIXME: Ignore concurrency error. It doesn't lead to
|
||||
// corruption, but it means we might leak something. Until
|
||||
// another update cleans it up.
|
||||
let _ = cleanup_parent(wparent, rgrandparent, guard);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let next_node = rnode.find_child_or_restart(key[0])?;
|
||||
|
||||
if next_node.is_none() {
|
||||
if rnode.is_full() {
|
||||
let (rparent, parent_key) = rparent.expect("root node cannot become full");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_and_grow(key, new_value, wnode, &mut wparent, parent_key, guard)?;
|
||||
wparent.write_unlock();
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
};
|
||||
} else {
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
if let Some((rparent, _)) = rparent {
|
||||
rparent.read_unlock_or_restart()?;
|
||||
}
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_to_node(&mut wnode, key, new_value, guard)?;
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
};
|
||||
wnode.write_unlock();
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
let next_child = next_node.unwrap(); // checked above it's not None
|
||||
if let Some((ref rparent, _)) = rparent {
|
||||
rparent.check_or_restart()?;
|
||||
}
|
||||
|
||||
// recurse to next level
|
||||
update_recurse(
|
||||
&key[1..],
|
||||
value_fn,
|
||||
next_child,
|
||||
Some((rnode, key[0])),
|
||||
rparent,
|
||||
guard,
|
||||
level + 1,
|
||||
orig_key,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum PathElement {
|
||||
Prefix(Vec<u8>),
|
||||
KeyByte(u8),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for PathElement {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
match self {
|
||||
PathElement::Prefix(prefix) => write!(fmt, "{prefix:?}"),
|
||||
PathElement::KeyByte(key_byte) => write!(fmt, "{key_byte}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn dump_tree<V: Value + std::fmt::Debug>(
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'_ EpochPin,
|
||||
dst: &mut dyn std::io::Write,
|
||||
) {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
|
||||
let _ = dump_recurse(&[], root_ref, epoch_pin, 0, dst);
|
||||
}
|
||||
|
||||
// TODO: return an Err if writeln!() returns error, instead of unwrapping
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
fn dump_recurse<'e, V: Value + std::fmt::Debug>(
|
||||
path: &[PathElement],
|
||||
node: NodeRef<'e, V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
level: usize,
|
||||
dst: &mut dyn std::io::Write,
|
||||
) -> Result<(), ConcurrentUpdateError> {
|
||||
let indent = str::repeat(" ", level);
|
||||
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
let mut path = Vec::from(path);
|
||||
let prefix = rnode.get_prefix();
|
||||
if !prefix.is_empty() {
|
||||
path.push(PathElement::Prefix(Vec::from(prefix)));
|
||||
}
|
||||
|
||||
if rnode.is_leaf() {
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let val = unsafe { vptr.as_ref().unwrap() };
|
||||
writeln!(dst, "{indent} {path:?}: {val:?}").unwrap();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
for key_byte in 0..=u8::MAX {
|
||||
match rnode.find_child_or_restart(key_byte)? {
|
||||
None => continue,
|
||||
Some(child_ref) => {
|
||||
let rchild = child_ref.read_lock_or_restart()?;
|
||||
writeln!(
|
||||
dst,
|
||||
"{} {:?}, {}: prefix {:?}",
|
||||
indent,
|
||||
&path,
|
||||
key_byte,
|
||||
rchild.get_prefix()
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut child_path = path.clone();
|
||||
child_path.push(PathElement::KeyByte(key_byte));
|
||||
|
||||
dump_recurse(&child_path, child_ref, epoch_pin, level + 1, dst)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///```text
|
||||
/// [fooba]r -> value
|
||||
///
|
||||
/// [foo]b -> [a]r -> value
|
||||
/// e -> [ls]e -> value
|
||||
///```
|
||||
fn insert_split_prefix<K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
node: &mut WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key: u8,
|
||||
guard: &'_ TreeWriteGuard<K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError> {
|
||||
let old_node = node;
|
||||
let old_prefix = old_node.get_prefix();
|
||||
let common_prefix_len = common_prefix(key, old_prefix);
|
||||
|
||||
// Allocate a node for the new value.
|
||||
let new_value_node = allocate_node_for_value(
|
||||
&key[common_prefix_len + 1..],
|
||||
value,
|
||||
guard.tree_writer.allocator,
|
||||
)?;
|
||||
|
||||
// Allocate a new internal node with the common prefix
|
||||
// FIXME: deallocate 'new_value_node' on OOM
|
||||
let mut prefix_node =
|
||||
node_ref::new_internal(&key[..common_prefix_len], guard.tree_writer.allocator)?;
|
||||
|
||||
// Add the old node and the new nodes to the new internal node
|
||||
prefix_node.insert_old_child(old_prefix[common_prefix_len], old_node);
|
||||
prefix_node.insert_new_child(key[common_prefix_len], new_value_node);
|
||||
|
||||
// Modify the prefix of the old child in place
|
||||
old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
|
||||
|
||||
// replace the pointer in the parent
|
||||
parent.replace_child(parent_key, prefix_node.into_ptr());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_to_node<K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
wnode: &mut WriteLockedNodeRef<V>,
|
||||
key: &[u8],
|
||||
value: V,
|
||||
guard: &'_ TreeWriteGuard<K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError> {
|
||||
let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
|
||||
wnode.insert_child(key[0], value_child.into_ptr());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// On entry: 'parent' and 'node' are locked
|
||||
fn insert_and_grow<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
wnode: WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key_byte: u8,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), ArtError> {
|
||||
let mut bigger_node = wnode.grow(guard.tree_writer.allocator)?;
|
||||
|
||||
// FIXME: deallocate 'bigger_node' on OOM
|
||||
let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
|
||||
bigger_node.insert_new_child(key[0], value_child);
|
||||
|
||||
// Replace the pointer in the parent
|
||||
parent.replace_child(parent_key_byte, bigger_node.into_ptr());
|
||||
|
||||
guard.remember_obsolete_node(wnode.as_ptr());
|
||||
wnode.write_unlock_obsolete();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cleanup_parent<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
wparent: WriteLockedNodeRef<V>,
|
||||
rgrandparent: (ReadLockedNodeRef<V>, u8),
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), ArtError> {
|
||||
let (rgrandparent, grandparent_key_byte) = rgrandparent;
|
||||
|
||||
// If the parent becomes completely empty after the deletion, remove the parent from the
|
||||
// grandparent. (This case is possible because we reserve only 8 bytes for the prefix.)
|
||||
// TODO: not implemented.
|
||||
|
||||
// If the parent has only one child, replace the parent with the remaining child. (This is not
|
||||
// possible if the child's prefix field cannot absorb the parent's)
|
||||
if wparent.num_children() == 1 {
|
||||
// Try to lock the remaining child. This can fail if the child is updated
|
||||
// concurrently.
|
||||
let (key_byte, remaining_child) = wparent.find_remaining_child();
|
||||
|
||||
let mut wremaining_child = remaining_child.write_lock_or_restart()?;
|
||||
|
||||
if 1 + wremaining_child.get_prefix().len() + wparent.get_prefix().len() <= MAX_PREFIX_LEN {
|
||||
let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
// Ok, we have locked the leaf, the parent, the grandparent, and the parent's only
|
||||
// remaining leaf. Proceed with the updates.
|
||||
|
||||
// Update the prefix on the remaining leaf
|
||||
wremaining_child.prepend_prefix(wparent.get_prefix(), key_byte);
|
||||
|
||||
// Replace the pointer in the grandparent to point directly to the remaining leaf
|
||||
wgrandparent.replace_child(grandparent_key_byte, wremaining_child.as_ptr());
|
||||
|
||||
// Mark the parent as deleted.
|
||||
guard.remember_obsolete_node(wparent.as_ptr());
|
||||
wparent.write_unlock_obsolete();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// If the parent's children would fit on a smaller node type after the deletion, replace it with
|
||||
// a smaller node.
|
||||
if wparent.can_shrink() {
|
||||
let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
|
||||
let smaller_node = wparent.shrink(guard.tree_writer.allocator)?;
|
||||
|
||||
// Replace the pointer in the grandparent
|
||||
wgrandparent.replace_child(grandparent_key_byte, smaller_node.into_ptr());
|
||||
|
||||
guard.remember_obsolete_node(wparent.as_ptr());
|
||||
wparent.write_unlock_obsolete();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// nothing to do
|
||||
wparent.write_unlock();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Allocate a new leaf node to hold 'value'. If the key is long, we
|
||||
// may need to allocate new internal nodes to hold it too
|
||||
fn allocate_node_for_value<'a, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError> {
|
||||
let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN);
|
||||
|
||||
let leaf_node = node_ref::new_leaf(&key[prefix_off..key.len()], value, allocator)?;
|
||||
|
||||
let mut node = leaf_node;
|
||||
while prefix_off > 0 {
|
||||
// Need another internal node
|
||||
let remain_prefix = &key[0..prefix_off];
|
||||
|
||||
prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
|
||||
let mut internal_node = node_ref::new_internal(
|
||||
&remain_prefix[prefix_off..remain_prefix.len() - 1],
|
||||
allocator,
|
||||
)?;
|
||||
internal_node.insert_new_child(*remain_prefix.last().unwrap(), node);
|
||||
node = internal_node;
|
||||
}
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
|
||||
fn common_prefix(a: &[u8], b: &[u8]) -> usize {
|
||||
for i in 0..MAX_PREFIX_LEN {
|
||||
if a[i] != b[i] {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
panic!("prefixes are equal");
|
||||
}
|
||||
@@ -1,117 +0,0 @@
|
||||
//! Each node in the tree has contains one atomic word that stores three things:
|
||||
//!
|
||||
//! Bit 0: set if the node is "obsolete". An obsolete node has been removed from the tree,
|
||||
//! but might still be accessed by concurrent readers until the epoch expires.
|
||||
//! Bit 1: set if the node is currently write-locked. Used as a spinlock.
|
||||
//! Bits 2-63: Version number, incremented every time the node is modified.
|
||||
//!
|
||||
//! AtomicLockAndVersion represents that.
|
||||
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
pub(crate) struct ConcurrentUpdateError();
|
||||
|
||||
pub(crate) struct AtomicLockAndVersion {
|
||||
inner: AtomicU64,
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn new() -> AtomicLockAndVersion {
|
||||
AtomicLockAndVersion {
|
||||
inner: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn read_lock_or_restart(&self) -> Result<u64, ConcurrentUpdateError> {
|
||||
let version = self.await_node_unlocked();
|
||||
if is_obsolete(version) {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(version)
|
||||
}
|
||||
|
||||
pub(crate) fn check_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
|
||||
self.read_unlock_or_restart(version)
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
|
||||
if self.inner.load(Ordering::Acquire) != version {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(
|
||||
&self,
|
||||
version: u64,
|
||||
) -> Result<(), ConcurrentUpdateError> {
|
||||
if self
|
||||
.inner
|
||||
.compare_exchange(
|
||||
version,
|
||||
set_locked_bit(version),
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_lock_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
|
||||
let old = self.inner.load(Ordering::Relaxed);
|
||||
if is_obsolete(old) || is_locked(old) {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
if self
|
||||
.inner
|
||||
.compare_exchange(
|
||||
old,
|
||||
set_locked_bit(old),
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(&self) {
|
||||
// reset locked bit and overflow into version
|
||||
self.inner.fetch_add(2, Ordering::Release);
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(&self) {
|
||||
// set obsolete, reset locked, overflow into version
|
||||
self.inner.fetch_add(3, Ordering::Release);
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
fn await_node_unlocked(&self) -> u64 {
|
||||
let mut version = self.inner.load(Ordering::Acquire);
|
||||
while is_locked(version) {
|
||||
// spinlock
|
||||
std::thread::yield_now();
|
||||
version = self.inner.load(Ordering::Acquire)
|
||||
}
|
||||
version
|
||||
}
|
||||
}
|
||||
|
||||
fn set_locked_bit(version: u64) -> u64 {
|
||||
version + 2
|
||||
}
|
||||
|
||||
fn is_obsolete(version: u64) -> bool {
|
||||
(version & 1) == 1
|
||||
}
|
||||
|
||||
fn is_locked(version: u64) -> bool {
|
||||
(version & 2) == 2
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,349 +0,0 @@
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use super::node_ptr;
|
||||
use super::node_ptr::NodePtr;
|
||||
use crate::EpochPin;
|
||||
use crate::Value;
|
||||
use crate::algorithm::lock_and_version::AtomicLockAndVersion;
|
||||
use crate::algorithm::lock_and_version::ConcurrentUpdateError;
|
||||
use crate::allocator::ArtAllocator;
|
||||
use crate::allocator::OutOfMemoryError;
|
||||
|
||||
pub struct NodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V> Debug for NodeRef<'e, V> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.ptr)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V: Value> NodeRef<'e, V> {
|
||||
pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
|
||||
NodeRef {
|
||||
ptr: root_ptr,
|
||||
phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn read_lock_or_restart(
|
||||
&self,
|
||||
) -> Result<ReadLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
let version = self.lockword().read_lock_or_restart()?;
|
||||
Ok(ReadLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
version,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn write_lock_or_restart(
|
||||
&self,
|
||||
) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
self.lockword().write_lock_or_restart()?;
|
||||
Ok(WriteLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
fn lockword(&self) -> &AtomicLockAndVersion {
|
||||
self.ptr.lockword()
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct ReadLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
version: u64,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
|
||||
pub(crate) fn is_leaf(&self) -> bool {
|
||||
self.ptr.is_leaf()
|
||||
}
|
||||
|
||||
pub(crate) fn is_full(&self) -> bool {
|
||||
self.ptr.is_full()
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
/// Note: because we're only holding a read lock, the prefix can change concurrently.
|
||||
/// You must be prepared to restart, if read_unlock() returns error later.
|
||||
///
|
||||
/// Returns the length of the prefix, or None if it's not a match
|
||||
pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
|
||||
self.ptr.prefix_matches(key)
|
||||
}
|
||||
|
||||
pub(crate) fn find_child_or_restart(
|
||||
&self,
|
||||
key_byte: u8,
|
||||
) -> Result<Option<NodeRef<'e, V>>, ConcurrentUpdateError> {
|
||||
let child_or_value = self.ptr.find_child(key_byte);
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
match child_or_value {
|
||||
None => Ok(None),
|
||||
Some(child_ptr) => Ok(Some(NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_next_child_or_restart(
|
||||
&self,
|
||||
min_key_byte: u8,
|
||||
) -> Result<Option<(u8, NodeRef<'e, V>)>, ConcurrentUpdateError> {
|
||||
let child_or_value = self.ptr.find_next_child(min_key_byte);
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
match child_or_value {
|
||||
None => Ok(None),
|
||||
Some((k, child_ptr)) => Ok(Some((
|
||||
k,
|
||||
NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
},
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_leaf_value_ptr(&self) -> Result<*const V, ConcurrentUpdateError> {
|
||||
let result = self.ptr.get_leaf_value();
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
// Extend the lifetime.
|
||||
let result = std::ptr::from_ref(result);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(
|
||||
self,
|
||||
) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
self.ptr
|
||||
.lockword()
|
||||
.upgrade_to_write_lock_or_restart(self.version)?;
|
||||
|
||||
Ok(WriteLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(self) -> Result<(), ConcurrentUpdateError> {
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn check_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct WriteLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
|
||||
pub(crate) fn can_shrink(&self) -> bool {
|
||||
self.ptr.can_shrink()
|
||||
}
|
||||
|
||||
pub(crate) fn num_children(&self) -> usize {
|
||||
self.ptr.num_children()
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(mut self) {
|
||||
self.ptr.lockword().write_unlock();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(mut self) {
|
||||
self.ptr.lockword().write_unlock_obsolete();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
self.ptr.truncate_prefix(new_prefix_len)
|
||||
}
|
||||
|
||||
pub(crate) fn prepend_prefix(&mut self, prefix: &[u8], prefix_byte: u8) {
|
||||
self.ptr.prepend_prefix(prefix, prefix_byte)
|
||||
}
|
||||
|
||||
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
self.ptr.insert_child(key_byte, child)
|
||||
}
|
||||
|
||||
pub(crate) fn get_leaf_value_mut(&mut self) -> &mut V {
|
||||
self.ptr.get_leaf_value_mut()
|
||||
}
|
||||
|
||||
pub(crate) fn grow<'a, A>(
|
||||
&self,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
let new_node = self.ptr.grow(allocator)?;
|
||||
Ok(NewNodeRef {
|
||||
ptr: new_node,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn shrink<'a, A>(
|
||||
&self,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
let new_node = self.ptr.shrink(allocator)?;
|
||||
Ok(NewNodeRef {
|
||||
ptr: new_node,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn as_ptr(&self) -> NodePtr<V> {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
self.ptr.replace_child(key_byte, replacement);
|
||||
}
|
||||
|
||||
pub(crate) fn delete_child(&mut self, key_byte: u8) {
|
||||
self.ptr.delete_child(key_byte);
|
||||
}
|
||||
|
||||
pub(crate) fn find_remaining_child(&self) -> (u8, NodeRef<'e, V>) {
|
||||
assert_eq!(self.num_children(), 1);
|
||||
let child_or_value = self.ptr.find_next_child(0);
|
||||
|
||||
match child_or_value {
|
||||
None => panic!("could not find only child in node"),
|
||||
Some((k, child_ptr)) => (
|
||||
k,
|
||||
NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
self.ptr.lockword().write_unlock();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
ptr: NodePtr<V>,
|
||||
allocator: &'a A,
|
||||
|
||||
extra_nodes: Vec<NodePtr<V>>,
|
||||
}
|
||||
|
||||
impl<'a, V, A> NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
pub(crate) fn insert_old_child(&mut self, key_byte: u8, child: &WriteLockedNodeRef<V>) {
|
||||
self.ptr.insert_child(key_byte, child.as_ptr())
|
||||
}
|
||||
|
||||
pub(crate) fn into_ptr(mut self) -> NodePtr<V> {
|
||||
let ptr = self.ptr;
|
||||
self.ptr = NodePtr::null();
|
||||
ptr
|
||||
}
|
||||
|
||||
pub(crate) fn insert_new_child(&mut self, key_byte: u8, child: NewNodeRef<'a, V, A>) {
|
||||
let child_ptr = child.into_ptr();
|
||||
self.ptr.insert_child(key_byte, child_ptr);
|
||||
self.extra_nodes.push(child_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V, A> Drop for NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
/// This drop implementation deallocates the newly allocated node, if into_ptr() was not called.
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
self.ptr.deallocate(self.allocator);
|
||||
for p in self.extra_nodes.iter() {
|
||||
p.deallocate(self.allocator);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new_internal<'a, V, A>(
|
||||
prefix: &[u8],
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
Ok(NewNodeRef {
|
||||
ptr: node_ptr::new_internal(prefix, allocator)?,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn new_leaf<'a, V, A>(
|
||||
prefix: &[u8],
|
||||
value: V,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
Ok(NewNodeRef {
|
||||
ptr: node_ptr::new_leaf(prefix, value, allocator)?,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
@@ -1,156 +0,0 @@
|
||||
pub mod block;
|
||||
mod multislab;
|
||||
mod slab;
|
||||
pub mod r#static;
|
||||
|
||||
use std::alloc::Layout;
|
||||
use std::marker::PhantomData;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use crate::allocator::multislab::MultiSlabAllocator;
|
||||
use crate::allocator::r#static::alloc_from_slice;
|
||||
|
||||
use spin;
|
||||
|
||||
use crate::Tree;
|
||||
pub use crate::algorithm::node_ptr::{
|
||||
NodeInternal4, NodeInternal16, NodeInternal48, NodeInternal256, NodeLeaf,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct OutOfMemoryError();
|
||||
|
||||
pub trait ArtAllocator<V: crate::Value> {
|
||||
fn alloc_tree(&self) -> *mut Tree<V>;
|
||||
|
||||
fn alloc_node_internal4(&self) -> *mut NodeInternal4<V>;
|
||||
fn alloc_node_internal16(&self) -> *mut NodeInternal16<V>;
|
||||
fn alloc_node_internal48(&self) -> *mut NodeInternal48<V>;
|
||||
fn alloc_node_internal256(&self) -> *mut NodeInternal256<V>;
|
||||
fn alloc_node_leaf(&self) -> *mut NodeLeaf<V>;
|
||||
|
||||
fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>);
|
||||
fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>);
|
||||
fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>);
|
||||
fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>);
|
||||
fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>);
|
||||
}
|
||||
|
||||
pub struct ArtMultiSlabAllocator<'t, V>
|
||||
where
|
||||
V: crate::Value,
|
||||
{
|
||||
tree_area: spin::Mutex<Option<&'t mut MaybeUninit<Tree<V>>>>,
|
||||
|
||||
pub(crate) inner: MultiSlabAllocator<'t, 5>,
|
||||
|
||||
phantom_val: PhantomData<V>,
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
|
||||
const LAYOUTS: [Layout; 5] = [
|
||||
Layout::new::<NodeInternal4<V>>(),
|
||||
Layout::new::<NodeInternal16<V>>(),
|
||||
Layout::new::<NodeInternal48<V>>(),
|
||||
Layout::new::<NodeInternal256<V>>(),
|
||||
Layout::new::<NodeLeaf<V>>(),
|
||||
];
|
||||
|
||||
pub fn new(area: &'t mut [MaybeUninit<u8>]) -> &'t mut ArtMultiSlabAllocator<'t, V> {
|
||||
let (allocator_area, remain) = alloc_from_slice::<ArtMultiSlabAllocator<V>>(area);
|
||||
let (tree_area, remain) = alloc_from_slice::<Tree<V>>(remain);
|
||||
|
||||
allocator_area.write(ArtMultiSlabAllocator {
|
||||
tree_area: spin::Mutex::new(Some(tree_area)),
|
||||
inner: MultiSlabAllocator::new(remain, &Self::LAYOUTS),
|
||||
phantom_val: PhantomData,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtAllocator<V> for ArtMultiSlabAllocator<'t, V> {
|
||||
fn alloc_tree(&self) -> *mut Tree<V> {
|
||||
let mut t = self.tree_area.lock();
|
||||
if let Some(tree_area) = t.take() {
|
||||
return tree_area.as_mut_ptr().cast();
|
||||
}
|
||||
panic!("cannot allocate more than one tree");
|
||||
}
|
||||
|
||||
fn alloc_node_internal4(&self) -> *mut NodeInternal4<V> {
|
||||
self.inner.alloc_slab(0).cast()
|
||||
}
|
||||
fn alloc_node_internal16(&self) -> *mut NodeInternal16<V> {
|
||||
self.inner.alloc_slab(1).cast()
|
||||
}
|
||||
fn alloc_node_internal48(&self) -> *mut NodeInternal48<V> {
|
||||
self.inner.alloc_slab(2).cast()
|
||||
}
|
||||
fn alloc_node_internal256(&self) -> *mut NodeInternal256<V> {
|
||||
self.inner.alloc_slab(3).cast()
|
||||
}
|
||||
fn alloc_node_leaf(&self) -> *mut NodeLeaf<V> {
|
||||
self.inner.alloc_slab(4).cast()
|
||||
}
|
||||
|
||||
fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>) {
|
||||
self.inner.dealloc_slab(0, ptr.cast())
|
||||
}
|
||||
|
||||
fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>) {
|
||||
self.inner.dealloc_slab(1, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>) {
|
||||
self.inner.dealloc_slab(2, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>) {
|
||||
self.inner.dealloc_slab(3, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>) {
|
||||
self.inner.dealloc_slab(4, ptr.cast())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
|
||||
pub(crate) fn get_statistics(&self) -> ArtMultiSlabStats {
|
||||
ArtMultiSlabStats {
|
||||
num_internal4: self.inner.slab_descs[0]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal16: self.inner.slab_descs[1]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal48: self.inner.slab_descs[2]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal256: self.inner.slab_descs[3]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_leaf: self.inner.slab_descs[4]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
|
||||
num_blocks_internal4: self.inner.slab_descs[0].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal16: self.inner.slab_descs[1].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal48: self.inner.slab_descs[2].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal256: self.inner.slab_descs[3].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_leaf: self.inner.slab_descs[4].num_blocks.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ArtMultiSlabStats {
|
||||
pub num_internal4: u64,
|
||||
pub num_internal16: u64,
|
||||
pub num_internal48: u64,
|
||||
pub num_internal256: u64,
|
||||
pub num_leaf: u64,
|
||||
|
||||
pub num_blocks_internal4: u64,
|
||||
pub num_blocks_internal16: u64,
|
||||
pub num_blocks_internal48: u64,
|
||||
pub num_blocks_internal256: u64,
|
||||
pub num_blocks_leaf: u64,
|
||||
}
|
||||
@@ -1,191 +0,0 @@
|
||||
//! Simple allocator of fixed-size blocks
|
||||
|
||||
use std::mem::MaybeUninit;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use spin;
|
||||
|
||||
pub const BLOCK_SIZE: usize = 16 * 1024;
|
||||
|
||||
const INVALID_BLOCK: u64 = u64::MAX;
|
||||
|
||||
pub(crate) struct BlockAllocator<'t> {
|
||||
blocks_ptr: &'t [MaybeUninit<u8>],
|
||||
num_blocks: u64,
|
||||
num_initialized: AtomicU64,
|
||||
|
||||
freelist_head: spin::Mutex<u64>,
|
||||
}
|
||||
|
||||
struct FreeListBlock {
|
||||
inner: spin::Mutex<FreeListBlockInner>,
|
||||
}
|
||||
|
||||
struct FreeListBlockInner {
|
||||
next: u64,
|
||||
|
||||
num_free_blocks: u64,
|
||||
free_blocks: [u64; 100], // FIXME: fill the rest of the block
|
||||
}
|
||||
|
||||
impl<'t> BlockAllocator<'t> {
|
||||
pub(crate) fn new(area: &'t mut [MaybeUninit<u8>]) -> Self {
|
||||
// Use all the space for the blocks
|
||||
let padding = area.as_ptr().align_offset(BLOCK_SIZE);
|
||||
let remain = &mut area[padding..];
|
||||
|
||||
let num_blocks = (remain.len() / BLOCK_SIZE) as u64;
|
||||
|
||||
BlockAllocator {
|
||||
blocks_ptr: remain,
|
||||
num_blocks,
|
||||
num_initialized: AtomicU64::new(0),
|
||||
freelist_head: spin::Mutex::new(INVALID_BLOCK),
|
||||
}
|
||||
}
|
||||
|
||||
/// safety: you must hold a lock on the pointer to this block, otherwise it might get
|
||||
/// reused for another kind of block
|
||||
fn read_freelist_block(&self, blkno: u64) -> &FreeListBlock {
|
||||
let ptr: *const FreeListBlock = self.get_block_ptr(blkno).cast();
|
||||
unsafe { ptr.as_ref().unwrap() }
|
||||
}
|
||||
|
||||
fn get_block_ptr(&self, blkno: u64) -> *mut u8 {
|
||||
assert!(blkno < self.num_blocks);
|
||||
unsafe {
|
||||
self.blocks_ptr
|
||||
.as_ptr()
|
||||
.byte_offset(blkno as isize * BLOCK_SIZE as isize)
|
||||
}
|
||||
.cast_mut()
|
||||
.cast()
|
||||
}
|
||||
|
||||
#[allow(clippy::mut_from_ref)]
|
||||
pub(crate) fn alloc_block(&self) -> &mut [MaybeUninit<u8>] {
|
||||
// FIXME: handle OOM
|
||||
let blkno = self.alloc_block_internal();
|
||||
if blkno == INVALID_BLOCK {
|
||||
panic!("out of memory");
|
||||
}
|
||||
|
||||
let ptr: *mut MaybeUninit<u8> = self.get_block_ptr(blkno).cast();
|
||||
unsafe { std::slice::from_raw_parts_mut(ptr, BLOCK_SIZE) }
|
||||
}
|
||||
|
||||
fn alloc_block_internal(&self) -> u64 {
|
||||
// check the free list.
|
||||
{
|
||||
let mut freelist_head = self.freelist_head.lock();
|
||||
if *freelist_head != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(*freelist_head);
|
||||
|
||||
// acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
|
||||
let mut g = freelist_block.inner.lock();
|
||||
|
||||
if g.num_free_blocks > 0 {
|
||||
g.num_free_blocks -= 1;
|
||||
let result = g.free_blocks[g.num_free_blocks as usize];
|
||||
return result;
|
||||
} else {
|
||||
// consume the freelist block itself
|
||||
let result = *freelist_head;
|
||||
*freelist_head = g.next;
|
||||
// This freelist block is now unlinked and can be repurposed
|
||||
drop(g);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If there are some blocks left that we've never used, pick next such block
|
||||
let mut next_uninitialized = self.num_initialized.load(Ordering::Relaxed);
|
||||
while next_uninitialized < self.num_blocks {
|
||||
match self.num_initialized.compare_exchange(
|
||||
next_uninitialized,
|
||||
next_uninitialized + 1,
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => {
|
||||
return next_uninitialized;
|
||||
}
|
||||
Err(old) => {
|
||||
next_uninitialized = old;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// out of blocks
|
||||
INVALID_BLOCK
|
||||
}
|
||||
|
||||
// TODO: this is currently unused. The slab allocator never releases blocks
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn release_block(&self, block_ptr: *mut u8) {
|
||||
let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
|
||||
self.release_block_internal(blockno as u64);
|
||||
}
|
||||
|
||||
fn release_block_internal(&self, blockno: u64) {
|
||||
let mut freelist_head = self.freelist_head.lock();
|
||||
if *freelist_head != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(*freelist_head);
|
||||
|
||||
// acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
|
||||
let mut g = freelist_block.inner.lock();
|
||||
|
||||
let num_free_blocks = g.num_free_blocks;
|
||||
if num_free_blocks < g.free_blocks.len() as u64 {
|
||||
g.free_blocks[num_free_blocks as usize] = blockno;
|
||||
g.num_free_blocks += 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert the block into a new freelist block
|
||||
let block_ptr: *mut FreeListBlock = self.get_block_ptr(blockno).cast();
|
||||
let init = FreeListBlock {
|
||||
inner: spin::Mutex::new(FreeListBlockInner {
|
||||
next: *freelist_head,
|
||||
num_free_blocks: 0,
|
||||
free_blocks: [INVALID_BLOCK; 100],
|
||||
}),
|
||||
};
|
||||
unsafe { (*block_ptr) = init };
|
||||
*freelist_head = blockno;
|
||||
}
|
||||
|
||||
// for debugging
|
||||
pub(crate) fn get_statistics(&self) -> BlockAllocatorStats {
|
||||
let mut num_free_blocks = 0;
|
||||
|
||||
let mut _prev_lock = None;
|
||||
let head_lock = self.freelist_head.lock();
|
||||
let mut next_blk = *head_lock;
|
||||
let mut _head_lock = Some(head_lock);
|
||||
while next_blk != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(next_blk);
|
||||
let lock = freelist_block.inner.lock();
|
||||
num_free_blocks += lock.num_free_blocks;
|
||||
next_blk = lock.next;
|
||||
_prev_lock = Some(lock); // hold the lock until we've read the next block
|
||||
_head_lock = None;
|
||||
}
|
||||
|
||||
BlockAllocatorStats {
|
||||
num_blocks: self.num_blocks,
|
||||
num_initialized: self.num_initialized.load(Ordering::Relaxed),
|
||||
num_free_blocks,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BlockAllocatorStats {
|
||||
pub num_blocks: u64,
|
||||
pub num_initialized: u64,
|
||||
pub num_free_blocks: u64,
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
use std::alloc::Layout;
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::allocator::block::BlockAllocator;
|
||||
use crate::allocator::slab::SlabDesc;
|
||||
|
||||
pub struct MultiSlabAllocator<'t, const N: usize> {
|
||||
pub(crate) block_allocator: BlockAllocator<'t>,
|
||||
|
||||
pub(crate) slab_descs: [SlabDesc; N],
|
||||
}
|
||||
|
||||
impl<'t, const N: usize> MultiSlabAllocator<'t, N> {
|
||||
pub(crate) fn new(
|
||||
area: &'t mut [MaybeUninit<u8>],
|
||||
layouts: &[Layout; N],
|
||||
) -> MultiSlabAllocator<'t, N> {
|
||||
let block_allocator = BlockAllocator::new(area);
|
||||
MultiSlabAllocator {
|
||||
block_allocator,
|
||||
|
||||
slab_descs: std::array::from_fn(|i| SlabDesc::new(&layouts[i])),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn alloc_slab(&self, slab_idx: usize) -> *mut u8 {
|
||||
self.slab_descs[slab_idx].alloc_chunk(&self.block_allocator)
|
||||
}
|
||||
|
||||
pub(crate) fn dealloc_slab(&self, slab_idx: usize, ptr: *mut u8) {
|
||||
self.slab_descs[slab_idx].dealloc_chunk(ptr, &self.block_allocator)
|
||||
}
|
||||
}
|
||||
@@ -1,433 +0,0 @@
|
||||
//! A slab allocator that carves out fixed-size chunks from larger blocks.
|
||||
//!
|
||||
//!
|
||||
|
||||
use std::alloc::Layout;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::ops::Deref;
|
||||
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
|
||||
use spin;
|
||||
|
||||
use super::alloc_from_slice;
|
||||
use super::block::BlockAllocator;
|
||||
|
||||
use crate::allocator::block::BLOCK_SIZE;
|
||||
|
||||
pub(crate) struct SlabDesc {
|
||||
pub(crate) layout: Layout,
|
||||
|
||||
block_lists: spin::RwLock<BlockLists>,
|
||||
|
||||
pub(crate) num_blocks: AtomicU64,
|
||||
pub(crate) num_allocated: AtomicU64,
|
||||
}
|
||||
|
||||
// FIXME: Not sure if SlabDesc is really Sync or Send. It probably is when it's empty, but
|
||||
// 'block_lists' contains pointers when it's not empty. In the current use as part of the
|
||||
// the art tree, SlabDescs are only moved during initialization.
|
||||
unsafe impl Sync for SlabDesc {}
|
||||
unsafe impl Send for SlabDesc {}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
struct BlockLists {
|
||||
full_blocks: BlockList,
|
||||
nonfull_blocks: BlockList,
|
||||
}
|
||||
|
||||
impl BlockLists {
|
||||
// Unlink a node. It must be in either one of the two lists.
|
||||
unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
|
||||
let list = unsafe {
|
||||
if (*elem).next.is_null() {
|
||||
if self.full_blocks.tail == elem {
|
||||
Some(&mut self.full_blocks)
|
||||
} else {
|
||||
Some(&mut self.nonfull_blocks)
|
||||
}
|
||||
} else if (*elem).prev.is_null() {
|
||||
if self.full_blocks.head == elem {
|
||||
Some(&mut self.full_blocks)
|
||||
} else {
|
||||
Some(&mut self.nonfull_blocks)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
unsafe { unlink_slab_block(list, elem) };
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn unlink_slab_block(mut list: Option<&mut BlockList>, elem: *mut SlabBlockHeader) {
|
||||
unsafe {
|
||||
if (*elem).next.is_null() {
|
||||
assert_eq!(list.as_ref().unwrap().tail, elem);
|
||||
list.as_mut().unwrap().tail = (*elem).prev;
|
||||
} else {
|
||||
assert_eq!((*(*elem).next).prev, elem);
|
||||
(*(*elem).next).prev = (*elem).prev;
|
||||
}
|
||||
if (*elem).prev.is_null() {
|
||||
assert_eq!(list.as_ref().unwrap().head, elem);
|
||||
list.as_mut().unwrap().head = (*elem).next;
|
||||
} else {
|
||||
assert_eq!((*(*elem).prev).next, elem);
|
||||
(*(*elem).prev).next = (*elem).next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct BlockList {
|
||||
head: *mut SlabBlockHeader,
|
||||
tail: *mut SlabBlockHeader,
|
||||
}
|
||||
|
||||
impl Default for BlockList {
|
||||
fn default() -> Self {
|
||||
BlockList {
|
||||
head: std::ptr::null_mut(),
|
||||
tail: std::ptr::null_mut(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockList {
|
||||
unsafe fn push_head(&mut self, elem: *mut SlabBlockHeader) {
|
||||
unsafe {
|
||||
if self.is_empty() {
|
||||
self.tail = elem;
|
||||
(*elem).next = std::ptr::null_mut();
|
||||
} else {
|
||||
(*elem).next = self.head;
|
||||
(*self.head).prev = elem;
|
||||
}
|
||||
(*elem).prev = std::ptr::null_mut();
|
||||
self.head = elem;
|
||||
}
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.head.is_null()
|
||||
}
|
||||
|
||||
unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
|
||||
unsafe { unlink_slab_block(Some(self), elem) }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn dump(&self) {
|
||||
let mut next = self.head;
|
||||
|
||||
while !next.is_null() {
|
||||
let n = unsafe { next.as_ref() }.unwrap();
|
||||
eprintln!(
|
||||
" blk {:?} (free {}/{})",
|
||||
next,
|
||||
n.num_free_chunks.load(Ordering::Relaxed),
|
||||
n.num_chunks
|
||||
);
|
||||
next = n.next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SlabDesc {
|
||||
pub(crate) fn new(layout: &Layout) -> SlabDesc {
|
||||
SlabDesc {
|
||||
layout: *layout,
|
||||
block_lists: spin::RwLock::new(BlockLists::default()),
|
||||
num_allocated: AtomicU64::new(0),
|
||||
num_blocks: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex<*mut FreeChunk>,
|
||||
num_free_chunks: AtomicU32,
|
||||
num_chunks: u32, // this is really a constant for a given Layout
|
||||
|
||||
// these fields are protected by the lock on the BlockLists
|
||||
prev: *mut SlabBlockHeader,
|
||||
next: *mut SlabBlockHeader,
|
||||
}
|
||||
|
||||
struct FreeChunk {
|
||||
next: *mut FreeChunk,
|
||||
}
|
||||
|
||||
enum ReadOrWriteGuard<'a, T> {
|
||||
Read(spin::RwLockReadGuard<'a, T>),
|
||||
Write(spin::RwLockWriteGuard<'a, T>),
|
||||
}
|
||||
|
||||
impl<'a, T> Deref for ReadOrWriteGuard<'a, T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &<Self as Deref>::Target {
|
||||
match self {
|
||||
ReadOrWriteGuard::Read(g) => g.deref(),
|
||||
ReadOrWriteGuard::Write(g) => g.deref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SlabDesc {
|
||||
pub fn alloc_chunk(&self, block_allocator: &BlockAllocator) -> *mut u8 {
|
||||
// Are there any free chunks?
|
||||
let mut acquire_write = false;
|
||||
'outer: loop {
|
||||
let mut block_lists_guard = if acquire_write {
|
||||
ReadOrWriteGuard::Write(self.block_lists.write())
|
||||
} else {
|
||||
ReadOrWriteGuard::Read(self.block_lists.read())
|
||||
};
|
||||
'inner: loop {
|
||||
let block_ptr = block_lists_guard.nonfull_blocks.head;
|
||||
if block_ptr.is_null() {
|
||||
break 'outer;
|
||||
}
|
||||
unsafe {
|
||||
let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
|
||||
if !(*free_chunks_head).is_null() {
|
||||
let result = *free_chunks_head;
|
||||
(*free_chunks_head) = (*result).next;
|
||||
let _old = (*block_ptr).num_free_chunks.fetch_sub(1, Ordering::Relaxed);
|
||||
|
||||
self.num_allocated.fetch_add(1, Ordering::Relaxed);
|
||||
return result.cast();
|
||||
}
|
||||
}
|
||||
|
||||
// The block at the head of the list was full. Grab write lock and retry
|
||||
match block_lists_guard {
|
||||
ReadOrWriteGuard::Read(_) => {
|
||||
acquire_write = true;
|
||||
continue 'outer;
|
||||
}
|
||||
ReadOrWriteGuard::Write(ref mut g) => {
|
||||
// move the node to the list of full blocks
|
||||
unsafe {
|
||||
g.nonfull_blocks.unlink(block_ptr);
|
||||
g.full_blocks.push_head(block_ptr);
|
||||
};
|
||||
continue 'inner;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// no free chunks. Allocate a new block (and the chunk from that)
|
||||
let (new_block, new_chunk) = self.alloc_block_and_chunk(block_allocator);
|
||||
self.num_blocks.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// Add the block to the list in the SlabDesc
|
||||
unsafe {
|
||||
let mut block_lists_guard = self.block_lists.write();
|
||||
block_lists_guard.nonfull_blocks.push_head(new_block);
|
||||
}
|
||||
self.num_allocated.fetch_add(1, Ordering::Relaxed);
|
||||
new_chunk
|
||||
}
|
||||
|
||||
pub fn dealloc_chunk(&self, chunk_ptr: *mut u8, _block_allocator: &BlockAllocator) {
|
||||
// Find the block it belongs to. You can find the block from the address. (And knowing the
|
||||
// layout, you could calculate the chunk number too.)
|
||||
let block_ptr: *mut SlabBlockHeader = {
|
||||
let block_addr = (chunk_ptr.addr() / BLOCK_SIZE) * BLOCK_SIZE;
|
||||
chunk_ptr.with_addr(block_addr).cast()
|
||||
};
|
||||
let chunk_ptr: *mut FreeChunk = chunk_ptr.cast();
|
||||
|
||||
// Mark the chunk as free in 'freechunks' list
|
||||
let num_chunks;
|
||||
let num_free_chunks;
|
||||
unsafe {
|
||||
let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
|
||||
(*chunk_ptr).next = *free_chunks_head;
|
||||
*free_chunks_head = chunk_ptr;
|
||||
|
||||
num_free_chunks = (*block_ptr).num_free_chunks.fetch_add(1, Ordering::Relaxed) + 1;
|
||||
num_chunks = (*block_ptr).num_chunks;
|
||||
}
|
||||
|
||||
if num_free_chunks == 1 {
|
||||
// If the block was full previously, add it to the nonfull blocks list. Note that
|
||||
// we're not holding the lock anymore, so it can immediately become full again.
|
||||
// That's harmless, it will be moved back to the full list again when a call
|
||||
// to alloc_chunk() sees it.
|
||||
let mut block_lists = self.block_lists.write();
|
||||
unsafe {
|
||||
block_lists.unlink(block_ptr);
|
||||
block_lists.nonfull_blocks.push_head(block_ptr);
|
||||
};
|
||||
} else if num_free_chunks == num_chunks {
|
||||
// If the block became completely empty, move it to the free list
|
||||
// TODO
|
||||
// FIXME: we're still holding the spinlock. It's not exactly safe to return it to
|
||||
// the free blocks list, is it? Defer it as garbage to wait out concurrent updates?
|
||||
//block_allocator.release_block()
|
||||
}
|
||||
|
||||
// update stats
|
||||
self.num_allocated.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn alloc_block_and_chunk(
|
||||
&self,
|
||||
block_allocator: &BlockAllocator,
|
||||
) -> (*mut SlabBlockHeader, *mut u8) {
|
||||
// fixme: handle OOM
|
||||
let block_slice: &mut [MaybeUninit<u8>] = block_allocator.alloc_block();
|
||||
let (block_header, remain) = alloc_from_slice::<SlabBlockHeader>(block_slice);
|
||||
|
||||
let padding = remain.as_ptr().align_offset(self.layout.align());
|
||||
|
||||
let num_chunks = (remain.len() - padding) / self.layout.size();
|
||||
|
||||
let first_chunk_ptr: *mut FreeChunk = remain[padding..].as_mut_ptr().cast();
|
||||
|
||||
unsafe {
|
||||
let mut chunk_ptr = first_chunk_ptr;
|
||||
for _ in 0..num_chunks - 1 {
|
||||
let next_chunk_ptr = chunk_ptr.byte_add(self.layout.size());
|
||||
(*chunk_ptr).next = next_chunk_ptr;
|
||||
chunk_ptr = next_chunk_ptr;
|
||||
}
|
||||
(*chunk_ptr).next = std::ptr::null_mut();
|
||||
|
||||
let result_chunk = first_chunk_ptr;
|
||||
|
||||
let block_header = block_header.write(SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex::new((*first_chunk_ptr).next),
|
||||
prev: std::ptr::null_mut(),
|
||||
next: std::ptr::null_mut(),
|
||||
num_chunks: num_chunks as u32,
|
||||
num_free_chunks: AtomicU32::new(num_chunks as u32 - 1),
|
||||
});
|
||||
|
||||
(block_header, result_chunk.cast())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn dump(&self) {
|
||||
eprintln!(
|
||||
"slab dump ({} blocks, {} allocated chunks)",
|
||||
self.num_blocks.load(Ordering::Relaxed),
|
||||
self.num_allocated.load(Ordering::Relaxed)
|
||||
);
|
||||
let lists = self.block_lists.read();
|
||||
|
||||
eprintln!("nonfull blocks:");
|
||||
lists.nonfull_blocks.dump();
|
||||
eprintln!("full blocks:");
|
||||
lists.full_blocks.dump();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use rand::Rng;
|
||||
use rand_distr::Zipf;
|
||||
|
||||
struct TestObject {
|
||||
val: usize,
|
||||
_dummy: [u8; BLOCK_SIZE / 4],
|
||||
}
|
||||
|
||||
struct TestObjectSlab<'a>(SlabDesc, BlockAllocator<'a>);
|
||||
impl<'a> TestObjectSlab<'a> {
|
||||
fn new(block_allocator: BlockAllocator) -> TestObjectSlab {
|
||||
TestObjectSlab(SlabDesc::new(&Layout::new::<TestObject>()), block_allocator)
|
||||
}
|
||||
|
||||
fn alloc(&self, val: usize) -> *mut TestObject {
|
||||
let obj: *mut TestObject = self.0.alloc_chunk(&self.1).cast();
|
||||
unsafe { (*obj).val = val };
|
||||
obj
|
||||
}
|
||||
|
||||
fn dealloc(&self, obj: *mut TestObject) {
|
||||
self.0.dealloc_chunk(obj.cast(), &self.1)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slab_alloc() {
|
||||
const MEM_SIZE: usize = 100000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
let block_allocator = BlockAllocator::new(&mut area);
|
||||
|
||||
let slab = TestObjectSlab::new(block_allocator);
|
||||
|
||||
let mut all: Vec<*mut TestObject> = Vec::new();
|
||||
for i in 0..11 {
|
||||
all.push(slab.alloc(i));
|
||||
}
|
||||
#[allow(clippy::needless_range_loop)]
|
||||
for i in 0..11 {
|
||||
assert!(unsafe { (*all[i]).val == i });
|
||||
}
|
||||
|
||||
let distribution = Zipf::new(10.0, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for _ in 0..100000 {
|
||||
slab.0.dump();
|
||||
let idx = rng.sample(distribution) as usize;
|
||||
let ptr: *mut TestObject = all[idx];
|
||||
if !ptr.is_null() {
|
||||
assert_eq!(unsafe { (*ptr).val }, idx);
|
||||
slab.dealloc(ptr);
|
||||
all[idx] = std::ptr::null_mut();
|
||||
} else {
|
||||
all[idx] = slab.alloc(idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn new_test_blk(i: u32) -> *mut SlabBlockHeader {
|
||||
Box::into_raw(Box::new(SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex::new(std::ptr::null_mut()),
|
||||
num_free_chunks: AtomicU32::new(0),
|
||||
num_chunks: i,
|
||||
prev: std::ptr::null_mut(),
|
||||
next: std::ptr::null_mut(),
|
||||
}))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_linked_list() {
|
||||
// note: these are leaked, but that's OK for tests
|
||||
let a = new_test_blk(0);
|
||||
let b = new_test_blk(1);
|
||||
|
||||
let mut list = BlockList::default();
|
||||
assert!(list.is_empty());
|
||||
|
||||
unsafe {
|
||||
list.push_head(a);
|
||||
assert!(!list.is_empty());
|
||||
list.unlink(a);
|
||||
}
|
||||
assert!(list.is_empty());
|
||||
|
||||
unsafe {
|
||||
list.push_head(b);
|
||||
list.push_head(a);
|
||||
assert_eq!(list.head, a);
|
||||
assert_eq!((*a).next, b);
|
||||
assert_eq!((*b).prev, a);
|
||||
assert_eq!(list.tail, b);
|
||||
|
||||
list.unlink(a);
|
||||
list.unlink(b);
|
||||
assert!(list.is_empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,44 +0,0 @@
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
pub fn alloc_from_slice<T>(
|
||||
area: &mut [MaybeUninit<u8>],
|
||||
) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
|
||||
let layout = std::alloc::Layout::new::<T>();
|
||||
|
||||
let area_start = area.as_mut_ptr();
|
||||
|
||||
// pad to satisfy alignment requirements
|
||||
let padding = area_start.align_offset(layout.align());
|
||||
if padding + layout.size() > area.len() {
|
||||
panic!("out of memory");
|
||||
}
|
||||
let area = &mut area[padding..];
|
||||
let (result_area, remain) = area.split_at_mut(layout.size());
|
||||
|
||||
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
|
||||
let result = unsafe { result_ptr.as_mut().unwrap() };
|
||||
|
||||
(result, remain)
|
||||
}
|
||||
|
||||
pub fn alloc_array_from_slice<T>(
|
||||
area: &mut [MaybeUninit<u8>],
|
||||
len: usize,
|
||||
) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
|
||||
let layout = std::alloc::Layout::new::<T>();
|
||||
|
||||
let area_start = area.as_mut_ptr();
|
||||
|
||||
// pad to satisfy alignment requirements
|
||||
let padding = area_start.align_offset(layout.align());
|
||||
if padding + layout.size() * len > area.len() {
|
||||
panic!("out of memory");
|
||||
}
|
||||
let area = &mut area[padding..];
|
||||
let (result_area, remain) = area.split_at_mut(layout.size() * len);
|
||||
|
||||
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
|
||||
let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
|
||||
|
||||
(result, remain)
|
||||
}
|
||||
@@ -1,142 +0,0 @@
|
||||
//! This is similar to crossbeam_epoch crate, but works in shared memory
|
||||
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
||||
|
||||
use crossbeam_utils::CachePadded;
|
||||
|
||||
const NUM_SLOTS: usize = 1000;
|
||||
|
||||
/// This is the struct that is stored in shmem
|
||||
///
|
||||
/// bit 0: is it pinned or not?
|
||||
/// rest of the bits are the epoch counter.
|
||||
pub struct EpochShared {
|
||||
global_epoch: AtomicU64,
|
||||
participants: [CachePadded<AtomicU64>; NUM_SLOTS],
|
||||
|
||||
broadcast_lock: spin::Mutex<()>,
|
||||
}
|
||||
|
||||
impl EpochShared {
|
||||
pub fn new() -> EpochShared {
|
||||
EpochShared {
|
||||
global_epoch: AtomicU64::new(2),
|
||||
participants: [const { CachePadded::new(AtomicU64::new(2)) }; NUM_SLOTS],
|
||||
broadcast_lock: spin::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn register(&self) -> LocalHandle {
|
||||
LocalHandle {
|
||||
global: self,
|
||||
last_slot: AtomicUsize::new(0), // todo: choose more intelligently
|
||||
}
|
||||
}
|
||||
|
||||
fn release_pin(&self, slot: usize, _epoch: u64) {
|
||||
let global_epoch = self.global_epoch.load(Ordering::Relaxed);
|
||||
self.participants[slot].store(global_epoch, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn pin_internal(&self, slot_hint: usize) -> (usize, u64) {
|
||||
// pick a slot
|
||||
let mut slot = slot_hint;
|
||||
let epoch = loop {
|
||||
let old = self.participants[slot].fetch_or(1, Ordering::Relaxed);
|
||||
if old & 1 == 0 {
|
||||
// Got this slot
|
||||
break old;
|
||||
}
|
||||
|
||||
// the slot was busy by another thread / process. try a different slot
|
||||
slot += 1;
|
||||
if slot == NUM_SLOTS {
|
||||
slot = 0;
|
||||
}
|
||||
continue;
|
||||
};
|
||||
(slot, epoch)
|
||||
}
|
||||
|
||||
pub(crate) fn advance(&self) -> u64 {
|
||||
// Advance the global epoch
|
||||
let old_epoch = self.global_epoch.fetch_add(2, Ordering::Relaxed);
|
||||
// Anyone that release their pin after this will update their slot.
|
||||
old_epoch + 2
|
||||
}
|
||||
|
||||
pub(crate) fn broadcast(&self) {
|
||||
let Some(_guard) = self.broadcast_lock.try_lock() else {
|
||||
return;
|
||||
};
|
||||
|
||||
let epoch = self.global_epoch.load(Ordering::Relaxed);
|
||||
let old_epoch = epoch.wrapping_sub(2);
|
||||
|
||||
// Update all free slots.
|
||||
for i in 0..NUM_SLOTS {
|
||||
// TODO: check result, as a sanity check. It should either be the old epoch, or pinned
|
||||
let _ = self.participants[i].compare_exchange(
|
||||
old_epoch,
|
||||
epoch,
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
}
|
||||
|
||||
// FIXME: memory fence here, since we used Relaxed?
|
||||
}
|
||||
|
||||
pub(crate) fn get_oldest(&self) -> u64 {
|
||||
// Read all slots.
|
||||
let now = self.global_epoch.load(Ordering::Relaxed);
|
||||
let mut oldest = now;
|
||||
for i in 0..NUM_SLOTS {
|
||||
let this_epoch = self.participants[i].load(Ordering::Relaxed);
|
||||
let delta = now.wrapping_sub(this_epoch);
|
||||
if delta > u64::MAX / 2 {
|
||||
// this is very recent
|
||||
} else if delta > now.wrapping_sub(oldest) {
|
||||
oldest = this_epoch;
|
||||
}
|
||||
}
|
||||
oldest
|
||||
}
|
||||
|
||||
pub(crate) fn get_current(&self) -> u64 {
|
||||
self.global_epoch.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct EpochPin<'e> {
|
||||
slot: usize,
|
||||
pub(crate) epoch: u64,
|
||||
|
||||
handle: &'e LocalHandle<'e>,
|
||||
}
|
||||
|
||||
impl<'e> Drop for EpochPin<'e> {
|
||||
fn drop(&mut self) {
|
||||
self.handle.global.release_pin(self.slot, self.epoch);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LocalHandle<'g> {
|
||||
global: &'g EpochShared,
|
||||
|
||||
last_slot: AtomicUsize,
|
||||
}
|
||||
|
||||
impl<'g> LocalHandle<'g> {
|
||||
pub fn pin(&self) -> EpochPin {
|
||||
let (slot, epoch) = self
|
||||
.global
|
||||
.pin_internal(self.last_slot.load(Ordering::Relaxed));
|
||||
self.last_slot.store(slot, Ordering::Relaxed);
|
||||
EpochPin {
|
||||
handle: self,
|
||||
epoch,
|
||||
slot,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,583 +0,0 @@
|
||||
//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
|
||||
//!
|
||||
//! The data structure is described in these two papers:
|
||||
//!
|
||||
//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
|
||||
//! The adaptive radix tree: ARTful indexing for main-memory databases.
|
||||
//! Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
|
||||
//! https://db.in.tum.de/~leis/papers/ART.pdf
|
||||
//!
|
||||
//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
|
||||
//! The ART of practical synchronization.
|
||||
//! 1-8. 10.1145/2933349.2933352.
|
||||
//! https://db.in.tum.de/~leis/papers/artsync.pdf
|
||||
//!
|
||||
//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
|
||||
//! use.
|
||||
//!
|
||||
//! The papers mention a few different variants. We have made the following choices in this
|
||||
//! implementation:
|
||||
//!
|
||||
//! - All keys have the same length
|
||||
//!
|
||||
//! - Single-value leaves.
|
||||
//!
|
||||
//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
|
||||
//! variable length "prefix", which stores the keys of all the one-way nodes which have been
|
||||
//! removed. However, similar to the "hybrid" approach described in the paper, each node only has
|
||||
//! space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
|
||||
//! create create one-way nodes to store them. (There was no particular reason for this choice,
|
||||
//! the "hybrid" approach described in the paper might be better.)
|
||||
//!
|
||||
//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
|
||||
//! ROWEX, which generally performs better when there is contention, but that is not important
|
||||
//! for use and Optimisic Lock Coupling is simpler to implement.
|
||||
//!
|
||||
//! ## Requirements
|
||||
//!
|
||||
//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
|
||||
//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
|
||||
//! requirements, which is why we had to write our own. Namely:
|
||||
//!
|
||||
//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
|
||||
//! built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
|
||||
//! feature, which still nightly-only experimental as of this writing).
|
||||
//!
|
||||
//! - The data structure is accessed from multiple processes. Only one process updates the data
|
||||
//! structure, but other processes perform reads. That rules out using built-in Rust locking
|
||||
//! primitives like Mutex and RwLock, and most crates too.
|
||||
//!
|
||||
//! - Within the one process with write-access, multiple threads can perform updates concurrently.
|
||||
//! That rules out using PostgreSQL LWLocks for the locking.
|
||||
//!
|
||||
//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
|
||||
//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
|
||||
//!
|
||||
//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
|
||||
//! locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
|
||||
//! read / write the same page at the same time. (Prefetching can conflict with actual reads,
|
||||
//! however.)
|
||||
//!
|
||||
//! - The keys in the integrated cache are 17 bytes long.
|
||||
//!
|
||||
//! ## Usage
|
||||
//!
|
||||
//! Because this is designed to be used as a Postgres shared memory data structure, initialization
|
||||
//! happens in three stages:
|
||||
//!
|
||||
//! 0. A fixed area of shared memory is allocated at postmaster startup.
|
||||
//!
|
||||
//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
|
||||
//! other process or thread is running. It returns a TreeInitStruct, which is inherited by all
|
||||
//! the processes through fork().
|
||||
//!
|
||||
//! 2. One process may have write-access to the struct, by calling
|
||||
//! [TreeInitStruct::attach_writer]. (That process is the communicator process.)
|
||||
//!
|
||||
//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
|
||||
//!
|
||||
//! "Write access" means that you can insert / update / delete values in the tree.
|
||||
//!
|
||||
//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
|
||||
//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
|
||||
//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
|
||||
//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
|
||||
//! problem, the version check could be passed up to the caller, so that the caller could detect the
|
||||
//! lost updates and retry the operation.
|
||||
//!
|
||||
//! ## Implementation
|
||||
//!
|
||||
//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
|
||||
//! since there is an Internal and Leaf variant of each)
|
||||
//!
|
||||
//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
|
||||
//! node.
|
||||
//!
|
||||
//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
|
||||
//! abstractions on top.
|
||||
//!
|
||||
//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
|
||||
//!
|
||||
//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
|
||||
//! own abstraction for that because we need the data structure to live in a pre-allocated shared
|
||||
//! memory segment).
|
||||
//!
|
||||
//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
|
||||
//! immediately deallocated, but stays around for as long as concurrent readers might still have
|
||||
//! pointers to them. This is enforced by an epoch system. This is similar to
|
||||
//! e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
|
||||
//! communicating over the shared memory segment.
|
||||
//!
|
||||
//! ## See also
|
||||
//!
|
||||
//! There are some existing Rust ART implementations out there, but none of them filled all
|
||||
//! the requirements:
|
||||
//!
|
||||
//! - https://github.com/XiangpengHao/congee
|
||||
//! - https://github.com/declanvk/blart
|
||||
//!
|
||||
//! ## TODO
|
||||
//!
|
||||
//! - Removing values has not been implemented
|
||||
|
||||
mod algorithm;
|
||||
pub mod allocator;
|
||||
mod epoch;
|
||||
|
||||
use algorithm::RootPtr;
|
||||
use algorithm::node_ptr::NodePtr;
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
use crate::epoch::EpochPin;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use allocator::ArtAllocator;
|
||||
pub use allocator::ArtMultiSlabAllocator;
|
||||
pub use allocator::OutOfMemoryError;
|
||||
|
||||
/// Fixed-length key type.
|
||||
///
|
||||
pub trait Key: Debug {
|
||||
const KEY_LEN: usize;
|
||||
|
||||
fn as_bytes(&self) -> &[u8];
|
||||
}
|
||||
|
||||
/// Values stored in the tree
|
||||
///
|
||||
/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
|
||||
/// the old sticks around until all readers that might see the old value are gone.
|
||||
// fixme obsolete, no longer needs Clone
|
||||
pub trait Value {}
|
||||
|
||||
const MAX_GARBAGE: usize = 1024;
|
||||
|
||||
/// The root of the tree, plus other tree-wide data. This is stored in the shared memory.
|
||||
pub struct Tree<V: Value> {
|
||||
/// For simplicity, so that we never need to grow or shrink the root, the root node is always an
|
||||
/// Internal256 node. Also, it never has a prefix (that's actually a bit wasteful, incurring one
|
||||
/// indirection to every lookup)
|
||||
root: RootPtr<V>,
|
||||
|
||||
writer_attached: AtomicBool,
|
||||
|
||||
epoch: epoch::EpochShared,
|
||||
}
|
||||
|
||||
unsafe impl<V: Value + Sync> Sync for Tree<V> {}
|
||||
unsafe impl<V: Value + Send> Send for Tree<V> {}
|
||||
|
||||
struct GarbageQueue<V>(VecDeque<(NodePtr<V>, u64)>);
|
||||
|
||||
unsafe impl<V: Value + Sync> Sync for GarbageQueue<V> {}
|
||||
unsafe impl<V: Value + Send> Send for GarbageQueue<V> {}
|
||||
|
||||
impl<V> GarbageQueue<V> {
|
||||
fn new() -> GarbageQueue<V> {
|
||||
GarbageQueue(VecDeque::with_capacity(MAX_GARBAGE))
|
||||
}
|
||||
|
||||
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>, epoch: u64) {
|
||||
self.0.push_front((ptr, epoch));
|
||||
}
|
||||
|
||||
fn next_obsolete(&mut self, cutoff_epoch: u64) -> Option<NodePtr<V>> {
|
||||
if let Some(back) = self.0.back() {
|
||||
if back.1 < cutoff_epoch {
|
||||
return Some(self.0.pop_back().unwrap().0);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct created at postmaster startup
|
||||
pub struct TreeInitStruct<'t, K: Key, V: Value, A: ArtAllocator<V>> {
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
allocator: &'t A,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
/// The worker process has a reference to this. The write operations are only safe
|
||||
/// from the worker process
|
||||
pub struct TreeWriteAccess<'t, K: Key, V: Value, A: ArtAllocator<V>>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
pub allocator: &'t A,
|
||||
|
||||
epoch_handle: epoch::LocalHandle<'t>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
|
||||
/// Obsolete nodes that cannot be recycled until their epoch expires.
|
||||
garbage: spin::Mutex<GarbageQueue<V>>,
|
||||
}
|
||||
|
||||
/// The backends have a reference to this. It cannot be used to modify the tree
|
||||
pub struct TreeReadAccess<'t, K: Key, V: Value>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
epoch_handle: epoch::LocalHandle<'t>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeInitStruct<'t, K, V, A> {
|
||||
pub fn new(allocator: &'t A) -> TreeInitStruct<'t, K, V, A> {
|
||||
let tree_ptr = allocator.alloc_tree();
|
||||
let tree_ptr = NonNull::new(tree_ptr).expect("out of memory");
|
||||
let init = Tree {
|
||||
root: algorithm::new_root(allocator).expect("out of memory"),
|
||||
writer_attached: AtomicBool::new(false),
|
||||
epoch: epoch::EpochShared::new(),
|
||||
};
|
||||
unsafe { tree_ptr.write(init) };
|
||||
|
||||
TreeInitStruct {
|
||||
tree: unsafe { tree_ptr.as_ref() },
|
||||
allocator,
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V, A> {
|
||||
let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
|
||||
if previously_attached {
|
||||
panic!("writer already attached");
|
||||
}
|
||||
TreeWriteAccess {
|
||||
tree: self.tree,
|
||||
allocator: self.allocator,
|
||||
phantom_key: PhantomData,
|
||||
epoch_handle: self.tree.epoch.register(),
|
||||
garbage: spin::Mutex::new(GarbageQueue::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
|
||||
TreeReadAccess {
|
||||
tree: self.tree,
|
||||
phantom_key: PhantomData,
|
||||
epoch_handle: self.tree.epoch.register(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteAccess<'t, K, V, A> {
|
||||
pub fn start_write<'g>(&'t self) -> TreeWriteGuard<'g, K, V, A>
|
||||
where
|
||||
't: 'g,
|
||||
{
|
||||
TreeWriteGuard {
|
||||
tree_writer: self,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
created_garbage: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: self.tree,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value> TreeReadAccess<'t, K, V> {
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: self.tree,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeReadGuard<'e, K, V>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'e Tree<V>,
|
||||
|
||||
epoch_pin: EpochPin<'e>,
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
|
||||
pub fn get(&'e self, key: &K) -> Option<&'e V> {
|
||||
algorithm::search(key, self.tree.root, &self.epoch_pin)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeWriteGuard<'e, K, V, A>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
tree_writer: &'e TreeWriteAccess<'e, K, V, A>,
|
||||
|
||||
epoch_pin: EpochPin<'e>,
|
||||
phantom_key: PhantomData<K>,
|
||||
|
||||
created_garbage: bool,
|
||||
}
|
||||
|
||||
pub enum UpdateAction<V> {
|
||||
Nothing,
|
||||
Insert(V),
|
||||
Remove,
|
||||
}
|
||||
|
||||
impl<'e, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
|
||||
/// Get a value
|
||||
pub fn get(&'e mut self, key: &K) -> Option<&'e V> {
|
||||
algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
|
||||
}
|
||||
|
||||
/// Insert a value
|
||||
pub fn insert(self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
|
||||
let mut success = None;
|
||||
|
||||
self.update_with_fn(key, |existing| {
|
||||
if existing.is_some() {
|
||||
success = Some(false);
|
||||
UpdateAction::Nothing
|
||||
} else {
|
||||
success = Some(true);
|
||||
UpdateAction::Insert(value)
|
||||
}
|
||||
})?;
|
||||
Ok(success.expect("value_fn not called"))
|
||||
}
|
||||
|
||||
/// Remove value. Returns true if it existed
|
||||
pub fn remove(self, key: &K) -> bool {
|
||||
let mut result = false;
|
||||
// FIXME: It's not clear if OOM is expected while removing. It seems
|
||||
// not nice, but shrinking a node can OOM. Then again, we could opt
|
||||
// to not shrink a node if we cannot allocate, to live a little longer.
|
||||
self.update_with_fn(key, |existing| match existing {
|
||||
Some(_) => {
|
||||
result = true;
|
||||
UpdateAction::Remove
|
||||
}
|
||||
None => UpdateAction::Nothing,
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
result
|
||||
}
|
||||
|
||||
/// Try to remove value and return the old value.
|
||||
pub fn remove_and_return(self, key: &K) -> Option<V>
|
||||
where
|
||||
V: Clone,
|
||||
{
|
||||
let mut old = None;
|
||||
self.update_with_fn(key, |existing| {
|
||||
old = existing.cloned();
|
||||
UpdateAction::Remove
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
old
|
||||
}
|
||||
|
||||
/// Update key using the given function. All the other modifying operations are based on this.
|
||||
///
|
||||
/// The function is passed a reference to the existing value, if any. If the function
|
||||
/// returns None, the value is removed from the tree (or if there was no existing value,
|
||||
/// does nothing). If the function returns Some, the existing value is replaced, of if there
|
||||
/// was no existing value, it is inserted. FIXME: update comment
|
||||
pub fn update_with_fn<F>(mut self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self)?;
|
||||
|
||||
if self.created_garbage {
|
||||
let _ = self.collect_garbage();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
|
||||
self.tree_writer
|
||||
.garbage
|
||||
.lock()
|
||||
.remember_obsolete_node(ptr, self.epoch_pin.epoch);
|
||||
self.created_garbage = true;
|
||||
}
|
||||
|
||||
// returns number of nodes recycled
|
||||
fn collect_garbage(&self) -> usize {
|
||||
self.tree_writer.tree.epoch.advance();
|
||||
self.tree_writer.tree.epoch.broadcast();
|
||||
|
||||
let cutoff_epoch = self.tree_writer.tree.epoch.get_oldest();
|
||||
|
||||
let mut result = 0;
|
||||
let mut garbage_queue = self.tree_writer.garbage.lock();
|
||||
while let Some(ptr) = garbage_queue.next_obsolete(cutoff_epoch) {
|
||||
ptr.deallocate(self.tree_writer.allocator);
|
||||
result += 1;
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeIterator<K>
|
||||
where
|
||||
K: Key + for<'a> From<&'a [u8]>,
|
||||
{
|
||||
done: bool,
|
||||
pub next_key: Vec<u8>,
|
||||
max_key: Option<Vec<u8>>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<K> TreeIterator<K>
|
||||
where
|
||||
K: Key + for<'a> From<&'a [u8]>,
|
||||
{
|
||||
pub fn new_wrapping() -> TreeIterator<K> {
|
||||
TreeIterator {
|
||||
done: false,
|
||||
next_key: vec![0; K::KEY_LEN],
|
||||
max_key: None,
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
|
||||
let result = TreeIterator {
|
||||
done: false,
|
||||
next_key: Vec::from(range.start.as_bytes()),
|
||||
max_key: Some(Vec::from(range.end.as_bytes())),
|
||||
phantom_key: PhantomData,
|
||||
};
|
||||
assert_eq!(result.next_key.len(), K::KEY_LEN);
|
||||
assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub fn next<'g, V>(&mut self, read_guard: &'g TreeReadGuard<'g, K, V>) -> Option<(K, &'g V)>
|
||||
where
|
||||
V: Value,
|
||||
{
|
||||
if self.done {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut wrapped_around = false;
|
||||
loop {
|
||||
assert_eq!(self.next_key.len(), K::KEY_LEN);
|
||||
if let Some((k, v)) =
|
||||
algorithm::iter_next(&self.next_key, read_guard.tree.root, &read_guard.epoch_pin)
|
||||
{
|
||||
assert_eq!(k.len(), K::KEY_LEN);
|
||||
assert_eq!(self.next_key.len(), K::KEY_LEN);
|
||||
|
||||
// Check if we reached the end of the range
|
||||
if let Some(max_key) = &self.max_key {
|
||||
if k.as_slice() >= max_key.as_slice() {
|
||||
self.done = true;
|
||||
break None;
|
||||
}
|
||||
}
|
||||
|
||||
// increment the key
|
||||
self.next_key = k.clone();
|
||||
increment_key(self.next_key.as_mut_slice());
|
||||
let k = k.as_slice().into();
|
||||
|
||||
break Some((k, v));
|
||||
} else {
|
||||
if self.max_key.is_some() {
|
||||
self.done = true;
|
||||
} else {
|
||||
// Start from beginning
|
||||
if !wrapped_around {
|
||||
for i in 0..K::KEY_LEN {
|
||||
self.next_key[i] = 0;
|
||||
}
|
||||
wrapped_around = true;
|
||||
continue;
|
||||
} else {
|
||||
// The tree is completely empty
|
||||
// FIXME: perhaps we should remember the starting point instead.
|
||||
// Currently this will scan some ranges twice.
|
||||
break None;
|
||||
}
|
||||
}
|
||||
break None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn increment_key(key: &mut [u8]) -> bool {
|
||||
for i in (0..key.len()).rev() {
|
||||
let (byte, overflow) = key[i].overflowing_add(1);
|
||||
key[i] = byte;
|
||||
if !overflow {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
// Debugging functions
|
||||
impl<'e, K: Key, V: Value + Debug, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
|
||||
pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
|
||||
algorithm::dump_tree(self.tree_writer.tree.root, &self.epoch_pin, dst)
|
||||
}
|
||||
}
|
||||
impl<'e, K: Key, V: Value + Debug> TreeReadGuard<'e, K, V> {
|
||||
pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
|
||||
algorithm::dump_tree(self.tree.root, &self.epoch_pin, dst)
|
||||
}
|
||||
}
|
||||
impl<'e, K: Key, V: Value> TreeWriteAccess<'e, K, V, ArtMultiSlabAllocator<'e, V>> {
|
||||
pub fn get_statistics(&self) -> ArtTreeStatistics {
|
||||
self.allocator.get_statistics();
|
||||
ArtTreeStatistics {
|
||||
blocks: self.allocator.inner.block_allocator.get_statistics(),
|
||||
slabs: self.allocator.get_statistics(),
|
||||
epoch: self.tree.epoch.get_current(),
|
||||
oldest_epoch: self.tree.epoch.get_oldest(),
|
||||
num_garbage: self.garbage.lock().0.len() as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ArtTreeStatistics {
|
||||
pub blocks: allocator::block::BlockAllocatorStats,
|
||||
pub slabs: allocator::ArtMultiSlabStats,
|
||||
|
||||
pub epoch: u64,
|
||||
pub oldest_epoch: u64,
|
||||
pub num_garbage: u64,
|
||||
}
|
||||
@@ -1,236 +0,0 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use crate::ArtAllocator;
|
||||
use crate::ArtMultiSlabAllocator;
|
||||
use crate::TreeInitStruct;
|
||||
use crate::TreeIterator;
|
||||
use crate::TreeWriteAccess;
|
||||
use crate::UpdateAction;
|
||||
|
||||
use crate::{Key, Value};
|
||||
|
||||
use rand::Rng;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand_distr::Zipf;
|
||||
|
||||
const TEST_KEY_LEN: usize = 16;
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
struct TestKey([u8; TEST_KEY_LEN]);
|
||||
|
||||
impl TestKey {
|
||||
const MIN: TestKey = TestKey([0; TEST_KEY_LEN]);
|
||||
const MAX: TestKey = TestKey([u8::MAX; TEST_KEY_LEN]);
|
||||
}
|
||||
|
||||
impl Key for TestKey {
|
||||
const KEY_LEN: usize = TEST_KEY_LEN;
|
||||
fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&TestKey> for u128 {
|
||||
fn from(val: &TestKey) -> u128 {
|
||||
u128::from_be_bytes(val.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u128> for TestKey {
|
||||
fn from(val: u128) -> TestKey {
|
||||
TestKey(val.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for TestKey {
|
||||
fn from(bytes: &'a [u8]) -> TestKey {
|
||||
TestKey(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl Value for usize {}
|
||||
|
||||
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
|
||||
let allocator = ArtMultiSlabAllocator::new(&mut area);
|
||||
|
||||
let init_struct = TreeInitStruct::<TestKey, usize, _>::new(allocator);
|
||||
let tree_writer = init_struct.attach_writer();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let w = tree_writer.start_write();
|
||||
let res = w.insert(&(*k).into(), idx);
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let r = tree_writer.start_read();
|
||||
let value = r.get(&(*k).into());
|
||||
assert_eq!(value, Some(idx).as_ref());
|
||||
}
|
||||
|
||||
eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense() {
|
||||
// This exercises splitting a node with prefix
|
||||
let keys: &[u128] = &[0, 1, 2, 3, 256];
|
||||
test_inserts(keys);
|
||||
|
||||
// Dense keys
|
||||
let mut keys: Vec<u128> = (0..10000).collect();
|
||||
test_inserts(&keys);
|
||||
|
||||
// Do the same in random orders
|
||||
for _ in 1..10 {
|
||||
keys.shuffle(&mut rand::rng());
|
||||
test_inserts(&keys);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse() {
|
||||
// sparse keys
|
||||
let mut keys: Vec<TestKey> = Vec::new();
|
||||
let mut used_keys = HashSet::new();
|
||||
for _ in 0..10000 {
|
||||
loop {
|
||||
let key = rand::random::<u128>();
|
||||
if used_keys.contains(&key) {
|
||||
continue;
|
||||
}
|
||||
used_keys.insert(key);
|
||||
keys.push(key.into());
|
||||
break;
|
||||
}
|
||||
}
|
||||
test_inserts(&keys);
|
||||
}
|
||||
|
||||
struct TestValue(AtomicUsize);
|
||||
|
||||
impl TestValue {
|
||||
fn new(val: usize) -> TestValue {
|
||||
TestValue(AtomicUsize::new(val))
|
||||
}
|
||||
|
||||
fn load(&self) -> usize {
|
||||
self.0.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Value for TestValue {}
|
||||
|
||||
impl Clone for TestValue {
|
||||
fn clone(&self) -> TestValue {
|
||||
TestValue::new(self.load())
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for TestValue {
|
||||
fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.load())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct TestOp(TestKey, Option<usize>);
|
||||
|
||||
fn apply_op<A: ArtAllocator<TestValue>>(
|
||||
op: &TestOp,
|
||||
tree: &TreeWriteAccess<TestKey, TestValue, A>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
eprintln!("applying op: {op:?}");
|
||||
|
||||
// apply the change to the shadow tree first
|
||||
let shadow_existing = if let Some(v) = op.1 {
|
||||
shadow.insert(op.0, v)
|
||||
} else {
|
||||
shadow.remove(&op.0)
|
||||
};
|
||||
|
||||
// apply to Art tree
|
||||
let w = tree.start_write();
|
||||
w.update_with_fn(&op.0, |existing| {
|
||||
assert_eq!(existing.map(TestValue::load), shadow_existing);
|
||||
|
||||
match (existing, op.1) {
|
||||
(None, None) => UpdateAction::Nothing,
|
||||
(None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
|
||||
(Some(_old_val), None) => UpdateAction::Remove,
|
||||
(Some(old_val), Some(new_val)) => {
|
||||
old_val.0.store(new_val, Ordering::Relaxed);
|
||||
UpdateAction::Nothing
|
||||
}
|
||||
}
|
||||
})
|
||||
.expect("out of memory");
|
||||
}
|
||||
|
||||
fn test_iter<A: ArtAllocator<TestValue>>(
|
||||
tree: &TreeWriteAccess<TestKey, TestValue, A>,
|
||||
shadow: &BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
let mut shadow_iter = shadow.iter();
|
||||
let mut iter = TreeIterator::new(&(TestKey::MIN..TestKey::MAX));
|
||||
|
||||
loop {
|
||||
let shadow_item = shadow_iter.next().map(|(k, v)| (*k, *v));
|
||||
let r = tree.start_read();
|
||||
let item = iter.next(&r);
|
||||
|
||||
if shadow_item != item.map(|(k, v)| (k, v.load())) {
|
||||
eprintln!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
|
||||
tree.start_read().dump(&mut std::io::stderr());
|
||||
|
||||
eprintln!("SHADOW:");
|
||||
for si in shadow {
|
||||
eprintln!("key: {:?}, val: {}", si.0, si.1);
|
||||
}
|
||||
panic!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
|
||||
}
|
||||
if item.is_none() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_ops() {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
|
||||
let allocator = ArtMultiSlabAllocator::new(&mut area);
|
||||
|
||||
let init_struct = TreeInitStruct::<TestKey, TestValue, _>::new(allocator);
|
||||
let tree_writer = init_struct.attach_writer();
|
||||
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..100000 {
|
||||
let mut key: TestKey = (rng.sample(distribution) as u128).into();
|
||||
|
||||
if rng.random_bool(0.10) {
|
||||
key = TestKey::from(u128::from(&key) | 0xffffffff);
|
||||
}
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &tree_writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user