From bb1e359872ca19b311466d8e3a363fb41b849cdf Mon Sep 17 00:00:00 2001 From: David Freifeld Date: Mon, 16 Jun 2025 16:01:46 -0700 Subject: [PATCH] Add testing utilities for hash map, freelist bugfixes --- libs/neon-shmem/src/hash.rs | 31 +++-- libs/neon-shmem/src/hash/core.rs | 36 +++--- libs/neon-shmem/src/hash/entry.rs | 12 +- libs/neon-shmem/src/hash/tests.rs | 185 ++++++++++++++++++------------ 4 files changed, 161 insertions(+), 103 deletions(-) diff --git a/libs/neon-shmem/src/hash.rs b/libs/neon-shmem/src/hash.rs index 907a32bfca..45e593fc48 100644 --- a/libs/neon-shmem/src/hash.rs +++ b/libs/neon-shmem/src/hash.rs @@ -20,7 +20,7 @@ pub mod entry; mod tests; use core::{CoreHashMap, INVALID_POS}; -use entry::{Entry, OccupiedEntry}; +use entry::{Entry, OccupiedEntry, PrevPos}; #[derive(Debug)] pub struct OutOfMemoryError(); @@ -289,15 +289,15 @@ where for i in old_num_buckets..num_buckets { let bucket_ptr = buckets_ptr.add(i as usize); bucket_ptr.write(core::Bucket { - next: if i < num_buckets { + next: if i < num_buckets-1 { i as u32 + 1 } else { inner.free_head }, prev: if i > 0 { - i as u32 - 1 + PrevPos::Chained(i as u32 - 1) } else { - INVALID_POS + PrevPos::First(INVALID_POS) }, inner: None, }); @@ -315,6 +315,10 @@ where if num_buckets > map.inner.get_num_buckets() as u32 { panic!("shrink called with a larger number of buckets"); } + _ = self + .shmem_handle + .as_ref() + .expect("shrink called on a fixed-size hash table"); map.inner.alloc_limit = num_buckets; } @@ -342,9 +346,21 @@ where // Would require making a wider error type enum with this and shmem errors. panic!("unevicted entries in shrinked space") } - let prev_pos = inner.buckets[i].prev; - if prev_pos != INVALID_POS { - inner.buckets[prev_pos as usize].next = inner.buckets[i].next; + match inner.buckets[i].prev { + PrevPos::First(_) => { + let next_pos = inner.buckets[i].next; + inner.free_head = next_pos; + if next_pos != INVALID_POS { + inner.buckets[next_pos as usize].prev = PrevPos::First(INVALID_POS); + } + }, + PrevPos::Chained(j) => { + let next_pos = inner.buckets[i].next; + inner.buckets[j as usize].next = next_pos; + if next_pos != INVALID_POS { + inner.buckets[next_pos as usize].prev = PrevPos::Chained(j); + } + } } } @@ -358,6 +374,7 @@ where let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) }; let buckets_ptr = inner.buckets.as_mut_ptr(); self.rehash_dict(inner, buckets_ptr, end_ptr, num_buckets, num_buckets); + inner.alloc_limit = INVALID_POS; Ok(()) } diff --git a/libs/neon-shmem/src/hash/core.rs b/libs/neon-shmem/src/hash/core.rs index 1e0ebede4a..d02d234a87 100644 --- a/libs/neon-shmem/src/hash/core.rs +++ b/libs/neon-shmem/src/hash/core.rs @@ -13,7 +13,7 @@ pub(crate) const INVALID_POS: u32 = u32::MAX; // Bucket pub(crate) struct Bucket { pub(crate) next: u32, - pub(crate) prev: u32, + pub(crate) prev: PrevPos, pub(crate) inner: Option<(K, V)>, } @@ -65,9 +65,9 @@ where INVALID_POS }, prev: if i > 0 { - i as u32 - 1 + PrevPos::Chained(i as u32 - 1) } else { - INVALID_POS + PrevPos::First(INVALID_POS) }, inner: None, }); @@ -176,24 +176,15 @@ where _key: key.clone(), // TODO(quantumish): clone unavoidable? bucket_pos: pos as u32, map: self, - prev_pos: if prev == INVALID_POS { - // TODO(quantumish): populating this correctly would require an O(n) scan over the dictionary - // (perhaps not if we refactored the prev field to be itself something like PrevPos). The real - // question though is whether this even needs to be populated correctly? All downstream uses of - // this function so far are just for deletion, which isn't really concerned with the dictionary. - // Then again, it's unintuitive to appear to return a normal OccupiedEntry which really is fake. - PrevPos::First(todo!("unclear what to do here")) - } else { - PrevPos::Chained(prev) - } + prev_pos: prev, }) } - + pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result { let mut pos = self.free_head; let mut prev = PrevPos::First(self.free_head); - while pos!= INVALID_POS && pos >= self.alloc_limit { + while pos != INVALID_POS && pos >= self.alloc_limit { let bucket = &mut self.buckets[pos as usize]; prev = PrevPos::Chained(pos); pos = bucket.next; @@ -204,16 +195,23 @@ where match prev { PrevPos::First(_) => { let next_pos = self.buckets[pos as usize].next; - self.free_head = next_pos; - self.buckets[next_pos as usize].prev = INVALID_POS; + self.free_head = next_pos; + // HACK(quantumish): Really, the INVALID_POS should be the position within the dictionary. + // This isn't passed into this function, though, and so for now rather than changing that + // we can just check it from `alloc_bucket`. Not a great solution. + if next_pos != INVALID_POS { + self.buckets[next_pos as usize].prev = PrevPos::First(INVALID_POS); + } } PrevPos::Chained(p) => if p != INVALID_POS { let next_pos = self.buckets[pos as usize].next; self.buckets[p as usize].next = next_pos; - self.buckets[next_pos as usize].prev = p; + if next_pos != INVALID_POS { + self.buckets[next_pos as usize].prev = PrevPos::Chained(p); + } }, } - + let bucket = &mut self.buckets[pos as usize]; self.buckets_in_use += 1; bucket.next = INVALID_POS; diff --git a/libs/neon-shmem/src/hash/entry.rs b/libs/neon-shmem/src/hash/entry.rs index cb9efcf742..147a464745 100644 --- a/libs/neon-shmem/src/hash/entry.rs +++ b/libs/neon-shmem/src/hash/entry.rs @@ -10,6 +10,7 @@ pub enum Entry<'a, 'b, K, V> { Vacant(VacantEntry<'a, 'b, K, V>), } +#[derive(Clone, Copy)] pub(crate) enum PrevPos { First(u32), Chained(u32), @@ -58,10 +59,14 @@ impl<'a, 'b, K, V> OccupiedEntry<'a, 'b, K, V> { } } - // and add it to the freelist + // and add it to the freelist + if self.map.free_head != INVALID_POS { + self.map.buckets[self.map.free_head as usize].prev = PrevPos::Chained(self.bucket_pos); + } let bucket = &mut self.map.buckets[self.bucket_pos as usize]; let old_value = bucket.inner.take(); - bucket.next = self.map.free_head; + bucket.next = self.map.free_head; + bucket.prev = PrevPos::First(INVALID_POS); self.map.free_head = self.bucket_pos; self.map.buckets_in_use -= 1; @@ -82,6 +87,9 @@ impl<'a, 'b, K: Clone + Hash + Eq, V> VacantEntry<'a, 'b, K, V> { return Err(FullError()); } let bucket = &mut self.map.buckets[pos as usize]; + if let PrevPos::First(INVALID_POS) = bucket.prev { + bucket.prev = PrevPos::First(self.dict_pos); + } bucket.next = self.map.dictionary[self.dict_pos as usize]; self.map.dictionary[self.dict_pos as usize] = pos; diff --git a/libs/neon-shmem/src/hash/tests.rs b/libs/neon-shmem/src/hash/tests.rs index c207e35a56..b95c33e578 100644 --- a/libs/neon-shmem/src/hash/tests.rs +++ b/libs/neon-shmem/src/hash/tests.rs @@ -1,6 +1,8 @@ use std::collections::BTreeMap; use std::collections::HashSet; use std::fmt::{Debug, Formatter}; +use std::mem::uninitialized; +use std::mem::MaybeUninit; use std::sync::atomic::{AtomicUsize, Ordering}; use crate::hash::HashMapAccess; @@ -132,8 +134,6 @@ fn apply_op( map: &mut HashMapAccess, shadow: &mut BTreeMap, ) { - eprintln!("applying op: {op:?}"); - // apply the change to the shadow tree first let shadow_existing = if let Some(v) = op.1 { shadow.insert(op.0, v) @@ -161,16 +161,42 @@ fn apply_op( assert_eq!(shadow_existing, hash_existing); } +fn do_random_ops( + num_ops: usize, + size: u32, + del_prob: f64, + writer: &mut HashMapAccess, + shadow: &mut BTreeMap, + rng: &mut rand::rngs::ThreadRng, +) { + for i in 0..num_ops { + let key: TestKey = ((rng.next_u32() % size) as u128).into(); + let op = TestOp(key, if rng.random_bool(del_prob) { Some(i) } else { None }); + apply_op(&op, writer, shadow); + } +} + +fn do_deletes( + num_ops: usize, + writer: &mut HashMapAccess, + shadow: &mut BTreeMap, +) { + for i in 0..num_ops { + let (k, _) = shadow.pop_first().unwrap(); + let hash = writer.get_hash_value(&k); + writer.remove_with_hash(&k, hash); + } +} + #[test] fn random_ops() { - const MAX_MEM_SIZE: usize = 10000000; - let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap(); + let shmem = ShmemHandle::new("test_inserts", 0, 10000000).unwrap(); let init_struct = HashMapInit::::init_in_shmem(100000, shmem); let mut writer = init_struct.attach_writer(); - let mut shadow: std::collections::BTreeMap = BTreeMap::new(); + let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap(); let mut rng = rand::rng(); for i in 0..100000 { @@ -190,88 +216,97 @@ fn random_ops() { #[test] fn test_grow() { - const MEM_SIZE: usize = 10000000; - let shmem = ShmemHandle::new("test_grow", 0, MEM_SIZE).unwrap(); - + let shmem = ShmemHandle::new("test_grow", 0, 10000000).unwrap(); let init_struct = HashMapInit::::init_in_shmem(1000, shmem); let mut writer = init_struct.attach_writer(); - let mut shadow: std::collections::BTreeMap = BTreeMap::new(); - let mut rng = rand::rng(); - for i in 0..10000 { - let key: TestKey = ((rng.next_u32() % 1000) as u128).into(); - - let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None }); - - apply_op(&op, &mut writer, &mut shadow); - - if i % 1000 == 0 { - eprintln!("{i} ops processed"); - //eprintln!("stats: {:?}", tree_writer.get_statistics()); - //test_iter(&tree_writer, &shadow); - } - } + do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng); writer.grow(1500).unwrap(); - - for i in 0..10000 { - let key: TestKey = ((rng.next_u32() % 1500) as u128).into(); - - let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None }); - - apply_op(&op, &mut writer, &mut shadow); - - if i % 1000 == 0 { - eprintln!("{i} ops processed"); - //eprintln!("stats: {:?}", tree_writer.get_statistics()); - //test_iter(&tree_writer, &shadow); - } - } + do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng); } - -#[test] -fn test_shrink() { - const MEM_SIZE: usize = 10000000; - let shmem = ShmemHandle::new("test_shrink", 0, MEM_SIZE).unwrap(); - - let init_struct = HashMapInit::::init_in_shmem(1500, shmem); - let mut writer = init_struct.attach_writer(); - - let mut shadow: std::collections::BTreeMap = BTreeMap::new(); - - let mut rng = rand::rng(); - for i in 0..100 { - let key: TestKey = ((rng.next_u32() % 1500) as u128).into(); - - let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None }); - - apply_op(&op, &mut writer, &mut shadow); - - if i % 1000 == 0 { - eprintln!("{i} ops processed"); - } - } - - writer.begin_shrink(1000); - for i in 1000..1500 { - if let Some(entry) = writer.entry_at_bucket(i) { +fn do_shrink( + writer: &mut HashMapAccess, + shadow: &mut BTreeMap, + from: u32, + to: u32 +) { + writer.begin_shrink(to); + for i in to..from { + if let Some(entry) = writer.entry_at_bucket(i as usize) { shadow.remove(&entry._key); entry.remove(); } } writer.finish_shrink().unwrap(); - - for i in 0..10000 { - let key: TestKey = ((rng.next_u32() % 1000) as u128).into(); - let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None }); - - apply_op(&op, &mut writer, &mut shadow); - - if i % 1000 == 0 { - eprintln!("{i} ops processed"); - } - } +} + +#[test] +fn test_shrink() { + let shmem = ShmemHandle::new("test_shrink", 0, 10000000).unwrap(); + let init_struct = HashMapInit::::init_in_shmem(1500, shmem); + let mut writer = init_struct.attach_writer(); + let mut shadow: std::collections::BTreeMap = BTreeMap::new(); + let mut rng = rand::rng(); + + do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng); + do_shrink(&mut writer, &mut shadow, 1500, 1000); + do_deletes(500, &mut writer, &mut shadow); + do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng); + assert!(writer.get_num_buckets_in_use() <= 1000); +} + +#[test] +fn test_shrink_grow_seq() { + let shmem = ShmemHandle::new("test_shrink", 0, 10000000).unwrap(); + let init_struct = HashMapInit::::init_in_shmem(1500, shmem); + let mut writer = init_struct.attach_writer(); + let mut shadow: std::collections::BTreeMap = BTreeMap::new(); + let mut rng = rand::rng(); + + do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng); + eprintln!("Shrinking to 750"); + do_shrink(&mut writer, &mut shadow, 1000, 750); + do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng); + eprintln!("Growing to 1500"); + writer.grow(1500).unwrap(); + do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng); + eprintln!("Shrinking to 200"); + do_shrink(&mut writer, &mut shadow, 1500, 200); + do_deletes(100, &mut writer, &mut shadow); + do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng); + eprintln!("Growing to 10k"); + writer.grow(10000).unwrap(); + do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng); +} + + +#[test] +#[should_panic] +fn test_shrink_bigger() { + let shmem = ShmemHandle::new("test_shrink", 0, 10000000).unwrap(); + let init_struct = HashMapInit::::init_in_shmem(1500, shmem); + let mut writer = init_struct.attach_writer(); + writer.begin_shrink(2000); +} + +#[test] +#[should_panic] +fn test_shrink_early_finish() { + let shmem = ShmemHandle::new("test_shrink", 0, 10000000).unwrap(); + let init_struct = HashMapInit::::init_in_shmem(1500, shmem); + let mut writer = init_struct.attach_writer(); + writer.finish_shrink().unwrap(); +} + +#[test] +#[should_panic] +fn test_shrink_fixed_size() { + let mut area = [MaybeUninit::uninit(); 10000]; + let init_struct = HashMapInit::::init_in_fixed_area(3, &mut area); + let mut writer = init_struct.attach_writer(); + writer.begin_shrink(1); }