Add testing utilities for hash map, freelist bugfixes

2026-01-05 20:42:54 +00:00 · 2025-06-16 16:01:46 -07:00
parent ac87544e79
commit bb1e359872
4 changed files with 161 additions and 103 deletions
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -20,7 +20,7 @@ pub mod entry;
 mod tests;

 use core::{CoreHashMap, INVALID_POS};
-use entry::{Entry, OccupiedEntry};
+use entry::{Entry, OccupiedEntry, PrevPos};

 #[derive(Debug)]
 pub struct OutOfMemoryError();
@@ -289,15 +289,15 @@ where
            for i in old_num_buckets..num_buckets {
                let bucket_ptr = buckets_ptr.add(i as usize);
                bucket_ptr.write(core::Bucket {
-                    next: if i < num_buckets {
+                    next: if i < num_buckets-1 {
                        i as u32 + 1
                    } else {
                        inner.free_head
                    },
 					prev: if i > 0 {
-						i as u32 - 1
+						PrevPos::Chained(i as u32 - 1)
 					} else {
-						INVALID_POS
+						PrevPos::First(INVALID_POS)
 					},
                    inner: None,
                });
@@ -315,6 +315,10 @@ where
 		if num_buckets > map.inner.get_num_buckets() as u32 {
            panic!("shrink called with a larger number of buckets");
        }
+		_ = self
+            .shmem_handle
+            .as_ref()
+            .expect("shrink called on a fixed-size hash table");
 		map.inner.alloc_limit = num_buckets;
 	}

@@ -342,9 +346,21 @@ where
 				// Would require making a wider error type enum with this and shmem errors.
 				panic!("unevicted entries in shrinked space")
 			}
-			let prev_pos = inner.buckets[i].prev;
-			if prev_pos != INVALID_POS {
-				inner.buckets[prev_pos as usize].next = inner.buckets[i].next;
+			match inner.buckets[i].prev {
+				PrevPos::First(_) => {
+					let next_pos = inner.buckets[i].next;
+					inner.free_head = next_pos;
+					if next_pos != INVALID_POS {
+						inner.buckets[next_pos as usize].prev = PrevPos::First(INVALID_POS);
+					}
+				},
+				PrevPos::Chained(j) => {
+					let next_pos = inner.buckets[i].next;
+					inner.buckets[j as usize].next = next_pos;
+					if next_pos != INVALID_POS {
+						inner.buckets[next_pos as usize].prev = PrevPos::Chained(j);
+					}
+				}
 			}
 		}

@@ -358,6 +374,7 @@ where
        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
 		let buckets_ptr = inner.buckets.as_mut_ptr();
 		self.rehash_dict(inner, buckets_ptr, end_ptr, num_buckets, num_buckets);
+		inner.alloc_limit = INVALID_POS;
 		
 		Ok(())
 	}
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -13,7 +13,7 @@ pub(crate) const INVALID_POS: u32 = u32::MAX;
 // Bucket
 pub(crate) struct Bucket<K, V> {
    pub(crate) next: u32,
-	pub(crate) prev: u32,
+	pub(crate) prev: PrevPos,
    pub(crate) inner: Option<(K, V)>,
 }

@@ -65,9 +65,9 @@ where
                    INVALID_POS
                },
 				prev: if i > 0 {
-					i as u32 - 1
+					PrevPos::Chained(i as u32 - 1)
 				} else {
-					INVALID_POS
+					PrevPos::First(INVALID_POS)
 				},
                inner: None,
            });
@@ -176,24 +176,15 @@ where
 			_key: key.clone(), // TODO(quantumish): clone unavoidable?
 			bucket_pos: pos as u32,
 			map: self,
-			prev_pos: if prev == INVALID_POS {
-				// TODO(quantumish): populating this correctly would require an O(n) scan over the dictionary
-				// (perhaps not if we refactored the prev field to be itself something like PrevPos). The real
-				// question though is whether this even needs to be populated correctly? All downstream uses of
-				// this function so far are just for deletion, which isn't really concerned with the dictionary.
-				// Then again, it's unintuitive to appear to return a normal OccupiedEntry which really is fake.
-				PrevPos::First(todo!("unclear what to do here"))
-			} else {
-				PrevPos::Chained(prev)
-			}
+			prev_pos: prev,
 		})
    }
-
+	
    pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result<u32, FullError> {
        let mut pos = self.free_head;

 		let mut prev = PrevPos::First(self.free_head);
-		while pos!= INVALID_POS && pos >= self.alloc_limit {
+		while pos != INVALID_POS && pos >= self.alloc_limit {
 			let bucket = &mut self.buckets[pos as usize];
 			prev = PrevPos::Chained(pos);
 			pos = bucket.next;
@@ -204,16 +195,23 @@ where
 		match prev {
 			PrevPos::First(_) => {
 				let next_pos = self.buckets[pos as usize].next;
-				self.free_head = next_pos;
-				self.buckets[next_pos as usize].prev = INVALID_POS;
+				self.free_head = next_pos;				
+				// HACK(quantumish): Really, the INVALID_POS should be the position within the dictionary.
+				// This isn't passed into this function, though, and so for now rather than changing that
+				// we can just check it from `alloc_bucket`. Not a great solution.
+				if next_pos != INVALID_POS {
+					self.buckets[next_pos as usize].prev = PrevPos::First(INVALID_POS);
+				}
 			}
 			PrevPos::Chained(p) => if p != INVALID_POS {
 				let next_pos = self.buckets[pos as usize].next;
 				self.buckets[p as usize].next = next_pos;
-				self.buckets[next_pos as usize].prev = p;
+				if next_pos != INVALID_POS {
+					self.buckets[next_pos as usize].prev = PrevPos::Chained(p);
+				}
 			},
 		}
-
+		
 		let bucket = &mut self.buckets[pos as usize];
 		self.buckets_in_use += 1;
        bucket.next = INVALID_POS;
--- a/libs/neon-shmem/src/hash/entry.rs
+++ b/libs/neon-shmem/src/hash/entry.rs
@@ -10,6 +10,7 @@ pub enum Entry<'a, 'b, K, V> {
    Vacant(VacantEntry<'a, 'b, K, V>),
 }

+#[derive(Clone, Copy)]
 pub(crate) enum PrevPos {
    First(u32),
    Chained(u32),
@@ -58,10 +59,14 @@ impl<'a, 'b, K, V> OccupiedEntry<'a, 'b, K, V> {
            }
        }

-        // and add it to the freelist
+        // and add it to the freelist        
+		if self.map.free_head != INVALID_POS {
+			self.map.buckets[self.map.free_head as usize].prev = PrevPos::Chained(self.bucket_pos);
+		}
        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
        let old_value = bucket.inner.take();
-        bucket.next = self.map.free_head;
+		bucket.next = self.map.free_head;
+		bucket.prev = PrevPos::First(INVALID_POS);
        self.map.free_head = self.bucket_pos;
        self.map.buckets_in_use -= 1;

@@ -82,6 +87,9 @@ impl<'a, 'b, K: Clone + Hash + Eq, V> VacantEntry<'a, 'b, K, V> {
            return Err(FullError());
        }
        let bucket = &mut self.map.buckets[pos as usize];
+		if let PrevPos::First(INVALID_POS) = bucket.prev {
+			bucket.prev = PrevPos::First(self.dict_pos);
+		}
        bucket.next = self.map.dictionary[self.dict_pos as usize];
        self.map.dictionary[self.dict_pos as usize] = pos;

--- a/libs/neon-shmem/src/hash/tests.rs
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -1,6 +1,8 @@
 use std::collections::BTreeMap;
 use std::collections::HashSet;
 use std::fmt::{Debug, Formatter};
+use std::mem::uninitialized;
+use std::mem::MaybeUninit;
 use std::sync::atomic::{AtomicUsize, Ordering};

 use crate::hash::HashMapAccess;
@@ -132,8 +134,6 @@ fn apply_op(
    map: &mut HashMapAccess<TestKey, usize>,
    shadow: &mut BTreeMap<TestKey, usize>,
 ) {
-    eprintln!("applying op: {op:?}");
-
    // apply the change to the shadow tree first
    let shadow_existing = if let Some(v) = op.1 {
        shadow.insert(op.0, v)
@@ -161,16 +161,42 @@ fn apply_op(
 	assert_eq!(shadow_existing, hash_existing);
 }

+fn do_random_ops(
+	num_ops: usize,
+	size: u32,
+	del_prob: f64,
+	writer: &mut HashMapAccess<TestKey, usize>,
+	shadow: &mut BTreeMap<TestKey, usize>,
+	rng: &mut rand::rngs::ThreadRng,
+) {
+	for i in 0..num_ops {
+        let key: TestKey = ((rng.next_u32() % size) as u128).into();
+        let op = TestOp(key, if rng.random_bool(del_prob) { Some(i) } else { None });
+        apply_op(&op, writer, shadow);
+    }
+}
+
+fn do_deletes(
+	num_ops: usize,
+	writer: &mut HashMapAccess<TestKey, usize>,
+	shadow: &mut BTreeMap<TestKey, usize>,
+) {
+	for i in 0..num_ops {
+		let (k, _) = shadow.pop_first().unwrap();
+		let hash = writer.get_hash_value(&k);
+		writer.remove_with_hash(&k, hash);
+	}
+}
+
 #[test]
 fn random_ops() {
-    const MAX_MEM_SIZE: usize = 10000000;
-    let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
+    let shmem = ShmemHandle::new("test_inserts", 0, 10000000).unwrap();

    let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(100000, shmem);
    let mut writer = init_struct.attach_writer();
-
    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();

+	
    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
    let mut rng = rand::rng();
    for i in 0..100000 {
@@ -190,88 +216,97 @@ fn random_ops() {

 #[test]
 fn test_grow() {
-    const MEM_SIZE: usize = 10000000;
-    let shmem = ShmemHandle::new("test_grow", 0, MEM_SIZE).unwrap();
-
+    let shmem = ShmemHandle::new("test_grow", 0, 10000000).unwrap();
    let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(1000, shmem);
    let mut writer = init_struct.attach_writer();
-
    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
-
    let mut rng = rand::rng();
-    for i in 0..10000 {
-        let key: TestKey = ((rng.next_u32() % 1000) as u128).into();
-
-        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
-
-        apply_op(&op, &mut writer, &mut shadow);
-
-        if i % 1000 == 0 {
-            eprintln!("{i} ops processed");
-            //eprintln!("stats: {:?}", tree_writer.get_statistics());
-            //test_iter(&tree_writer, &shadow);
-        }
-    }

+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
    writer.grow(1500).unwrap();
-
-    for i in 0..10000 {
-        let key: TestKey = ((rng.next_u32() % 1500) as u128).into();
-
-        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
-
-        apply_op(&op, &mut writer, &mut shadow);
-
-        if i % 1000 == 0 {
-            eprintln!("{i} ops processed");
-            //eprintln!("stats: {:?}", tree_writer.get_statistics());
-            //test_iter(&tree_writer, &shadow);
-        }
-    }
+	do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
 }

-
-#[test]
-fn test_shrink() {
-    const MEM_SIZE: usize = 10000000;
-    let shmem = ShmemHandle::new("test_shrink", 0, MEM_SIZE).unwrap();
-
-    let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(1500, shmem);
-    let mut writer = init_struct.attach_writer();
-
-    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
-
-    let mut rng = rand::rng();
-    for i in 0..100 {
-        let key: TestKey = ((rng.next_u32() % 1500) as u128).into();
-
-        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
-
-        apply_op(&op, &mut writer, &mut shadow);
-
-        if i % 1000 == 0 {
-            eprintln!("{i} ops processed");
-        }
-    }
-
-    writer.begin_shrink(1000);
-	for i in 1000..1500 {
-		if let Some(entry) = writer.entry_at_bucket(i) {
+fn do_shrink(
+	writer: &mut HashMapAccess<TestKey, usize>,
+	shadow: &mut BTreeMap<TestKey, usize>,
+	from: u32,
+	to: u32
+) {
+	writer.begin_shrink(to);
+	for i in to..from {
+		if let Some(entry) = writer.entry_at_bucket(i as usize) {
 			shadow.remove(&entry._key);
 			entry.remove();
 		}
 	}
 	writer.finish_shrink().unwrap();
-	
-    for i in 0..10000 {
-        let key: TestKey = ((rng.next_u32() % 1000) as u128).into();

-        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
-
-        apply_op(&op, &mut writer, &mut shadow);
-
-        if i % 1000 == 0 {
-            eprintln!("{i} ops processed");
-        }
-    }
+}
+
+#[test]
+fn test_shrink() {
+    let shmem = ShmemHandle::new("test_shrink", 0, 10000000).unwrap();
+    let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(1500, shmem);
+    let mut writer = init_struct.attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+	
+    do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    do_shrink(&mut writer, &mut shadow, 1500, 1000);
+	do_deletes(500, &mut writer, &mut shadow);
+	do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
+	assert!(writer.get_num_buckets_in_use() <= 1000);
+}
+
+#[test]
+fn test_shrink_grow_seq() {
+    let shmem = ShmemHandle::new("test_shrink", 0, 10000000).unwrap();
+    let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(1500, shmem);
+    let mut writer = init_struct.attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Shrinking to 750");
+    do_shrink(&mut writer, &mut shadow, 1000, 750);
+	do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Growing to 1500");
+	writer.grow(1500).unwrap();
+	do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Shrinking to 200");
+	do_shrink(&mut writer, &mut shadow, 1500, 200);
+	do_deletes(100, &mut writer, &mut shadow);
+	do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Growing to 10k");
+	writer.grow(10000).unwrap();
+	do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng);
+}
+
+
+#[test]
+#[should_panic]
+fn test_shrink_bigger() {
+    let shmem = ShmemHandle::new("test_shrink", 0, 10000000).unwrap();
+    let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(1500, shmem);
+    let mut writer = init_struct.attach_writer();
+	writer.begin_shrink(2000);
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_early_finish() {
+    let shmem = ShmemHandle::new("test_shrink", 0, 10000000).unwrap();
+    let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(1500, shmem);
+    let mut writer = init_struct.attach_writer();
+	writer.finish_shrink().unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_fixed_size() {
+	let mut area = [MaybeUninit::uninit(); 10000];
+    let init_struct = HashMapInit::<TestKey, usize>::init_in_fixed_area(3, &mut area);
+    let mut writer = init_struct.attach_writer();
+	writer.begin_shrink(1);
 }