From fb75e60c6eef99cb73a984a58a4121b91b94c059 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 21 Jun 2017 15:47:55 +0900 Subject: [PATCH] issue/136 Added hashmaps. --- src/datastruct/stacker/hashmap.rs | 53 +++++++++++-------------------- src/datastruct/stacker/heap.rs | 51 +++++++++++++++++++---------- src/lib.rs | 1 - 3 files changed, 53 insertions(+), 52 deletions(-) diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index a39b11512..e66494754 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -52,14 +52,7 @@ mod murmurhash2 { } -impl Default for BytesRef { - fn default() -> BytesRef { - BytesRef { - start: 0u32, - stop: 0u32, - } - } -} + /// `KeyValue` is the item stored in the hash table. /// The key is actually a `BytesRef` object stored in an external heap. @@ -69,15 +62,15 @@ impl Default for BytesRef { /// For this reason, the (start, stop) information is actually redundant /// and can be simplified in the future #[derive(Copy, Clone, Default)] +#[repr(packed)] struct KeyValue { key: BytesRef, - value_addr: u32, - masked_hash: u32, + hash: u32, } impl KeyValue { fn is_empty(&self) -> bool { - self.key.stop == 0u32 + self.key.is_null() } } @@ -95,7 +88,6 @@ pub struct HashMap<'a> { table: Box<[KeyValue]>, heap: &'a Heap, mask: usize, - num_bucket_power_of_2: usize, occupied: Vec, } @@ -131,12 +123,11 @@ impl<'a> HashMap<'a> { table: table.into_boxed_slice(), heap: heap, mask: table_size - 1, - num_bucket_power_of_2: num_bucket_power_of_2, occupied: Vec::with_capacity(table_size / 2), } } - fn probe(&self, hash: u64) -> QuadraticProbing { + fn probe(&self, hash: u32) -> QuadraticProbing { QuadraticProbing::compute(hash as usize, self.mask) } @@ -149,17 +140,15 @@ impl<'a> HashMap<'a> { self.heap.get_slice(bytes_ref) } - pub fn set_bucket(&mut self, masked_hash: u32, key_bytes: &[u8], bucket: usize, addr: u32) -> u32 { + pub fn set_bucket(&mut self, hash: u32, key_bytes_ref: BytesRef, bucket: usize) { self.occupied.push(bucket); self.table[bucket] = KeyValue { - key: self.heap.allocate_and_set(key_bytes), - value_addr: addr, - masked_hash: masked_hash, + key: key_bytes_ref, + hash: hash, }; - addr } - pub fn iter<'b: 'a>(&'b self) -> impl Iterator + 'b { + pub fn iter<'b: 'a>(&'b self) -> impl Iterator + 'b { let heap: &'a Heap = self.heap; let table: &'b [KeyValue] = &self.table; self.occupied @@ -167,32 +156,31 @@ impl<'a> HashMap<'a> { .cloned() .map(move |bucket: usize| { let kv = table[bucket]; - let addr = kv.value_addr; + let key = heap.get_slice(kv.key); + let addr: u32 = kv.key.addr() + 2 + key.len() as u32; (heap.get_slice(kv.key), addr) }) } - pub fn mask_hash(&self, hash: u64) -> u32 { - (hash >> self.num_bucket_power_of_2) as u32 - } - pub fn get_or_create, V: HeapAllocable>(&mut self, key: S) -> &mut V { let key_bytes: &[u8] = key.as_ref(); - let hash = murmurhash2::murmurhash2(key) as usize; - let masked_hash = self.mask_hash(hash); + let hash = murmurhash2::murmurhash2(key.as_ref()); let mut probe = self.probe(hash); loop { let bucket = probe.next_probe(); let kv: KeyValue = self.table[bucket]; if kv.is_empty() { + let key_bytes_ref = self.heap.allocate_and_set(key_bytes); let (addr, val): (u32, &mut V) = self.heap.allocate_object(); - self.set_bucket(masked_hash, key.as_ref(), bucket, addr); + assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32); + self.set_bucket(hash, key_bytes_ref, bucket); return val } - if kv.masked_hash == masked_hash { - if self.get_key(kv.key) == key_bytes { - return self.heap.get_mut_ref(kv.value_addr); + if kv.hash == hash { + let stored_key: &[u8] = self.get_key(kv.key); + if stored_key == key_bytes { + return self.heap.get_mut_ref(kv.key.addr() + 2 + stored_key.len() as u32); } } } @@ -208,8 +196,6 @@ mod tests { use super::murmurhash2::murmurhash2; use test::Bencher; use std::collections::HashSet; - use std::collections::hash_map::DefaultHasher; - use std::hash::Hasher; struct TestValue { val: u32, @@ -295,7 +281,6 @@ mod tests { .unwrap() }); } ->>>>>>> master } diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs index e642e5908..7ed6439d8 100644 --- a/src/datastruct/stacker/heap.rs +++ b/src/datastruct/stacker/heap.rs @@ -1,12 +1,26 @@ use std::cell::UnsafeCell; use std::mem; use std::ptr; +use byteorder::{NativeEndian, ByteOrder}; /// `BytesRef` refers to a slice in tantivy's custom `Heap`. #[derive(Copy, Clone)] -pub struct BytesRef { - pub start: u32, - pub stop: u32, +pub struct BytesRef(u32); + +impl BytesRef { + pub fn is_null(&self) -> bool { + self.0 == u32::max_value() + } + + pub fn addr(&self) -> u32 { + self.0 + } +} + +impl Default for BytesRef { + fn default() -> BytesRef { + BytesRef(u32::max_value()) + } } /// Object that can be allocated in tantivy's custom `Heap`. @@ -70,7 +84,7 @@ impl Heap { /// Fetches the `&[u8]` stored on the slice defined by the `BytesRef` /// given as argumetn pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] { - self.inner().get_slice(bytes_ref.start, bytes_ref.stop) + self.inner().get_slice(bytes_ref) } /// Stores an item's data in the heap, at the given `address`. @@ -144,14 +158,19 @@ impl InnerHeap { } } - fn get_slice(&self, start: u32, stop: u32) -> &[u8] { + + fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] { + let start = bytes_ref.0; if start >= self.buffer_len { self.next_heap .as_ref() .unwrap() - .get_slice(start - self.buffer_len, stop - self.buffer_len) - } else { - &self.buffer[start as usize..stop as usize] + .get_slice(BytesRef(start - self.buffer_len)) + } + else { + let start = start as usize; + let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize; + &self.buffer[start + 2.. start + 2 + len] } } @@ -167,13 +186,13 @@ impl InnerHeap { } fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef { - let start = self.allocate_space(data.len()); - let stop = start + data.len() as u32; - self.get_mut_slice(start, stop).clone_from_slice(data); - BytesRef { - start: start as u32, - stop: stop as u32, - } + assert!(data.len() < u16::max_value() as usize); + let total_len = 2 + data.len(); + let start = self.allocate_space(total_len); + let total_buff = self.get_mut_slice(start, start + total_len as u32); + NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16); + &mut total_buff[2..].clone_from_slice(data); + BytesRef(start) } fn get_mut(&mut self, addr: u32) -> *mut u8 { @@ -188,8 +207,6 @@ impl InnerHeap { } } - - fn get_mut_ref(&mut self, addr: u32) -> &mut Item { if addr >= self.buffer_len { self.next_heap diff --git a/src/lib.rs b/src/lib.rs index ab334c14d..5d13c8299 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -63,7 +63,6 @@ extern crate futures; extern crate futures_cpupool; extern crate owning_ref; extern crate stable_deref_trait; -extern crate murmurhash64; #[cfg(test)] extern crate env_logger;