issue/136 Added hashmaps.

This commit is contained in:
Paul Masurel
2017-06-21 15:47:55 +09:00
parent 04b15c6c11
commit fb75e60c6e
3 changed files with 53 additions and 52 deletions

View File

@@ -52,14 +52,7 @@ mod murmurhash2 {
}
impl Default for BytesRef {
fn default() -> BytesRef {
BytesRef {
start: 0u32,
stop: 0u32,
}
}
}
/// `KeyValue` is the item stored in the hash table.
/// The key is actually a `BytesRef` object stored in an external heap.
@@ -69,15 +62,15 @@ impl Default for BytesRef {
/// For this reason, the (start, stop) information is actually redundant
/// and can be simplified in the future
#[derive(Copy, Clone, Default)]
#[repr(packed)]
struct KeyValue {
key: BytesRef,
value_addr: u32,
masked_hash: u32,
hash: u32,
}
impl KeyValue {
fn is_empty(&self) -> bool {
self.key.stop == 0u32
self.key.is_null()
}
}
@@ -95,7 +88,6 @@ pub struct HashMap<'a> {
table: Box<[KeyValue]>,
heap: &'a Heap,
mask: usize,
num_bucket_power_of_2: usize,
occupied: Vec<usize>,
}
@@ -131,12 +123,11 @@ impl<'a> HashMap<'a> {
table: table.into_boxed_slice(),
heap: heap,
mask: table_size - 1,
num_bucket_power_of_2: num_bucket_power_of_2,
occupied: Vec::with_capacity(table_size / 2),
}
}
fn probe(&self, hash: u64) -> QuadraticProbing {
fn probe(&self, hash: u32) -> QuadraticProbing {
QuadraticProbing::compute(hash as usize, self.mask)
}
@@ -149,17 +140,15 @@ impl<'a> HashMap<'a> {
self.heap.get_slice(bytes_ref)
}
pub fn set_bucket(&mut self, masked_hash: u32, key_bytes: &[u8], bucket: usize, addr: u32) -> u32 {
pub fn set_bucket(&mut self, hash: u32, key_bytes_ref: BytesRef, bucket: usize) {
self.occupied.push(bucket);
self.table[bucket] = KeyValue {
key: self.heap.allocate_and_set(key_bytes),
value_addr: addr,
masked_hash: masked_hash,
key: key_bytes_ref,
hash: hash,
};
addr
}
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32)> + 'b {
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item=(&'a [u8], u32)> + 'b {
let heap: &'a Heap = self.heap;
let table: &'b [KeyValue] = &self.table;
self.occupied
@@ -167,32 +156,31 @@ impl<'a> HashMap<'a> {
.cloned()
.map(move |bucket: usize| {
let kv = table[bucket];
let addr = kv.value_addr;
let key = heap.get_slice(kv.key);
let addr: u32 = kv.key.addr() + 2 + key.len() as u32;
(heap.get_slice(kv.key), addr)
})
}
pub fn mask_hash(&self, hash: u64) -> u32 {
(hash >> self.num_bucket_power_of_2) as u32
}
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(&mut self, key: S) -> &mut V {
let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2::murmurhash2(key) as usize;
let masked_hash = self.mask_hash(hash);
let hash = murmurhash2::murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
let key_bytes_ref = self.heap.allocate_and_set(key_bytes);
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
self.set_bucket(masked_hash, key.as_ref(), bucket, addr);
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
self.set_bucket(hash, key_bytes_ref, bucket);
return val
}
if kv.masked_hash == masked_hash {
if self.get_key(kv.key) == key_bytes {
return self.heap.get_mut_ref(kv.value_addr);
if kv.hash == hash {
let stored_key: &[u8] = self.get_key(kv.key);
if stored_key == key_bytes {
return self.heap.get_mut_ref(kv.key.addr() + 2 + stored_key.len() as u32);
}
}
}
@@ -208,8 +196,6 @@ mod tests {
use super::murmurhash2::murmurhash2;
use test::Bencher;
use std::collections::HashSet;
use std::collections::hash_map::DefaultHasher;
use std::hash::Hasher;
struct TestValue {
val: u32,
@@ -295,7 +281,6 @@ mod tests {
.unwrap()
});
}
>>>>>>> master
}

View File

@@ -1,12 +1,26 @@
use std::cell::UnsafeCell;
use std::mem;
use std::ptr;
use byteorder::{NativeEndian, ByteOrder};
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
#[derive(Copy, Clone)]
pub struct BytesRef {
pub start: u32,
pub stop: u32,
pub struct BytesRef(u32);
impl BytesRef {
pub fn is_null(&self) -> bool {
self.0 == u32::max_value()
}
pub fn addr(&self) -> u32 {
self.0
}
}
impl Default for BytesRef {
fn default() -> BytesRef {
BytesRef(u32::max_value())
}
}
/// Object that can be allocated in tantivy's custom `Heap`.
@@ -70,7 +84,7 @@ impl Heap {
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
/// given as argumetn
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
self.inner().get_slice(bytes_ref.start, bytes_ref.stop)
self.inner().get_slice(bytes_ref)
}
/// Stores an item's data in the heap, at the given `address`.
@@ -144,14 +158,19 @@ impl InnerHeap {
}
}
fn get_slice(&self, start: u32, stop: u32) -> &[u8] {
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
let start = bytes_ref.0;
if start >= self.buffer_len {
self.next_heap
.as_ref()
.unwrap()
.get_slice(start - self.buffer_len, stop - self.buffer_len)
} else {
&self.buffer[start as usize..stop as usize]
.get_slice(BytesRef(start - self.buffer_len))
}
else {
let start = start as usize;
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
&self.buffer[start + 2.. start + 2 + len]
}
}
@@ -167,13 +186,13 @@ impl InnerHeap {
}
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
let start = self.allocate_space(data.len());
let stop = start + data.len() as u32;
self.get_mut_slice(start, stop).clone_from_slice(data);
BytesRef {
start: start as u32,
stop: stop as u32,
}
assert!(data.len() < u16::max_value() as usize);
let total_len = 2 + data.len();
let start = self.allocate_space(total_len);
let total_buff = self.get_mut_slice(start, start + total_len as u32);
NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16);
&mut total_buff[2..].clone_from_slice(data);
BytesRef(start)
}
fn get_mut(&mut self, addr: u32) -> *mut u8 {
@@ -188,8 +207,6 @@ impl InnerHeap {
}
}
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
if addr >= self.buffer_len {
self.next_heap

View File

@@ -63,7 +63,6 @@ extern crate futures;
extern crate futures_cpupool;
extern crate owning_ref;
extern crate stable_deref_trait;
extern crate murmurhash64;
#[cfg(test)]
extern crate env_logger;