mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 21:20:40 +00:00
issue/136 Added hashmaps.
This commit is contained in:
@@ -52,14 +52,7 @@ mod murmurhash2 {
|
||||
}
|
||||
|
||||
|
||||
impl Default for BytesRef {
|
||||
fn default() -> BytesRef {
|
||||
BytesRef {
|
||||
start: 0u32,
|
||||
stop: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external heap.
|
||||
@@ -69,15 +62,15 @@ impl Default for BytesRef {
|
||||
/// For this reason, the (start, stop) information is actually redundant
|
||||
/// and can be simplified in the future
|
||||
#[derive(Copy, Clone, Default)]
|
||||
#[repr(packed)]
|
||||
struct KeyValue {
|
||||
key: BytesRef,
|
||||
value_addr: u32,
|
||||
masked_hash: u32,
|
||||
hash: u32,
|
||||
}
|
||||
|
||||
impl KeyValue {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.key.stop == 0u32
|
||||
self.key.is_null()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,7 +88,6 @@ pub struct HashMap<'a> {
|
||||
table: Box<[KeyValue]>,
|
||||
heap: &'a Heap,
|
||||
mask: usize,
|
||||
num_bucket_power_of_2: usize,
|
||||
occupied: Vec<usize>,
|
||||
}
|
||||
|
||||
@@ -131,12 +123,11 @@ impl<'a> HashMap<'a> {
|
||||
table: table.into_boxed_slice(),
|
||||
heap: heap,
|
||||
mask: table_size - 1,
|
||||
num_bucket_power_of_2: num_bucket_power_of_2,
|
||||
occupied: Vec::with_capacity(table_size / 2),
|
||||
}
|
||||
}
|
||||
|
||||
fn probe(&self, hash: u64) -> QuadraticProbing {
|
||||
fn probe(&self, hash: u32) -> QuadraticProbing {
|
||||
QuadraticProbing::compute(hash as usize, self.mask)
|
||||
}
|
||||
|
||||
@@ -149,17 +140,15 @@ impl<'a> HashMap<'a> {
|
||||
self.heap.get_slice(bytes_ref)
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, masked_hash: u32, key_bytes: &[u8], bucket: usize, addr: u32) -> u32 {
|
||||
pub fn set_bucket(&mut self, hash: u32, key_bytes_ref: BytesRef, bucket: usize) {
|
||||
self.occupied.push(bucket);
|
||||
self.table[bucket] = KeyValue {
|
||||
key: self.heap.allocate_and_set(key_bytes),
|
||||
value_addr: addr,
|
||||
masked_hash: masked_hash,
|
||||
key: key_bytes_ref,
|
||||
hash: hash,
|
||||
};
|
||||
addr
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32)> + 'b {
|
||||
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item=(&'a [u8], u32)> + 'b {
|
||||
let heap: &'a Heap = self.heap;
|
||||
let table: &'b [KeyValue] = &self.table;
|
||||
self.occupied
|
||||
@@ -167,32 +156,31 @@ impl<'a> HashMap<'a> {
|
||||
.cloned()
|
||||
.map(move |bucket: usize| {
|
||||
let kv = table[bucket];
|
||||
let addr = kv.value_addr;
|
||||
let key = heap.get_slice(kv.key);
|
||||
let addr: u32 = kv.key.addr() + 2 + key.len() as u32;
|
||||
(heap.get_slice(kv.key), addr)
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
pub fn mask_hash(&self, hash: u64) -> u32 {
|
||||
(hash >> self.num_bucket_power_of_2) as u32
|
||||
}
|
||||
|
||||
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(&mut self, key: S) -> &mut V {
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key) as usize;
|
||||
let masked_hash = self.mask_hash(hash);
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
let key_bytes_ref = self.heap.allocate_and_set(key_bytes);
|
||||
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
|
||||
self.set_bucket(masked_hash, key.as_ref(), bucket, addr);
|
||||
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
|
||||
self.set_bucket(hash, key_bytes_ref, bucket);
|
||||
return val
|
||||
}
|
||||
if kv.masked_hash == masked_hash {
|
||||
if self.get_key(kv.key) == key_bytes {
|
||||
return self.heap.get_mut_ref(kv.value_addr);
|
||||
if kv.hash == hash {
|
||||
let stored_key: &[u8] = self.get_key(kv.key);
|
||||
if stored_key == key_bytes {
|
||||
return self.heap.get_mut_ref(kv.key.addr() + 2 + stored_key.len() as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -208,8 +196,6 @@ mod tests {
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use test::Bencher;
|
||||
use std::collections::HashSet;
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::Hasher;
|
||||
|
||||
struct TestValue {
|
||||
val: u32,
|
||||
@@ -295,7 +281,6 @@ mod tests {
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
>>>>>>> master
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -1,12 +1,26 @@
|
||||
use std::cell::UnsafeCell;
|
||||
use std::mem;
|
||||
use std::ptr;
|
||||
use byteorder::{NativeEndian, ByteOrder};
|
||||
|
||||
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct BytesRef {
|
||||
pub start: u32,
|
||||
pub stop: u32,
|
||||
pub struct BytesRef(u32);
|
||||
|
||||
impl BytesRef {
|
||||
pub fn is_null(&self) -> bool {
|
||||
self.0 == u32::max_value()
|
||||
}
|
||||
|
||||
pub fn addr(&self) -> u32 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BytesRef {
|
||||
fn default() -> BytesRef {
|
||||
BytesRef(u32::max_value())
|
||||
}
|
||||
}
|
||||
|
||||
/// Object that can be allocated in tantivy's custom `Heap`.
|
||||
@@ -70,7 +84,7 @@ impl Heap {
|
||||
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
|
||||
/// given as argumetn
|
||||
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
self.inner().get_slice(bytes_ref.start, bytes_ref.stop)
|
||||
self.inner().get_slice(bytes_ref)
|
||||
}
|
||||
|
||||
/// Stores an item's data in the heap, at the given `address`.
|
||||
@@ -144,14 +158,19 @@ impl InnerHeap {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_slice(&self, start: u32, stop: u32) -> &[u8] {
|
||||
|
||||
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
let start = bytes_ref.0;
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
} else {
|
||||
&self.buffer[start as usize..stop as usize]
|
||||
.get_slice(BytesRef(start - self.buffer_len))
|
||||
}
|
||||
else {
|
||||
let start = start as usize;
|
||||
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
|
||||
&self.buffer[start + 2.. start + 2 + len]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,13 +186,13 @@ impl InnerHeap {
|
||||
}
|
||||
|
||||
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
|
||||
let start = self.allocate_space(data.len());
|
||||
let stop = start + data.len() as u32;
|
||||
self.get_mut_slice(start, stop).clone_from_slice(data);
|
||||
BytesRef {
|
||||
start: start as u32,
|
||||
stop: stop as u32,
|
||||
}
|
||||
assert!(data.len() < u16::max_value() as usize);
|
||||
let total_len = 2 + data.len();
|
||||
let start = self.allocate_space(total_len);
|
||||
let total_buff = self.get_mut_slice(start, start + total_len as u32);
|
||||
NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16);
|
||||
&mut total_buff[2..].clone_from_slice(data);
|
||||
BytesRef(start)
|
||||
}
|
||||
|
||||
fn get_mut(&mut self, addr: u32) -> *mut u8 {
|
||||
@@ -188,8 +207,6 @@ impl InnerHeap {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
|
||||
@@ -63,7 +63,6 @@ extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
extern crate owning_ref;
|
||||
extern crate stable_deref_trait;
|
||||
extern crate murmurhash64;
|
||||
|
||||
#[cfg(test)]
|
||||
extern crate env_logger;
|
||||
|
||||
Reference in New Issue
Block a user