Merge branch 'master' into exp/hash_intable

Conflicts:
	src/datastruct/stacker/hashmap.rs
This commit is contained in:
Paul Masurel
2017-06-21 11:40:49 +09:00
36 changed files with 556 additions and 281 deletions

View File

@@ -18,7 +18,7 @@ impl<T: BinarySerializable> LayerBuilder<T> {
}
fn write(&self, output: &mut Write) -> Result<(), io::Error> {
try!(output.write_all(&self.buffer));
output.write_all(&self.buffer)?;
Ok(())
}
@@ -36,8 +36,8 @@ impl<T: BinarySerializable> LayerBuilder<T> {
self.remaining -= 1;
self.len += 1;
let offset = self.written_size() as u32;
try!(doc_id.serialize(&mut self.buffer));
try!(value.serialize(&mut self.buffer));
doc_id.serialize(&mut self.buffer)?;
value.serialize(&mut self.buffer)?;
Ok(if self.remaining == 0 {
self.remaining = self.period;
Some((doc_id, offset))
@@ -89,7 +89,7 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
}
}
pub fn write<W: Write>(self, output: &mut Write) -> io::Result<()> {
pub fn write<W: Write>(self, output: &mut W) -> io::Result<()> {
let mut size: u32 = 0;
let mut layer_sizes: Vec<u32> = Vec::new();
size += self.data_layer.buffer.len() as u32;
@@ -98,10 +98,10 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
size += layer.buffer.len() as u32;
layer_sizes.push(size);
}
try!(layer_sizes.serialize(output));
try!(self.data_layer.write(output));
layer_sizes.serialize(output)?;
self.data_layer.write(output)?;
for layer in self.skip_layers.iter().rev() {
try!(layer.write(output));
layer.write(output)?;
}
Ok(())
}

View File

@@ -13,7 +13,7 @@ pub fn jump_needed(val: u32) -> bool {
}
#[derive(Debug)]
#[derive(Debug, Clone)]
pub struct ExpUnrolledLinkedList {
len: u32,
end: u32,
@@ -52,6 +52,12 @@ impl ExpUnrolledLinkedList {
}
impl HeapAllocable for u32 {
fn with_addr(_addr: u32) -> u32 {
0u32
}
}
impl HeapAllocable for ExpUnrolledLinkedList {
fn with_addr(addr: u32) -> ExpUnrolledLinkedList {
let last_addr = addr + mem::size_of::<u32>() as u32 * 2u32;

View File

@@ -1,11 +1,54 @@
use std::iter;
use super::heap::{Heap, HeapAllocable, BytesRef};
use murmurhash64::murmur_hash64a;
const SEED: u64 = 2915580697u64;
mod murmurhash2 {
fn hash(key: &[u8]) -> u64 {
murmur_hash64a(key, SEED)
const SEED: u32 = 3_242_157_231u32;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let m: u32 = 0x5bd1e995;
let r = 24;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { *key_ptr };
k = k.wrapping_mul(m);
k ^= k >> r;
k = k.wrapping_mul(m);
k = k.wrapping_mul(m);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining = len & 3;
let key_ptr_u8: *const u8 = key_ptr as *const u8;
match remaining {
3 => {
h ^= unsafe { *key_ptr_u8.wrapping_offset(2) as u32 } << 16;
h ^= unsafe { *key_ptr_u8.wrapping_offset(1) as u32 } << 8;
h ^= unsafe { *key_ptr_u8 as u32 };
h = h.wrapping_mul(m);
}
2 => {
h ^= unsafe { *key_ptr_u8.wrapping_offset(1) as u32 } << 8;
h ^= unsafe { *key_ptr_u8 as u32 };
h = h.wrapping_mul(m);
}
1 => {
h ^= unsafe { *key_ptr_u8 as u32 };
h = h.wrapping_mul(m);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(m);
h ^ (h >> 15)
}
}
@@ -136,7 +179,7 @@ impl<'a> HashMap<'a> {
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(&mut self, key: S) -> &mut V {
let key_bytes: &[u8] = key.as_ref();
let hash = hash(key.as_ref());
let hash = murmurhash2::murmurhash2(key) as usize;
let masked_hash = self.mask_hash(hash);
let mut probe = self.probe(hash);
loop {
@@ -162,7 +205,9 @@ mod tests {
use super::*;
use super::super::heap::{Heap, HeapAllocable};
use super::murmurhash2::murmurhash2;
use test::Bencher;
use std::collections::HashSet;
use std::collections::hash_map::DefaultHasher;
use std::hash::Hasher;
@@ -216,21 +261,41 @@ mod tests {
assert!(iter_values.next().is_none());
}
// #[bench]
// fn bench_djb2(bench: &mut Bencher) {
// let v = String::from("abwer");
// bench.iter(|| djb2(v.as_bytes()));
// }
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes()));
}
}
// #[bench]
// fn bench_siphasher(bench: &mut Bencher) {
// let v = String::from("abwer");
// bench.iter(|| {
// let mut h = DefaultHasher::new();
// h.write(v.as_bytes());
// h.finish()
// });
// }
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
#[bench]
fn bench_murmurhash_2(b: &mut Bencher) {
let keys: Vec<&'static str> =
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
keys.iter()
.map(|&s| s.as_bytes())
.map(murmurhash2::murmurhash2)
.map(|h| h as u64)
.last()
.unwrap()
});
}
>>>>>>> master
}

View File

@@ -93,8 +93,9 @@ impl Heap {
struct InnerHeap {
buffer: Vec<u8>,
buffer_len: u32,
used: u32,
has_been_resized: bool,
next_heap: Option<Box<InnerHeap>>,
}
@@ -103,13 +104,15 @@ impl InnerHeap {
let buffer: Vec<u8> = vec![0u8; num_bytes];
InnerHeap {
buffer: buffer,
buffer_len: num_bytes as u32,
next_heap: None,
used: 0u32,
has_been_resized: false,
}
}
pub fn clear(&mut self) {
self.used = 0u32;
self.next_heap = None;
}
pub fn capacity(&self) -> u32 {
@@ -119,30 +122,48 @@ impl InnerHeap {
// Returns the number of free bytes. If the buffer
// has reached it's capacity and overflowed to another buffer, return 0.
pub fn num_free_bytes(&self) -> u32 {
if self.has_been_resized {
if self.next_heap.is_some() {
0u32
} else {
(self.buffer.len() as u32) - self.used
self.buffer_len - self.used
}
}
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
let addr = self.used;
self.used += num_bytes as u32;
let buffer_len = self.buffer.len();
if self.used > buffer_len as u32 {
self.buffer.resize(buffer_len * 2, 0u8);
self.has_been_resized = true
if self.used <= self.buffer_len {
addr
} else {
if self.next_heap.is_none() {
info!(r#"Exceeded heap size.
The segment will be committed right after indexing this document."#,);
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
}
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
}
addr
}
fn get_slice(&self, start: u32, stop: u32) -> &[u8] {
&self.buffer[start as usize..stop as usize]
if start >= self.buffer_len {
self.next_heap
.as_ref()
.unwrap()
.get_slice(start - self.buffer_len, stop - self.buffer_len)
} else {
&self.buffer[start as usize..stop as usize]
}
}
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
&mut self.buffer[start as usize..stop as usize]
if start >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
} else {
&mut self.buffer[start as usize..stop as usize]
}
}
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
@@ -156,23 +177,46 @@ impl InnerHeap {
}
fn get_mut(&mut self, addr: u32) -> *mut u8 {
let addr_isize = addr as isize;
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut(addr - self.buffer_len)
} else {
let addr_isize = addr as isize;
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
}
}
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
let v_ptr = v_ptr_u8 as *mut Item;
unsafe { &mut *v_ptr }
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_ref(addr - self.buffer_len)
} else {
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
let v_ptr = v_ptr_u8 as *mut Item;
unsafe { &mut *v_ptr }
}
}
fn set<Item>(&mut self, addr: u32, val: &Item) {
let v_ptr: *const Item = val as *const Item;
let v_ptr_u8: *const u8 = v_ptr as *const u8;
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
unsafe {
let dest_ptr: *mut u8 = self.get_mut(addr);
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.set(addr - self.buffer_len, val);
} else {
let v_ptr: *const Item = val as *const Item;
let v_ptr_u8: *const u8 = v_ptr as *const u8;
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
unsafe {
let dest_ptr: *mut u8 = self.get_mut(addr);
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
}
}
}
}