Merge branch 'master' into issue/136

Conflicts:
	src/datastruct/stacker/hashmap.rs
	src/datastruct/stacker/heap.rs
	src/datastruct/stacker/mod.rs
	src/indexer/index_writer.rs
	src/indexer/merger.rs
	src/indexer/segment_updater.rs
	src/indexer/segment_writer.rs
	src/postings/postings_writer.rs
	src/postings/recorder.rs
	src/schema/term.rs
This commit is contained in:
Paul Masurel
2017-05-17 18:40:09 +09:00
118 changed files with 3234 additions and 2946 deletions

View File

@@ -1,5 +1,3 @@
#![allow(should_implement_trait)]
use std::io;
use std::io::Write;
use fst;
@@ -20,18 +18,17 @@ pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
}
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
pub fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error));
Ok(FstMapBuilder {
fst_builder: fst_builder,
data: Vec::new(),
_phantom_: PhantomData,
})
fst_builder: fst_builder,
data: Vec::new(),
_phantom_: PhantomData,
})
}
/// Horribly unsafe, nobody should ever do that... except me :)
///
///
/// If used, it must be used by systematically alternating calls
/// to insert_key and insert_value.
///
@@ -39,8 +36,8 @@ impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
/// in a nice way.
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
try!(self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error));
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error));
Ok(())
}
@@ -53,17 +50,14 @@ impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
#[cfg(test)]
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> {
try!(self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error));
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error));
try!(value.serialize(&mut self.data));
Ok(())
}
pub fn finish(self,) -> io::Result<W> {
let mut file = try!(
self.fst_builder
.into_inner()
.map_err(convert_fst_error));
pub fn finish(self) -> io::Result<W> {
let mut file = try!(self.fst_builder.into_inner().map_err(convert_fst_error));
let footer_size = self.data.len() as u32;
try!(file.write_all(&self.data));
try!((footer_size as u32).serialize(&mut file));
@@ -81,31 +75,35 @@ pub struct FstMap<V: BinarySerializable> {
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
Ok(fst::Map::from(match source {
ReadOnlySource::Anonymous(data) => try!(Fst::from_shared_bytes(data.data, data.start, data.len).map_err(convert_fst_error)),
ReadOnlySource::Mmap(mmap_readonly) => try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)),
}))
ReadOnlySource::Anonymous(data) => {
try!(Fst::from_shared_bytes(data.data, data.start, data.len)
.map_err(convert_fst_error))
}
ReadOnlySource::Mmap(mmap_readonly) => {
try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error))
}
}))
}
impl<V: BinarySerializable> FstMap<V> {
pub fn keys(&self,) -> fst::map::Keys {
pub fn keys(&self) -> fst::map::Keys {
self.fst_index.keys()
}
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
let total_len = source.len();
let length_offset = total_len - 4;
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
let footer_size = try!(u32::deserialize(&mut split_len_buffer)) as usize;
let footer_size = try!(u32::deserialize(&mut split_len_buffer)) as usize;
let split_len = length_offset - footer_size;
let fst_source = source.slice(0, split_len);
let values_source = source.slice(split_len, length_offset);
let fst_index = try!(open_fst_index(fst_source));
Ok(FstMap {
fst_index: fst_index,
values_mmap: values_source,
_phantom_: PhantomData,
})
fst_index: fst_index,
values_mmap: values_source,
_phantom_: PhantomData,
})
}
fn read_value(&self, offset: u64) -> V {

View File

@@ -114,9 +114,9 @@ mod tests {
let mut skip_list: SkipList<()> = SkipList::from(output.as_slice());
assert_eq!(skip_list.next().unwrap(), (0, ()));
skip_list.seek(431);
assert_eq!(skip_list.next().unwrap(), (431,()) );
assert_eq!(skip_list.next().unwrap(), (431, ()));
skip_list.seek(1003);
assert_eq!(skip_list.next().unwrap(), (1004,()) );
assert_eq!(skip_list.next().unwrap(), (1004, ()));
assert_eq!(skip_list.next(), None);
}

View File

@@ -13,14 +13,12 @@ struct Layer<'a, T> {
}
impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
type Item = (DocId, T);
fn next(&mut self,)-> Option<(DocId, T)> {
fn next(&mut self) -> Option<(DocId, T)> {
if self.next_id == u32::max_value() {
None
}
else {
} else {
let cur_val = T::deserialize(&mut self.cursor).unwrap();
let cur_id = self.next_id;
self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
@@ -31,7 +29,7 @@ impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
impl<'a, T: BinarySerializable> From<&'a [u8]> for Layer<'a, T> {
fn from(data: &'a [u8]) -> Layer<'a, T> {
let mut cursor = data;
let mut cursor = data;
let next_id = u32::deserialize(&mut cursor).unwrap_or(u32::max_value());
Layer {
data: data,
@@ -43,7 +41,6 @@ impl<'a, T: BinarySerializable> From<&'a [u8]> for Layer<'a, T> {
}
impl<'a, T: BinarySerializable> Layer<'a, T> {
fn empty() -> Layer<'a, T> {
Layer {
data: &EMPTY,
@@ -53,11 +50,11 @@ impl<'a, T: BinarySerializable> Layer<'a, T> {
}
}
fn seek_offset(&mut self, offset: usize) {
fn seek_offset(&mut self, offset: usize) {
self.cursor = &self.data[offset..];
self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
}
// Returns the last element (key, val)
// such that (key < doc_id)
//
@@ -67,8 +64,12 @@ impl<'a, T: BinarySerializable> Layer<'a, T> {
let mut val = None;
while self.next_id < doc_id {
match self.next() {
None => { break; },
v => { val = v; }
None => {
break;
}
v => {
val = v;
}
}
}
val
@@ -82,16 +83,14 @@ pub struct SkipList<'a, T: BinarySerializable> {
}
impl<'a, T: BinarySerializable> Iterator for SkipList<'a, T> {
type Item = (DocId, T);
fn next(&mut self,)-> Option<(DocId, T)> {
fn next(&mut self) -> Option<(DocId, T)> {
self.data_layer.next()
}
}
impl<'a, T: BinarySerializable> SkipList<'a, T> {
pub fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> {
let mut next_layer_skip: Option<(DocId, u32)> = None;
for skip_layer in &mut self.skip_layers {
@@ -99,39 +98,33 @@ impl<'a, T: BinarySerializable> SkipList<'a, T> {
skip_layer.seek_offset(offset as usize);
}
next_layer_skip = skip_layer.seek(doc_id);
}
if let Some((_, offset)) = next_layer_skip {
self.data_layer.seek_offset(offset as usize);
}
self.data_layer.seek(doc_id)
}
if let Some((_, offset)) = next_layer_skip {
self.data_layer.seek_offset(offset as usize);
}
self.data_layer.seek(doc_id)
}
}
impl<'a, T: BinarySerializable> From<&'a [u8]> for SkipList<'a, T> {
fn from(mut data: &'a [u8]) -> SkipList<'a, T> {
let offsets: Vec<u32> = Vec::deserialize(&mut data).unwrap();
let num_layers = offsets.len();
let layers_data: &[u8] = data;
let data_layer: Layer<'a, T> =
if num_layers == 0 { Layer::empty() }
else {
let first_layer_data: &[u8] = &layers_data[..offsets[0] as usize];
Layer::from(first_layer_data)
};
let data_layer: Layer<'a, T> = if num_layers == 0 {
Layer::empty()
} else {
let first_layer_data: &[u8] = &layers_data[..offsets[0] as usize];
Layer::from(first_layer_data)
};
let skip_layers = (0..max(1, num_layers) - 1)
.map(|i| (offsets[i] as usize, offsets[i + 1] as usize))
.map(|(start, stop)| {
Layer::from(&layers_data[start..stop])
})
.map(|(start, stop)| Layer::from(&layers_data[start..stop]))
.collect();
SkipList {
skip_layers: skip_layers,
data_layer: data_layer,
}
}
}

View File

@@ -13,8 +13,7 @@ struct LayerBuilder<T: BinarySerializable> {
}
impl<T: BinarySerializable> LayerBuilder<T> {
fn written_size(&self,) -> usize {
fn written_size(&self) -> usize {
self.buffer.len()
}
@@ -42,8 +41,9 @@ impl<T: BinarySerializable> LayerBuilder<T> {
Ok(if self.remaining == 0 {
self.remaining = self.period;
Some((doc_id, offset))
}
else { None })
} else {
None
})
}
}
@@ -56,7 +56,6 @@ pub struct SkipListBuilder<T: BinarySerializable> {
impl<T: BinarySerializable> SkipListBuilder<T> {
pub fn new(period: usize) -> SkipListBuilder<T> {
SkipListBuilder {
period: period,
@@ -78,11 +77,13 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
let mut skip_pointer = try!(self.data_layer.insert(doc_id, dest));
loop {
skip_pointer = match skip_pointer {
Some((skip_doc_id, skip_offset)) =>
try!(self
.get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset)),
None => { return Ok(()); }
Some((skip_doc_id, skip_offset)) => {
try!(self.get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset))
}
None => {
return Ok(());
}
};
layer_id += 1;
}

View File

@@ -9,7 +9,7 @@ pub fn is_power_of_2(val: u32) -> bool {
#[inline]
pub fn jump_needed(val: u32) -> bool {
val > 3 && is_power_of_2(val)
val > 3 && is_power_of_2(val)
}
@@ -24,7 +24,6 @@ pub struct ExpUnrolledLinkedList {
}
impl ExpUnrolledLinkedList {
pub fn iter<'a>(&self, addr: u32, heap: &'a Heap) -> ExpUnrolledLinkedListIterator<'a> {
ExpUnrolledLinkedListIterator {
heap: heap,
@@ -42,10 +41,10 @@ impl ExpUnrolledLinkedList {
// the next block as a size of (length so far),
// and we need to add 1u32 to store the pointer
// to the next element.
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
let new_block_addr: u32 = heap.allocate_space(new_block_size);
heap.set(self.end, &new_block_addr);
self.end = new_block_addr;
self.end = new_block_addr;
}
heap.set(self.end, &val);
self.end += mem::size_of::<u32>() as u32;
@@ -77,23 +76,21 @@ pub struct ExpUnrolledLinkedListIterator<'a> {
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
type Item = u32;
fn next(&mut self,) -> Option<u32> {
fn next(&mut self) -> Option<u32> {
if self.consumed == self.len {
None
}
else {
} else {
let addr: u32;
self.consumed += 1;
if jump_needed(self.consumed) {
addr = *self.heap.get_mut_ref(self.addr);
}
else {
} else {
addr = self.addr;
}
self.addr = addr + mem::size_of::<u32>() as u32;
Some(*self.heap.get_mut_ref(addr))
}
}
}
}
@@ -103,7 +100,7 @@ impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
#[cfg(test)]
mod tests {
use super::*;
use super::super::heap::Heap;
use test::Bencher;
@@ -147,7 +144,7 @@ mod tests {
#[bench]
fn bench_push_stack(bench: &mut Bencher) {
let heap = Heap::with_capacity(64_000_000);
let heap = Heap::with_capacity(64_000_000);
bench.iter(|| {
let mut stacks = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
@@ -163,4 +160,4 @@ mod tests {
heap.clear();
});
}
}
}

View File

@@ -3,7 +3,7 @@ use super::heap::{Heap, HeapAllocable, BytesRef};
/// dbj2 hash function
fn djb2(key: &[u8]) -> u64 {
let mut state: u64 = 5381;
let mut state: u64 = 5381;
for &b in key {
state = (state << 5).wrapping_add(state).wrapping_add(b as u64);
}
@@ -25,7 +25,7 @@ impl Default for BytesRef {
///
/// The key and the value are actually stored contiguously.
/// For this reason, the (start, stop) information is actually redundant
/// and can be simplified in the future
/// and can be simplified in the future
#[derive(Copy, Clone, Default)]
struct KeyValue {
key: BytesRef,
@@ -33,7 +33,7 @@ struct KeyValue {
}
impl KeyValue {
fn is_empty(&self,) -> bool {
fn is_empty(&self) -> bool {
self.key.stop == 0u32
}
}
@@ -45,7 +45,7 @@ pub enum Entry {
/// Customized `HashMap` with string keys
///
///
/// This `HashMap` takes String as keys. Keys are
/// stored in a user defined heap.
///
@@ -60,6 +60,7 @@ pub struct HashMap<'a> {
occupied: Vec<usize>,
}
struct QuadraticProbing {
hash: usize,
i: usize,
@@ -77,7 +78,7 @@ impl QuadraticProbing {
}
#[inline]
fn next(&mut self) -> usize {
fn next_probe(&mut self) -> usize {
self.i += 1;
(self.hash + self.i * self.i) & self.mask
}
@@ -88,9 +89,7 @@ impl<'a> HashMap<'a> {
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a> {
let table_size = 1 << num_bucket_power_of_2;
let table: Vec<KeyValue> = iter::repeat(KeyValue::default())
.take(table_size)
.collect();
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
HashMap {
table: table.into_boxed_slice(),
heap: heap,
@@ -136,9 +135,7 @@ impl<'a> HashMap<'a> {
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(&mut self, key: S) -> &mut V {
let entry = self.lookup(key.as_ref());
match entry {
Entry::Occupied(addr) => {
self.heap.get_mut_ref(addr)
}
Entry::Occupied(addr) => self.heap.get_mut_ref(addr),
Entry::Vacant(bucket) => {
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
self.set_bucket(key.as_ref(), bucket, addr);
@@ -146,12 +143,12 @@ impl<'a> HashMap<'a> {
}
}
}
pub fn lookup<S: AsRef<[u8]>>(&self, key: S) -> Entry {
let key_bytes: &[u8] = key.as_ref();
let mut probe = self.probe(key_bytes);
loop {
let bucket = probe.next();
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
return Entry::Vacant(bucket);
@@ -166,7 +163,7 @@ impl<'a> HashMap<'a> {
#[cfg(test)]
mod tests {
use super::*;
use super::super::heap::{Heap, HeapAllocable};
use super::djb2;
@@ -227,20 +224,17 @@ mod tests {
#[bench]
fn bench_djb2(bench: &mut Bencher) {
let v = String::from("abwer");
bench.iter(|| {
djb2(v.as_bytes())
});
bench.iter(|| djb2(v.as_bytes()));
}
#[bench]
fn bench_siphasher(bench: &mut Bencher) {
let v = String::from("abwer");
bench.iter(|| {
let mut h = DefaultHasher::new();
h.write(v.as_bytes());
h.finish()
});
let mut h = DefaultHasher::new();
h.write(v.as_bytes());
h.finish()
});
}
}

View File

@@ -19,20 +19,17 @@ pub struct Heap {
inner: UnsafeCell<InnerHeap>,
}
#[cfg_attr(feature = "cargo-clippy", allow(mut_from_ref))]
impl Heap {
/// Creates a new heap with a given capacity
pub fn with_capacity(num_bytes: usize) -> Heap {
Heap {
inner: UnsafeCell::new(
InnerHeap::with_capacity(num_bytes)
),
}
Heap { inner: UnsafeCell::new(InnerHeap::with_capacity(num_bytes)) }
}
fn inner(&self,) -> &mut InnerHeap {
unsafe { &mut *self.inner.get() }
fn inner(&self) -> &mut InnerHeap {
unsafe { &mut *self.inner.get() }
}
/// Clears the heap. All the underlying data is lost.
///
/// This heap does not support deallocation.
@@ -40,14 +37,14 @@ impl Heap {
pub fn clear(&self) {
self.inner().clear();
}
/// Return the heap capacity.
pub fn capacity(&self,) -> u32 {
pub fn capacity(&self) -> u32 {
self.inner().capacity()
}
/// Return amount of free space, in bytes.
pub fn num_free_bytes(&self,) -> u32 {
pub fn num_free_bytes(&self) -> u32 {
self.inner().num_free_bytes()
}
@@ -56,40 +53,40 @@ impl Heap {
pub fn allocate_space(&self, num_bytes: usize) -> u32 {
self.inner().allocate_space(num_bytes)
}
/// Allocate an object in the heap
pub fn allocate_object<V: HeapAllocable>(&self,) -> (u32, &mut V) {
pub fn allocate_object<V: HeapAllocable>(&self) -> (u32, &mut V) {
let addr = self.inner().allocate_space(mem::size_of::<V>());
let v: V = V::with_addr(addr);
self.inner().set(addr, &v);
(addr, self.inner().get_mut_ref(addr))
}
/// Stores a `&[u8]` in the heap and returns the destination BytesRef.
pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
self.inner().allocate_and_set(data)
}
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
/// given as argumetn
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
self.inner().get_slice(bytes_ref.start, bytes_ref.stop)
}
/// Stores an item's data in the heap, at the given `address`.
pub fn set<Item>(&self, addr: u32, val: &Item) {
self.inner().set(addr, val);
}
/// Returns a reference to an `Item` at a given `addr`.
#[cfg(test)]
pub fn get_ref<Item>(&self, addr: u32) -> &Item {
/// Returns a mutable reference for an object at a given Item.
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
self.inner().get_mut_ref(addr)
}
/// Returns a mutable reference to an `Item` at a given `addr`.
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
self.inner().get_mut_ref(addr)
#[cfg(test)]
pub fn get_ref<Item>(&self, addr: u32) -> &mut Item {
self.get_mut_ref(addr)
}
}
@@ -102,7 +99,6 @@ struct InnerHeap {
impl InnerHeap {
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
let buffer: Vec<u8> = vec![0u8; num_bytes];
InnerHeap {
@@ -116,7 +112,7 @@ impl InnerHeap {
self.used = 0u32;
}
pub fn capacity(&self,) -> u32 {
pub fn capacity(&self) -> u32 {
self.buffer.len() as u32
}
@@ -125,10 +121,9 @@ impl InnerHeap {
pub fn num_free_bytes(&self,) -> u32 {
if self.has_been_resized {
0u32
}
else {
} else {
(self.buffer.len() as u32) - self.used
}
}
}
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
@@ -141,11 +136,11 @@ impl InnerHeap {
}
addr
}
fn get_slice(&self, start: u32, stop: u32) -> &[u8] {
&self.buffer[start as usize..stop as usize]
}
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
&mut self.buffer[start as usize..stop as usize]
}
@@ -180,4 +175,4 @@ impl InnerHeap {
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
}
}
}
}

View File

@@ -22,7 +22,7 @@ fn test_unrolled_linked_list() {
for j in 0..k {
for i in 0..500 {
let mut list: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string());
list.push(i*j, &heap);
list.push(i * j, &heap);
}
}
for i in 0..500 {
@@ -31,7 +31,7 @@ fn test_unrolled_linked_list() {
let v: &mut ExpUnrolledLinkedList = heap.get_mut_ref(addr);
let mut it = v.iter(addr, &heap);
for j in 0..k {
assert_eq!(it.next().unwrap(), i*j);
assert_eq!(it.next().unwrap(), i * j);
}
assert!(!it.next().is_some());
}
@@ -41,6 +41,6 @@ fn test_unrolled_linked_list() {
}
}
}
}
}
}