Added documentation / HeapAllocable

This commit is contained in:
Paul Masurel
2016-09-22 14:32:44 +09:00
parent 994f223e35
commit ca331e7fe5
12 changed files with 177 additions and 78 deletions

View File

@@ -1,5 +1,5 @@
use std::mem;
use super::heap::Heap;
use super::heap::{Heap, HeapAllocable};
#[inline]
@@ -53,8 +53,8 @@ impl ExpUnrolledLinkedList {
}
impl From<u32> for ExpUnrolledLinkedList {
fn from(addr: u32) -> ExpUnrolledLinkedList {
impl HeapAllocable for ExpUnrolledLinkedList {
fn with_addr(addr: u32) -> ExpUnrolledLinkedList {
let last_addr = addr + mem::size_of::<u32>() as u32 * 2u32;
ExpUnrolledLinkedList {
len: 0u32,
@@ -67,22 +67,6 @@ impl From<u32> for ExpUnrolledLinkedList {
}
}
impl Default for ExpUnrolledLinkedList {
fn default() -> ExpUnrolledLinkedList {
ExpUnrolledLinkedList {
len: 0u32,
end: 0u32,
val0: 0u32,
val1: 0u32,
val2: 0u32,
next: 0u32,
}
}
}
pub struct ExpUnrolledLinkedListIterator<'a> {
heap: &'a Heap,
addr: u32,

View File

@@ -1,6 +1,9 @@
use std::iter;
use std::marker::PhantomData;
use super::heap::{Heap, BytesRef};
use super::heap::{Heap, HeapAllocable, BytesRef};
/// dbj2 hash function
fn djb2(key: &[u8]) -> u64 {
@@ -54,7 +57,7 @@ pub enum Entry {
/// the computation of the hash of the key twice,
/// or copying the key as long as there is no insert.
///
pub struct HashMap<'a, V> where V: From<u32> {
pub struct HashMap<'a, V> where V: HeapAllocable {
table: Box<[KeyValue]>,
heap: &'a Heap,
_phantom: PhantomData<V>,
@@ -62,7 +65,7 @@ pub struct HashMap<'a, V> where V: From<u32> {
occupied: Vec<usize>,
}
impl<'a, V> HashMap<'a, V> where V: From<u32> {
impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a, V> {
let table_size = 1 << num_bucket_power_of_2;
@@ -157,7 +160,7 @@ impl<'a, V> HashMap<'a, V> where V: From<u32> {
mod tests {
use super::*;
use super::super::heap::Heap;
use super::super::heap::{Heap, HeapAllocable};
use super::djb2;
use test::Bencher;
use std::hash::SipHasher;
@@ -168,8 +171,8 @@ mod tests {
_addr: u32,
}
impl From<u32> for TestValue {
fn from(addr: u32) -> TestValue {
impl HeapAllocable for TestValue {
fn with_addr(addr: u32) -> TestValue {
TestValue {
val: 0u32,
_addr: addr,

View File

@@ -3,19 +3,26 @@ use std::mem;
use std::ptr;
use std::iter;
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
#[derive(Copy, Clone)]
pub struct BytesRef {
pub start: u32,
pub stop: u32,
}
/// Object that can be allocated in tantivy's custom `Heap`.
pub trait HeapAllocable {
fn with_addr(addr: u32) -> Self;
}
/// Tantivy's custom `Heap`.
pub struct Heap {
inner: UnsafeCell<InnerHeap>,
}
impl Heap {
/// Creates a new heap with a given capacity
pub fn with_capacity(num_bytes: usize) -> Heap {
Heap {
inner: UnsafeCell::new(
@@ -27,46 +34,62 @@ impl Heap {
fn inner(&self,) -> &mut InnerHeap {
unsafe { &mut *self.inner.get() }
}
/// Clears the heap. All the underlying data is lost.
///
/// This heap does not support deallocation.
/// This method is the only way to free memory.
pub fn clear(&self) {
self.inner().clear();
}
/// Return the heap capacity.
pub fn capacity(&self,) -> u32 {
self.inner().capacity()
}
/// Return the amount of memory that has been allocated so far.
pub fn len(&self,) -> u32 {
self.inner().len()
}
/// Return amount of free space, in bytes.
pub fn num_free_bytes(&self,) -> u32 {
self.inner().num_free_bytes()
}
/// Allocate a given amount of space and returns an address
/// in the Heap.
pub fn allocate_space(&self, num_bytes: usize) -> u32 {
self.inner().allocate_space(num_bytes)
}
pub fn allocate_object<V: From<u32>>(&self,) -> (u32, &mut V) {
/// Allocate an object in the heap
pub fn allocate_object<V: HeapAllocable>(&self,) -> (u32, &mut V) {
let addr = self.inner().allocate_space(mem::size_of::<V>());
let v: V = V::from(addr);
let v: V = V::with_addr(addr);
self.inner().set(addr, &v);
(addr, self.inner().get_mut_ref(addr))
}
/// Stores a `&[u8]` in the heap and returns the destination BytesRef.
pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
self.inner().allocate_and_set(data)
}
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
/// given as argumetn
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
self.inner().get_slice(bytes_ref.start, bytes_ref.stop)
}
/// Stores an item's data in the heap, at the given `address`.
pub fn set<Item>(&self, addr: u32, val: &Item) {
self.inner().set(addr, val);
}
/// Returns a mutable reference for an object at a given Item.
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
self.inner().get_mut_ref(addr)
}
@@ -183,7 +206,7 @@ impl InnerHeap {
}
}
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
fn set<Item>(&mut self, addr: u32, val: &Item) {
if addr >= self.buffer_len {
self.next_heap.as_mut().unwrap().set(addr - self.buffer_len, val);
}

View File

@@ -2,7 +2,7 @@ mod hashmap;
mod heap;
mod expull;
pub use self::heap::Heap;
pub use self::heap::{Heap, HeapAllocable};
pub use self::expull::ExpUnrolledLinkedList;
pub use self::hashmap::{HashMap, Entry};

View File

@@ -7,7 +7,7 @@ use super::compute_num_bits;
/// `FastFieldSerializer` is in charge of serializing
/// a fastfield on disk.
/// fastfields on disk.
///
/// FastField are encoded using bit-packing.
///

View File

@@ -118,7 +118,7 @@ impl<'a> Iterator for PostingsMerger<'a> {
let next_heap_it = self.heap.pop().expect("This is only reached if an element was peeked beforehand.");
self.append_segment(&next_heap_it, &mut segment_postings_list);
}
let chained_posting = ChainedPostings::new(segment_postings_list);
let chained_posting = ChainedPostings::from(segment_postings_list);
Some((heap_it.term, chained_posting))
},
None => None

View File

@@ -4,19 +4,26 @@ use postings::OffsetPostings;
use postings::DocSet;
use postings::HasLen;
/// Creates a posting object that chains two postings
/// together.
///
/// When iterating over the chained postings,
/// it will consume all of the documents of the first postings,
/// and then iterate over the documents over the second postings.
///
/// The chained postings is used when merging segments.
pub struct ChainedPostings<'a> {
chained_postings: Vec<OffsetPostings<'a>>,
posting_id: usize,
len: usize,
}
impl<'a> ChainedPostings<'a> {
pub fn new(chained_postings: Vec<OffsetPostings<'a>>) -> ChainedPostings {
impl<'a> From<Vec<OffsetPostings<'a>>> for ChainedPostings<'a> {
fn from(chained_postings: Vec<OffsetPostings<'a>>) -> ChainedPostings {
let len: usize = chained_postings
.iter()
.map(|segment_postings| segment_postings.len())
.fold(0, |sum, addition| sum + addition);
.sum();
ChainedPostings {
chained_postings: chained_postings,
posting_id: 0,

View File

@@ -20,7 +20,10 @@ pub use self::postings_writer::PostingsWriter;
pub use self::postings_writer::SpecializedPostingsWriter;
pub use self::term_info::TermInfo;
pub use self::postings::Postings;
#[cfg(test)]
pub use self::vec_postings::VecPostings;
pub use self::chained_postings::ChainedPostings;
pub use self::segment_postings::SegmentPostings;
pub use self::intersection::intersection;

View File

@@ -1,20 +1,38 @@
use DocId;
use std::io;
use postings::PostingsSerializer;
use datastruct::stacker::{ExpUnrolledLinkedList, Heap};
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
const POSITION_END: u32 = 4294967295;
pub trait Recorder: From<u32> {
/// Recorder is in charge of recording relevant information about
/// the presence of a term in a document.
///
/// Depending on the `TextIndexingOptions` associated to the
/// field, the recorder may records
/// * the document frequency
/// * the document id
/// * the term frequency
/// * the term positions
pub trait Recorder: HeapAllocable {
/// Returns the current document
fn current_doc(&self,) -> u32;
/// Starts recording information about a new document
/// This method shall only be called if the term is within the document.
fn new_doc(&mut self, doc: DocId, heap: &Heap);
/// Record the position of a term. For each document,
/// this method will be called `term_freq` times.
fn record_position(&mut self, position: u32, heap: &Heap);
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, heap: &Heap);
/// Returns the number of document that have been seen so far
fn doc_freq(&self,) -> u32;
/// Pushes the postings information to the serializer.
fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
}
/// Only records the doc ids
#[repr(C, packed)]
pub struct NothingRecorder {
stack: ExpUnrolledLinkedList,
@@ -22,10 +40,10 @@ pub struct NothingRecorder {
doc_freq: u32,
}
impl From<u32> for NothingRecorder {
fn from(addr: u32) -> NothingRecorder {
impl HeapAllocable for NothingRecorder {
fn with_addr(addr: u32) -> NothingRecorder {
NothingRecorder {
stack: ExpUnrolledLinkedList::from(addr),
stack: ExpUnrolledLinkedList::with_addr(addr),
current_doc: u32::max_value(),
doc_freq: 0u32,
}
@@ -33,12 +51,11 @@ impl From<u32> for NothingRecorder {
}
impl Recorder for NothingRecorder {
fn current_doc(&self,) -> DocId {
self.current_doc
}
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
self.current_doc = doc;
self.stack.push(doc, heap);
@@ -59,9 +76,10 @@ impl Recorder for NothingRecorder {
}
Ok(())
}
}
/// Recorder encoding document ids, and term frequencies
#[repr(C, packed)]
pub struct TermFrequencyRecorder {
stack: ExpUnrolledLinkedList,
@@ -70,10 +88,10 @@ pub struct TermFrequencyRecorder {
doc_freq: u32,
}
impl From<u32> for TermFrequencyRecorder {
fn from(addr: u32) -> TermFrequencyRecorder {
impl HeapAllocable for TermFrequencyRecorder {
fn with_addr(addr: u32) -> TermFrequencyRecorder {
TermFrequencyRecorder {
stack: ExpUnrolledLinkedList::from(addr),
stack: ExpUnrolledLinkedList::with_addr(addr),
current_doc: u32::max_value(),
current_tf: 0u32,
doc_freq: 0u32
@@ -82,6 +100,8 @@ impl From<u32> for TermFrequencyRecorder {
}
impl Recorder for TermFrequencyRecorder {
fn current_doc(&self,) -> DocId {
self.current_doc
@@ -120,9 +140,10 @@ impl Recorder for TermFrequencyRecorder {
}
Ok(())
}
}
/// Recorder encoding term frequencies as well as positions.
#[repr(C, packed)]
pub struct TFAndPositionRecorder {
stack: ExpUnrolledLinkedList,
@@ -130,18 +151,18 @@ pub struct TFAndPositionRecorder {
doc_freq: u32,
}
impl From<u32> for TFAndPositionRecorder {
fn from(addr: u32) -> TFAndPositionRecorder {
impl HeapAllocable for TFAndPositionRecorder {
fn with_addr(addr: u32) -> TFAndPositionRecorder {
TFAndPositionRecorder {
stack: ExpUnrolledLinkedList::from(addr),
stack: ExpUnrolledLinkedList::with_addr(addr),
current_doc: u32::max_value(),
doc_freq: 0u32,
}
}
}
impl Recorder for TFAndPositionRecorder {
fn current_doc(&self,) -> DocId {
self.current_doc
@@ -191,6 +212,7 @@ impl Recorder for TFAndPositionRecorder {
}
Ok(())
}
}

View File

@@ -17,6 +17,35 @@ use common::VInt;
use common::BinarySerializable;
/// `PostingsSerializer` is in charge of serializing
/// postings on disk, in the
/// * `.idx` (inverted index)
/// * `.pos` (positions file)
/// * `.term` (term dictionary)
///
/// `PostingsWriter` are in charge of pushing the data to the
/// serializer.
///
/// The serializer expects to receive the following calls
/// in this order :
///
/// * `new_term(...)`
/// * `write_doc(...)`
/// * `write_doc(...)`
/// * `write_doc(...)`
/// * ...
/// * `close_term()`
/// * `new_term(...)`
/// * `write_doc(...)`
/// * ...
/// * `close_term()`
/// * `close()`
///
/// Terms have to be pushed in a lexicographically-sorted order.
/// Within a term, document have to be pushed in increasing order.
///
/// A description of the serialization format is
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
pub struct PostingsSerializer {
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>, // TODO find an alternative to work around the "move"
postings_write: WritePtr,
@@ -35,7 +64,8 @@ pub struct PostingsSerializer {
}
impl PostingsSerializer {
/// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> {
let terms_write = try!(segment.open_write(SegmentComponent::TERMS));
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
@@ -59,8 +89,8 @@ impl PostingsSerializer {
term_open: false,
})
}
pub fn load_indexing_options(&mut self, field: Field) {
fn load_indexing_options(&mut self, field: Field) {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
self.text_indexing_options = match *field_entry.field_type() {
FieldType::Str(ref text_options) => {
@@ -76,7 +106,11 @@ impl PostingsSerializer {
}
};
}
/// Starts the postings for a new term.
/// * term - the term. It needs to come after the previous term according
/// to the lexicographical order.
/// * doc_freq - return the number of document containing the term.
pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> {
if self.term_open {
panic!("Called new_term, while the previous term was not closed.");
@@ -95,7 +129,11 @@ impl PostingsSerializer {
self.terms_fst_builder
.insert(term.as_slice(), &term_info)
}
/// Finish the serialization for this term postings.
///
/// If the current block is incomplete, it need to be encoded
/// using `VInt` encoding.
pub fn close_term(&mut self,) -> io::Result<()> {
if self.term_open {
if !self.doc_ids.is_empty() {
@@ -133,7 +171,17 @@ impl PostingsSerializer {
}
Ok(())
}
/// Serialize the information that a document contains the current term,
/// its term frequency, and the position deltas.
///
/// At this point, the positions are already `delta-encoded`.
/// For instance, if the positions are `2, 3, 17`,
/// `position_deltas` is `2, 1, 14`
///
/// Term frequencies and positions may be ignored by the serializer depending
/// on the configuration of the field in the `Schema`.
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) -> io::Result<()> {
self.doc_ids.push(doc_id);
if self.text_indexing_options.is_termfreq_enabled() {
@@ -161,7 +209,8 @@ impl PostingsSerializer {
}
Ok(())
}
/// Closes the serializer.
pub fn close(mut self,) -> io::Result<()> {
try!(self.close_term());
try!(self.terms_fst_builder.finish());

View File

@@ -2,20 +2,23 @@ use common::BinarySerializable;
use std::io;
// `TermInfo` contains all of the information
// associated to terms in the `.term` file.
//
// It consists of
// * doc_freq : the number of document in the segment
// containing this term. It is also the length of the
// posting list associated to this term
// * postings_offset: an offset in the `.idx` file
// addressing the start of the posting list associated
// to this term.
/// `TermInfo` contains all of the information
/// associated to terms in the `.term` file.
///
/// It consists of
/// * doc_freq : the number of document in the segment
/// containing this term. It is also the length of the
/// posting list associated to this term
/// * postings_offset: an offset in the `.idx` file
/// addressing the start of the posting list associated
/// to this term.
#[derive(Debug,Ord,PartialOrd,Eq,PartialEq,Clone)]
pub struct TermInfo {
/// Number of documents in the segment containing the term
pub doc_freq: u32,
/// Offset within the postings (`.idx`) file.
pub postings_offset: u32,
/// Offset within the position (`.pos`) file.
pub positions_offset: u32,
}

View File

@@ -7,6 +7,11 @@ use std::cmp::Ordering;
const EMPTY_ARRAY: [u32; 0] = [];
/// Simulate a `Postings` objects from a `VecPostings`.
/// `VecPostings` only exist for testing purposes.
///
/// Term frequencies always return 1.
/// No positions are returned.
pub struct VecPostings {
doc_ids: Vec<DocId>,
cursor: Wrapping<usize>,