diff --git a/src/datastruct/stacker/expull.rs b/src/datastruct/stacker/expull.rs index 96f3a766a..93aa80b08 100644 --- a/src/datastruct/stacker/expull.rs +++ b/src/datastruct/stacker/expull.rs @@ -1,5 +1,5 @@ use std::mem; -use super::heap::Heap; +use super::heap::{Heap, HeapAllocable}; #[inline] @@ -53,8 +53,8 @@ impl ExpUnrolledLinkedList { } -impl From for ExpUnrolledLinkedList { - fn from(addr: u32) -> ExpUnrolledLinkedList { +impl HeapAllocable for ExpUnrolledLinkedList { + fn with_addr(addr: u32) -> ExpUnrolledLinkedList { let last_addr = addr + mem::size_of::() as u32 * 2u32; ExpUnrolledLinkedList { len: 0u32, @@ -67,22 +67,6 @@ impl From for ExpUnrolledLinkedList { } } - - -impl Default for ExpUnrolledLinkedList { - fn default() -> ExpUnrolledLinkedList { - ExpUnrolledLinkedList { - len: 0u32, - end: 0u32, - val0: 0u32, - val1: 0u32, - val2: 0u32, - next: 0u32, - } - } -} - - pub struct ExpUnrolledLinkedListIterator<'a> { heap: &'a Heap, addr: u32, diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index 730d7c9f8..2ec92dd7c 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -1,6 +1,9 @@ use std::iter; use std::marker::PhantomData; -use super::heap::{Heap, BytesRef}; +use super::heap::{Heap, HeapAllocable, BytesRef}; + + + /// dbj2 hash function fn djb2(key: &[u8]) -> u64 { @@ -54,7 +57,7 @@ pub enum Entry { /// the computation of the hash of the key twice, /// or copying the key as long as there is no insert. /// -pub struct HashMap<'a, V> where V: From { +pub struct HashMap<'a, V> where V: HeapAllocable { table: Box<[KeyValue]>, heap: &'a Heap, _phantom: PhantomData, @@ -62,7 +65,7 @@ pub struct HashMap<'a, V> where V: From { occupied: Vec, } -impl<'a, V> HashMap<'a, V> where V: From { +impl<'a, V> HashMap<'a, V> where V: HeapAllocable { pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a, V> { let table_size = 1 << num_bucket_power_of_2; @@ -157,7 +160,7 @@ impl<'a, V> HashMap<'a, V> where V: From { mod tests { use super::*; - use super::super::heap::Heap; + use super::super::heap::{Heap, HeapAllocable}; use super::djb2; use test::Bencher; use std::hash::SipHasher; @@ -168,8 +171,8 @@ mod tests { _addr: u32, } - impl From for TestValue { - fn from(addr: u32) -> TestValue { + impl HeapAllocable for TestValue { + fn with_addr(addr: u32) -> TestValue { TestValue { val: 0u32, _addr: addr, diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs index ead18c054..df73ab242 100644 --- a/src/datastruct/stacker/heap.rs +++ b/src/datastruct/stacker/heap.rs @@ -3,19 +3,26 @@ use std::mem; use std::ptr; use std::iter; + +/// `BytesRef` refers to a slice in tantivy's custom `Heap`. #[derive(Copy, Clone)] pub struct BytesRef { pub start: u32, pub stop: u32, } +/// Object that can be allocated in tantivy's custom `Heap`. +pub trait HeapAllocable { + fn with_addr(addr: u32) -> Self; +} - +/// Tantivy's custom `Heap`. pub struct Heap { inner: UnsafeCell, } impl Heap { + /// Creates a new heap with a given capacity pub fn with_capacity(num_bytes: usize) -> Heap { Heap { inner: UnsafeCell::new( @@ -27,46 +34,62 @@ impl Heap { fn inner(&self,) -> &mut InnerHeap { unsafe { &mut *self.inner.get() } } - + + /// Clears the heap. All the underlying data is lost. + /// + /// This heap does not support deallocation. + /// This method is the only way to free memory. pub fn clear(&self) { self.inner().clear(); } - + + + /// Return the heap capacity. pub fn capacity(&self,) -> u32 { self.inner().capacity() } - + + /// Return the amount of memory that has been allocated so far. pub fn len(&self,) -> u32 { self.inner().len() } - + + /// Return amount of free space, in bytes. pub fn num_free_bytes(&self,) -> u32 { self.inner().num_free_bytes() } + /// Allocate a given amount of space and returns an address + /// in the Heap. pub fn allocate_space(&self, num_bytes: usize) -> u32 { self.inner().allocate_space(num_bytes) } - - pub fn allocate_object>(&self,) -> (u32, &mut V) { + + /// Allocate an object in the heap + pub fn allocate_object(&self,) -> (u32, &mut V) { let addr = self.inner().allocate_space(mem::size_of::()); - let v: V = V::from(addr); + let v: V = V::with_addr(addr); self.inner().set(addr, &v); (addr, self.inner().get_mut_ref(addr)) } - + + /// Stores a `&[u8]` in the heap and returns the destination BytesRef. pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef { self.inner().allocate_and_set(data) } - + + /// Fetches the `&[u8]` stored on the slice defined by the `BytesRef` + /// given as argumetn pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] { self.inner().get_slice(bytes_ref.start, bytes_ref.stop) } - + + /// Stores an item's data in the heap, at the given `address`. pub fn set(&self, addr: u32, val: &Item) { self.inner().set(addr, val); } - + + /// Returns a mutable reference for an object at a given Item. pub fn get_mut_ref(&self, addr: u32) -> &mut Item { self.inner().get_mut_ref(addr) } @@ -183,7 +206,7 @@ impl InnerHeap { } } - pub fn set(&mut self, addr: u32, val: &Item) { + fn set(&mut self, addr: u32, val: &Item) { if addr >= self.buffer_len { self.next_heap.as_mut().unwrap().set(addr - self.buffer_len, val); } diff --git a/src/datastruct/stacker/mod.rs b/src/datastruct/stacker/mod.rs index c93695ec5..66aaee2d0 100644 --- a/src/datastruct/stacker/mod.rs +++ b/src/datastruct/stacker/mod.rs @@ -2,7 +2,7 @@ mod hashmap; mod heap; mod expull; -pub use self::heap::Heap; +pub use self::heap::{Heap, HeapAllocable}; pub use self::expull::ExpUnrolledLinkedList; pub use self::hashmap::{HashMap, Entry}; diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index 11b740715..9dfbfceb8 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -7,7 +7,7 @@ use super::compute_num_bits; /// `FastFieldSerializer` is in charge of serializing -/// a fastfield on disk. +/// fastfields on disk. /// /// FastField are encoded using bit-packing. /// diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 92e211071..cc94ac95b 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -118,7 +118,7 @@ impl<'a> Iterator for PostingsMerger<'a> { let next_heap_it = self.heap.pop().expect("This is only reached if an element was peeked beforehand."); self.append_segment(&next_heap_it, &mut segment_postings_list); } - let chained_posting = ChainedPostings::new(segment_postings_list); + let chained_posting = ChainedPostings::from(segment_postings_list); Some((heap_it.term, chained_posting)) }, None => None diff --git a/src/postings/chained_postings.rs b/src/postings/chained_postings.rs index 9b1408c64..f07185918 100644 --- a/src/postings/chained_postings.rs +++ b/src/postings/chained_postings.rs @@ -4,19 +4,26 @@ use postings::OffsetPostings; use postings::DocSet; use postings::HasLen; +/// Creates a posting object that chains two postings +/// together. +/// +/// When iterating over the chained postings, +/// it will consume all of the documents of the first postings, +/// and then iterate over the documents over the second postings. +/// +/// The chained postings is used when merging segments. pub struct ChainedPostings<'a> { chained_postings: Vec>, posting_id: usize, len: usize, } -impl<'a> ChainedPostings<'a> { - - pub fn new(chained_postings: Vec>) -> ChainedPostings { +impl<'a> From>> for ChainedPostings<'a> { + fn from(chained_postings: Vec>) -> ChainedPostings { let len: usize = chained_postings .iter() .map(|segment_postings| segment_postings.len()) - .fold(0, |sum, addition| sum + addition); + .sum(); ChainedPostings { chained_postings: chained_postings, posting_id: 0, diff --git a/src/postings/mod.rs b/src/postings/mod.rs index a30817e70..7f955b9fd 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -20,7 +20,10 @@ pub use self::postings_writer::PostingsWriter; pub use self::postings_writer::SpecializedPostingsWriter; pub use self::term_info::TermInfo; pub use self::postings::Postings; + +#[cfg(test)] pub use self::vec_postings::VecPostings; + pub use self::chained_postings::ChainedPostings; pub use self::segment_postings::SegmentPostings; pub use self::intersection::intersection; diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index 31d384aeb..095102a3d 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -1,20 +1,38 @@ use DocId; use std::io; use postings::PostingsSerializer; -use datastruct::stacker::{ExpUnrolledLinkedList, Heap}; +use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable}; const EMPTY_ARRAY: [u32; 0] = [0u32; 0]; const POSITION_END: u32 = 4294967295; -pub trait Recorder: From { +/// Recorder is in charge of recording relevant information about +/// the presence of a term in a document. +/// +/// Depending on the `TextIndexingOptions` associated to the +/// field, the recorder may records +/// * the document frequency +/// * the document id +/// * the term frequency +/// * the term positions +pub trait Recorder: HeapAllocable { + /// Returns the current document fn current_doc(&self,) -> u32; + /// Starts recording information about a new document + /// This method shall only be called if the term is within the document. fn new_doc(&mut self, doc: DocId, heap: &Heap); + /// Record the position of a term. For each document, + /// this method will be called `term_freq` times. fn record_position(&mut self, position: u32, heap: &Heap); + /// Close the document. It will help record the term frequency. fn close_doc(&mut self, heap: &Heap); + /// Returns the number of document that have been seen so far fn doc_freq(&self,) -> u32; + /// Pushes the postings information to the serializer. fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>; } +/// Only records the doc ids #[repr(C, packed)] pub struct NothingRecorder { stack: ExpUnrolledLinkedList, @@ -22,10 +40,10 @@ pub struct NothingRecorder { doc_freq: u32, } -impl From for NothingRecorder { - fn from(addr: u32) -> NothingRecorder { +impl HeapAllocable for NothingRecorder { + fn with_addr(addr: u32) -> NothingRecorder { NothingRecorder { - stack: ExpUnrolledLinkedList::from(addr), + stack: ExpUnrolledLinkedList::with_addr(addr), current_doc: u32::max_value(), doc_freq: 0u32, } @@ -33,12 +51,11 @@ impl From for NothingRecorder { } impl Recorder for NothingRecorder { - + fn current_doc(&self,) -> DocId { self.current_doc } - fn new_doc(&mut self, doc: DocId, heap: &Heap) { self.current_doc = doc; self.stack.push(doc, heap); @@ -59,9 +76,10 @@ impl Recorder for NothingRecorder { } Ok(()) } + } - +/// Recorder encoding document ids, and term frequencies #[repr(C, packed)] pub struct TermFrequencyRecorder { stack: ExpUnrolledLinkedList, @@ -70,10 +88,10 @@ pub struct TermFrequencyRecorder { doc_freq: u32, } -impl From for TermFrequencyRecorder { - fn from(addr: u32) -> TermFrequencyRecorder { +impl HeapAllocable for TermFrequencyRecorder { + fn with_addr(addr: u32) -> TermFrequencyRecorder { TermFrequencyRecorder { - stack: ExpUnrolledLinkedList::from(addr), + stack: ExpUnrolledLinkedList::with_addr(addr), current_doc: u32::max_value(), current_tf: 0u32, doc_freq: 0u32 @@ -82,6 +100,8 @@ impl From for TermFrequencyRecorder { } impl Recorder for TermFrequencyRecorder { + + fn current_doc(&self,) -> DocId { self.current_doc @@ -120,9 +140,10 @@ impl Recorder for TermFrequencyRecorder { } Ok(()) } + } - +/// Recorder encoding term frequencies as well as positions. #[repr(C, packed)] pub struct TFAndPositionRecorder { stack: ExpUnrolledLinkedList, @@ -130,18 +151,18 @@ pub struct TFAndPositionRecorder { doc_freq: u32, } -impl From for TFAndPositionRecorder { - fn from(addr: u32) -> TFAndPositionRecorder { +impl HeapAllocable for TFAndPositionRecorder { + fn with_addr(addr: u32) -> TFAndPositionRecorder { TFAndPositionRecorder { - stack: ExpUnrolledLinkedList::from(addr), + stack: ExpUnrolledLinkedList::with_addr(addr), current_doc: u32::max_value(), doc_freq: 0u32, } } - } impl Recorder for TFAndPositionRecorder { + fn current_doc(&self,) -> DocId { self.current_doc @@ -191,6 +212,7 @@ impl Recorder for TFAndPositionRecorder { } Ok(()) } + } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index f42d857e8..f1724ad46 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -17,6 +17,35 @@ use common::VInt; use common::BinarySerializable; +/// `PostingsSerializer` is in charge of serializing +/// postings on disk, in the +/// * `.idx` (inverted index) +/// * `.pos` (positions file) +/// * `.term` (term dictionary) +/// +/// `PostingsWriter` are in charge of pushing the data to the +/// serializer. +/// +/// The serializer expects to receive the following calls +/// in this order : +/// +/// * `new_term(...)` +/// * `write_doc(...)` +/// * `write_doc(...)` +/// * `write_doc(...)` +/// * ... +/// * `close_term()` +/// * `new_term(...)` +/// * `write_doc(...)` +/// * ... +/// * `close_term()` +/// * `close()` +/// +/// Terms have to be pushed in a lexicographically-sorted order. +/// Within a term, document have to be pushed in increasing order. +/// +/// A description of the serialization format is +/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html). pub struct PostingsSerializer { terms_fst_builder: FstMapBuilder, // TODO find an alternative to work around the "move" postings_write: WritePtr, @@ -35,7 +64,8 @@ pub struct PostingsSerializer { } impl PostingsSerializer { - + + /// Open a new `PostingsSerializer` for the given segment pub fn open(segment: &mut Segment) -> Result { let terms_write = try!(segment.open_write(SegmentComponent::TERMS)); let terms_fst_builder = try!(FstMapBuilder::new(terms_write)); @@ -59,8 +89,8 @@ impl PostingsSerializer { term_open: false, }) } - - pub fn load_indexing_options(&mut self, field: Field) { + + fn load_indexing_options(&mut self, field: Field) { let field_entry: &FieldEntry = self.schema.get_field_entry(field); self.text_indexing_options = match *field_entry.field_type() { FieldType::Str(ref text_options) => { @@ -76,7 +106,11 @@ impl PostingsSerializer { } }; } - + + /// Starts the postings for a new term. + /// * term - the term. It needs to come after the previous term according + /// to the lexicographical order. + /// * doc_freq - return the number of document containing the term. pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> { if self.term_open { panic!("Called new_term, while the previous term was not closed."); @@ -95,7 +129,11 @@ impl PostingsSerializer { self.terms_fst_builder .insert(term.as_slice(), &term_info) } - + + /// Finish the serialization for this term postings. + /// + /// If the current block is incomplete, it need to be encoded + /// using `VInt` encoding. pub fn close_term(&mut self,) -> io::Result<()> { if self.term_open { if !self.doc_ids.is_empty() { @@ -133,7 +171,17 @@ impl PostingsSerializer { } Ok(()) } - + + + /// Serialize the information that a document contains the current term, + /// its term frequency, and the position deltas. + /// + /// At this point, the positions are already `delta-encoded`. + /// For instance, if the positions are `2, 3, 17`, + /// `position_deltas` is `2, 1, 14` + /// + /// Term frequencies and positions may be ignored by the serializer depending + /// on the configuration of the field in the `Schema`. pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) -> io::Result<()> { self.doc_ids.push(doc_id); if self.text_indexing_options.is_termfreq_enabled() { @@ -161,7 +209,8 @@ impl PostingsSerializer { } Ok(()) } - + + /// Closes the serializer. pub fn close(mut self,) -> io::Result<()> { try!(self.close_term()); try!(self.terms_fst_builder.finish()); diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index e75539dc0..640caac7d 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -2,20 +2,23 @@ use common::BinarySerializable; use std::io; -// `TermInfo` contains all of the information -// associated to terms in the `.term` file. -// -// It consists of -// * doc_freq : the number of document in the segment -// containing this term. It is also the length of the -// posting list associated to this term -// * postings_offset: an offset in the `.idx` file -// addressing the start of the posting list associated -// to this term. +/// `TermInfo` contains all of the information +/// associated to terms in the `.term` file. +/// +/// It consists of +/// * doc_freq : the number of document in the segment +/// containing this term. It is also the length of the +/// posting list associated to this term +/// * postings_offset: an offset in the `.idx` file +/// addressing the start of the posting list associated +/// to this term. #[derive(Debug,Ord,PartialOrd,Eq,PartialEq,Clone)] pub struct TermInfo { + /// Number of documents in the segment containing the term pub doc_freq: u32, + /// Offset within the postings (`.idx`) file. pub postings_offset: u32, + /// Offset within the position (`.pos`) file. pub positions_offset: u32, } diff --git a/src/postings/vec_postings.rs b/src/postings/vec_postings.rs index c980bc4ac..8d4ba0d48 100644 --- a/src/postings/vec_postings.rs +++ b/src/postings/vec_postings.rs @@ -7,6 +7,11 @@ use std::cmp::Ordering; const EMPTY_ARRAY: [u32; 0] = []; +/// Simulate a `Postings` objects from a `VecPostings`. +/// `VecPostings` only exist for testing purposes. +/// +/// Term frequencies always return 1. +/// No positions are returned. pub struct VecPostings { doc_ids: Vec, cursor: Wrapping,