Added documentation / HeapAllocable

2026-01-12 12:02:54 +00:00 · 2016-09-22 14:32:44 +09:00
parent 994f223e35
commit ca331e7fe5
12 changed files with 177 additions and 78 deletions
--- a/src/datastruct/stacker/expull.rs
+++ b/src/datastruct/stacker/expull.rs
@@ -1,5 +1,5 @@
 use std::mem;
-use super::heap::Heap;
+use super::heap::{Heap, HeapAllocable};


 #[inline]
@@ -53,8 +53,8 @@ impl ExpUnrolledLinkedList {
 }


-impl From<u32> for ExpUnrolledLinkedList {
-    fn from(addr: u32) -> ExpUnrolledLinkedList {
+impl HeapAllocable for ExpUnrolledLinkedList {
+    fn with_addr(addr: u32) -> ExpUnrolledLinkedList {
        let last_addr = addr + mem::size_of::<u32>() as u32 * 2u32;
        ExpUnrolledLinkedList {
            len: 0u32,
@@ -67,22 +67,6 @@ impl From<u32> for ExpUnrolledLinkedList {
    }
 }

-
-
-impl Default for ExpUnrolledLinkedList {
-    fn default() -> ExpUnrolledLinkedList {
-        ExpUnrolledLinkedList {
-            len: 0u32,
-            end: 0u32,
-            val0: 0u32,
-            val1: 0u32,
-            val2: 0u32,
-            next: 0u32,
-        }
-    }
-}
-
-
 pub struct ExpUnrolledLinkedListIterator<'a> {
    heap: &'a Heap,
    addr: u32,
--- a/src/datastruct/stacker/hashmap.rs
+++ b/src/datastruct/stacker/hashmap.rs
@@ -1,6 +1,9 @@
 use std::iter;
 use std::marker::PhantomData;
-use super::heap::{Heap, BytesRef};
+use super::heap::{Heap, HeapAllocable, BytesRef};
+
+
+

 /// dbj2 hash function
 fn djb2(key: &[u8]) -> u64 {
@@ -54,7 +57,7 @@ pub enum Entry {
 /// the computation of the hash of the key twice,
 /// or copying the key as long as there is no insert.
 ///
-pub struct HashMap<'a, V> where V: From<u32> {
+pub struct HashMap<'a, V> where V: HeapAllocable {
    table: Box<[KeyValue]>,
    heap: &'a Heap,
    _phantom: PhantomData<V>,
@@ -62,7 +65,7 @@ pub struct HashMap<'a, V> where V: From<u32> {
    occupied: Vec<usize>,
 }

-impl<'a, V> HashMap<'a, V> where V: From<u32> {
+impl<'a, V> HashMap<'a, V> where V: HeapAllocable {

    pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a, V> {
        let table_size = 1 << num_bucket_power_of_2;
@@ -157,7 +160,7 @@ impl<'a, V> HashMap<'a, V> where V: From<u32> {
 mod tests {
    
    use super::*;
-    use super::super::heap::Heap;
+    use super::super::heap::{Heap, HeapAllocable};
    use super::djb2;
    use test::Bencher;
    use std::hash::SipHasher;
@@ -168,8 +171,8 @@ mod tests {
        _addr: u32,
    }

-    impl From<u32> for TestValue {
-        fn from(addr: u32) -> TestValue {
+    impl HeapAllocable for TestValue {
+        fn with_addr(addr: u32) -> TestValue {
            TestValue {
                val: 0u32,
                _addr: addr,
--- a/src/datastruct/stacker/heap.rs
+++ b/src/datastruct/stacker/heap.rs
@@ -3,19 +3,26 @@ use std::mem;
 use std::ptr;
 use std::iter;

+
+/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
 #[derive(Copy, Clone)]
 pub struct BytesRef {
    pub start: u32,
    pub stop: u32,
 }

+/// Object that can be allocated in tantivy's custom `Heap`.
+pub trait HeapAllocable {
+    fn with_addr(addr: u32) -> Self;
+}

-
+/// Tantivy's custom `Heap`.
 pub struct Heap {
    inner: UnsafeCell<InnerHeap>,
 }

 impl Heap {
+    /// Creates a new heap with a given capacity
    pub fn with_capacity(num_bytes: usize) -> Heap {
        Heap {
            inner: UnsafeCell::new(
@@ -27,46 +34,62 @@ impl Heap {
    fn inner(&self,) -> &mut InnerHeap {
        unsafe { &mut *self.inner.get() } 
    }
-
+    
+    /// Clears the heap. All the underlying data is lost.
+    ///
+    /// This heap does not support deallocation.
+    /// This method is the only way to free memory.
    pub fn clear(&self) {
        self.inner().clear();
    }
-
+    
+    
+    /// Return the heap capacity.
    pub fn capacity(&self,) -> u32 {
        self.inner().capacity()
    }
-
+    
+    /// Return the amount of memory that has been allocated so far. 
    pub fn len(&self,) -> u32 {
        self.inner().len()
    }
-
+        
+    /// Return amount of free space, in bytes.
    pub fn num_free_bytes(&self,) -> u32 {
        self.inner().num_free_bytes()
    }

+    /// Allocate a given amount of space and returns an address
+    /// in the Heap.
    pub fn allocate_space(&self, num_bytes: usize) -> u32 {
        self.inner().allocate_space(num_bytes)
    }
-    
-    pub fn allocate_object<V: From<u32>>(&self,) -> (u32, &mut V) {
+        
+    /// Allocate an object in the heap
+    pub fn allocate_object<V: HeapAllocable>(&self,) -> (u32, &mut V) {
        let addr = self.inner().allocate_space(mem::size_of::<V>());
-        let v: V = V::from(addr);
+        let v: V = V::with_addr(addr);
        self.inner().set(addr, &v);
        (addr, self.inner().get_mut_ref(addr))
    }
-
+    
+    /// Stores a `&[u8]` in the heap and returns the destination BytesRef.
    pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
        self.inner().allocate_and_set(data)
    }
-
+    
+    /// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
+    /// given as argumetn
    pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
        self.inner().get_slice(bytes_ref.start, bytes_ref.stop)
    }
-
+    
+    /// Stores an item's data in the heap, at the given `address`.
    pub fn set<Item>(&self, addr: u32, val: &Item) {
        self.inner().set(addr, val);
    }
-
+    
+    /// Returns a mutable reference for an object at a given Item.
    pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
        self.inner().get_mut_ref(addr)
    }
@@ -183,7 +206,7 @@ impl InnerHeap {
        }
    }

-    pub fn set<Item>(&mut self, addr: u32, val: &Item) {
+    fn set<Item>(&mut self, addr: u32, val: &Item) {
        if addr >= self.buffer_len {
            self.next_heap.as_mut().unwrap().set(addr - self.buffer_len, val);
        }
--- a/src/datastruct/stacker/mod.rs
+++ b/src/datastruct/stacker/mod.rs
@@ -2,7 +2,7 @@ mod hashmap;
 mod heap;
 mod expull;

-pub use self::heap::Heap;
+pub use self::heap::{Heap, HeapAllocable};
 pub use self::expull::ExpUnrolledLinkedList;
 pub use self::hashmap::{HashMap, Entry};

--- a/src/fastfield/serializer.rs
+++ b/src/fastfield/serializer.rs
@@ -7,7 +7,7 @@ use super::compute_num_bits;


 /// `FastFieldSerializer` is in charge of serializing
-/// a fastfield on disk.
+/// fastfields on disk.
 /// 
 /// FastField are encoded using bit-packing.
 /// 
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -118,7 +118,7 @@ impl<'a> Iterator for PostingsMerger<'a> {
                    let next_heap_it = self.heap.pop().expect("This is only reached if an element was peeked beforehand.");
                    self.append_segment(&next_heap_it, &mut segment_postings_list);
                }
-                let chained_posting = ChainedPostings::new(segment_postings_list);
+                let chained_posting = ChainedPostings::from(segment_postings_list);
                Some((heap_it.term, chained_posting))
            },
            None => None
--- a/src/postings/chained_postings.rs
+++ b/src/postings/chained_postings.rs
@@ -4,19 +4,26 @@ use postings::OffsetPostings;
 use postings::DocSet;
 use postings::HasLen;

+/// Creates a posting object that chains two postings 
+/// together.
+///
+/// When iterating over the chained postings,
+/// it will consume all of the documents of the first postings, 
+/// and then iterate over the documents over the second postings.
+/// 
+/// The chained postings is used when merging segments. 
 pub struct ChainedPostings<'a> {
    chained_postings: Vec<OffsetPostings<'a>>,
    posting_id: usize,
    len: usize,
 }

-impl<'a> ChainedPostings<'a> {
-    
-    pub fn new(chained_postings: Vec<OffsetPostings<'a>>) -> ChainedPostings {
+impl<'a> From<Vec<OffsetPostings<'a>>> for ChainedPostings<'a> {
+    fn from(chained_postings: Vec<OffsetPostings<'a>>) -> ChainedPostings {
        let len: usize = chained_postings
            .iter()
            .map(|segment_postings| segment_postings.len())
-            .fold(0, |sum, addition| sum + addition);
+            .sum();
        ChainedPostings {
            chained_postings: chained_postings,
            posting_id: 0,
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -20,7 +20,10 @@ pub use self::postings_writer::PostingsWriter;
 pub use self::postings_writer::SpecializedPostingsWriter;
 pub use self::term_info::TermInfo;
 pub use self::postings::Postings;
+
+#[cfg(test)]
 pub use self::vec_postings::VecPostings;
+
 pub use self::chained_postings::ChainedPostings;
 pub use self::segment_postings::SegmentPostings;
 pub use self::intersection::intersection;
--- a/src/postings/recorder.rs
+++ b/src/postings/recorder.rs
@@ -1,20 +1,38 @@
 use DocId;
 use std::io;
 use postings::PostingsSerializer;
-use datastruct::stacker::{ExpUnrolledLinkedList, Heap};
+use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};

 const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
 const POSITION_END: u32 = 4294967295; 

-pub trait Recorder: From<u32> {
+/// Recorder is in charge of recording relevant information about
+/// the presence of a term in a document.
+///
+/// Depending on the `TextIndexingOptions` associated to the 
+/// field, the recorder may records
+///   * the document frequency
+///   * the document id 
+///   * the term frequency
+///   * the term positions
+pub trait Recorder: HeapAllocable {
+    /// Returns the current document
    fn current_doc(&self,) -> u32;
+    /// Starts recording information about a new document
+    /// This method shall only be called if the term is within the document. 
    fn new_doc(&mut self, doc: DocId, heap: &Heap);
+    /// Record the position of a term. For each document, 
+    /// this method will be called `term_freq` times.
    fn record_position(&mut self, position: u32, heap: &Heap);
+    /// Close the document. It will help record the term frequency. 
    fn close_doc(&mut self, heap: &Heap);
+    /// Returns the number of document that have been seen so far
    fn doc_freq(&self,) -> u32;
+    /// Pushes the postings information to the serializer.
    fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
 }

+/// Only records the doc ids
 #[repr(C, packed)]
 pub struct NothingRecorder {
    stack: ExpUnrolledLinkedList,
@@ -22,10 +40,10 @@ pub struct NothingRecorder {
    doc_freq: u32,
 }

-impl From<u32> for NothingRecorder {
-    fn from(addr: u32) -> NothingRecorder {
+impl HeapAllocable for NothingRecorder {
+    fn with_addr(addr: u32) -> NothingRecorder {
        NothingRecorder {
-            stack: ExpUnrolledLinkedList::from(addr),
+            stack: ExpUnrolledLinkedList::with_addr(addr),
            current_doc: u32::max_value(),
            doc_freq: 0u32,
        }
@@ -33,12 +51,11 @@ impl From<u32> for NothingRecorder {
 }

 impl Recorder for NothingRecorder {
-    
+        
    fn current_doc(&self,) -> DocId {
        self.current_doc
    }
    
-    
    fn new_doc(&mut self, doc: DocId, heap: &Heap) {
        self.current_doc = doc;
        self.stack.push(doc, heap);
@@ -59,9 +76,10 @@ impl Recorder for NothingRecorder {
        }
        Ok(())
    }
+
 }

-
+/// Recorder encoding document ids, and term frequencies
 #[repr(C, packed)]
 pub struct TermFrequencyRecorder {
    stack: ExpUnrolledLinkedList,
@@ -70,10 +88,10 @@ pub struct TermFrequencyRecorder {
    doc_freq: u32,
 }

-impl From<u32> for TermFrequencyRecorder {
-    fn from(addr: u32) -> TermFrequencyRecorder {
+impl HeapAllocable for TermFrequencyRecorder {
+    fn with_addr(addr: u32) -> TermFrequencyRecorder {
        TermFrequencyRecorder {
-            stack: ExpUnrolledLinkedList::from(addr),
+            stack: ExpUnrolledLinkedList::with_addr(addr),
            current_doc: u32::max_value(),
            current_tf: 0u32,
            doc_freq: 0u32
@@ -82,6 +100,8 @@ impl From<u32> for TermFrequencyRecorder {
 }

 impl Recorder for TermFrequencyRecorder {
+       
+    
    
    fn current_doc(&self,) -> DocId {
        self.current_doc
@@ -120,9 +140,10 @@ impl Recorder for TermFrequencyRecorder {
        }
        Ok(())
    }
+
 }

-
+/// Recorder encoding term frequencies as well as positions.
 #[repr(C, packed)]
 pub struct TFAndPositionRecorder {
    stack: ExpUnrolledLinkedList,
@@ -130,18 +151,18 @@ pub struct TFAndPositionRecorder {
    doc_freq: u32,
 }

-impl From<u32> for TFAndPositionRecorder {
-    fn from(addr: u32) -> TFAndPositionRecorder {
+impl HeapAllocable for TFAndPositionRecorder {
+    fn with_addr(addr: u32) -> TFAndPositionRecorder {
        TFAndPositionRecorder {
-            stack: ExpUnrolledLinkedList::from(addr),
+            stack: ExpUnrolledLinkedList::with_addr(addr),
            current_doc: u32::max_value(),
            doc_freq: 0u32,
        }
    }
-    
 }

 impl Recorder for TFAndPositionRecorder {
+  
    
    fn current_doc(&self,) -> DocId {
        self.current_doc
@@ -191,6 +212,7 @@ impl Recorder for TFAndPositionRecorder {
        }
        Ok(())
    }
+
 }


--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -17,6 +17,35 @@ use common::VInt;
 use common::BinarySerializable;


+/// `PostingsSerializer` is in charge of serializing
+/// postings on disk, in the 
+/// * `.idx` (inverted index)
+/// * `.pos` (positions file)
+/// * `.term` (term dictionary)
+/// 
+/// `PostingsWriter` are in charge of pushing the data to the 
+/// serializer.
+/// 
+/// The serializer expects to receive the following calls
+/// in this order :
+///
+/// * `new_term(...)`
+/// * `write_doc(...)`
+/// * `write_doc(...)`
+/// * `write_doc(...)`
+/// * ...
+/// * `close_term()`
+/// * `new_term(...)`
+/// * `write_doc(...)`
+/// * ...
+/// * `close_term()`
+/// * `close()`
+///
+/// Terms have to be pushed in a lexicographically-sorted order.
+/// Within a term, document have to be pushed in increasing order.
+///
+/// A description of the serialization format is 
+/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html). 
 pub struct PostingsSerializer {
    terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>, // TODO find an alternative to work around the "move"
    postings_write: WritePtr,
@@ -35,7 +64,8 @@ pub struct PostingsSerializer {
 }

 impl PostingsSerializer {
-
+    
+    /// Open a new `PostingsSerializer` for the given segment  
    pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> {
        let terms_write = try!(segment.open_write(SegmentComponent::TERMS));
        let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
@@ -59,8 +89,8 @@ impl PostingsSerializer {
            term_open: false,
        })
    }
-
-    pub fn load_indexing_options(&mut self, field: Field) {
+    
+    fn load_indexing_options(&mut self, field: Field) {
        let field_entry: &FieldEntry = self.schema.get_field_entry(field);
        self.text_indexing_options = match *field_entry.field_type() {
            FieldType::Str(ref text_options) => {
@@ -76,7 +106,11 @@ impl PostingsSerializer {
            }
        };
    }
-
+    
+    /// Starts the postings for a new term.
+    /// * term - the term. It needs to come after the previous term according
+    ///   to the lexicographical order. 
+    /// * doc_freq - return the number of document containing the term.
    pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> {
        if self.term_open {
            panic!("Called new_term, while the previous term was not closed.");
@@ -95,7 +129,11 @@ impl PostingsSerializer {
        self.terms_fst_builder
            .insert(term.as_slice(), &term_info)
    }
-
+    
+    /// Finish the serialization for this term postings.
+    ///
+    /// If the current block is incomplete, it need to be encoded
+    /// using `VInt` encoding.  
    pub fn close_term(&mut self,) -> io::Result<()> {
        if self.term_open {
            if !self.doc_ids.is_empty() {
@@ -133,7 +171,17 @@ impl PostingsSerializer {
        }
        Ok(())
    }
-
+    
+    
+    /// Serialize the information that a document contains the current term,
+    /// its term frequency, and the position deltas.
+    ///
+    /// At this point, the positions are already `delta-encoded`.
+    /// For instance, if the positions are `2, 3, 17`,
+    /// `position_deltas` is `2, 1, 14`
+    ///
+    /// Term frequencies and positions may be ignored by the serializer depending
+    /// on the configuration of the field in the `Schema`.
    pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) -> io::Result<()> {
        self.doc_ids.push(doc_id);
        if self.text_indexing_options.is_termfreq_enabled() {
@@ -161,7 +209,8 @@ impl PostingsSerializer {
        }
        Ok(())
    }
-
+    
+    /// Closes the serializer.
    pub fn close(mut self,) -> io::Result<()> {
        try!(self.close_term());
        try!(self.terms_fst_builder.finish());
--- a/src/postings/term_info.rs
+++ b/src/postings/term_info.rs
@@ -2,20 +2,23 @@ use common::BinarySerializable;
 use std::io;


-// `TermInfo` contains all of the information 
-// associated to terms in the `.term` file.
-// 
-// It consists of
-// * doc_freq : the number of document in the segment
-// containing this term. It is also the length of the
-// posting list associated to this term
-// * postings_offset: an offset in the `.idx` file 
-// addressing the start of the posting list associated
-// to this term.
+/// `TermInfo` contains all of the information 
+/// associated to terms in the `.term` file.
+/// 
+/// It consists of
+/// * doc_freq : the number of document in the segment
+/// containing this term. It is also the length of the
+/// posting list associated to this term
+/// * postings_offset: an offset in the `.idx` file 
+/// addressing the start of the posting list associated
+/// to this term.
 #[derive(Debug,Ord,PartialOrd,Eq,PartialEq,Clone)]
 pub struct TermInfo {
+    /// Number of documents in the segment containing the term
    pub doc_freq: u32,
+    /// Offset within the postings (`.idx`) file.  
    pub postings_offset: u32,
+    /// Offset within the position (`.pos`) file.
    pub positions_offset: u32,
 }

--- a/src/postings/vec_postings.rs
+++ b/src/postings/vec_postings.rs
@@ -7,6 +7,11 @@ use std::cmp::Ordering;

 const EMPTY_ARRAY: [u32; 0] = []; 

+/// Simulate a `Postings` objects from a `VecPostings`.
+/// `VecPostings` only exist for testing purposes.
+///
+/// Term frequencies always return 1.
+/// No positions are returned.
 pub struct VecPostings {
    doc_ids: Vec<DocId>,
    cursor: Wrapping<usize>,