From 025ab3c7ab04cc4d3babc633e4d5b54ff273f6d0 Mon Sep 17 00:00:00 2001
From: Paul Masurel <paul.masurel@gmail.com>
Date: Wed, 21 Sep 2016 10:27:43 +0900
Subject: [PATCH] NOBUG Added comments.

---
 src/core/segment_reader.rs       | 12 +++++-----
 src/fastfield/serializer.rs      | 41 ++++++++++++++++++++++++++++----
 src/postings/docset.rs           | 23 ++++++++++++++----
 src/postings/freq_handler.rs     | 28 ++++++++++++++++++----
 src/postings/intersection.rs     |  7 +++++-
 src/postings/offset_postings.rs  |  2 ++
 src/postings/postings_writer.rs  | 31 +++++++++++++++++++-----
 src/postings/segment_postings.rs | 33 +++++++++++++------------
 8 files changed, 132 insertions(+), 45 deletions(-)
diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs
index b43058f16..5e874c8e8 100644
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -157,11 +157,12 @@ impl SegmentReader {
     /// Returns the segment postings associated with the term, and with the given option,
     /// or `None` if the term has never been encounterred and indexed. 
     /// 
-    /// # Panics
-    /// This method panics if the field was not indexed with the indexing options that cover 
-    /// the requested options.
+    /// If the field was not indexed with the indexing options that cover 
+    /// the requested options, the returned `SegmentPostings` the method does not fail
+    /// and returns a `SegmentPostings` with as much information as possible.
+    ///
     /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions`
-    /// that does not index position will panic.
+    /// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
     pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<SegmentPostings> {
         let field = term.field();
         let field_entry = self.schema.get_field_entry(field);
@@ -204,8 +205,7 @@ impl SegmentReader {
         };
         Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, freq_handler))
     }
-    
-    
+        
     /// Returns the posting list associated with a term.
     pub fn read_postings_all_info(&self, term: &Term) -> Option<SegmentPostings> {
         let field_entry = self.schema.get_field_entry(term.field());
diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs
index 756119596..11b740715 100644
--- a/src/fastfield/serializer.rs
+++ b/src/fastfield/serializer.rs
@@ -5,6 +5,27 @@ use std::io;
 use std::io::{SeekFrom, Write};
 use super::compute_num_bits;
 
+
+/// `FastFieldSerializer` is in charge of serializing
+/// a fastfield on disk.
+/// 
+/// FastField are encoded using bit-packing.
+/// 
+/// `FastFieldWriter`s are in charge of pushing the data to
+/// the serializer.
+/// The serializer expects to receive the following calls.
+///
+/// * `new_u32_fast_field(...)`
+/// * `add_val(...)`
+/// * `add_val(...)`
+/// * `add_val(...)`
+/// * ...
+/// * `close_field()`
+/// * `new_u32_fast_field(...)`
+/// * `add_val(...)`
+/// * ...
+/// * `close_field()`
+/// * `close()`
 pub struct FastFieldSerializer {
     write: WritePtr,
     written_size: usize,
@@ -12,13 +33,15 @@ pub struct FastFieldSerializer {
     num_bits: u8,
     min_value: u32,
     field_open: bool,
-
-
+    
     mini_buffer_written: usize,
     mini_buffer: u32,
 }
 
+
+
 impl FastFieldSerializer {
+    /// Constructor
     pub fn new(mut write: WritePtr) -> io::Result<FastFieldSerializer> {
         // just making room for the pointer to header.
         let written_size: usize = try!(0u32.serialize(&mut write));
@@ -34,7 +57,8 @@ impl FastFieldSerializer {
             mini_buffer: 0u32,
         })
     }
-
+    
+    /// Start serializing a new u32 fast field
     pub fn new_u32_fast_field(&mut self, field: Field, min_value: u32, max_value: u32) -> io::Result<()> {
         if self.field_open {
             return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
@@ -56,6 +80,8 @@ impl FastFieldSerializer {
         Ok(())
     }
 
+
+    /// Pushes a new value to the currently open u32 fast field. 
     pub fn add_val(&mut self, val: u32) -> io::Result<()> {
         let write: &mut Write = &mut self.write;
         let val_to_write: u32 = val - self.min_value;
@@ -77,7 +103,8 @@ impl FastFieldSerializer {
         }
         Ok(())
     }
-
+    
+    /// Close the u32 fast field. 
     pub fn close_field(&mut self,) -> io::Result<()> {
         if !self.field_open {
             return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
@@ -94,7 +121,11 @@ impl FastFieldSerializer {
         self.mini_buffer = 0;
         Ok(())
     }
-
+    
+    
+    /// Closes the serializer
+    /// 
+    /// After this call the data must be persistently save on disk.
     pub fn close(mut self,) -> io::Result<usize> {
         if self.field_open {
             return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
diff --git a/src/postings/docset.rs b/src/postings/docset.rs
index 2f35384f7..e7fb59315 100644
--- a/src/postings/docset.rs
+++ b/src/postings/docset.rs
@@ -3,23 +3,33 @@ use std::borrow::Borrow;
 use std::borrow::BorrowMut;
 use std::cmp::Ordering;
 
+
+/// Expressed the outcome of a call to `DocSet`'s `.skip_next(...)`.
 #[derive(PartialEq, Eq, Debug)]
 pub enum SkipResult {
+    /// target was in the docset
     Reached,
+    /// target was not in the docset, skipping stopped as a greater element was found
     OverStep,
+    /// the docset was entirely consumed without finding the target, nor any
+    /// element greater than the target.
     End,
 }
 
 
 /// Represents an iterable set of sorted doc ids. 
 pub trait DocSet {
-    // goes to the next element.
-    // next needs to be called a first time to point to the correct element.
+    /// Goes to the next element.
+    /// `.advance(...)` needs to be called a first time to point to the correct
+    /// element.
     fn advance(&mut self,) -> bool;
     
-    // after skipping position
-    // the iterator in such a way that doc() will return a
-    // value greater or equal to target.
+    /// After skipping position, the iterator in such a way `.doc()`
+    /// will return a value greater or equal to target.
+    /// 
+    /// SkipResult expresses whether the `target value` was reached, overstepped,
+    /// or if the `DocSet` was entirely consumed without finding any value
+    /// greater or equal to the `target`.  
     fn skip_next(&mut self, target: DocId) -> SkipResult {
         loop {
             match self.doc().cmp(&target) {
@@ -37,6 +47,9 @@ pub trait DocSet {
     /// Returns the current document
     fn doc(&self,) -> DocId;
     
+    /// Advances the cursor to the next document
+    /// None is returned if the iterator has `DocSet` 
+    /// has already been entirely consumed.  
     fn next(&mut self,) -> Option<DocId> {
         if self.advance() {
             Some(self.doc())
diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs
index b02ec8d1f..4f44fb9e0 100644
--- a/src/postings/freq_handler.rs
+++ b/src/postings/freq_handler.rs
@@ -6,6 +6,9 @@ use compression::CompositeDecoder;
 use postings::SegmentPostingsOption;
 use compression::NUM_DOCS_PER_BLOCK;
 
+
+/// The FreqHandler object is in charge of decompressing
+/// frequencies and/or positions.
 pub struct FreqHandler {
     freq_decoder: SIMDBlockDecoder,
     positions: Vec<u32>,
@@ -28,6 +31,7 @@ fn read_positions(data: &[u8]) -> Vec<u32> {
 
 impl FreqHandler {
     
+    /// Returns a `FreqHandler` that just decodes `DocId`s.
     pub fn new_without_freq() -> FreqHandler {
         FreqHandler {
             freq_decoder: SIMDBlockDecoder::with_val(1u32),
@@ -37,6 +41,7 @@ impl FreqHandler {
         }
     }
     
+    /// Returns a `FreqHandler` that decodes `DocId`s and term frequencies.
     pub fn new_with_freq() -> FreqHandler {
         FreqHandler {
             freq_decoder: SIMDBlockDecoder::new(),
@@ -46,6 +51,8 @@ impl FreqHandler {
         }
     }
 
+
+    /// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions.
     pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler {
         let positions = read_positions(position_data);
         FreqHandler {
@@ -75,12 +82,26 @@ impl FreqHandler {
         }
     }
     
+    
+    /// Accessor to term frequency
+    ///
+    /// idx is the offset of the current doc in the block.
+    /// It takes value between 0 and 128.
+    pub fn freq(&self, idx: usize)-> u32 {
+        self.freq_decoder.output(idx)
+    }
+    
+    /// Accessor to the positions
+    ///
+    /// idx is the offset of the current doc in the block.
+    /// It takes value between 0 and 128.
     pub fn positions(&self, idx: usize) -> &[u32] {
         let start = self.positions_offsets[idx];
         let stop = self.positions_offsets[idx + 1];
         &self.positions[start..stop]        
     }
     
+    /// Decompresses a complete frequency block
     pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
         match self.option {
             SegmentPostingsOption::NoFreq => {
@@ -96,7 +117,8 @@ impl FreqHandler {
             }
         }
     }
-
+        
+    /// Decompresses an incomplete frequency block
     pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
         match self.option {
             SegmentPostingsOption::NoFreq => {}
@@ -110,8 +132,4 @@ impl FreqHandler {
         }
     }
     
-    #[inline]
-    pub fn freq(&self, idx: usize)-> u32 {
-        self.freq_decoder.output(idx)
-    }
 }
\ No newline at end of file
diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs
index 4069a602b..72dd804e7 100644
--- a/src/postings/intersection.rs
+++ b/src/postings/intersection.rs
@@ -2,7 +2,9 @@ use postings::DocSet;
 use std::cmp::Ordering;
 use DocId;
 
+// TODO Find a way to specialize IntersectionDocSet
 
+/// Creates a DocSet that iterator through the intersection of two `DocSet`s.
 pub struct IntersectionDocSet<'a> {
     left: Box<DocSet + 'a>,
     right: Box<DocSet + 'a>,
@@ -10,7 +12,8 @@ pub struct IntersectionDocSet<'a> {
 }
 
 impl<'a> IntersectionDocSet<'a> {
-    
+        
+    /// Intersect two `DocSet`s
     fn from_pair(left: Box<DocSet + 'a>, right: Box<DocSet + 'a>) -> IntersectionDocSet<'a> {
         IntersectionDocSet {
             left: left,
@@ -19,6 +22,7 @@ impl<'a> IntersectionDocSet<'a> {
         }         
     }
     
+    /// Intersect a list of `DocSet`s
     pub fn new(mut postings: Vec<Box<DocSet + 'a>>) -> IntersectionDocSet<'a> {
         let left = postings.pop().unwrap();
         let right = 
@@ -74,6 +78,7 @@ impl<'a> DocSet for IntersectionDocSet<'a> {
     }
 }
 
+/// Intersects a `Vec` of `DocSets`
 pub fn intersection<'a, TDocSet: DocSet + 'a>(postings: Vec<TDocSet>) -> IntersectionDocSet<'a> {
     let boxed_postings: Vec<Box<DocSet + 'a>> = postings
         .into_iter()
diff --git a/src/postings/offset_postings.rs b/src/postings/offset_postings.rs
index c5c0b5888..fe7ea453d 100644
--- a/src/postings/offset_postings.rs
+++ b/src/postings/offset_postings.rs
@@ -15,6 +15,8 @@ pub struct OffsetPostings<'a> {
 }
 
 impl<'a> OffsetPostings<'a> {
+    
+    /// Constructor
     pub fn new(underlying: SegmentPostings<'a>, offset: DocId) -> OffsetPostings {
         OffsetPostings {
             underlying: underlying,
diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs
index 2ccaaa5fc..7afe42b14 100644
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -9,14 +9,29 @@ use schema::Field;
 use analyzer::StreamingIterator;
 use datastruct::stacker::{HashMap, Heap};
 
+/// The `PostingsWriter` is in charge of receiving documenting  
+/// and building a `Segment` in anonymous memory.
+///
+/// `PostingsWriter` writes in a `Heap`.
 pub trait PostingsWriter {
     
-    fn close(&mut self, heap: &Heap);
-
+    /// Record that a document contains a term at a given position.
+    ///
+    /// * doc  - the document id
+    /// * pos  - the term position (expressed in tokens)
+    /// * term - the term
+    /// * heap - heap used to store the postings informations as well as the terms
+    /// in the hashmap.
     fn suscribe(&mut self,  doc: DocId, pos: u32, term: &Term, heap: &Heap);
-
+    
+    /// Serializes the postings on disk.
+    /// The actual serialization format is handled by the `PostingsSerializer`.
     fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
     
+    /// Closes all of the currently open `Recorder`'s.
+    fn close(&mut self, heap: &Heap);
+        
+    /// Tokenize a text and suscribe all of its token.
     fn index_text<'a>(&mut self, doc_id: DocId, field: Field, field_values: &[&'a FieldValue], heap: &Heap) -> u32  {
         let mut pos = 0u32;
         let mut num_tokens: u32 = 0u32;
@@ -39,10 +54,13 @@ pub trait PostingsWriter {
     }
 }
 
+/// The SpecializedPostingsWriter is just here to remove dynamic
+/// dispatch to the recorder information.
 pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
     term_index: HashMap<'a, Rec>,
 }
 
+/// Given a `Heap` size, computes a relevant size for the `HashMap`.
 fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
     let num_buckets_usable = heap_capacity / 100;
     let hash_table_size = num_buckets_usable * 2;
@@ -57,7 +75,8 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
 }
 
 impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
-
+    
+    /// constructor
     pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
         let capacity = heap.capacity();
         let hashmap_size = hashmap_size_in_bits(capacity);
@@ -66,9 +85,9 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
         }
     }
     
+    /// Builds a `SpecializedPostingsWriter` storing its data in a heap.
     pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
-        let res = SpecializedPostingsWriter::<Rec>::new(heap);
-        Box::new(res)
+        Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
     } 
 
 }
diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs
index 99b2aa65c..8de872f90 100644
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -6,7 +6,11 @@ use std::num::Wrapping;
 
 
 
-// No Term Frequency, no postings.
+/// `SegmentPostings` represents the inverted list or postings associated to 
+/// a term in a `Segment`.
+/// 
+/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
+/// Positions on the other hand, are optionally entirely decoded upfront.
 pub struct SegmentPostings<'a> {
     len: usize,
     doc_offset: u32,
@@ -16,22 +20,9 @@ pub struct SegmentPostings<'a> {
     cur: Wrapping<usize>,
 }
 
-const EMPTY_ARRAY: [u8; 0] = [];
-
 impl<'a> SegmentPostings<'a> {
-
-    pub fn empty() -> SegmentPostings<'a> {
-        SegmentPostings {
-            len: 0,
-            doc_offset: 0,
-            block_decoder: SIMDBlockDecoder::new(),
-            freq_handler: FreqHandler::new_without_freq(),
-            remaining_data: &EMPTY_ARRAY,
-            cur: Wrapping(usize::max_value()),
-        }
-    }
     
-    pub fn load_next_block(&mut self,) {
+    fn load_next_block(&mut self,) {
         let num_remaining_docs = self.len - self.cur.0;
         if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
             self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset);
@@ -44,6 +35,12 @@ impl<'a> SegmentPostings<'a> {
         }
     }
 
+    /// Reads a Segment postings from an &[u8]
+    ///
+    /// * `len` - number of document in the posting lists.
+    /// * `data` - data array. The complete data is not necessarily used.
+    /// * `freq_handler` - the freq handler is in charge of decoding 
+    ///   frequencies and/or positions
     pub fn from_data(len: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> {
         SegmentPostings {
             len: len as usize,
@@ -54,7 +51,9 @@ impl<'a> SegmentPostings<'a> {
             cur: Wrapping(usize::max_value()),
         }
     }
-
+    
+    /// Index within a block is used as an address when
+    /// interacting with the `FreqHandler` 
     fn index_within_block(&self,) -> usize {
         self.cur.0 % NUM_DOCS_PER_BLOCK
     }
@@ -77,7 +76,7 @@ impl<'a> DocSet for SegmentPostings<'a> {
         }
         true
     }
-
+    
     #[inline]
     fn doc(&self,) -> DocId {
         self.block_decoder.output(self.index_within_block())