NOBUG Added comments.

2026-05-30 07:00:41 +00:00 · 2016-09-21 10:27:43 +09:00
parent b337adbd78
commit 025ab3c7ab
8 changed files with 132 additions and 45 deletions
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -157,11 +157,12 @@ impl SegmentReader {
    /// Returns the segment postings associated with the term, and with the given option,
    /// or `None` if the term has never been encounterred and indexed. 
    /// 
-    /// # Panics
-    /// This method panics if the field was not indexed with the indexing options that cover 
-    /// the requested options.
+    /// If the field was not indexed with the indexing options that cover 
+    /// the requested options, the returned `SegmentPostings` the method does not fail
+    /// and returns a `SegmentPostings` with as much information as possible.
+    ///
    /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions`
-    /// that does not index position will panic.
+    /// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
    pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<SegmentPostings> {
        let field = term.field();
        let field_entry = self.schema.get_field_entry(field);
@@ -204,8 +205,7 @@ impl SegmentReader {
        };
        Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, freq_handler))
    }
-    
-    
+        
    /// Returns the posting list associated with a term.
    pub fn read_postings_all_info(&self, term: &Term) -> Option<SegmentPostings> {
        let field_entry = self.schema.get_field_entry(term.field());
--- a/src/fastfield/serializer.rs
+++ b/src/fastfield/serializer.rs
@@ -5,6 +5,27 @@ use std::io;
 use std::io::{SeekFrom, Write};
 use super::compute_num_bits;

+
+/// `FastFieldSerializer` is in charge of serializing
+/// a fastfield on disk.
+/// 
+/// FastField are encoded using bit-packing.
+/// 
+/// `FastFieldWriter`s are in charge of pushing the data to
+/// the serializer.
+/// The serializer expects to receive the following calls.
+///
+/// * `new_u32_fast_field(...)`
+/// * `add_val(...)`
+/// * `add_val(...)`
+/// * `add_val(...)`
+/// * ...
+/// * `close_field()`
+/// * `new_u32_fast_field(...)`
+/// * `add_val(...)`
+/// * ...
+/// * `close_field()`
+/// * `close()`
 pub struct FastFieldSerializer {
    write: WritePtr,
    written_size: usize,
@@ -12,13 +33,15 @@ pub struct FastFieldSerializer {
    num_bits: u8,
    min_value: u32,
    field_open: bool,
-
-
+    
    mini_buffer_written: usize,
    mini_buffer: u32,
 }

+
+
 impl FastFieldSerializer {
+    /// Constructor
    pub fn new(mut write: WritePtr) -> io::Result<FastFieldSerializer> {
        // just making room for the pointer to header.
        let written_size: usize = try!(0u32.serialize(&mut write));
@@ -34,7 +57,8 @@ impl FastFieldSerializer {
            mini_buffer: 0u32,
        })
    }
-
+    
+    /// Start serializing a new u32 fast field
    pub fn new_u32_fast_field(&mut self, field: Field, min_value: u32, max_value: u32) -> io::Result<()> {
        if self.field_open {
            return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
@@ -56,6 +80,8 @@ impl FastFieldSerializer {
        Ok(())
    }

+
+    /// Pushes a new value to the currently open u32 fast field. 
    pub fn add_val(&mut self, val: u32) -> io::Result<()> {
        let write: &mut Write = &mut self.write;
        let val_to_write: u32 = val - self.min_value;
@@ -77,7 +103,8 @@ impl FastFieldSerializer {
        }
        Ok(())
    }
-
+    
+    /// Close the u32 fast field. 
    pub fn close_field(&mut self,) -> io::Result<()> {
        if !self.field_open {
            return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
@@ -94,7 +121,11 @@ impl FastFieldSerializer {
        self.mini_buffer = 0;
        Ok(())
    }
-
+    
+    
+    /// Closes the serializer
+    /// 
+    /// After this call the data must be persistently save on disk.
    pub fn close(mut self,) -> io::Result<usize> {
        if self.field_open {
            return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
--- a/src/postings/docset.rs
+++ b/src/postings/docset.rs
@@ -3,23 +3,33 @@ use std::borrow::Borrow;
 use std::borrow::BorrowMut;
 use std::cmp::Ordering;

+
+/// Expressed the outcome of a call to `DocSet`'s `.skip_next(...)`.
 #[derive(PartialEq, Eq, Debug)]
 pub enum SkipResult {
+    /// target was in the docset
    Reached,
+    /// target was not in the docset, skipping stopped as a greater element was found
    OverStep,
+    /// the docset was entirely consumed without finding the target, nor any
+    /// element greater than the target.
    End,
 }


 /// Represents an iterable set of sorted doc ids. 
 pub trait DocSet {
-    // goes to the next element.
-    // next needs to be called a first time to point to the correct element.
+    /// Goes to the next element.
+    /// `.advance(...)` needs to be called a first time to point to the correct
+    /// element.
    fn advance(&mut self,) -> bool;
    
-    // after skipping position
-    // the iterator in such a way that doc() will return a
-    // value greater or equal to target.
+    /// After skipping position, the iterator in such a way `.doc()`
+    /// will return a value greater or equal to target.
+    /// 
+    /// SkipResult expresses whether the `target value` was reached, overstepped,
+    /// or if the `DocSet` was entirely consumed without finding any value
+    /// greater or equal to the `target`.  
    fn skip_next(&mut self, target: DocId) -> SkipResult {
        loop {
            match self.doc().cmp(&target) {
@@ -37,6 +47,9 @@ pub trait DocSet {
    /// Returns the current document
    fn doc(&self,) -> DocId;
    
+    /// Advances the cursor to the next document
+    /// None is returned if the iterator has `DocSet` 
+    /// has already been entirely consumed.  
    fn next(&mut self,) -> Option<DocId> {
        if self.advance() {
            Some(self.doc())
--- a/src/postings/freq_handler.rs
+++ b/src/postings/freq_handler.rs
@@ -6,6 +6,9 @@ use compression::CompositeDecoder;
 use postings::SegmentPostingsOption;
 use compression::NUM_DOCS_PER_BLOCK;

+
+/// The FreqHandler object is in charge of decompressing
+/// frequencies and/or positions.
 pub struct FreqHandler {
    freq_decoder: SIMDBlockDecoder,
    positions: Vec<u32>,
@@ -28,6 +31,7 @@ fn read_positions(data: &[u8]) -> Vec<u32> {

 impl FreqHandler {
    
+    /// Returns a `FreqHandler` that just decodes `DocId`s.
    pub fn new_without_freq() -> FreqHandler {
        FreqHandler {
            freq_decoder: SIMDBlockDecoder::with_val(1u32),
@@ -37,6 +41,7 @@ impl FreqHandler {
        }
    }
    
+    /// Returns a `FreqHandler` that decodes `DocId`s and term frequencies.
    pub fn new_with_freq() -> FreqHandler {
        FreqHandler {
            freq_decoder: SIMDBlockDecoder::new(),
@@ -46,6 +51,8 @@ impl FreqHandler {
        }
    }

+
+    /// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions.
    pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler {
        let positions = read_positions(position_data);
        FreqHandler {
@@ -75,12 +82,26 @@ impl FreqHandler {
        }
    }
    
+    
+    /// Accessor to term frequency
+    ///
+    /// idx is the offset of the current doc in the block.
+    /// It takes value between 0 and 128.
+    pub fn freq(&self, idx: usize)-> u32 {
+        self.freq_decoder.output(idx)
+    }
+    
+    /// Accessor to the positions
+    ///
+    /// idx is the offset of the current doc in the block.
+    /// It takes value between 0 and 128.
    pub fn positions(&self, idx: usize) -> &[u32] {
        let start = self.positions_offsets[idx];
        let stop = self.positions_offsets[idx + 1];
        &self.positions[start..stop]        
    }
    
+    /// Decompresses a complete frequency block
    pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
        match self.option {
            SegmentPostingsOption::NoFreq => {
@@ -96,7 +117,8 @@ impl FreqHandler {
            }
        }
    }
-
+        
+    /// Decompresses an incomplete frequency block
    pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
        match self.option {
            SegmentPostingsOption::NoFreq => {}
@@ -110,8 +132,4 @@ impl FreqHandler {
        }
    }
    
-    #[inline]
-    pub fn freq(&self, idx: usize)-> u32 {
-        self.freq_decoder.output(idx)
-    }
 }
--- a/src/postings/intersection.rs
+++ b/src/postings/intersection.rs
@@ -2,7 +2,9 @@ use postings::DocSet;
 use std::cmp::Ordering;
 use DocId;

+// TODO Find a way to specialize IntersectionDocSet

+/// Creates a DocSet that iterator through the intersection of two `DocSet`s.
 pub struct IntersectionDocSet<'a> {
    left: Box<DocSet + 'a>,
    right: Box<DocSet + 'a>,
@@ -10,7 +12,8 @@ pub struct IntersectionDocSet<'a> {
 }

 impl<'a> IntersectionDocSet<'a> {
-    
+        
+    /// Intersect two `DocSet`s
    fn from_pair(left: Box<DocSet + 'a>, right: Box<DocSet + 'a>) -> IntersectionDocSet<'a> {
        IntersectionDocSet {
            left: left,
@@ -19,6 +22,7 @@ impl<'a> IntersectionDocSet<'a> {
        }         
    }
    
+    /// Intersect a list of `DocSet`s
    pub fn new(mut postings: Vec<Box<DocSet + 'a>>) -> IntersectionDocSet<'a> {
        let left = postings.pop().unwrap();
        let right = 
@@ -74,6 +78,7 @@ impl<'a> DocSet for IntersectionDocSet<'a> {
    }
 }

+/// Intersects a `Vec` of `DocSets`
 pub fn intersection<'a, TDocSet: DocSet + 'a>(postings: Vec<TDocSet>) -> IntersectionDocSet<'a> {
    let boxed_postings: Vec<Box<DocSet + 'a>> = postings
        .into_iter()
--- a/src/postings/offset_postings.rs
+++ b/src/postings/offset_postings.rs
@@ -15,6 +15,8 @@ pub struct OffsetPostings<'a> {
 }

 impl<'a> OffsetPostings<'a> {
+    
+    /// Constructor
    pub fn new(underlying: SegmentPostings<'a>, offset: DocId) -> OffsetPostings {
        OffsetPostings {
            underlying: underlying,
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -9,14 +9,29 @@ use schema::Field;
 use analyzer::StreamingIterator;
 use datastruct::stacker::{HashMap, Heap};

+/// The `PostingsWriter` is in charge of receiving documenting  
+/// and building a `Segment` in anonymous memory.
+///
+/// `PostingsWriter` writes in a `Heap`.
 pub trait PostingsWriter {
    
-    fn close(&mut self, heap: &Heap);
-
+    /// Record that a document contains a term at a given position.
+    ///
+    /// * doc  - the document id
+    /// * pos  - the term position (expressed in tokens)
+    /// * term - the term
+    /// * heap - heap used to store the postings informations as well as the terms
+    /// in the hashmap.
    fn suscribe(&mut self,  doc: DocId, pos: u32, term: &Term, heap: &Heap);
-
+    
+    /// Serializes the postings on disk.
+    /// The actual serialization format is handled by the `PostingsSerializer`.
    fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
    
+    /// Closes all of the currently open `Recorder`'s.
+    fn close(&mut self, heap: &Heap);
+        
+    /// Tokenize a text and suscribe all of its token.
    fn index_text<'a>(&mut self, doc_id: DocId, field: Field, field_values: &[&'a FieldValue], heap: &Heap) -> u32  {
        let mut pos = 0u32;
        let mut num_tokens: u32 = 0u32;
@@ -39,10 +54,13 @@ pub trait PostingsWriter {
    }
 }

+/// The SpecializedPostingsWriter is just here to remove dynamic
+/// dispatch to the recorder information.
 pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
    term_index: HashMap<'a, Rec>,
 }

+/// Given a `Heap` size, computes a relevant size for the `HashMap`.
 fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
    let num_buckets_usable = heap_capacity / 100;
    let hash_table_size = num_buckets_usable * 2;
@@ -57,7 +75,8 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
 }

 impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
-
+    
+    /// constructor
    pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
        let capacity = heap.capacity();
        let hashmap_size = hashmap_size_in_bits(capacity);
@@ -66,9 +85,9 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
        }
    }
    
+    /// Builds a `SpecializedPostingsWriter` storing its data in a heap.
    pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
-        let res = SpecializedPostingsWriter::<Rec>::new(heap);
-        Box::new(res)
+        Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
    } 

 }
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -6,7 +6,11 @@ use std::num::Wrapping;



-// No Term Frequency, no postings.
+/// `SegmentPostings` represents the inverted list or postings associated to 
+/// a term in a `Segment`.
+/// 
+/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
+/// Positions on the other hand, are optionally entirely decoded upfront.
 pub struct SegmentPostings<'a> {
    len: usize,
    doc_offset: u32,
@@ -16,22 +20,9 @@ pub struct SegmentPostings<'a> {
    cur: Wrapping<usize>,
 }

-const EMPTY_ARRAY: [u8; 0] = [];
-
 impl<'a> SegmentPostings<'a> {
-
-    pub fn empty() -> SegmentPostings<'a> {
-        SegmentPostings {
-            len: 0,
-            doc_offset: 0,
-            block_decoder: SIMDBlockDecoder::new(),
-            freq_handler: FreqHandler::new_without_freq(),
-            remaining_data: &EMPTY_ARRAY,
-            cur: Wrapping(usize::max_value()),
-        }
-    }
    
-    pub fn load_next_block(&mut self,) {
+    fn load_next_block(&mut self,) {
        let num_remaining_docs = self.len - self.cur.0;
        if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
            self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset);
@@ -44,6 +35,12 @@ impl<'a> SegmentPostings<'a> {
        }
    }

+    /// Reads a Segment postings from an &[u8]
+    ///
+    /// * `len` - number of document in the posting lists.
+    /// * `data` - data array. The complete data is not necessarily used.
+    /// * `freq_handler` - the freq handler is in charge of decoding 
+    ///   frequencies and/or positions
    pub fn from_data(len: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> {
        SegmentPostings {
            len: len as usize,
@@ -54,7 +51,9 @@ impl<'a> SegmentPostings<'a> {
            cur: Wrapping(usize::max_value()),
        }
    }
-
+    
+    /// Index within a block is used as an address when
+    /// interacting with the `FreqHandler` 
    fn index_within_block(&self,) -> usize {
        self.cur.0 % NUM_DOCS_PER_BLOCK
    }
@@ -77,7 +76,7 @@ impl<'a> DocSet for SegmentPostings<'a> {
        }
        true
    }
-
+    
    #[inline]
    fn doc(&self,) -> DocId {
        self.block_decoder.output(self.index_within_block())