From 025ab3c7ab04cc4d3babc633e4d5b54ff273f6d0 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 21 Sep 2016 10:27:43 +0900 Subject: [PATCH] NOBUG Added comments. --- src/core/segment_reader.rs | 12 +++++----- src/fastfield/serializer.rs | 41 ++++++++++++++++++++++++++++---- src/postings/docset.rs | 23 ++++++++++++++---- src/postings/freq_handler.rs | 28 ++++++++++++++++++---- src/postings/intersection.rs | 7 +++++- src/postings/offset_postings.rs | 2 ++ src/postings/postings_writer.rs | 31 +++++++++++++++++++----- src/postings/segment_postings.rs | 33 +++++++++++++------------ 8 files changed, 132 insertions(+), 45 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index b43058f16..5e874c8e8 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -157,11 +157,12 @@ impl SegmentReader { /// Returns the segment postings associated with the term, and with the given option, /// or `None` if the term has never been encounterred and indexed. /// - /// # Panics - /// This method panics if the field was not indexed with the indexing options that cover - /// the requested options. + /// If the field was not indexed with the indexing options that cover + /// the requested options, the returned `SegmentPostings` the method does not fail + /// and returns a `SegmentPostings` with as much information as possible. + /// /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions` - /// that does not index position will panic. + /// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies. pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option { let field = term.field(); let field_entry = self.schema.get_field_entry(field); @@ -204,8 +205,7 @@ impl SegmentReader { }; Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, freq_handler)) } - - + /// Returns the posting list associated with a term. pub fn read_postings_all_info(&self, term: &Term) -> Option { let field_entry = self.schema.get_field_entry(term.field()); diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index 756119596..11b740715 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -5,6 +5,27 @@ use std::io; use std::io::{SeekFrom, Write}; use super::compute_num_bits; + +/// `FastFieldSerializer` is in charge of serializing +/// a fastfield on disk. +/// +/// FastField are encoded using bit-packing. +/// +/// `FastFieldWriter`s are in charge of pushing the data to +/// the serializer. +/// The serializer expects to receive the following calls. +/// +/// * `new_u32_fast_field(...)` +/// * `add_val(...)` +/// * `add_val(...)` +/// * `add_val(...)` +/// * ... +/// * `close_field()` +/// * `new_u32_fast_field(...)` +/// * `add_val(...)` +/// * ... +/// * `close_field()` +/// * `close()` pub struct FastFieldSerializer { write: WritePtr, written_size: usize, @@ -12,13 +33,15 @@ pub struct FastFieldSerializer { num_bits: u8, min_value: u32, field_open: bool, - - + mini_buffer_written: usize, mini_buffer: u32, } + + impl FastFieldSerializer { + /// Constructor pub fn new(mut write: WritePtr) -> io::Result { // just making room for the pointer to header. let written_size: usize = try!(0u32.serialize(&mut write)); @@ -34,7 +57,8 @@ impl FastFieldSerializer { mini_buffer: 0u32, }) } - + + /// Start serializing a new u32 fast field pub fn new_u32_fast_field(&mut self, field: Field, min_value: u32, max_value: u32) -> io::Result<()> { if self.field_open { return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed")); @@ -56,6 +80,8 @@ impl FastFieldSerializer { Ok(()) } + + /// Pushes a new value to the currently open u32 fast field. pub fn add_val(&mut self, val: u32) -> io::Result<()> { let write: &mut Write = &mut self.write; let val_to_write: u32 = val - self.min_value; @@ -77,7 +103,8 @@ impl FastFieldSerializer { } Ok(()) } - + + /// Close the u32 fast field. pub fn close_field(&mut self,) -> io::Result<()> { if !self.field_open { return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed")); @@ -94,7 +121,11 @@ impl FastFieldSerializer { self.mini_buffer = 0; Ok(()) } - + + + /// Closes the serializer + /// + /// After this call the data must be persistently save on disk. pub fn close(mut self,) -> io::Result { if self.field_open { return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed")); diff --git a/src/postings/docset.rs b/src/postings/docset.rs index 2f35384f7..e7fb59315 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -3,23 +3,33 @@ use std::borrow::Borrow; use std::borrow::BorrowMut; use std::cmp::Ordering; + +/// Expressed the outcome of a call to `DocSet`'s `.skip_next(...)`. #[derive(PartialEq, Eq, Debug)] pub enum SkipResult { + /// target was in the docset Reached, + /// target was not in the docset, skipping stopped as a greater element was found OverStep, + /// the docset was entirely consumed without finding the target, nor any + /// element greater than the target. End, } /// Represents an iterable set of sorted doc ids. pub trait DocSet { - // goes to the next element. - // next needs to be called a first time to point to the correct element. + /// Goes to the next element. + /// `.advance(...)` needs to be called a first time to point to the correct + /// element. fn advance(&mut self,) -> bool; - // after skipping position - // the iterator in such a way that doc() will return a - // value greater or equal to target. + /// After skipping position, the iterator in such a way `.doc()` + /// will return a value greater or equal to target. + /// + /// SkipResult expresses whether the `target value` was reached, overstepped, + /// or if the `DocSet` was entirely consumed without finding any value + /// greater or equal to the `target`. fn skip_next(&mut self, target: DocId) -> SkipResult { loop { match self.doc().cmp(&target) { @@ -37,6 +47,9 @@ pub trait DocSet { /// Returns the current document fn doc(&self,) -> DocId; + /// Advances the cursor to the next document + /// None is returned if the iterator has `DocSet` + /// has already been entirely consumed. fn next(&mut self,) -> Option { if self.advance() { Some(self.doc()) diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs index b02ec8d1f..4f44fb9e0 100644 --- a/src/postings/freq_handler.rs +++ b/src/postings/freq_handler.rs @@ -6,6 +6,9 @@ use compression::CompositeDecoder; use postings::SegmentPostingsOption; use compression::NUM_DOCS_PER_BLOCK; + +/// The FreqHandler object is in charge of decompressing +/// frequencies and/or positions. pub struct FreqHandler { freq_decoder: SIMDBlockDecoder, positions: Vec, @@ -28,6 +31,7 @@ fn read_positions(data: &[u8]) -> Vec { impl FreqHandler { + /// Returns a `FreqHandler` that just decodes `DocId`s. pub fn new_without_freq() -> FreqHandler { FreqHandler { freq_decoder: SIMDBlockDecoder::with_val(1u32), @@ -37,6 +41,7 @@ impl FreqHandler { } } + /// Returns a `FreqHandler` that decodes `DocId`s and term frequencies. pub fn new_with_freq() -> FreqHandler { FreqHandler { freq_decoder: SIMDBlockDecoder::new(), @@ -46,6 +51,8 @@ impl FreqHandler { } } + + /// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions. pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler { let positions = read_positions(position_data); FreqHandler { @@ -75,12 +82,26 @@ impl FreqHandler { } } + + /// Accessor to term frequency + /// + /// idx is the offset of the current doc in the block. + /// It takes value between 0 and 128. + pub fn freq(&self, idx: usize)-> u32 { + self.freq_decoder.output(idx) + } + + /// Accessor to the positions + /// + /// idx is the offset of the current doc in the block. + /// It takes value between 0 and 128. pub fn positions(&self, idx: usize) -> &[u32] { let start = self.positions_offsets[idx]; let stop = self.positions_offsets[idx + 1]; &self.positions[start..stop] } + /// Decompresses a complete frequency block pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { match self.option { SegmentPostingsOption::NoFreq => { @@ -96,7 +117,8 @@ impl FreqHandler { } } } - + + /// Decompresses an incomplete frequency block pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) { match self.option { SegmentPostingsOption::NoFreq => {} @@ -110,8 +132,4 @@ impl FreqHandler { } } - #[inline] - pub fn freq(&self, idx: usize)-> u32 { - self.freq_decoder.output(idx) - } } \ No newline at end of file diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs index 4069a602b..72dd804e7 100644 --- a/src/postings/intersection.rs +++ b/src/postings/intersection.rs @@ -2,7 +2,9 @@ use postings::DocSet; use std::cmp::Ordering; use DocId; +// TODO Find a way to specialize IntersectionDocSet +/// Creates a DocSet that iterator through the intersection of two `DocSet`s. pub struct IntersectionDocSet<'a> { left: Box, right: Box, @@ -10,7 +12,8 @@ pub struct IntersectionDocSet<'a> { } impl<'a> IntersectionDocSet<'a> { - + + /// Intersect two `DocSet`s fn from_pair(left: Box, right: Box) -> IntersectionDocSet<'a> { IntersectionDocSet { left: left, @@ -19,6 +22,7 @@ impl<'a> IntersectionDocSet<'a> { } } + /// Intersect a list of `DocSet`s pub fn new(mut postings: Vec>) -> IntersectionDocSet<'a> { let left = postings.pop().unwrap(); let right = @@ -74,6 +78,7 @@ impl<'a> DocSet for IntersectionDocSet<'a> { } } +/// Intersects a `Vec` of `DocSets` pub fn intersection<'a, TDocSet: DocSet + 'a>(postings: Vec) -> IntersectionDocSet<'a> { let boxed_postings: Vec> = postings .into_iter() diff --git a/src/postings/offset_postings.rs b/src/postings/offset_postings.rs index c5c0b5888..fe7ea453d 100644 --- a/src/postings/offset_postings.rs +++ b/src/postings/offset_postings.rs @@ -15,6 +15,8 @@ pub struct OffsetPostings<'a> { } impl<'a> OffsetPostings<'a> { + + /// Constructor pub fn new(underlying: SegmentPostings<'a>, offset: DocId) -> OffsetPostings { OffsetPostings { underlying: underlying, diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 2ccaaa5fc..7afe42b14 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -9,14 +9,29 @@ use schema::Field; use analyzer::StreamingIterator; use datastruct::stacker::{HashMap, Heap}; +/// The `PostingsWriter` is in charge of receiving documenting +/// and building a `Segment` in anonymous memory. +/// +/// `PostingsWriter` writes in a `Heap`. pub trait PostingsWriter { - fn close(&mut self, heap: &Heap); - + /// Record that a document contains a term at a given position. + /// + /// * doc - the document id + /// * pos - the term position (expressed in tokens) + /// * term - the term + /// * heap - heap used to store the postings informations as well as the terms + /// in the hashmap. fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap); - + + /// Serializes the postings on disk. + /// The actual serialization format is handled by the `PostingsSerializer`. fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>; + /// Closes all of the currently open `Recorder`'s. + fn close(&mut self, heap: &Heap); + + /// Tokenize a text and suscribe all of its token. fn index_text<'a>(&mut self, doc_id: DocId, field: Field, field_values: &[&'a FieldValue], heap: &Heap) -> u32 { let mut pos = 0u32; let mut num_tokens: u32 = 0u32; @@ -39,10 +54,13 @@ pub trait PostingsWriter { } } +/// The SpecializedPostingsWriter is just here to remove dynamic +/// dispatch to the recorder information. pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> { term_index: HashMap<'a, Rec>, } +/// Given a `Heap` size, computes a relevant size for the `HashMap`. fn hashmap_size_in_bits(heap_capacity: u32) -> usize { let num_buckets_usable = heap_capacity / 100; let hash_table_size = num_buckets_usable * 2; @@ -57,7 +75,8 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize { } impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> { - + + /// constructor pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> { let capacity = heap.capacity(); let hashmap_size = hashmap_size_in_bits(capacity); @@ -66,9 +85,9 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> { } } + /// Builds a `SpecializedPostingsWriter` storing its data in a heap. pub fn new_boxed(heap: &'a Heap) -> Box { - let res = SpecializedPostingsWriter::::new(heap); - Box::new(res) + Box::new(SpecializedPostingsWriter::::new(heap)) } } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 99b2aa65c..8de872f90 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -6,7 +6,11 @@ use std::num::Wrapping; -// No Term Frequency, no postings. +/// `SegmentPostings` represents the inverted list or postings associated to +/// a term in a `Segment`. +/// +/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded. +/// Positions on the other hand, are optionally entirely decoded upfront. pub struct SegmentPostings<'a> { len: usize, doc_offset: u32, @@ -16,22 +20,9 @@ pub struct SegmentPostings<'a> { cur: Wrapping, } -const EMPTY_ARRAY: [u8; 0] = []; - impl<'a> SegmentPostings<'a> { - - pub fn empty() -> SegmentPostings<'a> { - SegmentPostings { - len: 0, - doc_offset: 0, - block_decoder: SIMDBlockDecoder::new(), - freq_handler: FreqHandler::new_without_freq(), - remaining_data: &EMPTY_ARRAY, - cur: Wrapping(usize::max_value()), - } - } - pub fn load_next_block(&mut self,) { + fn load_next_block(&mut self,) { let num_remaining_docs = self.len - self.cur.0; if num_remaining_docs >= NUM_DOCS_PER_BLOCK { self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset); @@ -44,6 +35,12 @@ impl<'a> SegmentPostings<'a> { } } + /// Reads a Segment postings from an &[u8] + /// + /// * `len` - number of document in the posting lists. + /// * `data` - data array. The complete data is not necessarily used. + /// * `freq_handler` - the freq handler is in charge of decoding + /// frequencies and/or positions pub fn from_data(len: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> { SegmentPostings { len: len as usize, @@ -54,7 +51,9 @@ impl<'a> SegmentPostings<'a> { cur: Wrapping(usize::max_value()), } } - + + /// Index within a block is used as an address when + /// interacting with the `FreqHandler` fn index_within_block(&self,) -> usize { self.cur.0 % NUM_DOCS_PER_BLOCK } @@ -77,7 +76,7 @@ impl<'a> DocSet for SegmentPostings<'a> { } true } - + #[inline] fn doc(&self,) -> DocId { self.block_decoder.output(self.index_within_block())