NOBUG Added comments.

This commit is contained in:
Paul Masurel
2016-09-21 10:27:43 +09:00
parent b337adbd78
commit 025ab3c7ab
8 changed files with 132 additions and 45 deletions

View File

@@ -157,11 +157,12 @@ impl SegmentReader {
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encounterred and indexed.
///
/// # Panics
/// This method panics if the field was not indexed with the indexing options that cover
/// the requested options.
/// If the field was not indexed with the indexing options that cover
/// the requested options, the returned `SegmentPostings` the method does not fail
/// and returns a `SegmentPostings` with as much information as possible.
///
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions`
/// that does not index position will panic.
/// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<SegmentPostings> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
@@ -204,8 +205,7 @@ impl SegmentReader {
};
Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, freq_handler))
}
/// Returns the posting list associated with a term.
pub fn read_postings_all_info(&self, term: &Term) -> Option<SegmentPostings> {
let field_entry = self.schema.get_field_entry(term.field());

View File

@@ -5,6 +5,27 @@ use std::io;
use std::io::{SeekFrom, Write};
use super::compute_num_bits;
/// `FastFieldSerializer` is in charge of serializing
/// a fastfield on disk.
///
/// FastField are encoded using bit-packing.
///
/// `FastFieldWriter`s are in charge of pushing the data to
/// the serializer.
/// The serializer expects to receive the following calls.
///
/// * `new_u32_fast_field(...)`
/// * `add_val(...)`
/// * `add_val(...)`
/// * `add_val(...)`
/// * ...
/// * `close_field()`
/// * `new_u32_fast_field(...)`
/// * `add_val(...)`
/// * ...
/// * `close_field()`
/// * `close()`
pub struct FastFieldSerializer {
write: WritePtr,
written_size: usize,
@@ -12,13 +33,15 @@ pub struct FastFieldSerializer {
num_bits: u8,
min_value: u32,
field_open: bool,
mini_buffer_written: usize,
mini_buffer: u32,
}
impl FastFieldSerializer {
/// Constructor
pub fn new(mut write: WritePtr) -> io::Result<FastFieldSerializer> {
// just making room for the pointer to header.
let written_size: usize = try!(0u32.serialize(&mut write));
@@ -34,7 +57,8 @@ impl FastFieldSerializer {
mini_buffer: 0u32,
})
}
/// Start serializing a new u32 fast field
pub fn new_u32_fast_field(&mut self, field: Field, min_value: u32, max_value: u32) -> io::Result<()> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
@@ -56,6 +80,8 @@ impl FastFieldSerializer {
Ok(())
}
/// Pushes a new value to the currently open u32 fast field.
pub fn add_val(&mut self, val: u32) -> io::Result<()> {
let write: &mut Write = &mut self.write;
let val_to_write: u32 = val - self.min_value;
@@ -77,7 +103,8 @@ impl FastFieldSerializer {
}
Ok(())
}
/// Close the u32 fast field.
pub fn close_field(&mut self,) -> io::Result<()> {
if !self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
@@ -94,7 +121,11 @@ impl FastFieldSerializer {
self.mini_buffer = 0;
Ok(())
}
/// Closes the serializer
///
/// After this call the data must be persistently save on disk.
pub fn close(mut self,) -> io::Result<usize> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));

View File

@@ -3,23 +3,33 @@ use std::borrow::Borrow;
use std::borrow::BorrowMut;
use std::cmp::Ordering;
/// Expressed the outcome of a call to `DocSet`'s `.skip_next(...)`.
#[derive(PartialEq, Eq, Debug)]
pub enum SkipResult {
/// target was in the docset
Reached,
/// target was not in the docset, skipping stopped as a greater element was found
OverStep,
/// the docset was entirely consumed without finding the target, nor any
/// element greater than the target.
End,
}
/// Represents an iterable set of sorted doc ids.
pub trait DocSet {
// goes to the next element.
// next needs to be called a first time to point to the correct element.
/// Goes to the next element.
/// `.advance(...)` needs to be called a first time to point to the correct
/// element.
fn advance(&mut self,) -> bool;
// after skipping position
// the iterator in such a way that doc() will return a
// value greater or equal to target.
/// After skipping position, the iterator in such a way `.doc()`
/// will return a value greater or equal to target.
///
/// SkipResult expresses whether the `target value` was reached, overstepped,
/// or if the `DocSet` was entirely consumed without finding any value
/// greater or equal to the `target`.
fn skip_next(&mut self, target: DocId) -> SkipResult {
loop {
match self.doc().cmp(&target) {
@@ -37,6 +47,9 @@ pub trait DocSet {
/// Returns the current document
fn doc(&self,) -> DocId;
/// Advances the cursor to the next document
/// None is returned if the iterator has `DocSet`
/// has already been entirely consumed.
fn next(&mut self,) -> Option<DocId> {
if self.advance() {
Some(self.doc())

View File

@@ -6,6 +6,9 @@ use compression::CompositeDecoder;
use postings::SegmentPostingsOption;
use compression::NUM_DOCS_PER_BLOCK;
/// The FreqHandler object is in charge of decompressing
/// frequencies and/or positions.
pub struct FreqHandler {
freq_decoder: SIMDBlockDecoder,
positions: Vec<u32>,
@@ -28,6 +31,7 @@ fn read_positions(data: &[u8]) -> Vec<u32> {
impl FreqHandler {
/// Returns a `FreqHandler` that just decodes `DocId`s.
pub fn new_without_freq() -> FreqHandler {
FreqHandler {
freq_decoder: SIMDBlockDecoder::with_val(1u32),
@@ -37,6 +41,7 @@ impl FreqHandler {
}
}
/// Returns a `FreqHandler` that decodes `DocId`s and term frequencies.
pub fn new_with_freq() -> FreqHandler {
FreqHandler {
freq_decoder: SIMDBlockDecoder::new(),
@@ -46,6 +51,8 @@ impl FreqHandler {
}
}
/// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions.
pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler {
let positions = read_positions(position_data);
FreqHandler {
@@ -75,12 +82,26 @@ impl FreqHandler {
}
}
/// Accessor to term frequency
///
/// idx is the offset of the current doc in the block.
/// It takes value between 0 and 128.
pub fn freq(&self, idx: usize)-> u32 {
self.freq_decoder.output(idx)
}
/// Accessor to the positions
///
/// idx is the offset of the current doc in the block.
/// It takes value between 0 and 128.
pub fn positions(&self, idx: usize) -> &[u32] {
let start = self.positions_offsets[idx];
let stop = self.positions_offsets[idx + 1];
&self.positions[start..stop]
}
/// Decompresses a complete frequency block
pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
match self.option {
SegmentPostingsOption::NoFreq => {
@@ -96,7 +117,8 @@ impl FreqHandler {
}
}
}
/// Decompresses an incomplete frequency block
pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
match self.option {
SegmentPostingsOption::NoFreq => {}
@@ -110,8 +132,4 @@ impl FreqHandler {
}
}
#[inline]
pub fn freq(&self, idx: usize)-> u32 {
self.freq_decoder.output(idx)
}
}

View File

@@ -2,7 +2,9 @@ use postings::DocSet;
use std::cmp::Ordering;
use DocId;
// TODO Find a way to specialize IntersectionDocSet
/// Creates a DocSet that iterator through the intersection of two `DocSet`s.
pub struct IntersectionDocSet<'a> {
left: Box<DocSet + 'a>,
right: Box<DocSet + 'a>,
@@ -10,7 +12,8 @@ pub struct IntersectionDocSet<'a> {
}
impl<'a> IntersectionDocSet<'a> {
/// Intersect two `DocSet`s
fn from_pair(left: Box<DocSet + 'a>, right: Box<DocSet + 'a>) -> IntersectionDocSet<'a> {
IntersectionDocSet {
left: left,
@@ -19,6 +22,7 @@ impl<'a> IntersectionDocSet<'a> {
}
}
/// Intersect a list of `DocSet`s
pub fn new(mut postings: Vec<Box<DocSet + 'a>>) -> IntersectionDocSet<'a> {
let left = postings.pop().unwrap();
let right =
@@ -74,6 +78,7 @@ impl<'a> DocSet for IntersectionDocSet<'a> {
}
}
/// Intersects a `Vec` of `DocSets`
pub fn intersection<'a, TDocSet: DocSet + 'a>(postings: Vec<TDocSet>) -> IntersectionDocSet<'a> {
let boxed_postings: Vec<Box<DocSet + 'a>> = postings
.into_iter()

View File

@@ -15,6 +15,8 @@ pub struct OffsetPostings<'a> {
}
impl<'a> OffsetPostings<'a> {
/// Constructor
pub fn new(underlying: SegmentPostings<'a>, offset: DocId) -> OffsetPostings {
OffsetPostings {
underlying: underlying,

View File

@@ -9,14 +9,29 @@ use schema::Field;
use analyzer::StreamingIterator;
use datastruct::stacker::{HashMap, Heap};
/// The `PostingsWriter` is in charge of receiving documenting
/// and building a `Segment` in anonymous memory.
///
/// `PostingsWriter` writes in a `Heap`.
pub trait PostingsWriter {
fn close(&mut self, heap: &Heap);
/// Record that a document contains a term at a given position.
///
/// * doc - the document id
/// * pos - the term position (expressed in tokens)
/// * term - the term
/// * heap - heap used to store the postings informations as well as the terms
/// in the hashmap.
fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap);
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
/// Closes all of the currently open `Recorder`'s.
fn close(&mut self, heap: &Heap);
/// Tokenize a text and suscribe all of its token.
fn index_text<'a>(&mut self, doc_id: DocId, field: Field, field_values: &[&'a FieldValue], heap: &Heap) -> u32 {
let mut pos = 0u32;
let mut num_tokens: u32 = 0u32;
@@ -39,10 +54,13 @@ pub trait PostingsWriter {
}
}
/// The SpecializedPostingsWriter is just here to remove dynamic
/// dispatch to the recorder information.
pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
term_index: HashMap<'a, Rec>,
}
/// Given a `Heap` size, computes a relevant size for the `HashMap`.
fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
let num_buckets_usable = heap_capacity / 100;
let hash_table_size = num_buckets_usable * 2;
@@ -57,7 +75,8 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
}
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
/// constructor
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
let capacity = heap.capacity();
let hashmap_size = hashmap_size_in_bits(capacity);
@@ -66,9 +85,9 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
}
}
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
let res = SpecializedPostingsWriter::<Rec>::new(heap);
Box::new(res)
Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
}
}

View File

@@ -6,7 +6,11 @@ use std::num::Wrapping;
// No Term Frequency, no postings.
/// `SegmentPostings` represents the inverted list or postings associated to
/// a term in a `Segment`.
///
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
/// Positions on the other hand, are optionally entirely decoded upfront.
pub struct SegmentPostings<'a> {
len: usize,
doc_offset: u32,
@@ -16,22 +20,9 @@ pub struct SegmentPostings<'a> {
cur: Wrapping<usize>,
}
const EMPTY_ARRAY: [u8; 0] = [];
impl<'a> SegmentPostings<'a> {
pub fn empty() -> SegmentPostings<'a> {
SegmentPostings {
len: 0,
doc_offset: 0,
block_decoder: SIMDBlockDecoder::new(),
freq_handler: FreqHandler::new_without_freq(),
remaining_data: &EMPTY_ARRAY,
cur: Wrapping(usize::max_value()),
}
}
pub fn load_next_block(&mut self,) {
fn load_next_block(&mut self,) {
let num_remaining_docs = self.len - self.cur.0;
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset);
@@ -44,6 +35,12 @@ impl<'a> SegmentPostings<'a> {
}
}
/// Reads a Segment postings from an &[u8]
///
/// * `len` - number of document in the posting lists.
/// * `data` - data array. The complete data is not necessarily used.
/// * `freq_handler` - the freq handler is in charge of decoding
/// frequencies and/or positions
pub fn from_data(len: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> {
SegmentPostings {
len: len as usize,
@@ -54,7 +51,9 @@ impl<'a> SegmentPostings<'a> {
cur: Wrapping(usize::max_value()),
}
}
/// Index within a block is used as an address when
/// interacting with the `FreqHandler`
fn index_within_block(&self,) -> usize {
self.cur.0 % NUM_DOCS_PER_BLOCK
}
@@ -77,7 +76,7 @@ impl<'a> DocSet for SegmentPostings<'a> {
}
true
}
#[inline]
fn doc(&self,) -> DocId {
self.block_decoder.output(self.index_within_block())