mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-30 07:00:41 +00:00
NOBUG Added comments.
This commit is contained in:
@@ -157,11 +157,12 @@ impl SegmentReader {
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encounterred and indexed.
|
||||
///
|
||||
/// # Panics
|
||||
/// This method panics if the field was not indexed with the indexing options that cover
|
||||
/// the requested options.
|
||||
/// If the field was not indexed with the indexing options that cover
|
||||
/// the requested options, the returned `SegmentPostings` the method does not fail
|
||||
/// and returns a `SegmentPostings` with as much information as possible.
|
||||
///
|
||||
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions`
|
||||
/// that does not index position will panic.
|
||||
/// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<SegmentPostings> {
|
||||
let field = term.field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
@@ -204,8 +205,7 @@ impl SegmentReader {
|
||||
};
|
||||
Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, freq_handler))
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Returns the posting list associated with a term.
|
||||
pub fn read_postings_all_info(&self, term: &Term) -> Option<SegmentPostings> {
|
||||
let field_entry = self.schema.get_field_entry(term.field());
|
||||
|
||||
@@ -5,6 +5,27 @@ use std::io;
|
||||
use std::io::{SeekFrom, Write};
|
||||
use super::compute_num_bits;
|
||||
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
/// a fastfield on disk.
|
||||
///
|
||||
/// FastField are encoded using bit-packing.
|
||||
///
|
||||
/// `FastFieldWriter`s are in charge of pushing the data to
|
||||
/// the serializer.
|
||||
/// The serializer expects to receive the following calls.
|
||||
///
|
||||
/// * `new_u32_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `new_u32_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `close()`
|
||||
pub struct FastFieldSerializer {
|
||||
write: WritePtr,
|
||||
written_size: usize,
|
||||
@@ -12,13 +33,15 @@ pub struct FastFieldSerializer {
|
||||
num_bits: u8,
|
||||
min_value: u32,
|
||||
field_open: bool,
|
||||
|
||||
|
||||
|
||||
mini_buffer_written: usize,
|
||||
mini_buffer: u32,
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl FastFieldSerializer {
|
||||
/// Constructor
|
||||
pub fn new(mut write: WritePtr) -> io::Result<FastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let written_size: usize = try!(0u32.serialize(&mut write));
|
||||
@@ -34,7 +57,8 @@ impl FastFieldSerializer {
|
||||
mini_buffer: 0u32,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Start serializing a new u32 fast field
|
||||
pub fn new_u32_fast_field(&mut self, field: Field, min_value: u32, max_value: u32) -> io::Result<()> {
|
||||
if self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
|
||||
@@ -56,6 +80,8 @@ impl FastFieldSerializer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Pushes a new value to the currently open u32 fast field.
|
||||
pub fn add_val(&mut self, val: u32) -> io::Result<()> {
|
||||
let write: &mut Write = &mut self.write;
|
||||
let val_to_write: u32 = val - self.min_value;
|
||||
@@ -77,7 +103,8 @@ impl FastFieldSerializer {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Close the u32 fast field.
|
||||
pub fn close_field(&mut self,) -> io::Result<()> {
|
||||
if !self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
|
||||
@@ -94,7 +121,11 @@ impl FastFieldSerializer {
|
||||
self.mini_buffer = 0;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
pub fn close(mut self,) -> io::Result<usize> {
|
||||
if self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
|
||||
|
||||
@@ -3,23 +3,33 @@ use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
|
||||
/// Expressed the outcome of a call to `DocSet`'s `.skip_next(...)`.
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub enum SkipResult {
|
||||
/// target was in the docset
|
||||
Reached,
|
||||
/// target was not in the docset, skipping stopped as a greater element was found
|
||||
OverStep,
|
||||
/// the docset was entirely consumed without finding the target, nor any
|
||||
/// element greater than the target.
|
||||
End,
|
||||
}
|
||||
|
||||
|
||||
/// Represents an iterable set of sorted doc ids.
|
||||
pub trait DocSet {
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
/// Goes to the next element.
|
||||
/// `.advance(...)` needs to be called a first time to point to the correct
|
||||
/// element.
|
||||
fn advance(&mut self,) -> bool;
|
||||
|
||||
// after skipping position
|
||||
// the iterator in such a way that doc() will return a
|
||||
// value greater or equal to target.
|
||||
/// After skipping position, the iterator in such a way `.doc()`
|
||||
/// will return a value greater or equal to target.
|
||||
///
|
||||
/// SkipResult expresses whether the `target value` was reached, overstepped,
|
||||
/// or if the `DocSet` was entirely consumed without finding any value
|
||||
/// greater or equal to the `target`.
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
loop {
|
||||
match self.doc().cmp(&target) {
|
||||
@@ -37,6 +47,9 @@ pub trait DocSet {
|
||||
/// Returns the current document
|
||||
fn doc(&self,) -> DocId;
|
||||
|
||||
/// Advances the cursor to the next document
|
||||
/// None is returned if the iterator has `DocSet`
|
||||
/// has already been entirely consumed.
|
||||
fn next(&mut self,) -> Option<DocId> {
|
||||
if self.advance() {
|
||||
Some(self.doc())
|
||||
|
||||
@@ -6,6 +6,9 @@ use compression::CompositeDecoder;
|
||||
use postings::SegmentPostingsOption;
|
||||
use compression::NUM_DOCS_PER_BLOCK;
|
||||
|
||||
|
||||
/// The FreqHandler object is in charge of decompressing
|
||||
/// frequencies and/or positions.
|
||||
pub struct FreqHandler {
|
||||
freq_decoder: SIMDBlockDecoder,
|
||||
positions: Vec<u32>,
|
||||
@@ -28,6 +31,7 @@ fn read_positions(data: &[u8]) -> Vec<u32> {
|
||||
|
||||
impl FreqHandler {
|
||||
|
||||
/// Returns a `FreqHandler` that just decodes `DocId`s.
|
||||
pub fn new_without_freq() -> FreqHandler {
|
||||
FreqHandler {
|
||||
freq_decoder: SIMDBlockDecoder::with_val(1u32),
|
||||
@@ -37,6 +41,7 @@ impl FreqHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `FreqHandler` that decodes `DocId`s and term frequencies.
|
||||
pub fn new_with_freq() -> FreqHandler {
|
||||
FreqHandler {
|
||||
freq_decoder: SIMDBlockDecoder::new(),
|
||||
@@ -46,6 +51,8 @@ impl FreqHandler {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions.
|
||||
pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler {
|
||||
let positions = read_positions(position_data);
|
||||
FreqHandler {
|
||||
@@ -75,12 +82,26 @@ impl FreqHandler {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Accessor to term frequency
|
||||
///
|
||||
/// idx is the offset of the current doc in the block.
|
||||
/// It takes value between 0 and 128.
|
||||
pub fn freq(&self, idx: usize)-> u32 {
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Accessor to the positions
|
||||
///
|
||||
/// idx is the offset of the current doc in the block.
|
||||
/// It takes value between 0 and 128.
|
||||
pub fn positions(&self, idx: usize) -> &[u32] {
|
||||
let start = self.positions_offsets[idx];
|
||||
let stop = self.positions_offsets[idx + 1];
|
||||
&self.positions[start..stop]
|
||||
}
|
||||
|
||||
/// Decompresses a complete frequency block
|
||||
pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
|
||||
match self.option {
|
||||
SegmentPostingsOption::NoFreq => {
|
||||
@@ -96,7 +117,8 @@ impl FreqHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Decompresses an incomplete frequency block
|
||||
pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
|
||||
match self.option {
|
||||
SegmentPostingsOption::NoFreq => {}
|
||||
@@ -110,8 +132,4 @@ impl FreqHandler {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn freq(&self, idx: usize)-> u32 {
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
}
|
||||
@@ -2,7 +2,9 @@ use postings::DocSet;
|
||||
use std::cmp::Ordering;
|
||||
use DocId;
|
||||
|
||||
// TODO Find a way to specialize IntersectionDocSet
|
||||
|
||||
/// Creates a DocSet that iterator through the intersection of two `DocSet`s.
|
||||
pub struct IntersectionDocSet<'a> {
|
||||
left: Box<DocSet + 'a>,
|
||||
right: Box<DocSet + 'a>,
|
||||
@@ -10,7 +12,8 @@ pub struct IntersectionDocSet<'a> {
|
||||
}
|
||||
|
||||
impl<'a> IntersectionDocSet<'a> {
|
||||
|
||||
|
||||
/// Intersect two `DocSet`s
|
||||
fn from_pair(left: Box<DocSet + 'a>, right: Box<DocSet + 'a>) -> IntersectionDocSet<'a> {
|
||||
IntersectionDocSet {
|
||||
left: left,
|
||||
@@ -19,6 +22,7 @@ impl<'a> IntersectionDocSet<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Intersect a list of `DocSet`s
|
||||
pub fn new(mut postings: Vec<Box<DocSet + 'a>>) -> IntersectionDocSet<'a> {
|
||||
let left = postings.pop().unwrap();
|
||||
let right =
|
||||
@@ -74,6 +78,7 @@ impl<'a> DocSet for IntersectionDocSet<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Intersects a `Vec` of `DocSets`
|
||||
pub fn intersection<'a, TDocSet: DocSet + 'a>(postings: Vec<TDocSet>) -> IntersectionDocSet<'a> {
|
||||
let boxed_postings: Vec<Box<DocSet + 'a>> = postings
|
||||
.into_iter()
|
||||
|
||||
@@ -15,6 +15,8 @@ pub struct OffsetPostings<'a> {
|
||||
}
|
||||
|
||||
impl<'a> OffsetPostings<'a> {
|
||||
|
||||
/// Constructor
|
||||
pub fn new(underlying: SegmentPostings<'a>, offset: DocId) -> OffsetPostings {
|
||||
OffsetPostings {
|
||||
underlying: underlying,
|
||||
|
||||
@@ -9,14 +9,29 @@ use schema::Field;
|
||||
use analyzer::StreamingIterator;
|
||||
use datastruct::stacker::{HashMap, Heap};
|
||||
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
/// `PostingsWriter` writes in a `Heap`.
|
||||
pub trait PostingsWriter {
|
||||
|
||||
fn close(&mut self, heap: &Heap);
|
||||
|
||||
/// Record that a document contains a term at a given position.
|
||||
///
|
||||
/// * doc - the document id
|
||||
/// * pos - the term position (expressed in tokens)
|
||||
/// * term - the term
|
||||
/// * heap - heap used to store the postings informations as well as the terms
|
||||
/// in the hashmap.
|
||||
fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap);
|
||||
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
|
||||
|
||||
/// Closes all of the currently open `Recorder`'s.
|
||||
fn close(&mut self, heap: &Heap);
|
||||
|
||||
/// Tokenize a text and suscribe all of its token.
|
||||
fn index_text<'a>(&mut self, doc_id: DocId, field: Field, field_values: &[&'a FieldValue], heap: &Heap) -> u32 {
|
||||
let mut pos = 0u32;
|
||||
let mut num_tokens: u32 = 0u32;
|
||||
@@ -39,10 +54,13 @@ pub trait PostingsWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// The SpecializedPostingsWriter is just here to remove dynamic
|
||||
/// dispatch to the recorder information.
|
||||
pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
|
||||
term_index: HashMap<'a, Rec>,
|
||||
}
|
||||
|
||||
/// Given a `Heap` size, computes a relevant size for the `HashMap`.
|
||||
fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
|
||||
let num_buckets_usable = heap_capacity / 100;
|
||||
let hash_table_size = num_buckets_usable * 2;
|
||||
@@ -57,7 +75,8 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
|
||||
|
||||
/// constructor
|
||||
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
|
||||
let capacity = heap.capacity();
|
||||
let hashmap_size = hashmap_size_in_bits(capacity);
|
||||
@@ -66,9 +85,9 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
|
||||
pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
|
||||
let res = SpecializedPostingsWriter::<Rec>::new(heap);
|
||||
Box::new(res)
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -6,7 +6,11 @@ use std::num::Wrapping;
|
||||
|
||||
|
||||
|
||||
// No Term Frequency, no postings.
|
||||
/// `SegmentPostings` represents the inverted list or postings associated to
|
||||
/// a term in a `Segment`.
|
||||
///
|
||||
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
|
||||
/// Positions on the other hand, are optionally entirely decoded upfront.
|
||||
pub struct SegmentPostings<'a> {
|
||||
len: usize,
|
||||
doc_offset: u32,
|
||||
@@ -16,22 +20,9 @@ pub struct SegmentPostings<'a> {
|
||||
cur: Wrapping<usize>,
|
||||
}
|
||||
|
||||
const EMPTY_ARRAY: [u8; 0] = [];
|
||||
|
||||
impl<'a> SegmentPostings<'a> {
|
||||
|
||||
pub fn empty() -> SegmentPostings<'a> {
|
||||
SegmentPostings {
|
||||
len: 0,
|
||||
doc_offset: 0,
|
||||
block_decoder: SIMDBlockDecoder::new(),
|
||||
freq_handler: FreqHandler::new_without_freq(),
|
||||
remaining_data: &EMPTY_ARRAY,
|
||||
cur: Wrapping(usize::max_value()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn load_next_block(&mut self,) {
|
||||
fn load_next_block(&mut self,) {
|
||||
let num_remaining_docs = self.len - self.cur.0;
|
||||
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
|
||||
self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
@@ -44,6 +35,12 @@ impl<'a> SegmentPostings<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads a Segment postings from an &[u8]
|
||||
///
|
||||
/// * `len` - number of document in the posting lists.
|
||||
/// * `data` - data array. The complete data is not necessarily used.
|
||||
/// * `freq_handler` - the freq handler is in charge of decoding
|
||||
/// frequencies and/or positions
|
||||
pub fn from_data(len: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> {
|
||||
SegmentPostings {
|
||||
len: len as usize,
|
||||
@@ -54,7 +51,9 @@ impl<'a> SegmentPostings<'a> {
|
||||
cur: Wrapping(usize::max_value()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Index within a block is used as an address when
|
||||
/// interacting with the `FreqHandler`
|
||||
fn index_within_block(&self,) -> usize {
|
||||
self.cur.0 % NUM_DOCS_PER_BLOCK
|
||||
}
|
||||
@@ -77,7 +76,7 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
|
||||
#[inline]
|
||||
fn doc(&self,) -> DocId {
|
||||
self.block_decoder.output(self.index_within_block())
|
||||
|
||||
Reference in New Issue
Block a user