issue/50 Small formatting change.

This commit is contained in:
Paul Masurel
2016-11-04 00:11:46 +09:00
parent 9d3c9999cb
commit f2df0bf0e9
19 changed files with 290 additions and 297 deletions

View File

@@ -17,23 +17,23 @@ pub enum SkipResult {
}
/// Represents an iterable set of sorted doc ids.
/// Represents an iterable set of sorted doc ids.
pub trait DocSet {
/// Goes to the next element.
/// `.advance(...)` needs to be called a first time to point to the correct
/// element.
fn advance(&mut self,) -> bool;
fn advance(&mut self) -> bool;
/// After skipping, position the iterator in such a way that `.doc()`
/// will return a value greater than or equal to target.
///
///
/// SkipResult expresses whether the `target value` was reached, overstepped,
/// or if the `DocSet` was entirely consumed without finding any value
/// greater or equal to the `target`.
///
/// WARNING: Calling skip always advances the docset.
/// More specifically, if the docset is already positionned on the target
/// skipping will advance to the next position and return SkipResult::Overstep.
/// skipping will advance to the next position and return SkipResult::Overstep.
///
fn skip_next(&mut self, target: DocId) -> SkipResult {
self.advance();
@@ -43,32 +43,30 @@ pub trait DocSet {
if !self.advance() {
return SkipResult::End;
}
},
Ordering::Equal => { return SkipResult::Reached },
Ordering::Greater => { return SkipResult::OverStep },
}
Ordering::Equal => return SkipResult::Reached,
Ordering::Greater => return SkipResult::OverStep,
}
}
}
/// Returns the current document
fn doc(&self,) -> DocId;
fn doc(&self) -> DocId;
/// Advances the cursor to the next document
/// None is returned if the iterator has `DocSet`
/// has already been entirely consumed.
fn next(&mut self,) -> Option<DocId> {
/// None is returned if the iterator has `DocSet`
/// has already been entirely consumed.
fn next(&mut self) -> Option<DocId> {
if self.advance() {
Some(self.doc())
}
else {
} else {
None
}
}
}
}
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
fn advance(&mut self,) -> bool {
fn advance(&mut self) -> bool {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.advance()
}
@@ -78,28 +76,25 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.skip_next(target)
}
fn doc(&self,) -> DocId {
fn doc(&self) -> DocId {
let unboxed: &TDocSet = self.borrow();
unboxed.doc()
}
}
impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
fn advance(&mut self,) -> bool {
fn advance(&mut self) -> bool {
let unref: &mut TDocSet = *self;
unref.advance()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
let unref: &mut TDocSet = *self;
unref.skip_next(target)
}
fn doc(&self,) -> DocId {
fn doc(&self) -> DocId {
let unref: &TDocSet = *self;
unref.doc()
}
}

View File

@@ -17,7 +17,7 @@ pub struct FreqHandler {
fn read_positions(data: &[u8]) -> Vec<u32> {
let mut composite_reader = CompositeDecoder::new();
let mut composite_reader = CompositeDecoder::new();
let mut readable: &[u8] = data;
let uncompressed_len = VInt::deserialize(&mut readable).unwrap().0 as usize;
composite_reader.uncompress_unsorted(readable, uncompressed_len);
@@ -27,17 +27,16 @@ fn read_positions(data: &[u8]) -> Vec<u32> {
impl FreqHandler {
/// Returns a `FreqHandler` that just decodes `DocId`s.
pub fn new_without_freq() -> FreqHandler {
FreqHandler {
freq_decoder: SIMDBlockDecoder::with_val(1u32),
positions: Vec::new(),
positions: Vec::new(),
option: SegmentPostingsOption::NoFreq,
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
}
}
/// Returns a `FreqHandler` that decodes `DocId`s and term frequencies.
pub fn new_with_freq() -> FreqHandler {
FreqHandler {
@@ -54,15 +53,15 @@ impl FreqHandler {
let positions = read_positions(position_data);
FreqHandler {
freq_decoder: SIMDBlockDecoder::new(),
positions: positions,
positions: positions,
option: SegmentPostingsOption::FreqAndPositions,
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
}
}
fn fill_positions_offset(&mut self,) {
fn fill_positions_offset(&mut self) {
let mut cur_position: usize = self.positions_offsets[NUM_DOCS_PER_BLOCK];
let mut i: usize = 0;
let mut i: usize = 0;
self.positions_offsets[i] = cur_position;
let mut last_cur_position = cur_position;
for &doc_freq in self.freq_decoder.output_array() {
@@ -78,16 +77,16 @@ impl FreqHandler {
last_cur_position = cur_position;
}
}
/// Accessor to term frequency
///
/// idx is the offset of the current doc in the block.
/// It takes value between 0 and 128.
pub fn freq(&self, idx: usize)-> u32 {
pub fn freq(&self, idx: usize) -> u32 {
self.freq_decoder.output(idx)
}
/// Accessor to the positions
///
/// idx is the offset of the current doc in the block.
@@ -97,16 +96,12 @@ impl FreqHandler {
let stop = self.positions_offsets[idx + 1];
&self.positions[start..stop]
}
/// Decompresses a complete frequency block
pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
match self.option {
SegmentPostingsOption::NoFreq => {
data
}
SegmentPostingsOption::Freq => {
self.freq_decoder.uncompress_block_unsorted(data)
}
SegmentPostingsOption::NoFreq => data,
SegmentPostingsOption::Freq => self.freq_decoder.uncompress_block_unsorted(data),
SegmentPostingsOption::FreqAndPositions => {
let remaining: &'a [u8] = self.freq_decoder.uncompress_block_unsorted(data);
self.fill_positions_offset();
@@ -114,7 +109,7 @@ impl FreqHandler {
}
}
}
/// Decompresses an incomplete frequency block
pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
match self.option {
@@ -128,5 +123,4 @@ impl FreqHandler {
}
}
}
}

View File

@@ -7,7 +7,7 @@ use DocId;
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
pub struct IntersectionDocSet<TDocSet: DocSet> {
docsets: Vec<TDocSet>,
finished: bool,
finished: bool,
doc: DocId,
}
@@ -18,11 +18,14 @@ impl<TDocSet: DocSet> From<Vec<TDocSet>> for IntersectionDocSet<TDocSet> {
docsets: docsets,
finished: false,
doc: DocId::max_value(),
}
}
}
}
impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> {
/// Returns an array to the underlying `DocSet`s of the intersection.
/// These `DocSet` are in the same position as the `IntersectionDocSet`,
/// so that user can access their `docfreq` and `positions`.
pub fn docsets(&self) -> &[TDocSet] {
&self.docsets[..]
}
@@ -30,8 +33,7 @@ impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> {
impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
fn advance(&mut self,) -> bool {
fn advance(&mut self) -> bool {
if self.finished {
return false;
}
@@ -71,8 +73,8 @@ impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
}
}
}
fn doc(&self,) -> DocId {
fn doc(&self) -> DocId {
self.doc
}
}

View File

@@ -15,7 +15,6 @@ pub struct OffsetPostings<'a> {
}
impl<'a> OffsetPostings<'a> {
/// Constructor
pub fn new(underlying: SegmentPostings<'a>, offset: DocId) -> OffsetPostings {
OffsetPostings {
@@ -26,38 +25,35 @@ impl<'a> OffsetPostings<'a> {
}
impl<'a> DocSet for OffsetPostings<'a> {
fn advance(&mut self,) -> bool {
fn advance(&mut self) -> bool {
self.underlying.advance()
}
fn doc(&self,) -> DocId {
fn doc(&self) -> DocId {
self.underlying.doc() + self.offset
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
if target >= self.offset {
SkipResult::OverStep
}
else {
self.underlying.skip_next(target - self.offset)
} else {
self.underlying.skip_next(target - self.offset)
}
}
}
impl<'a> HasLen for OffsetPostings<'a> {
fn len(&self,) -> usize {
fn len(&self) -> usize {
self.underlying.len()
}
}
impl<'a> Postings for OffsetPostings<'a> {
fn term_freq(&self,) -> u32 {
fn term_freq(&self) -> u32 {
self.underlying.term_freq()
}
fn positions(&self) -> &[u32] {
self.underlying.positions()
}
}

View File

@@ -7,45 +7,38 @@ use postings::docset::DocSet;
/// containing the term. Optionally, for each document,
/// it may also give access to the term frequency
/// as well as the list of term positions.
///
///
/// Its main implementation is `SegmentPostings`,
/// but other implementations mocking `SegmentPostings` exist,
/// for merging segments or for testing.
pub trait Postings: DocSet {
/// Returns the term frequency
fn term_freq(&self,) -> u32;
fn term_freq(&self) -> u32;
/// Returns the list of positions of the term, expressed as a list of
/// token ordinals.
fn positions(&self) -> &[u32];
}
impl<TPostings: Postings> Postings for Box<TPostings> {
fn term_freq(&self,) -> u32 {
fn term_freq(&self) -> u32 {
let unboxed: &TPostings = self.borrow();
unboxed.term_freq()
}
fn positions(&self) -> &[u32] {
let unboxed: &TPostings = self.borrow();
unboxed.positions()
}
}
impl<'a, TPostings: Postings> Postings for &'a mut TPostings {
fn term_freq(&self,) -> u32 {
fn term_freq(&self) -> u32 {
let unref: &TPostings = *self;
unref.term_freq()
}
fn positions(&self) -> &[u32] {
let unref: &TPostings = *self;
unref.positions()
}
}

View File

@@ -9,12 +9,11 @@ use schema::Field;
use analyzer::StreamingIterator;
use datastruct::stacker::{HashMap, Heap};
/// The `PostingsWriter` is in charge of receiving documenting
/// The `PostingsWriter` is in charge of receiving documenting
/// and building a `Segment` in anonymous memory.
///
/// `PostingsWriter` writes in a `Heap`.
pub trait PostingsWriter {
/// Record that a document contains a term at a given position.
///
/// * doc - the document id
@@ -22,17 +21,22 @@ pub trait PostingsWriter {
/// * term - the term
/// * heap - heap used to store the postings informations as well as the terms
/// in the hashmap.
fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap);
fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap);
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
/// Closes all of the currently open `Recorder`'s.
fn close(&mut self, heap: &Heap);
/// Tokenize a text and suscribe all of its token.
fn index_text<'a>(&mut self, doc_id: DocId, field: Field, field_values: &[&'a FieldValue], heap: &Heap) -> u32 {
fn index_text<'a>(&mut self,
doc_id: DocId,
field: Field,
field_values: &[&'a FieldValue],
heap: &Heap)
-> u32 {
let mut pos = 0u32;
let mut num_tokens: u32 = 0u32;
let mut term = Term::allocate(field, 100);
@@ -65,7 +69,7 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
let num_buckets_usable = heap_capacity / 100;
let hash_table_size = num_buckets_usable * 2;
let mut pow = 512;
for num_bits in 10 .. 32 {
for num_bits in 10..32 {
pow <<= 1;
if pow > hash_table_size {
return num_bits;
@@ -75,31 +79,26 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
}
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
/// constructor
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
let capacity = heap.capacity();
let hashmap_size = hashmap_size_in_bits(capacity);
SpecializedPostingsWriter {
term_index: HashMap::new(hashmap_size, heap),
}
SpecializedPostingsWriter { term_index: HashMap::new(hashmap_size, heap) }
}
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
}
}
}
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
fn close(&mut self, heap: &Heap) {
for recorder in self.term_index.values_mut() {
recorder.close_doc(heap);
}
}
#[inline]
fn suscribe(&mut self, doc: DocId, position: u32, term: &Term, heap: &Heap) {
let mut recorder = self.term_index.get_or_create(term);
@@ -112,9 +111,9 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
}
recorder.record_position(position, heap);
}
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index
let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index
.iter()
.collect();
term_offsets.sort_by_key(|&(k, _v)| k);
@@ -128,8 +127,6 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
}
Ok(())
}
}

View File

@@ -4,32 +4,36 @@ use postings::PostingsSerializer;
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
const POSITION_END: u32 = 4294967295;
const POSITION_END: u32 = 4294967295;
/// Recorder is in charge of recording relevant information about
/// the presence of a term in a document.
///
/// Depending on the `TextIndexingOptions` associated to the
/// Depending on the `TextIndexingOptions` associated to the
/// field, the recorder may records
/// * the document frequency
/// * the document id
/// * the document id
/// * the term frequency
/// * the term positions
pub trait Recorder: HeapAllocable {
/// Returns the current document
fn current_doc(&self,) -> u32;
fn current_doc(&self) -> u32;
/// Starts recording information about a new document
/// This method shall only be called if the term is within the document.
/// This method shall only be called if the term is within the document.
fn new_doc(&mut self, doc: DocId, heap: &Heap);
/// Record the position of a term. For each document,
/// Record the position of a term. For each document,
/// this method will be called `term_freq` times.
fn record_position(&mut self, position: u32, heap: &Heap);
/// Close the document. It will help record the term frequency.
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, heap: &Heap);
/// Returns the number of document that have been seen so far
fn doc_freq(&self,) -> u32;
fn doc_freq(&self) -> u32;
/// Pushes the postings information to the serializer.
fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()>;
}
/// Only records the doc ids
@@ -51,11 +55,10 @@ impl HeapAllocable for NothingRecorder {
}
impl Recorder for NothingRecorder {
fn current_doc(&self,) -> DocId {
fn current_doc(&self) -> DocId {
self.current_doc
}
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
self.current_doc = doc;
self.stack.push(doc, heap);
@@ -66,17 +69,20 @@ impl Recorder for NothingRecorder {
fn close_doc(&mut self, _heap: &Heap) {}
fn doc_freq(&self,) -> u32 {
fn doc_freq(&self) -> u32 {
self.doc_freq
}
fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
for doc in self.stack.iter(self_addr, heap) {
try!(serializer.write_doc(doc, 0u32, &EMPTY_ARRAY));
}
Ok(())
}
}
/// Recorder encoding document ids, and term frequencies
@@ -94,16 +100,13 @@ impl HeapAllocable for TermFrequencyRecorder {
stack: ExpUnrolledLinkedList::with_addr(addr),
current_doc: u32::max_value(),
current_tf: 0u32,
doc_freq: 0u32
}
doc_freq: 0u32,
}
}
}
impl Recorder for TermFrequencyRecorder {
fn current_doc(&self,) -> DocId {
fn current_doc(&self) -> DocId {
self.current_doc
}
@@ -112,22 +115,26 @@ impl Recorder for TermFrequencyRecorder {
self.current_doc = doc;
self.stack.push(doc, heap);
}
fn record_position(&mut self, _position: u32, _heap: &Heap) {
self.current_tf += 1;
}
fn close_doc(&mut self, heap: &Heap) {
debug_assert!(self.current_tf > 0);
self.stack.push(self.current_tf, heap);
self.current_tf = 0;
}
fn doc_freq(&self,) -> u32 {
fn doc_freq(&self) -> u32 {
self.doc_freq
}
fn serialize(&self, self_addr:u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
let mut doc_iter = self.stack.iter(self_addr, heap);
loop {
if let Some(doc) = doc_iter.next() {
@@ -140,7 +147,6 @@ impl Recorder for TermFrequencyRecorder {
}
Ok(())
}
}
/// Recorder encoding term frequencies as well as positions.
@@ -162,12 +168,10 @@ impl HeapAllocable for TFAndPositionRecorder {
}
impl Recorder for TFAndPositionRecorder {
fn current_doc(&self,) -> DocId {
fn current_doc(&self) -> DocId {
self.current_doc
}
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
self.doc_freq += 1;
self.current_doc = doc;
@@ -177,16 +181,20 @@ impl Recorder for TFAndPositionRecorder {
fn record_position(&mut self, position: u32, heap: &Heap) {
self.stack.push(position, heap);
}
fn close_doc(&mut self, heap: &Heap) {
self.stack.push(POSITION_END, heap);
}
fn doc_freq(&self,) -> u32 {
fn doc_freq(&self) -> u32 {
self.doc_freq
}
fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
let mut doc_positions = Vec::with_capacity(100);
let mut positions_iter = self.stack.iter(self_addr, heap);
while let Some(doc) = positions_iter.next() {
@@ -197,8 +205,7 @@ impl Recorder for TFAndPositionRecorder {
Some(position) => {
if position == POSITION_END {
break;
}
else {
} else {
doc_positions.push(position - prev_position);
prev_position = position;
}
@@ -212,7 +219,4 @@ impl Recorder for TFAndPositionRecorder {
}
Ok(())
}
}

View File

@@ -6,9 +6,9 @@ use std::num::Wrapping;
const EMPTY_DATA: [u8; 0] = [0u8; 0];
/// `SegmentPostings` represents the inverted list or postings associated to
/// `SegmentPostings` represents the inverted list or postings associated to
/// a term in a `Segment`.
///
///
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
/// Positions on the other hand, are optionally entirely decoded upfront.
pub struct SegmentPostings<'a> {
@@ -16,21 +16,21 @@ pub struct SegmentPostings<'a> {
doc_offset: u32,
block_decoder: SIMDBlockDecoder,
freq_handler: FreqHandler,
remaining_data: &'a[u8],
remaining_data: &'a [u8],
cur: Wrapping<usize>,
}
impl<'a> SegmentPostings<'a> {
fn load_next_block(&mut self,) {
fn load_next_block(&mut self) {
let num_remaining_docs = self.len - self.cur.0;
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset);
self.remaining_data = self.block_decoder
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
}
else {
self.remaining_data = self.block_decoder.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs);
} else {
self.remaining_data = self.block_decoder
.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs);
self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs);
}
}
@@ -39,7 +39,7 @@ impl<'a> SegmentPostings<'a> {
///
/// * `len` - number of document in the posting lists.
/// * `data` - data array. The complete data is not necessarily used.
/// * `freq_handler` - the freq handler is in charge of decoding
/// * `freq_handler` - the freq handler is in charge of decoding
/// frequencies and/or positions
pub fn from_data(len: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> {
SegmentPostings {
@@ -51,7 +51,7 @@ impl<'a> SegmentPostings<'a> {
cur: Wrapping(usize::max_value()),
}
}
/// Returns an empty segment postings object
pub fn empty() -> SegmentPostings<'static> {
SegmentPostings {
@@ -65,11 +65,10 @@ impl<'a> SegmentPostings<'a> {
}
/// Index within a block is used as an address when
/// interacting with the `FreqHandler`
fn index_within_block(&self,) -> usize {
/// interacting with the `FreqHandler`
fn index_within_block(&self) -> usize {
self.cur.0 % NUM_DOCS_PER_BLOCK
}
}
@@ -77,7 +76,7 @@ impl<'a> DocSet for SegmentPostings<'a> {
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self,) -> bool {
fn advance(&mut self) -> bool {
self.cur += Wrapping(1);
if self.cur.0 >= self.len {
return false;
@@ -87,27 +86,25 @@ impl<'a> DocSet for SegmentPostings<'a> {
}
true
}
#[inline]
fn doc(&self,) -> DocId {
fn doc(&self) -> DocId {
self.block_decoder.output(self.index_within_block())
}
}
impl<'a> HasLen for SegmentPostings<'a> {
fn len(&self,) -> usize {
fn len(&self) -> usize {
self.len
}
}
impl<'a> Postings for SegmentPostings<'a> {
fn term_freq(&self,) -> u32 {
fn term_freq(&self) -> u32 {
self.freq_handler.freq(self.index_within_block())
}
fn positions(&self) -> &[u32] {
self.freq_handler.positions(self.index_within_block())
}
}

View File

@@ -2,7 +2,7 @@
/// Object describing the amount of information required when reading a postings.
///
/// Since decoding information is not free, this makes it possible to
/// Since decoding information is not free, this makes it possible to
/// avoid this extra cost when the information is not required.
/// For instance, positions are useful when running phrase queries
/// but useless in other queries.
@@ -14,4 +14,4 @@ pub enum SegmentPostingsOption {
Freq,
/// DocIds, term frequencies and positions will be decoded.
FreqAndPositions,
}
}

View File

@@ -19,14 +19,14 @@ use common::BinarySerializable;
/// `PostingsSerializer` is in charge of serializing
/// postings on disk, in the
/// postings on disk, in the
/// * `.idx` (inverted index)
/// * `.pos` (positions file)
/// * `.term` (term dictionary)
///
/// `PostingsWriter` are in charge of pushing the data to the
///
/// `PostingsWriter` are in charge of pushing the data to the
/// serializer.
///
///
/// The serializer expects to receive the following calls
/// in this order :
///
@@ -45,10 +45,10 @@ use common::BinarySerializable;
/// Terms have to be pushed in a lexicographically-sorted order.
/// Within a term, document have to be pushed in increasing order.
///
/// A description of the serialization format is
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
/// A description of the serialization format is
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
pub struct PostingsSerializer {
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>, // TODO find an alternative to work around the "move"
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>, /* TODO find an alternative to work around the "move" */
postings_write: WritePtr,
positions_write: WritePtr,
written_bytes_postings: usize,
@@ -65,14 +65,12 @@ pub struct PostingsSerializer {
}
impl PostingsSerializer {
/// Open a new `PostingsSerializer` for the given segment
pub fn new(
terms_write: WritePtr,
postings_write: WritePtr,
positions_write: WritePtr,
schema: Schema
) -> Result<PostingsSerializer> {
/// Open a new `PostingsSerializer` for the given segment
pub fn new(terms_write: WritePtr,
postings_write: WritePtr,
positions_write: WritePtr,
schema: Schema)
-> Result<PostingsSerializer> {
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
Ok(PostingsSerializer {
terms_fst_builder: terms_fst_builder,
@@ -91,41 +89,36 @@ impl PostingsSerializer {
term_open: false,
})
}
/// Open a new `PostingsSerializer` for the given segment
/// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> {
let terms_write = try!(segment.open_write(SegmentComponent::TERMS));
let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS));
PostingsSerializer::new(
terms_write,
postings_write,
positions_write,
segment.schema()
)
PostingsSerializer::new(terms_write,
postings_write,
positions_write,
segment.schema())
}
fn load_indexing_options(&mut self, field: Field) {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
self.text_indexing_options = match *field_entry.field_type() {
FieldType::Str(ref text_options) => {
text_options.get_indexing_options()
}
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
FieldType::U32(ref u32_options) => {
if u32_options.is_indexed() {
TextIndexingOptions::Unindexed
}
else {
TextIndexingOptions::Untokenized
} else {
TextIndexingOptions::Untokenized
}
}
};
}
/// Starts the postings for a new term.
/// * term - the term. It needs to come after the previous term according
/// to the lexicographical order.
/// to the lexicographical order.
/// * doc_freq - return the number of document containing the term.
pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> {
if self.term_open {
@@ -145,31 +138,34 @@ impl PostingsSerializer {
self.terms_fst_builder
.insert(term.as_slice(), &term_info)
}
/// Finish the serialization for this term postings.
///
/// If the current block is incomplete, it need to be encoded
/// using `VInt` encoding.
pub fn close_term(&mut self,) -> io::Result<()> {
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
if self.term_open {
if !self.doc_ids.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
//
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded = self.block_encoder.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
let block_encoded = self.block_encoder
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
self.written_bytes_postings += block_encoded.len();
try!(self.postings_write.write_all(block_encoded));
self.doc_ids.clear();
}
// ... Idem for term frequencies
// ... Idem for term frequencies
if self.text_indexing_options.is_termfreq_enabled() {
let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]);
let block_encoded = self.block_encoder
.compress_vint_unsorted(&self.term_freqs[..]);
for num in block_encoded {
self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
self.written_bytes_postings +=
try!(num.serialize(&mut self.postings_write));
}
self.term_freqs.clear();
}
@@ -177,8 +173,10 @@ impl PostingsSerializer {
// On the other hand, positions are entirely buffered until the
// end of the term, at which point they are compressed and written.
if self.text_indexing_options.is_position_enabled() {
self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64).serialize(&mut self.positions_write));
let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64)
.serialize(&mut self.positions_write));
let positions_encoded: &[u8] = self.positions_encoder
.compress_unsorted(&self.position_deltas[..]);
try!(self.positions_write.write_all(positions_encoded));
self.written_bytes_positions += positions_encoded.len();
self.position_deltas.clear();
@@ -187,8 +185,8 @@ impl PostingsSerializer {
}
Ok(())
}
/// Serialize the information that a document contains the current term,
/// its term frequency, and the position deltas.
///
@@ -198,7 +196,11 @@ impl PostingsSerializer {
///
/// Term frequencies and positions may be ignored by the serializer depending
/// on the configuration of the field in the `Schema`.
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) -> io::Result<()> {
pub fn write_doc(&mut self,
doc_id: DocId,
term_freq: u32,
position_deltas: &[u32])
-> io::Result<()> {
self.doc_ids.push(doc_id);
if self.text_indexing_options.is_termfreq_enabled() {
self.term_freqs.push(term_freq as u32);
@@ -209,14 +211,16 @@ impl PostingsSerializer {
if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
{
// encode the doc ids
let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
let block_encoded: &[u8] = self.block_encoder
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
try!(self.postings_write.write_all(block_encoded));
self.written_bytes_postings += block_encoded.len();
}
if self.text_indexing_options.is_termfreq_enabled() {
// encode the term_freqs
let block_encoded: &[u8] = self.block_encoder.compress_block_unsorted(&self.term_freqs);
let block_encoded: &[u8] = self.block_encoder
.compress_block_unsorted(&self.term_freqs);
try!(self.postings_write.write_all(block_encoded));
self.written_bytes_postings += block_encoded.len();
self.term_freqs.clear();
@@ -225,9 +229,9 @@ impl PostingsSerializer {
}
Ok(())
}
/// Closes the serializer.
pub fn close(mut self,) -> io::Result<()> {
pub fn close(mut self) -> io::Result<()> {
try!(self.close_term());
try!(self.terms_fst_builder.finish());
try!(self.postings_write.flush());

View File

@@ -4,7 +4,7 @@ use DocId;
use postings::{Postings, DocSet, HasLen};
use std::num::Wrapping;
const EMPTY_ARRAY: [u32; 0] = [];
const EMPTY_ARRAY: [u32; 0] = [];
/// Simulate a `Postings` objects from a `VecPostings`.
/// `VecPostings` only exist for testing purposes.
@@ -26,43 +26,43 @@ impl From<Vec<DocId>> for VecPostings {
}
impl DocSet for VecPostings {
fn advance(&mut self,) -> bool {
fn advance(&mut self) -> bool {
self.cursor += Wrapping(1);
self.doc_ids.len() > self.cursor.0
}
fn doc(&self,) -> DocId {
fn doc(&self) -> DocId {
self.doc_ids[self.cursor.0]
}
}
impl HasLen for VecPostings {
fn len(&self,) -> usize {
fn len(&self) -> usize {
self.doc_ids.len()
}
}
impl Postings for VecPostings {
fn term_freq(&self,) -> u32 {
fn term_freq(&self) -> u32 {
1u32
}
fn positions(&self) -> &[u32] {
&EMPTY_ARRAY
}
}
}
#[cfg(test)]
pub mod tests {
use super::*;
use DocId;
use postings::{Postings, SkipResult, DocSet};
use DocId;
use postings::{Postings, SkipResult, DocSet};
#[test]
pub fn test_vec_postings() {
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e*3).collect();
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
let mut postings = VecPostings::from(doc_ids);
assert!(postings.advance());
assert_eq!(postings.doc(), 0u32);
@@ -77,4 +77,3 @@ pub mod tests {
}
}

View File

@@ -12,7 +12,7 @@ impl BooleanClause {
pub fn new(query: Box<Query>, occur: Occur) -> BooleanClause {
BooleanClause {
query: query,
occur: occur
occur: occur,
}
}
}
}

View File

@@ -12,11 +12,11 @@ use query::OccurFilter;
///
/// The documents matched by the boolean query are
/// those which
/// * match all of the sub queries associated with the
/// * match all of the sub queries associated with the
/// `Must` occurence
/// * match none of the sub queries associated with the
/// * match none of the sub queries associated with the
/// `MustNot` occurence.
/// * match at least one of the subqueries that is not
/// * match at least one of the subqueries that is not
/// a `MustNot` occurence.
#[derive(Debug)]
pub struct BooleanQuery {
@@ -25,14 +25,11 @@ pub struct BooleanQuery {
impl From<Vec<BooleanClause>> for BooleanQuery {
fn from(clauses: Vec<BooleanClause>) -> BooleanQuery {
BooleanQuery {
clauses: clauses,
}
}
BooleanQuery { clauses: clauses }
}
}
impl Query for BooleanQuery {
fn as_any(&self) -> &Any {
self
}
@@ -41,8 +38,7 @@ impl Query for BooleanQuery {
let sub_weights = try!(self.clauses
.iter()
.map(|clause| clause.query.weight(searcher))
.collect()
);
.collect());
let occurs: Vec<Occur> = self.clauses
.iter()
.map(|clause| clause.occur)
@@ -50,5 +46,4 @@ impl Query for BooleanQuery {
let filter = OccurFilter::new(&occurs);
Ok(box BooleanWeight::new(sub_weights, filter))
}
}

View File

@@ -33,7 +33,7 @@ impl Ord for HeapItem {
}
pub struct BooleanScorer<TScorer: Scorer> {
postings: Vec<TScorer>,
scorers: Vec<TScorer>,
queue: BinaryHeap<HeapItem>,
doc: DocId,
score_combiner: ScoreCombiner,
@@ -43,20 +43,20 @@ pub struct BooleanScorer<TScorer: Scorer> {
impl<TScorer: Scorer> BooleanScorer<TScorer> {
pub fn scorers(&self) -> &[TScorer] {
&self.postings
&self.scorers
}
pub fn new(postings: Vec<TScorer>,
pub fn new(scorers: Vec<TScorer>,
occur_filter: OccurFilter) -> BooleanScorer<TScorer> {
let score_combiner = ScoreCombiner::default_for_num_scorers(postings.len());
let mut non_empty_postings: Vec<TScorer> = Vec::new();
for mut posting in postings {
let score_combiner = ScoreCombiner::default_for_num_scorers(scorers.len());
let mut non_empty_scorers: Vec<TScorer> = Vec::new();
for mut posting in scorers {
let non_empty = posting.advance();
if non_empty {
non_empty_postings.push(posting);
non_empty_scorers.push(posting);
}
}
let heap_items: Vec<HeapItem> = non_empty_postings
let heap_items: Vec<HeapItem> = non_empty_scorers
.iter()
.map(|posting| posting.doc())
.enumerate()
@@ -68,7 +68,7 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
})
.collect();
BooleanScorer {
postings: non_empty_postings,
scorers: non_empty_scorers,
queue: BinaryHeap::from(heap_items),
doc: 0u32,
score_combiner: score_combiner,
@@ -77,7 +77,7 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
}
}
/// Advances the head of our heap (the segment postings with the lowest doc)
/// Advances the head of our heap (the segment posting with the lowest doc)
/// It will also update the new current `DocId` as well as the term frequency
/// associated with the segment postings.
///
@@ -89,9 +89,9 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
fn advance_head(&mut self,) {
{
let mut mutable_head = self.queue.peek_mut().unwrap();
let cur_postings = &mut self.postings[mutable_head.ord as usize];
if cur_postings.advance() {
mutable_head.doc = cur_postings.doc();
let cur_scorers = &mut self.scorers[mutable_head.ord as usize];
if cur_scorers.advance() {
mutable_head.doc = cur_scorers.doc();
return;
}
}
@@ -108,7 +108,7 @@ impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
Some(heap_item) => {
let ord = heap_item.ord as usize;
self.doc = heap_item.doc;
let score = self.postings[ord].score();
let score = self.scorers[ord].score();
self.score_combiner.update(score);
ord_bitset |= 1 << ord;
}
@@ -120,7 +120,7 @@ impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
while let Some(&HeapItem {doc, ord}) = self.queue.peek() {
if doc == self.doc {
let ord = ord as usize;
let score = self.postings[ord].score();
let score = self.scorers[ord].score();
self.score_combiner.update(score);
ord_bitset |= 1 << ord;
}

View File

@@ -11,8 +11,7 @@ pub struct BooleanWeight {
}
impl BooleanWeight {
pub fn new(weights: Vec<Box<Weight>>,
occur_filter: OccurFilter) -> BooleanWeight {
pub fn new(weights: Vec<Box<Weight>>, occur_filter: OccurFilter) -> BooleanWeight {
BooleanWeight {
weights: weights,
occur_filter: occur_filter,
@@ -22,15 +21,12 @@ impl BooleanWeight {
impl Weight for BooleanWeight {
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
let sub_scorers: Vec<Box<Scorer + 'a>> = try!(
self.weights
.iter()
.map(|weight| weight.scorer(reader))
.collect()
);
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
let sub_scorers: Vec<Box<Scorer + 'a>> = try!(self.weights
.iter()
.map(|weight| weight.scorer(reader))
.collect());
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
Ok(box boolean_scorer)
}
}

View File

@@ -12,20 +12,21 @@ use postings::SegmentPostingsOption;
/// Query involving one or more terms.
#[derive(Eq, Clone, PartialEq, Debug)]
pub struct MultiTermQuery {
// TODO need a better Debug
occur_terms: Vec<(Occur, Term)>
pub struct MultiTermQuery {
// TODO need a better Debug
occur_terms: Vec<(Occur, Term)>,
}
impl MultiTermQuery {
/// Accessor for the number of terms
pub fn num_terms(&self,) -> usize {
pub fn num_terms(&self) -> usize {
self.occur_terms.len()
}
/// Same as `weight()`, except that rather than a boxed trait,
/// `specialized_weight` returns a specific type of the weight, allowing for
/// compile-time optimization.
pub fn specialized_weight(&self, searcher: &Searcher) -> MultiTermWeight {
let term_queries: Vec<TermQuery> = self.occur_terms
.iter()
@@ -33,7 +34,7 @@ impl MultiTermQuery {
.collect();
let occurs: Vec<Occur> = self.occur_terms
.iter()
.map(|&(occur, _) | occur.clone())
.map(|&(occur, _)| occur.clone())
.collect();
let occur_filter = OccurFilter::new(&occurs);
let weights = term_queries.iter()
@@ -43,21 +44,17 @@ impl MultiTermQuery {
term_weight
})
.collect();
MultiTermWeight {
weights: weights,
occur_filter: occur_filter,
}
MultiTermWeight::new(weights, occur_filter)
}
}
impl Query for MultiTermQuery {
fn as_any(&self) -> &Any {
self
}
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
Ok(box self.specialized_weight(searcher))
}
@@ -66,16 +63,13 @@ impl Query for MultiTermQuery {
impl From<Vec<(Occur, Term)>> for MultiTermQuery {
fn from(occur_terms: Vec<(Occur, Term)>) -> MultiTermQuery {
MultiTermQuery {
occur_terms: occur_terms
}
MultiTermQuery { occur_terms: occur_terms }
}
}
impl From<Vec<Term>> for MultiTermQuery {
fn from(terms: Vec<Term>) -> MultiTermQuery {
let should_terms: Vec<(Occur, Term)> = terms
.into_iter()
let should_terms: Vec<(Occur, Term)> = terms.into_iter()
.map(|term| (Occur::Should, term))
.collect();
MultiTermQuery::from(should_terms)

View File

@@ -7,14 +7,28 @@ use postings::SegmentPostings;
use query::term_query::{TermWeight, TermScorer};
use query::boolean_query::BooleanScorer;
/// Weight object associated to a [`MultiTermQuery`](./struct.MultiTermQuery.html).
pub struct MultiTermWeight {
pub weights: Vec<TermWeight>,
pub occur_filter: OccurFilter,
weights: Vec<TermWeight>,
occur_filter: OccurFilter,
}
impl MultiTermWeight {
/// MultiTermWeigh constructor.
/// The `OccurFilter` is tied with the weights order.
pub fn new(weights: Vec<TermWeight>, occur_filter: OccurFilter) -> MultiTermWeight {
MultiTermWeight {
weights: weights,
occur_filter: occur_filter,
}
}
pub fn specialized_scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<BooleanScorer<TermScorer<SegmentPostings<'a>>>> {
/// Same as `scorer()`, except that rather than a boxed trait,
/// `specialized_scorer` returns a specific type of the scorer, allowing for
/// compile-time optimization.
pub fn specialized_scorer<'a>(&'a self,
reader: &'a SegmentReader)
-> Result<BooleanScorer<TermScorer<SegmentPostings<'a>>>> {
let mut term_scorers: Vec<TermScorer<_>> = Vec::new();
for term_weight in &self.weights {
let term_scorer = try!(term_weight.specialized_scorer(reader));
@@ -22,12 +36,10 @@ impl MultiTermWeight {
}
Ok(BooleanScorer::new(term_scorers, self.occur_filter))
}
}
impl Weight for MultiTermWeight {
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
Ok(box try!(self.specialized_scorer(reader)))
Ok(box try!(self.specialized_scorer(reader)))
}
}
}

View File

@@ -7,6 +7,19 @@ use query::Weight;
use Result;
/// `PhraseQuery` matches a specific sequence of word.
/// For instance the phrase query for `"part time"` will match
/// the sentence
///
/// **Alan just got a part time job.**
///
/// On the other hand it will not match the sentence.
///
/// **This is my favorite part of the job.**
///
/// Using a `PhraseQuery` on a field requires positions
/// to be indexed for this field.
///
#[derive(Debug)]
pub struct PhraseQuery {
phrase_terms: Vec<Term>,
@@ -24,7 +37,7 @@ impl Query for PhraseQuery {
/// Create the weight associated to a query.
///
/// See [Weight](./trait.Weight.html).
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
fn weight(&self, _searcher: &Searcher) -> Result<Box<Weight>> {
Ok(box PhraseWeight::from(self.phrase_terms.clone()))
}

View File

@@ -37,7 +37,9 @@ impl<'a> Scorer for Box<Scorer + 'a> {
}
}
/// EmptyScorer is a dummy Scorer in which no document matches.
///
/// It is useful for tests and handling edge cases.
pub struct EmptyScorer;
impl DocSet for EmptyScorer {