mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-02 00:20:42 +00:00
issue/50 Small formatting change.
This commit is contained in:
@@ -17,23 +17,23 @@ pub enum SkipResult {
|
||||
}
|
||||
|
||||
|
||||
/// Represents an iterable set of sorted doc ids.
|
||||
/// Represents an iterable set of sorted doc ids.
|
||||
pub trait DocSet {
|
||||
/// Goes to the next element.
|
||||
/// `.advance(...)` needs to be called a first time to point to the correct
|
||||
/// element.
|
||||
fn advance(&mut self,) -> bool;
|
||||
|
||||
fn advance(&mut self) -> bool;
|
||||
|
||||
/// After skipping, position the iterator in such a way that `.doc()`
|
||||
/// will return a value greater than or equal to target.
|
||||
///
|
||||
///
|
||||
/// SkipResult expresses whether the `target value` was reached, overstepped,
|
||||
/// or if the `DocSet` was entirely consumed without finding any value
|
||||
/// greater or equal to the `target`.
|
||||
///
|
||||
/// WARNING: Calling skip always advances the docset.
|
||||
/// More specifically, if the docset is already positionned on the target
|
||||
/// skipping will advance to the next position and return SkipResult::Overstep.
|
||||
/// skipping will advance to the next position and return SkipResult::Overstep.
|
||||
///
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.advance();
|
||||
@@ -43,32 +43,30 @@ pub trait DocSet {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
},
|
||||
Ordering::Equal => { return SkipResult::Reached },
|
||||
Ordering::Greater => { return SkipResult::OverStep },
|
||||
}
|
||||
Ordering::Equal => return SkipResult::Reached,
|
||||
Ordering::Greater => return SkipResult::OverStep,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns the current document
|
||||
fn doc(&self,) -> DocId;
|
||||
|
||||
fn doc(&self) -> DocId;
|
||||
|
||||
/// Advances the cursor to the next document
|
||||
/// None is returned if the iterator has `DocSet`
|
||||
/// has already been entirely consumed.
|
||||
fn next(&mut self,) -> Option<DocId> {
|
||||
/// None is returned if the iterator has `DocSet`
|
||||
/// has already been entirely consumed.
|
||||
fn next(&mut self) -> Option<DocId> {
|
||||
if self.advance() {
|
||||
Some(self.doc())
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.advance()
|
||||
}
|
||||
@@ -78,28 +76,25 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
unboxed.skip_next(target)
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
fn doc(&self) -> DocId {
|
||||
let unboxed: &TDocSet = self.borrow();
|
||||
unboxed.doc()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
|
||||
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
let unref: &mut TDocSet = *self;
|
||||
unref.advance()
|
||||
}
|
||||
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
let unref: &mut TDocSet = *self;
|
||||
unref.skip_next(target)
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
fn doc(&self) -> DocId {
|
||||
let unref: &TDocSet = *self;
|
||||
unref.doc()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ pub struct FreqHandler {
|
||||
|
||||
|
||||
fn read_positions(data: &[u8]) -> Vec<u32> {
|
||||
let mut composite_reader = CompositeDecoder::new();
|
||||
let mut composite_reader = CompositeDecoder::new();
|
||||
let mut readable: &[u8] = data;
|
||||
let uncompressed_len = VInt::deserialize(&mut readable).unwrap().0 as usize;
|
||||
composite_reader.uncompress_unsorted(readable, uncompressed_len);
|
||||
@@ -27,17 +27,16 @@ fn read_positions(data: &[u8]) -> Vec<u32> {
|
||||
|
||||
|
||||
impl FreqHandler {
|
||||
|
||||
/// Returns a `FreqHandler` that just decodes `DocId`s.
|
||||
pub fn new_without_freq() -> FreqHandler {
|
||||
FreqHandler {
|
||||
freq_decoder: SIMDBlockDecoder::with_val(1u32),
|
||||
positions: Vec::new(),
|
||||
positions: Vec::new(),
|
||||
option: SegmentPostingsOption::NoFreq,
|
||||
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns a `FreqHandler` that decodes `DocId`s and term frequencies.
|
||||
pub fn new_with_freq() -> FreqHandler {
|
||||
FreqHandler {
|
||||
@@ -54,15 +53,15 @@ impl FreqHandler {
|
||||
let positions = read_positions(position_data);
|
||||
FreqHandler {
|
||||
freq_decoder: SIMDBlockDecoder::new(),
|
||||
positions: positions,
|
||||
positions: positions,
|
||||
option: SegmentPostingsOption::FreqAndPositions,
|
||||
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_positions_offset(&mut self,) {
|
||||
|
||||
fn fill_positions_offset(&mut self) {
|
||||
let mut cur_position: usize = self.positions_offsets[NUM_DOCS_PER_BLOCK];
|
||||
let mut i: usize = 0;
|
||||
let mut i: usize = 0;
|
||||
self.positions_offsets[i] = cur_position;
|
||||
let mut last_cur_position = cur_position;
|
||||
for &doc_freq in self.freq_decoder.output_array() {
|
||||
@@ -78,16 +77,16 @@ impl FreqHandler {
|
||||
last_cur_position = cur_position;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/// Accessor to term frequency
|
||||
///
|
||||
/// idx is the offset of the current doc in the block.
|
||||
/// It takes value between 0 and 128.
|
||||
pub fn freq(&self, idx: usize)-> u32 {
|
||||
pub fn freq(&self, idx: usize) -> u32 {
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
|
||||
/// Accessor to the positions
|
||||
///
|
||||
/// idx is the offset of the current doc in the block.
|
||||
@@ -97,16 +96,12 @@ impl FreqHandler {
|
||||
let stop = self.positions_offsets[idx + 1];
|
||||
&self.positions[start..stop]
|
||||
}
|
||||
|
||||
|
||||
/// Decompresses a complete frequency block
|
||||
pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
|
||||
match self.option {
|
||||
SegmentPostingsOption::NoFreq => {
|
||||
data
|
||||
}
|
||||
SegmentPostingsOption::Freq => {
|
||||
self.freq_decoder.uncompress_block_unsorted(data)
|
||||
}
|
||||
SegmentPostingsOption::NoFreq => data,
|
||||
SegmentPostingsOption::Freq => self.freq_decoder.uncompress_block_unsorted(data),
|
||||
SegmentPostingsOption::FreqAndPositions => {
|
||||
let remaining: &'a [u8] = self.freq_decoder.uncompress_block_unsorted(data);
|
||||
self.fill_positions_offset();
|
||||
@@ -114,7 +109,7 @@ impl FreqHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Decompresses an incomplete frequency block
|
||||
pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
|
||||
match self.option {
|
||||
@@ -128,5 +123,4 @@ impl FreqHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -7,7 +7,7 @@ use DocId;
|
||||
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
||||
pub struct IntersectionDocSet<TDocSet: DocSet> {
|
||||
docsets: Vec<TDocSet>,
|
||||
finished: bool,
|
||||
finished: bool,
|
||||
doc: DocId,
|
||||
}
|
||||
|
||||
@@ -18,11 +18,14 @@ impl<TDocSet: DocSet> From<Vec<TDocSet>> for IntersectionDocSet<TDocSet> {
|
||||
docsets: docsets,
|
||||
finished: false,
|
||||
doc: DocId::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> {
|
||||
/// Returns an array to the underlying `DocSet`s of the intersection.
|
||||
/// These `DocSet` are in the same position as the `IntersectionDocSet`,
|
||||
/// so that user can access their `docfreq` and `positions`.
|
||||
pub fn docsets(&self) -> &[TDocSet] {
|
||||
&self.docsets[..]
|
||||
}
|
||||
@@ -30,8 +33,7 @@ impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> {
|
||||
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
|
||||
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.finished {
|
||||
return false;
|
||||
}
|
||||
@@ -71,8 +73,8 @@ impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,6 @@ pub struct OffsetPostings<'a> {
|
||||
}
|
||||
|
||||
impl<'a> OffsetPostings<'a> {
|
||||
|
||||
/// Constructor
|
||||
pub fn new(underlying: SegmentPostings<'a>, offset: DocId) -> OffsetPostings {
|
||||
OffsetPostings {
|
||||
@@ -26,38 +25,35 @@ impl<'a> OffsetPostings<'a> {
|
||||
}
|
||||
|
||||
impl<'a> DocSet for OffsetPostings<'a> {
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.underlying.advance()
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.underlying.doc() + self.offset
|
||||
}
|
||||
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if target >= self.offset {
|
||||
SkipResult::OverStep
|
||||
}
|
||||
else {
|
||||
self.underlying.skip_next(target - self.offset)
|
||||
} else {
|
||||
self.underlying.skip_next(target - self.offset)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> HasLen for OffsetPostings<'a> {
|
||||
fn len(&self,) -> usize {
|
||||
fn len(&self) -> usize {
|
||||
self.underlying.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Postings for OffsetPostings<'a> {
|
||||
|
||||
fn term_freq(&self,) -> u32 {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.underlying.term_freq()
|
||||
}
|
||||
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
self.underlying.positions()
|
||||
}
|
||||
|
||||
}
|
||||
@@ -7,45 +7,38 @@ use postings::docset::DocSet;
|
||||
/// containing the term. Optionally, for each document,
|
||||
/// it may also give access to the term frequency
|
||||
/// as well as the list of term positions.
|
||||
///
|
||||
///
|
||||
/// Its main implementation is `SegmentPostings`,
|
||||
/// but other implementations mocking `SegmentPostings` exist,
|
||||
/// for merging segments or for testing.
|
||||
pub trait Postings: DocSet {
|
||||
/// Returns the term frequency
|
||||
fn term_freq(&self,) -> u32;
|
||||
fn term_freq(&self) -> u32;
|
||||
/// Returns the list of positions of the term, expressed as a list of
|
||||
/// token ordinals.
|
||||
fn positions(&self) -> &[u32];
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> Postings for Box<TPostings> {
|
||||
|
||||
fn term_freq(&self,) -> u32 {
|
||||
fn term_freq(&self) -> u32 {
|
||||
let unboxed: &TPostings = self.borrow();
|
||||
unboxed.term_freq()
|
||||
}
|
||||
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
let unboxed: &TPostings = self.borrow();
|
||||
unboxed.positions()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl<'a, TPostings: Postings> Postings for &'a mut TPostings {
|
||||
|
||||
fn term_freq(&self,) -> u32 {
|
||||
fn term_freq(&self) -> u32 {
|
||||
let unref: &TPostings = *self;
|
||||
unref.term_freq()
|
||||
}
|
||||
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
let unref: &TPostings = *self;
|
||||
unref.positions()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -9,12 +9,11 @@ use schema::Field;
|
||||
use analyzer::StreamingIterator;
|
||||
use datastruct::stacker::{HashMap, Heap};
|
||||
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
/// `PostingsWriter` writes in a `Heap`.
|
||||
pub trait PostingsWriter {
|
||||
|
||||
/// Record that a document contains a term at a given position.
|
||||
///
|
||||
/// * doc - the document id
|
||||
@@ -22,17 +21,22 @@ pub trait PostingsWriter {
|
||||
/// * term - the term
|
||||
/// * heap - heap used to store the postings informations as well as the terms
|
||||
/// in the hashmap.
|
||||
fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap);
|
||||
|
||||
fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap);
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
|
||||
|
||||
|
||||
/// Closes all of the currently open `Recorder`'s.
|
||||
fn close(&mut self, heap: &Heap);
|
||||
|
||||
|
||||
/// Tokenize a text and suscribe all of its token.
|
||||
fn index_text<'a>(&mut self, doc_id: DocId, field: Field, field_values: &[&'a FieldValue], heap: &Heap) -> u32 {
|
||||
fn index_text<'a>(&mut self,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
field_values: &[&'a FieldValue],
|
||||
heap: &Heap)
|
||||
-> u32 {
|
||||
let mut pos = 0u32;
|
||||
let mut num_tokens: u32 = 0u32;
|
||||
let mut term = Term::allocate(field, 100);
|
||||
@@ -65,7 +69,7 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
|
||||
let num_buckets_usable = heap_capacity / 100;
|
||||
let hash_table_size = num_buckets_usable * 2;
|
||||
let mut pow = 512;
|
||||
for num_bits in 10 .. 32 {
|
||||
for num_bits in 10..32 {
|
||||
pow <<= 1;
|
||||
if pow > hash_table_size {
|
||||
return num_bits;
|
||||
@@ -75,31 +79,26 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
|
||||
/// constructor
|
||||
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
|
||||
let capacity = heap.capacity();
|
||||
let hashmap_size = hashmap_size_in_bits(capacity);
|
||||
SpecializedPostingsWriter {
|
||||
term_index: HashMap::new(hashmap_size, heap),
|
||||
}
|
||||
SpecializedPostingsWriter { term_index: HashMap::new(hashmap_size, heap) }
|
||||
}
|
||||
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
|
||||
pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
|
||||
fn close(&mut self, heap: &Heap) {
|
||||
for recorder in self.term_index.values_mut() {
|
||||
recorder.close_doc(heap);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[inline]
|
||||
fn suscribe(&mut self, doc: DocId, position: u32, term: &Term, heap: &Heap) {
|
||||
let mut recorder = self.term_index.get_or_create(term);
|
||||
@@ -112,9 +111,9 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
|
||||
}
|
||||
recorder.record_position(position, heap);
|
||||
}
|
||||
|
||||
|
||||
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
|
||||
let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index
|
||||
let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index
|
||||
.iter()
|
||||
.collect();
|
||||
term_offsets.sort_by_key(|&(k, _v)| k);
|
||||
@@ -128,8 +127,6 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -4,32 +4,36 @@ use postings::PostingsSerializer;
|
||||
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
|
||||
|
||||
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
|
||||
const POSITION_END: u32 = 4294967295;
|
||||
const POSITION_END: u32 = 4294967295;
|
||||
|
||||
/// Recorder is in charge of recording relevant information about
|
||||
/// the presence of a term in a document.
|
||||
///
|
||||
/// Depending on the `TextIndexingOptions` associated to the
|
||||
/// Depending on the `TextIndexingOptions` associated to the
|
||||
/// field, the recorder may records
|
||||
/// * the document frequency
|
||||
/// * the document id
|
||||
/// * the document id
|
||||
/// * the term frequency
|
||||
/// * the term positions
|
||||
pub trait Recorder: HeapAllocable {
|
||||
/// Returns the current document
|
||||
fn current_doc(&self,) -> u32;
|
||||
fn current_doc(&self) -> u32;
|
||||
/// Starts recording information about a new document
|
||||
/// This method shall only be called if the term is within the document.
|
||||
/// This method shall only be called if the term is within the document.
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap);
|
||||
/// Record the position of a term. For each document,
|
||||
/// Record the position of a term. For each document,
|
||||
/// this method will be called `term_freq` times.
|
||||
fn record_position(&mut self, position: u32, heap: &Heap);
|
||||
/// Close the document. It will help record the term frequency.
|
||||
/// Close the document. It will help record the term frequency.
|
||||
fn close_doc(&mut self, heap: &Heap);
|
||||
/// Returns the number of document that have been seen so far
|
||||
fn doc_freq(&self,) -> u32;
|
||||
fn doc_freq(&self) -> u32;
|
||||
/// Pushes the postings information to the serializer.
|
||||
fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()>;
|
||||
}
|
||||
|
||||
/// Only records the doc ids
|
||||
@@ -51,11 +55,10 @@ impl HeapAllocable for NothingRecorder {
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
|
||||
fn current_doc(&self,) -> DocId {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
@@ -66,17 +69,20 @@ impl Recorder for NothingRecorder {
|
||||
|
||||
fn close_doc(&mut self, _heap: &Heap) {}
|
||||
|
||||
fn doc_freq(&self,) -> u32 {
|
||||
fn doc_freq(&self) -> u32 {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
for doc in self.stack.iter(self_addr, heap) {
|
||||
try!(serializer.write_doc(doc, 0u32, &EMPTY_ARRAY));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Recorder encoding document ids, and term frequencies
|
||||
@@ -94,16 +100,13 @@ impl HeapAllocable for TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
current_doc: u32::max_value(),
|
||||
current_tf: 0u32,
|
||||
doc_freq: 0u32
|
||||
}
|
||||
doc_freq: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
|
||||
|
||||
|
||||
fn current_doc(&self,) -> DocId {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
@@ -112,22 +115,26 @@ impl Recorder for TermFrequencyRecorder {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
|
||||
fn record_position(&mut self, _position: u32, _heap: &Heap) {
|
||||
self.current_tf += 1;
|
||||
}
|
||||
|
||||
|
||||
fn close_doc(&mut self, heap: &Heap) {
|
||||
debug_assert!(self.current_tf > 0);
|
||||
self.stack.push(self.current_tf, heap);
|
||||
self.current_tf = 0;
|
||||
}
|
||||
|
||||
fn doc_freq(&self,) -> u32 {
|
||||
|
||||
fn doc_freq(&self) -> u32 {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
fn serialize(&self, self_addr:u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
let mut doc_iter = self.stack.iter(self_addr, heap);
|
||||
loop {
|
||||
if let Some(doc) = doc_iter.next() {
|
||||
@@ -140,7 +147,6 @@ impl Recorder for TermFrequencyRecorder {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Recorder encoding term frequencies as well as positions.
|
||||
@@ -162,12 +168,10 @@ impl HeapAllocable for TFAndPositionRecorder {
|
||||
}
|
||||
|
||||
impl Recorder for TFAndPositionRecorder {
|
||||
|
||||
|
||||
fn current_doc(&self,) -> DocId {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
self.doc_freq += 1;
|
||||
self.current_doc = doc;
|
||||
@@ -177,16 +181,20 @@ impl Recorder for TFAndPositionRecorder {
|
||||
fn record_position(&mut self, position: u32, heap: &Heap) {
|
||||
self.stack.push(position, heap);
|
||||
}
|
||||
|
||||
|
||||
fn close_doc(&mut self, heap: &Heap) {
|
||||
self.stack.push(POSITION_END, heap);
|
||||
}
|
||||
|
||||
fn doc_freq(&self,) -> u32 {
|
||||
|
||||
fn doc_freq(&self) -> u32 {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
let mut doc_positions = Vec::with_capacity(100);
|
||||
let mut positions_iter = self.stack.iter(self_addr, heap);
|
||||
while let Some(doc) = positions_iter.next() {
|
||||
@@ -197,8 +205,7 @@ impl Recorder for TFAndPositionRecorder {
|
||||
Some(position) => {
|
||||
if position == POSITION_END {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
doc_positions.push(position - prev_position);
|
||||
prev_position = position;
|
||||
}
|
||||
@@ -212,7 +219,4 @@ impl Recorder for TFAndPositionRecorder {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -6,9 +6,9 @@ use std::num::Wrapping;
|
||||
|
||||
const EMPTY_DATA: [u8; 0] = [0u8; 0];
|
||||
|
||||
/// `SegmentPostings` represents the inverted list or postings associated to
|
||||
/// `SegmentPostings` represents the inverted list or postings associated to
|
||||
/// a term in a `Segment`.
|
||||
///
|
||||
///
|
||||
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
|
||||
/// Positions on the other hand, are optionally entirely decoded upfront.
|
||||
pub struct SegmentPostings<'a> {
|
||||
@@ -16,21 +16,21 @@ pub struct SegmentPostings<'a> {
|
||||
doc_offset: u32,
|
||||
block_decoder: SIMDBlockDecoder,
|
||||
freq_handler: FreqHandler,
|
||||
remaining_data: &'a[u8],
|
||||
remaining_data: &'a [u8],
|
||||
cur: Wrapping<usize>,
|
||||
}
|
||||
|
||||
impl<'a> SegmentPostings<'a> {
|
||||
|
||||
fn load_next_block(&mut self,) {
|
||||
fn load_next_block(&mut self) {
|
||||
let num_remaining_docs = self.len - self.cur.0;
|
||||
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
|
||||
self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
self.remaining_data = self.block_decoder
|
||||
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
|
||||
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
|
||||
}
|
||||
else {
|
||||
self.remaining_data = self.block_decoder.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs);
|
||||
} else {
|
||||
self.remaining_data = self.block_decoder
|
||||
.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs);
|
||||
self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs);
|
||||
}
|
||||
}
|
||||
@@ -39,7 +39,7 @@ impl<'a> SegmentPostings<'a> {
|
||||
///
|
||||
/// * `len` - number of document in the posting lists.
|
||||
/// * `data` - data array. The complete data is not necessarily used.
|
||||
/// * `freq_handler` - the freq handler is in charge of decoding
|
||||
/// * `freq_handler` - the freq handler is in charge of decoding
|
||||
/// frequencies and/or positions
|
||||
pub fn from_data(len: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> {
|
||||
SegmentPostings {
|
||||
@@ -51,7 +51,7 @@ impl<'a> SegmentPostings<'a> {
|
||||
cur: Wrapping(usize::max_value()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> SegmentPostings<'static> {
|
||||
SegmentPostings {
|
||||
@@ -65,11 +65,10 @@ impl<'a> SegmentPostings<'a> {
|
||||
}
|
||||
|
||||
/// Index within a block is used as an address when
|
||||
/// interacting with the `FreqHandler`
|
||||
fn index_within_block(&self,) -> usize {
|
||||
/// interacting with the `FreqHandler`
|
||||
fn index_within_block(&self) -> usize {
|
||||
self.cur.0 % NUM_DOCS_PER_BLOCK
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -77,7 +76,7 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.cur += Wrapping(1);
|
||||
if self.cur.0 >= self.len {
|
||||
return false;
|
||||
@@ -87,27 +86,25 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
|
||||
#[inline]
|
||||
fn doc(&self,) -> DocId {
|
||||
fn doc(&self) -> DocId {
|
||||
self.block_decoder.output(self.index_within_block())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl<'a> HasLen for SegmentPostings<'a> {
|
||||
fn len(&self,) -> usize {
|
||||
fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Postings for SegmentPostings<'a> {
|
||||
fn term_freq(&self,) -> u32 {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.freq_handler.freq(self.index_within_block())
|
||||
}
|
||||
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
self.freq_handler.positions(self.index_within_block())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
/// Object describing the amount of information required when reading a postings.
|
||||
///
|
||||
/// Since decoding information is not free, this makes it possible to
|
||||
/// Since decoding information is not free, this makes it possible to
|
||||
/// avoid this extra cost when the information is not required.
|
||||
/// For instance, positions are useful when running phrase queries
|
||||
/// but useless in other queries.
|
||||
@@ -14,4 +14,4 @@ pub enum SegmentPostingsOption {
|
||||
Freq,
|
||||
/// DocIds, term frequencies and positions will be decoded.
|
||||
FreqAndPositions,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,14 +19,14 @@ use common::BinarySerializable;
|
||||
|
||||
|
||||
/// `PostingsSerializer` is in charge of serializing
|
||||
/// postings on disk, in the
|
||||
/// postings on disk, in the
|
||||
/// * `.idx` (inverted index)
|
||||
/// * `.pos` (positions file)
|
||||
/// * `.term` (term dictionary)
|
||||
///
|
||||
/// `PostingsWriter` are in charge of pushing the data to the
|
||||
///
|
||||
/// `PostingsWriter` are in charge of pushing the data to the
|
||||
/// serializer.
|
||||
///
|
||||
///
|
||||
/// The serializer expects to receive the following calls
|
||||
/// in this order :
|
||||
///
|
||||
@@ -45,10 +45,10 @@ use common::BinarySerializable;
|
||||
/// Terms have to be pushed in a lexicographically-sorted order.
|
||||
/// Within a term, document have to be pushed in increasing order.
|
||||
///
|
||||
/// A description of the serialization format is
|
||||
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
|
||||
/// A description of the serialization format is
|
||||
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
|
||||
pub struct PostingsSerializer {
|
||||
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>, // TODO find an alternative to work around the "move"
|
||||
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>, /* TODO find an alternative to work around the "move" */
|
||||
postings_write: WritePtr,
|
||||
positions_write: WritePtr,
|
||||
written_bytes_postings: usize,
|
||||
@@ -65,14 +65,12 @@ pub struct PostingsSerializer {
|
||||
}
|
||||
|
||||
impl PostingsSerializer {
|
||||
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
pub fn new(
|
||||
terms_write: WritePtr,
|
||||
postings_write: WritePtr,
|
||||
positions_write: WritePtr,
|
||||
schema: Schema
|
||||
) -> Result<PostingsSerializer> {
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
pub fn new(terms_write: WritePtr,
|
||||
postings_write: WritePtr,
|
||||
positions_write: WritePtr,
|
||||
schema: Schema)
|
||||
-> Result<PostingsSerializer> {
|
||||
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
|
||||
Ok(PostingsSerializer {
|
||||
terms_fst_builder: terms_fst_builder,
|
||||
@@ -91,41 +89,36 @@ impl PostingsSerializer {
|
||||
term_open: false,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
|
||||
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> {
|
||||
let terms_write = try!(segment.open_write(SegmentComponent::TERMS));
|
||||
let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
|
||||
let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS));
|
||||
PostingsSerializer::new(
|
||||
terms_write,
|
||||
postings_write,
|
||||
positions_write,
|
||||
segment.schema()
|
||||
)
|
||||
PostingsSerializer::new(terms_write,
|
||||
postings_write,
|
||||
positions_write,
|
||||
segment.schema())
|
||||
}
|
||||
|
||||
|
||||
fn load_indexing_options(&mut self, field: Field) {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
self.text_indexing_options = match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
text_options.get_indexing_options()
|
||||
}
|
||||
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
|
||||
FieldType::U32(ref u32_options) => {
|
||||
if u32_options.is_indexed() {
|
||||
TextIndexingOptions::Unindexed
|
||||
}
|
||||
else {
|
||||
TextIndexingOptions::Untokenized
|
||||
} else {
|
||||
TextIndexingOptions::Untokenized
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/// Starts the postings for a new term.
|
||||
/// * term - the term. It needs to come after the previous term according
|
||||
/// to the lexicographical order.
|
||||
/// to the lexicographical order.
|
||||
/// * doc_freq - return the number of document containing the term.
|
||||
pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> {
|
||||
if self.term_open {
|
||||
@@ -145,31 +138,34 @@ impl PostingsSerializer {
|
||||
self.terms_fst_builder
|
||||
.insert(term.as_slice(), &term_info)
|
||||
}
|
||||
|
||||
|
||||
/// Finish the serialization for this term postings.
|
||||
///
|
||||
/// If the current block is incomplete, it need to be encoded
|
||||
/// using `VInt` encoding.
|
||||
pub fn close_term(&mut self,) -> io::Result<()> {
|
||||
/// using `VInt` encoding.
|
||||
pub fn close_term(&mut self) -> io::Result<()> {
|
||||
if self.term_open {
|
||||
if !self.doc_ids.is_empty() {
|
||||
// we have doc ids waiting to be written
|
||||
// this happens when the number of doc ids is
|
||||
// this happens when the number of doc ids is
|
||||
// not a perfect multiple of our block size.
|
||||
//
|
||||
// In that case, the remaining part is encoded
|
||||
// using variable int encoding.
|
||||
{
|
||||
let block_encoded = self.block_encoder.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.written_bytes_postings += block_encoded.len();
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
self.doc_ids.clear();
|
||||
}
|
||||
// ... Idem for term frequencies
|
||||
// ... Idem for term frequencies
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]);
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_unsorted(&self.term_freqs[..]);
|
||||
for num in block_encoded {
|
||||
self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
|
||||
self.written_bytes_postings +=
|
||||
try!(num.serialize(&mut self.postings_write));
|
||||
}
|
||||
self.term_freqs.clear();
|
||||
}
|
||||
@@ -177,8 +173,10 @@ impl PostingsSerializer {
|
||||
// On the other hand, positions are entirely buffered until the
|
||||
// end of the term, at which point they are compressed and written.
|
||||
if self.text_indexing_options.is_position_enabled() {
|
||||
self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64).serialize(&mut self.positions_write));
|
||||
let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
|
||||
self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64)
|
||||
.serialize(&mut self.positions_write));
|
||||
let positions_encoded: &[u8] = self.positions_encoder
|
||||
.compress_unsorted(&self.position_deltas[..]);
|
||||
try!(self.positions_write.write_all(positions_encoded));
|
||||
self.written_bytes_positions += positions_encoded.len();
|
||||
self.position_deltas.clear();
|
||||
@@ -187,8 +185,8 @@ impl PostingsSerializer {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/// Serialize the information that a document contains the current term,
|
||||
/// its term frequency, and the position deltas.
|
||||
///
|
||||
@@ -198,7 +196,11 @@ impl PostingsSerializer {
|
||||
///
|
||||
/// Term frequencies and positions may be ignored by the serializer depending
|
||||
/// on the configuration of the field in the `Schema`.
|
||||
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) -> io::Result<()> {
|
||||
pub fn write_doc(&mut self,
|
||||
doc_id: DocId,
|
||||
term_freq: u32,
|
||||
position_deltas: &[u32])
|
||||
-> io::Result<()> {
|
||||
self.doc_ids.push(doc_id);
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
self.term_freqs.push(term_freq as u32);
|
||||
@@ -209,14 +211,16 @@ impl PostingsSerializer {
|
||||
if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
|
||||
{
|
||||
// encode the doc ids
|
||||
let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
let block_encoded: &[u8] = self.block_encoder
|
||||
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
self.written_bytes_postings += block_encoded.len();
|
||||
}
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
// encode the term_freqs
|
||||
let block_encoded: &[u8] = self.block_encoder.compress_block_unsorted(&self.term_freqs);
|
||||
let block_encoded: &[u8] = self.block_encoder
|
||||
.compress_block_unsorted(&self.term_freqs);
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
self.written_bytes_postings += block_encoded.len();
|
||||
self.term_freqs.clear();
|
||||
@@ -225,9 +229,9 @@ impl PostingsSerializer {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Closes the serializer.
|
||||
pub fn close(mut self,) -> io::Result<()> {
|
||||
pub fn close(mut self) -> io::Result<()> {
|
||||
try!(self.close_term());
|
||||
try!(self.terms_fst_builder.finish());
|
||||
try!(self.postings_write.flush());
|
||||
|
||||
@@ -4,7 +4,7 @@ use DocId;
|
||||
use postings::{Postings, DocSet, HasLen};
|
||||
use std::num::Wrapping;
|
||||
|
||||
const EMPTY_ARRAY: [u32; 0] = [];
|
||||
const EMPTY_ARRAY: [u32; 0] = [];
|
||||
|
||||
/// Simulate a `Postings` objects from a `VecPostings`.
|
||||
/// `VecPostings` only exist for testing purposes.
|
||||
@@ -26,43 +26,43 @@ impl From<Vec<DocId>> for VecPostings {
|
||||
}
|
||||
|
||||
impl DocSet for VecPostings {
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.cursor += Wrapping(1);
|
||||
self.doc_ids.len() > self.cursor.0
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc_ids[self.cursor.0]
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for VecPostings {
|
||||
fn len(&self,) -> usize {
|
||||
fn len(&self) -> usize {
|
||||
self.doc_ids.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl Postings for VecPostings {
|
||||
fn term_freq(&self,) -> u32 {
|
||||
fn term_freq(&self) -> u32 {
|
||||
1u32
|
||||
}
|
||||
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
&EMPTY_ARRAY
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
|
||||
use super::*;
|
||||
use DocId;
|
||||
use postings::{Postings, SkipResult, DocSet};
|
||||
|
||||
|
||||
use DocId;
|
||||
use postings::{Postings, SkipResult, DocSet};
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_vec_postings() {
|
||||
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e*3).collect();
|
||||
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
|
||||
let mut postings = VecPostings::from(doc_ids);
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0u32);
|
||||
@@ -77,4 +77,3 @@ pub mod tests {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ impl BooleanClause {
|
||||
pub fn new(query: Box<Query>, occur: Occur) -> BooleanClause {
|
||||
BooleanClause {
|
||||
query: query,
|
||||
occur: occur
|
||||
occur: occur,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -12,11 +12,11 @@ use query::OccurFilter;
|
||||
///
|
||||
/// The documents matched by the boolean query are
|
||||
/// those which
|
||||
/// * match all of the sub queries associated with the
|
||||
/// * match all of the sub queries associated with the
|
||||
/// `Must` occurence
|
||||
/// * match none of the sub queries associated with the
|
||||
/// * match none of the sub queries associated with the
|
||||
/// `MustNot` occurence.
|
||||
/// * match at least one of the subqueries that is not
|
||||
/// * match at least one of the subqueries that is not
|
||||
/// a `MustNot` occurence.
|
||||
#[derive(Debug)]
|
||||
pub struct BooleanQuery {
|
||||
@@ -25,14 +25,11 @@ pub struct BooleanQuery {
|
||||
|
||||
impl From<Vec<BooleanClause>> for BooleanQuery {
|
||||
fn from(clauses: Vec<BooleanClause>) -> BooleanQuery {
|
||||
BooleanQuery {
|
||||
clauses: clauses,
|
||||
}
|
||||
}
|
||||
BooleanQuery { clauses: clauses }
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for BooleanQuery {
|
||||
|
||||
fn as_any(&self) -> &Any {
|
||||
self
|
||||
}
|
||||
@@ -41,8 +38,7 @@ impl Query for BooleanQuery {
|
||||
let sub_weights = try!(self.clauses
|
||||
.iter()
|
||||
.map(|clause| clause.query.weight(searcher))
|
||||
.collect()
|
||||
);
|
||||
.collect());
|
||||
let occurs: Vec<Occur> = self.clauses
|
||||
.iter()
|
||||
.map(|clause| clause.occur)
|
||||
@@ -50,5 +46,4 @@ impl Query for BooleanQuery {
|
||||
let filter = OccurFilter::new(&occurs);
|
||||
Ok(box BooleanWeight::new(sub_weights, filter))
|
||||
}
|
||||
|
||||
}
|
||||
@@ -33,7 +33,7 @@ impl Ord for HeapItem {
|
||||
}
|
||||
|
||||
pub struct BooleanScorer<TScorer: Scorer> {
|
||||
postings: Vec<TScorer>,
|
||||
scorers: Vec<TScorer>,
|
||||
queue: BinaryHeap<HeapItem>,
|
||||
doc: DocId,
|
||||
score_combiner: ScoreCombiner,
|
||||
@@ -43,20 +43,20 @@ pub struct BooleanScorer<TScorer: Scorer> {
|
||||
impl<TScorer: Scorer> BooleanScorer<TScorer> {
|
||||
|
||||
pub fn scorers(&self) -> &[TScorer] {
|
||||
&self.postings
|
||||
&self.scorers
|
||||
}
|
||||
|
||||
pub fn new(postings: Vec<TScorer>,
|
||||
pub fn new(scorers: Vec<TScorer>,
|
||||
occur_filter: OccurFilter) -> BooleanScorer<TScorer> {
|
||||
let score_combiner = ScoreCombiner::default_for_num_scorers(postings.len());
|
||||
let mut non_empty_postings: Vec<TScorer> = Vec::new();
|
||||
for mut posting in postings {
|
||||
let score_combiner = ScoreCombiner::default_for_num_scorers(scorers.len());
|
||||
let mut non_empty_scorers: Vec<TScorer> = Vec::new();
|
||||
for mut posting in scorers {
|
||||
let non_empty = posting.advance();
|
||||
if non_empty {
|
||||
non_empty_postings.push(posting);
|
||||
non_empty_scorers.push(posting);
|
||||
}
|
||||
}
|
||||
let heap_items: Vec<HeapItem> = non_empty_postings
|
||||
let heap_items: Vec<HeapItem> = non_empty_scorers
|
||||
.iter()
|
||||
.map(|posting| posting.doc())
|
||||
.enumerate()
|
||||
@@ -68,7 +68,7 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
|
||||
})
|
||||
.collect();
|
||||
BooleanScorer {
|
||||
postings: non_empty_postings,
|
||||
scorers: non_empty_scorers,
|
||||
queue: BinaryHeap::from(heap_items),
|
||||
doc: 0u32,
|
||||
score_combiner: score_combiner,
|
||||
@@ -77,7 +77,7 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Advances the head of our heap (the segment postings with the lowest doc)
|
||||
/// Advances the head of our heap (the segment posting with the lowest doc)
|
||||
/// It will also update the new current `DocId` as well as the term frequency
|
||||
/// associated with the segment postings.
|
||||
///
|
||||
@@ -89,9 +89,9 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
|
||||
fn advance_head(&mut self,) {
|
||||
{
|
||||
let mut mutable_head = self.queue.peek_mut().unwrap();
|
||||
let cur_postings = &mut self.postings[mutable_head.ord as usize];
|
||||
if cur_postings.advance() {
|
||||
mutable_head.doc = cur_postings.doc();
|
||||
let cur_scorers = &mut self.scorers[mutable_head.ord as usize];
|
||||
if cur_scorers.advance() {
|
||||
mutable_head.doc = cur_scorers.doc();
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -108,7 +108,7 @@ impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
|
||||
Some(heap_item) => {
|
||||
let ord = heap_item.ord as usize;
|
||||
self.doc = heap_item.doc;
|
||||
let score = self.postings[ord].score();
|
||||
let score = self.scorers[ord].score();
|
||||
self.score_combiner.update(score);
|
||||
ord_bitset |= 1 << ord;
|
||||
}
|
||||
@@ -120,7 +120,7 @@ impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
|
||||
while let Some(&HeapItem {doc, ord}) = self.queue.peek() {
|
||||
if doc == self.doc {
|
||||
let ord = ord as usize;
|
||||
let score = self.postings[ord].score();
|
||||
let score = self.scorers[ord].score();
|
||||
self.score_combiner.update(score);
|
||||
ord_bitset |= 1 << ord;
|
||||
}
|
||||
|
||||
@@ -11,8 +11,7 @@ pub struct BooleanWeight {
|
||||
}
|
||||
|
||||
impl BooleanWeight {
|
||||
pub fn new(weights: Vec<Box<Weight>>,
|
||||
occur_filter: OccurFilter) -> BooleanWeight {
|
||||
pub fn new(weights: Vec<Box<Weight>>, occur_filter: OccurFilter) -> BooleanWeight {
|
||||
BooleanWeight {
|
||||
weights: weights,
|
||||
occur_filter: occur_filter,
|
||||
@@ -22,15 +21,12 @@ impl BooleanWeight {
|
||||
|
||||
|
||||
impl Weight for BooleanWeight {
|
||||
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let sub_scorers: Vec<Box<Scorer + 'a>> = try!(
|
||||
self.weights
|
||||
.iter()
|
||||
.map(|weight| weight.scorer(reader))
|
||||
.collect()
|
||||
);
|
||||
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
|
||||
let sub_scorers: Vec<Box<Scorer + 'a>> = try!(self.weights
|
||||
.iter()
|
||||
.map(|weight| weight.scorer(reader))
|
||||
.collect());
|
||||
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
|
||||
Ok(box boolean_scorer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,20 +12,21 @@ use postings::SegmentPostingsOption;
|
||||
|
||||
|
||||
/// Query involving one or more terms.
|
||||
|
||||
#[derive(Eq, Clone, PartialEq, Debug)]
|
||||
pub struct MultiTermQuery {
|
||||
// TODO need a better Debug
|
||||
occur_terms: Vec<(Occur, Term)>
|
||||
pub struct MultiTermQuery {
|
||||
// TODO need a better Debug
|
||||
occur_terms: Vec<(Occur, Term)>,
|
||||
}
|
||||
|
||||
impl MultiTermQuery {
|
||||
|
||||
/// Accessor for the number of terms
|
||||
pub fn num_terms(&self,) -> usize {
|
||||
pub fn num_terms(&self) -> usize {
|
||||
self.occur_terms.len()
|
||||
}
|
||||
|
||||
/// Same as `weight()`, except that rather than a boxed trait,
|
||||
/// `specialized_weight` returns a specific type of the weight, allowing for
|
||||
/// compile-time optimization.
|
||||
pub fn specialized_weight(&self, searcher: &Searcher) -> MultiTermWeight {
|
||||
let term_queries: Vec<TermQuery> = self.occur_terms
|
||||
.iter()
|
||||
@@ -33,7 +34,7 @@ impl MultiTermQuery {
|
||||
.collect();
|
||||
let occurs: Vec<Occur> = self.occur_terms
|
||||
.iter()
|
||||
.map(|&(occur, _) | occur.clone())
|
||||
.map(|&(occur, _)| occur.clone())
|
||||
.collect();
|
||||
let occur_filter = OccurFilter::new(&occurs);
|
||||
let weights = term_queries.iter()
|
||||
@@ -43,21 +44,17 @@ impl MultiTermQuery {
|
||||
term_weight
|
||||
})
|
||||
.collect();
|
||||
MultiTermWeight {
|
||||
weights: weights,
|
||||
occur_filter: occur_filter,
|
||||
}
|
||||
MultiTermWeight::new(weights, occur_filter)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl Query for MultiTermQuery {
|
||||
|
||||
fn as_any(&self) -> &Any {
|
||||
self
|
||||
}
|
||||
|
||||
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
Ok(box self.specialized_weight(searcher))
|
||||
}
|
||||
@@ -66,16 +63,13 @@ impl Query for MultiTermQuery {
|
||||
|
||||
impl From<Vec<(Occur, Term)>> for MultiTermQuery {
|
||||
fn from(occur_terms: Vec<(Occur, Term)>) -> MultiTermQuery {
|
||||
MultiTermQuery {
|
||||
occur_terms: occur_terms
|
||||
}
|
||||
MultiTermQuery { occur_terms: occur_terms }
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<Term>> for MultiTermQuery {
|
||||
fn from(terms: Vec<Term>) -> MultiTermQuery {
|
||||
let should_terms: Vec<(Occur, Term)> = terms
|
||||
.into_iter()
|
||||
let should_terms: Vec<(Occur, Term)> = terms.into_iter()
|
||||
.map(|term| (Occur::Should, term))
|
||||
.collect();
|
||||
MultiTermQuery::from(should_terms)
|
||||
|
||||
@@ -7,14 +7,28 @@ use postings::SegmentPostings;
|
||||
use query::term_query::{TermWeight, TermScorer};
|
||||
use query::boolean_query::BooleanScorer;
|
||||
|
||||
/// Weight object associated to a [`MultiTermQuery`](./struct.MultiTermQuery.html).
|
||||
pub struct MultiTermWeight {
|
||||
pub weights: Vec<TermWeight>,
|
||||
pub occur_filter: OccurFilter,
|
||||
weights: Vec<TermWeight>,
|
||||
occur_filter: OccurFilter,
|
||||
}
|
||||
|
||||
impl MultiTermWeight {
|
||||
/// MultiTermWeigh constructor.
|
||||
/// The `OccurFilter` is tied with the weights order.
|
||||
pub fn new(weights: Vec<TermWeight>, occur_filter: OccurFilter) -> MultiTermWeight {
|
||||
MultiTermWeight {
|
||||
weights: weights,
|
||||
occur_filter: occur_filter,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn specialized_scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<BooleanScorer<TermScorer<SegmentPostings<'a>>>> {
|
||||
/// Same as `scorer()`, except that rather than a boxed trait,
|
||||
/// `specialized_scorer` returns a specific type of the scorer, allowing for
|
||||
/// compile-time optimization.
|
||||
pub fn specialized_scorer<'a>(&'a self,
|
||||
reader: &'a SegmentReader)
|
||||
-> Result<BooleanScorer<TermScorer<SegmentPostings<'a>>>> {
|
||||
let mut term_scorers: Vec<TermScorer<_>> = Vec::new();
|
||||
for term_weight in &self.weights {
|
||||
let term_scorer = try!(term_weight.specialized_scorer(reader));
|
||||
@@ -22,12 +36,10 @@ impl MultiTermWeight {
|
||||
}
|
||||
Ok(BooleanScorer::new(term_scorers, self.occur_filter))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl Weight for MultiTermWeight {
|
||||
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
Ok(box try!(self.specialized_scorer(reader)))
|
||||
Ok(box try!(self.specialized_scorer(reader)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,19 @@ use query::Weight;
|
||||
use Result;
|
||||
|
||||
|
||||
/// `PhraseQuery` matches a specific sequence of word.
|
||||
/// For instance the phrase query for `"part time"` will match
|
||||
/// the sentence
|
||||
///
|
||||
/// **Alan just got a part time job.**
|
||||
///
|
||||
/// On the other hand it will not match the sentence.
|
||||
///
|
||||
/// **This is my favorite part of the job.**
|
||||
///
|
||||
/// Using a `PhraseQuery` on a field requires positions
|
||||
/// to be indexed for this field.
|
||||
///
|
||||
#[derive(Debug)]
|
||||
pub struct PhraseQuery {
|
||||
phrase_terms: Vec<Term>,
|
||||
@@ -24,7 +37,7 @@ impl Query for PhraseQuery {
|
||||
/// Create the weight associated to a query.
|
||||
///
|
||||
/// See [Weight](./trait.Weight.html).
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
fn weight(&self, _searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
Ok(box PhraseWeight::from(self.phrase_terms.clone()))
|
||||
}
|
||||
|
||||
|
||||
@@ -37,7 +37,9 @@ impl<'a> Scorer for Box<Scorer + 'a> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// EmptyScorer is a dummy Scorer in which no document matches.
|
||||
///
|
||||
/// It is useful for tests and handling edge cases.
|
||||
pub struct EmptyScorer;
|
||||
|
||||
impl DocSet for EmptyScorer {
|
||||
|
||||
Reference in New Issue
Block a user