mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-01 16:10:42 +00:00
Merge branch 'issue/162'
This commit is contained in:
@@ -19,12 +19,10 @@ use std::sync::Arc;
|
||||
use std::fmt;
|
||||
use schema::Field;
|
||||
use postings::SegmentPostingsOption;
|
||||
use postings::SegmentPostings;
|
||||
use postings::{SegmentPostings, BlockSegmentPostings};
|
||||
use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader};
|
||||
use schema::Schema;
|
||||
use schema::FieldType;
|
||||
use postings::FreqHandler;
|
||||
use schema::TextIndexingOptions;
|
||||
|
||||
|
||||
|
||||
@@ -219,6 +217,20 @@ impl SegmentReader {
|
||||
term_info: &TermInfo,
|
||||
option: SegmentPostingsOption)
|
||||
-> SegmentPostings {
|
||||
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
|
||||
let delete_bitset = self.delete_bitset.clone();
|
||||
SegmentPostings::from_block_postings(block_postings, delete_bitset)
|
||||
}
|
||||
|
||||
|
||||
/// Returns a block postings given a `term_info`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub fn read_block_postings_from_terminfo(&self,
|
||||
term_info: &TermInfo,
|
||||
option: SegmentPostingsOption)
|
||||
-> BlockSegmentPostings {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data = &self.postings_data[offset..];
|
||||
let freq_handler = match option {
|
||||
@@ -230,34 +242,26 @@ impl SegmentReader {
|
||||
FreqHandler::new_with_freq_and_position(offseted_position_data)
|
||||
}
|
||||
};
|
||||
SegmentPostings::from_data(term_info.doc_freq,
|
||||
postings_data,
|
||||
&self.delete_bitset,
|
||||
freq_handler)
|
||||
BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler)
|
||||
}
|
||||
|
||||
|
||||
/// Returns the posting list associated with a term.
|
||||
/// Resets the block segment to another position of the postings
|
||||
/// file.
|
||||
///
|
||||
/// If the term is not found, return None.
|
||||
/// Even when non-null, because of deletes, the posting object
|
||||
/// returned by this method may contain no documents.
|
||||
pub fn read_postings_all_info(&self, term: &Term) -> Option<SegmentPostings> {
|
||||
let field_entry = self.schema.get_field_entry(term.field());
|
||||
let segment_posting_option = match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
match text_options.get_indexing_options() {
|
||||
TextIndexingOptions::TokenizedWithFreq => SegmentPostingsOption::Freq,
|
||||
TextIndexingOptions::TokenizedWithFreqAndPosition => {
|
||||
SegmentPostingsOption::FreqAndPositions
|
||||
}
|
||||
_ => SegmentPostingsOption::NoFreq,
|
||||
}
|
||||
}
|
||||
FieldType::U64(_) |
|
||||
FieldType::I64(_) => SegmentPostingsOption::NoFreq,
|
||||
};
|
||||
self.read_postings(term, segment_posting_option)
|
||||
/// This is useful for enumerating through a list of terms,
|
||||
/// and consuming the associated posting lists while avoiding
|
||||
/// reallocating a `BlockSegmentPostings`.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This does not reset the positions list.
|
||||
pub fn reset_block_postings_from_terminfo<'a>(&'a self,
|
||||
term_info: &TermInfo,
|
||||
block_postings: &mut BlockSegmentPostings<'a>) {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data: &'a [u8] = &self.postings_data[offset..];
|
||||
block_postings.reset(term_info.doc_freq as usize, postings_data);
|
||||
}
|
||||
|
||||
/// Returns the term info associated with the term.
|
||||
|
||||
@@ -66,6 +66,7 @@ impl DeleteBitSet {
|
||||
}
|
||||
|
||||
/// Returns true iff the document is deleted.
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
if self.len == 0 {
|
||||
false
|
||||
|
||||
56
src/lib.rs
56
src/lib.rs
@@ -207,6 +207,7 @@ mod tests {
|
||||
use schema::*;
|
||||
use DocSet;
|
||||
use IndexWriter;
|
||||
use postings::SegmentPostingsOption::FreqAndPositions;
|
||||
use fastfield::{FastFieldReader, U64FastFieldReader, I64FastFieldReader};
|
||||
use Postings;
|
||||
use rand::{XorShiftRng, Rng, SeedableRng};
|
||||
@@ -338,6 +339,10 @@ mod tests {
|
||||
fn test_delete_postings1() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let term_abcd = Term::from_field_text(text_field, "abcd");
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let term_b = Term::from_field_text(text_field, "b");
|
||||
let term_c = Term::from_field_text(text_field, "c");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
@@ -385,21 +390,15 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert!(reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "abcd"))
|
||||
.is_none());
|
||||
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
|
||||
{
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "a"))
|
||||
.unwrap();
|
||||
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 5);
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "b"))
|
||||
.unwrap();
|
||||
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
@@ -425,21 +424,16 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert!(reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "abcd"))
|
||||
.is_none());
|
||||
|
||||
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
|
||||
{
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "a"))
|
||||
.unwrap();
|
||||
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 5);
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "b"))
|
||||
.unwrap();
|
||||
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
@@ -465,19 +459,13 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert!(reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "abcd"))
|
||||
.is_none());
|
||||
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
|
||||
{
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "a"))
|
||||
.unwrap();
|
||||
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "b"))
|
||||
.unwrap();
|
||||
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
@@ -485,9 +473,7 @@ mod tests {
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "c"))
|
||||
.unwrap();
|
||||
let mut postings = reader.read_postings(&term_c, FreqAndPositions).unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 4);
|
||||
assert!(!postings.advance());
|
||||
@@ -596,12 +582,10 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert!(reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "abcd"))
|
||||
.is_none());
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "af"))
|
||||
.unwrap();
|
||||
let term_abcd = Term::from_field_text(text_field, "abcd");
|
||||
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
|
||||
let term_af = Term::from_field_text(text_field, "af");
|
||||
let mut postings = reader.read_postings(&term_af, FreqAndPositions).unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert_eq!(postings.term_freq(), 3);
|
||||
|
||||
@@ -47,7 +47,6 @@ impl FreqHandler {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions.
|
||||
pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler {
|
||||
let positions = read_positions(position_data);
|
||||
|
||||
@@ -26,7 +26,7 @@ pub use self::postings::Postings;
|
||||
#[cfg(test)]
|
||||
pub use self::vec_postings::VecPostings;
|
||||
|
||||
pub use self::segment_postings::SegmentPostings;
|
||||
pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings};
|
||||
pub use self::intersection::IntersectionDocSet;
|
||||
pub use self::freq_handler::FreqHandler;
|
||||
pub use self::segment_postings_option::SegmentPostingsOption;
|
||||
@@ -42,6 +42,7 @@ mod tests {
|
||||
use indexer::SegmentWriter;
|
||||
use core::SegmentReader;
|
||||
use core::Index;
|
||||
use postings::SegmentPostingsOption::FreqAndPositions;
|
||||
use std::iter;
|
||||
use datastruct::stacker::Heap;
|
||||
use fastfield::FastFieldReader;
|
||||
@@ -128,11 +129,15 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let term_a = Term::from_field_text(text_field, "abcdef");
|
||||
assert!(segment_reader.read_postings_all_info(&term_a).is_none());
|
||||
assert!(segment_reader
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.is_none());
|
||||
}
|
||||
{
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let mut postings_a = segment_reader.read_postings_all_info(&term_a).unwrap();
|
||||
let mut postings_a = segment_reader
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings_a.len(), 1000);
|
||||
assert!(postings_a.advance());
|
||||
assert_eq!(postings_a.doc(), 0);
|
||||
@@ -151,7 +156,9 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let term_e = Term::from_field_text(text_field, "e");
|
||||
let mut postings_e = segment_reader.read_postings_all_info(&term_e).unwrap();
|
||||
let mut postings_e = segment_reader
|
||||
.read_postings(&term_e, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings_e.len(), 1000 - 2);
|
||||
for i in 2u32..1000u32 {
|
||||
assert!(postings_e.advance());
|
||||
@@ -474,7 +481,9 @@ mod tests {
|
||||
let mut segment_postings = segment_reader
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
let mut existing_docs = Vec::new();
|
||||
segment_postings.advance();
|
||||
for doc in &docs {
|
||||
if *doc >= segment_postings.doc() {
|
||||
existing_docs.push(*doc);
|
||||
|
||||
@@ -2,101 +2,50 @@ use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder};
|
||||
use DocId;
|
||||
use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult};
|
||||
use std::cmp;
|
||||
use std::num::Wrapping;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fst::Streamer;
|
||||
|
||||
|
||||
const EMPTY_DATA: [u8; 0] = [0u8; 0];
|
||||
|
||||
|
||||
/// `SegmentPostings` represents the inverted list or postings associated to
|
||||
/// a term in a `Segment`.
|
||||
///
|
||||
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
|
||||
/// Positions on the other hand, are optionally entirely decoded upfront.
|
||||
pub struct SegmentPostings<'a> {
|
||||
len: usize,
|
||||
// Removing this makes the code slower
|
||||
// See https://github.com/tantivy-search/tantivy/issues/89
|
||||
block_len: usize,
|
||||
doc_offset: u32,
|
||||
block_decoder: BlockDecoder,
|
||||
freq_handler: FreqHandler,
|
||||
remaining_data: &'a [u8],
|
||||
cur: Wrapping<usize>,
|
||||
block_cursor: BlockSegmentPostings<'a>,
|
||||
cur: usize,
|
||||
delete_bitset: DeleteBitSet,
|
||||
}
|
||||
|
||||
impl<'a> SegmentPostings<'a> {
|
||||
fn load_next_block(&mut self) {
|
||||
let num_remaining_docs = self.len - self.cur.0;
|
||||
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
|
||||
self.remaining_data =
|
||||
self.block_decoder
|
||||
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
|
||||
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
|
||||
self.block_len = NUM_DOCS_PER_BLOCK;
|
||||
} else {
|
||||
self.remaining_data =
|
||||
self.block_decoder
|
||||
.uncompress_vint_sorted(self.remaining_data,
|
||||
self.doc_offset,
|
||||
num_remaining_docs);
|
||||
self.freq_handler
|
||||
.read_freq_vint(self.remaining_data, num_remaining_docs);
|
||||
self.block_len = num_remaining_docs;
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads a Segment postings from an &[u8]
|
||||
///
|
||||
/// * `len` - number of document in the posting lists.
|
||||
/// * `data` - data array. The complete data is not necessarily used.
|
||||
/// * `freq_handler` - the freq handler is in charge of decoding
|
||||
/// frequencies and/or positions
|
||||
pub fn from_data(len: u32,
|
||||
data: &'a [u8],
|
||||
delete_bitset: &'a DeleteBitSet,
|
||||
freq_handler: FreqHandler)
|
||||
-> SegmentPostings<'a> {
|
||||
pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>,
|
||||
delete_bitset: DeleteBitSet)
|
||||
-> SegmentPostings<'a> {
|
||||
SegmentPostings {
|
||||
len: len as usize,
|
||||
block_len: len as usize,
|
||||
doc_offset: 0,
|
||||
block_decoder: BlockDecoder::new(),
|
||||
freq_handler: freq_handler,
|
||||
remaining_data: data,
|
||||
cur: Wrapping(usize::max_value()),
|
||||
delete_bitset: delete_bitset.clone(),
|
||||
block_cursor: segment_block_postings,
|
||||
cur: NUM_DOCS_PER_BLOCK, // cursor within the block
|
||||
delete_bitset: delete_bitset,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> SegmentPostings<'static> {
|
||||
let empty_block_cursor = BlockSegmentPostings::empty();
|
||||
SegmentPostings {
|
||||
len: 0,
|
||||
block_len: 0,
|
||||
doc_offset: 0,
|
||||
block_decoder: BlockDecoder::new(),
|
||||
freq_handler: FreqHandler::new_without_freq(),
|
||||
remaining_data: &EMPTY_DATA,
|
||||
block_cursor: empty_block_cursor,
|
||||
delete_bitset: DeleteBitSet::empty(),
|
||||
cur: Wrapping(usize::max_value()),
|
||||
cur: NUM_DOCS_PER_BLOCK,
|
||||
}
|
||||
}
|
||||
|
||||
/// Index within a block is used as an address when
|
||||
/// interacting with the `FreqHandler`
|
||||
fn index_within_block(&self) -> usize {
|
||||
self.cur.0 % NUM_DOCS_PER_BLOCK
|
||||
}
|
||||
|
||||
/// Sets the current position to a location relative
|
||||
/// to the current block
|
||||
#[inline]
|
||||
fn set_within_block(&mut self, inner_pos: usize) {
|
||||
self.cur = Wrapping(self.cur.0 & !(NUM_DOCS_PER_BLOCK - 1)) + Wrapping(inner_pos)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -106,12 +55,13 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
#[inline]
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
self.cur += Wrapping(1);
|
||||
if self.cur.0 >= self.len {
|
||||
return false;
|
||||
}
|
||||
if self.index_within_block() == 0 {
|
||||
self.load_next_block();
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
self.cur = 0;
|
||||
if !self.block_cursor.advance() {
|
||||
self.cur = NUM_DOCS_PER_BLOCK;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if !self.delete_bitset.is_deleted(self.doc()) {
|
||||
return true;
|
||||
@@ -119,78 +69,82 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
|
||||
let mut pos = self.index_within_block();
|
||||
// skip blocks until one that might contain the target
|
||||
loop {
|
||||
// check if we need to go to the next block
|
||||
if target > self.block_decoder.output(self.block_len - 1) {
|
||||
self.cur += Wrapping(self.block_len - pos);
|
||||
self.load_next_block();
|
||||
pos = 0;
|
||||
|
||||
// there was no more data
|
||||
if self.cur.0 == self.len {
|
||||
let (current_doc, last_doc_in_block) = {
|
||||
let block_docs = self.block_cursor.docs();
|
||||
(block_docs[self.cur], block_docs[block_docs.len() - 1])
|
||||
};
|
||||
if target > last_doc_in_block {
|
||||
if !self.block_cursor.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
} else if target < self.block_decoder.output(pos) {
|
||||
// We've overpassed the target after the first `advance` call
|
||||
// or we're at the beginning of a block.
|
||||
// Either way, we're on the first `DocId` greater than `target`
|
||||
return SkipResult::OverStep;
|
||||
self.cur = 0;
|
||||
} else {
|
||||
if target < current_doc {
|
||||
// We've overpassed the target after the first `advance` call
|
||||
// or we're at the beginning of a block.
|
||||
// Either way, we're on the first `DocId` greater than `target`
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
{
|
||||
// we're in the right block now, start with an exponential search
|
||||
let block_docs = self.block_cursor.docs();
|
||||
let block_len = block_docs.len();
|
||||
|
||||
debug_assert!(target >= self.block_decoder.output(pos));
|
||||
debug_assert!(target <= self.block_decoder.output(self.block_len - 1));
|
||||
debug_assert!(target >= block_docs[self.cur]);
|
||||
debug_assert!(target <= block_docs[block_len - 1]);
|
||||
|
||||
// we're in the right block now, start with an exponential search
|
||||
let mut start = pos;
|
||||
let mut end = self.block_len;
|
||||
let mut count = 1;
|
||||
loop {
|
||||
let new = start + count;
|
||||
if new < end && self.block_decoder.output(new) < target {
|
||||
start = new;
|
||||
count *= 2;
|
||||
} else {
|
||||
break;
|
||||
let mut start = self.cur;
|
||||
let mut end = block_len;
|
||||
let mut count = 1;
|
||||
loop {
|
||||
let new = start + count;
|
||||
if new < end && block_docs[new] < target {
|
||||
start = new;
|
||||
count *= 2;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
end = cmp::min(start + count, end);
|
||||
|
||||
// now do a binary search
|
||||
let mut count = end - start;
|
||||
while count > 0 {
|
||||
let step = count / 2;
|
||||
let mid = start + step;
|
||||
let doc = block_docs[mid];
|
||||
if doc < target {
|
||||
start = mid + 1;
|
||||
count -= step + 1;
|
||||
} else {
|
||||
count = step;
|
||||
}
|
||||
}
|
||||
|
||||
// `doc` is now >= `target`
|
||||
let doc = block_docs[start];
|
||||
self.cur = start;
|
||||
|
||||
if !self.delete_bitset.is_deleted(doc) {
|
||||
if doc == target {
|
||||
return SkipResult::Reached;
|
||||
} else {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
}
|
||||
}
|
||||
end = cmp::min(start + count, end);
|
||||
|
||||
// now do a binary search
|
||||
let mut count = end - start;
|
||||
while count > 0 {
|
||||
let step = count / 2;
|
||||
let mid = start + step;
|
||||
let doc = self.block_decoder.output(mid);
|
||||
if doc < target {
|
||||
start = mid + 1;
|
||||
count -= step + 1;
|
||||
} else {
|
||||
count = step;
|
||||
}
|
||||
}
|
||||
|
||||
// `doc` is now >= `target`
|
||||
let doc = self.block_decoder.output(start);
|
||||
self.set_within_block(start);
|
||||
|
||||
if !self.delete_bitset.is_deleted(doc) {
|
||||
if doc == target {
|
||||
return SkipResult::Reached;
|
||||
} else {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
}
|
||||
|
||||
if self.advance() {
|
||||
SkipResult::OverStep
|
||||
} else {
|
||||
@@ -200,22 +154,268 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
|
||||
#[inline]
|
||||
fn doc(&self) -> DocId {
|
||||
self.block_decoder.output(self.index_within_block())
|
||||
let docs = self.block_cursor.docs();
|
||||
assert!(self.cur < docs.len(),
|
||||
"Have you forgotten to call `.advance()` at least once before calling .doc().");
|
||||
docs[self.cur]
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> HasLen for SegmentPostings<'a> {
|
||||
fn len(&self) -> usize {
|
||||
self.len
|
||||
self.block_cursor.doc_freq()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Postings for SegmentPostings<'a> {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.freq_handler.freq(self.index_within_block())
|
||||
self.block_cursor.freq_handler().freq(self.cur)
|
||||
}
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
self.freq_handler.positions(self.index_within_block())
|
||||
self.block_cursor.freq_handler().positions(self.cur)
|
||||
}
|
||||
}
|
||||
|
||||
/// `BlockSegmentPostings` is a cursor iterating over blocks
|
||||
/// of documents.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// While it is useful for some very specific high-performance
|
||||
/// use cases, you should prefer using `SegmentPostings` for most usage.
|
||||
pub struct BlockSegmentPostings<'a> {
|
||||
block_decoder: BlockDecoder,
|
||||
doc_freq: usize,
|
||||
doc_offset: DocId,
|
||||
num_binpacked_blocks: usize,
|
||||
num_vint_docs: usize,
|
||||
remaining_data: &'a [u8],
|
||||
freq_handler: FreqHandler,
|
||||
}
|
||||
|
||||
impl<'a> BlockSegmentPostings<'a> {
|
||||
pub(crate) fn from_data(doc_freq: usize,
|
||||
data: &'a [u8],
|
||||
freq_handler: FreqHandler)
|
||||
-> BlockSegmentPostings<'a> {
|
||||
let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK;
|
||||
let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks;
|
||||
BlockSegmentPostings {
|
||||
num_binpacked_blocks: num_binpacked_blocks,
|
||||
num_vint_docs: num_vint_docs,
|
||||
block_decoder: BlockDecoder::new(),
|
||||
freq_handler: freq_handler,
|
||||
remaining_data: data,
|
||||
doc_offset: 0,
|
||||
doc_freq: doc_freq,
|
||||
}
|
||||
}
|
||||
|
||||
// Resets the block segment postings on another position
|
||||
// in the postings file.
|
||||
//
|
||||
// This is useful for enumerating through a list of terms,
|
||||
// and consuming the associated posting lists while avoiding
|
||||
// reallocating a `BlockSegmentPostings`.
|
||||
//
|
||||
// # Warning
|
||||
//
|
||||
// This does not reset the positions list.
|
||||
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: &'a [u8]) {
|
||||
let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK;
|
||||
let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks;
|
||||
self.num_binpacked_blocks = num_binpacked_blocks;
|
||||
self.num_vint_docs = num_vint_docs;
|
||||
self.remaining_data = postings_data;
|
||||
self.doc_offset = 0;
|
||||
self.doc_freq = doc_freq;
|
||||
}
|
||||
|
||||
/// Returns the document frequency associated to this block postings.
|
||||
///
|
||||
/// This `doc_freq` is simply the sum of the length of all of the blocks
|
||||
/// length, and it does not take in account deleted documents.
|
||||
pub fn doc_freq(&self) -> usize {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
/// Returns the array of docs in the current block.
|
||||
///
|
||||
/// Before the first call to `.advance()`, the block
|
||||
/// returned by `.docs()` is empty.
|
||||
#[inline]
|
||||
pub fn docs(&self) -> &[DocId] {
|
||||
self.block_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Returns the length of the current block.
|
||||
///
|
||||
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
|
||||
/// except the last block that may have a length
|
||||
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
|
||||
#[inline]
|
||||
fn block_len(&self) -> usize {
|
||||
self.block_decoder.output_len
|
||||
}
|
||||
|
||||
|
||||
/// Returns a reference to the frequency handler.
|
||||
pub fn freq_handler(&self) -> &FreqHandler {
|
||||
&self.freq_handler
|
||||
}
|
||||
|
||||
/// Advance to the next block.
|
||||
///
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.num_binpacked_blocks > 0 {
|
||||
self.remaining_data =
|
||||
self.block_decoder
|
||||
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
|
||||
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
|
||||
self.num_binpacked_blocks -= 1;
|
||||
true
|
||||
} else if self.num_vint_docs > 0 {
|
||||
self.remaining_data =
|
||||
self.block_decoder
|
||||
.uncompress_vint_sorted(self.remaining_data,
|
||||
self.doc_offset,
|
||||
self.num_vint_docs);
|
||||
self.freq_handler
|
||||
.read_freq_vint(self.remaining_data, self.num_vint_docs);
|
||||
self.num_vint_docs = 0;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> BlockSegmentPostings<'static> {
|
||||
BlockSegmentPostings {
|
||||
num_binpacked_blocks: 0,
|
||||
num_vint_docs: 0,
|
||||
block_decoder: BlockDecoder::new(),
|
||||
freq_handler: FreqHandler::new_without_freq(),
|
||||
remaining_data: &EMPTY_DATA,
|
||||
doc_offset: 0,
|
||||
doc_freq: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Streamer<'b> for BlockSegmentPostings<'a> {
|
||||
type Item = &'b [DocId];
|
||||
|
||||
fn next(&'b mut self) -> Option<&'b [DocId]> {
|
||||
if self.advance() {
|
||||
Some(self.docs())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use DocSet;
|
||||
use super::SegmentPostings;
|
||||
use schema::{Document, SchemaBuilder};
|
||||
use core::Index;
|
||||
use schema::INT_INDEXED;
|
||||
use schema::Term;
|
||||
use fst::Streamer;
|
||||
use postings::SegmentPostingsOption;
|
||||
use common::HasLen;
|
||||
use super::BlockSegmentPostings;
|
||||
use schema::FieldValue;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
let mut postings = SegmentPostings::empty();
|
||||
assert!(!postings.advance());
|
||||
assert!(!postings.advance());
|
||||
assert_eq!(postings.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_block_segment_postings() {
|
||||
let mut postings = BlockSegmentPostings::empty();
|
||||
assert!(!postings.advance());
|
||||
assert_eq!(postings.doc_freq(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
for _ in 0..100_000 {
|
||||
let doc = doc!(int_field=>0u64);
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let term_info = segment_reader.get_term_info(&term).unwrap();
|
||||
let mut block_segments =
|
||||
segment_reader
|
||||
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
|
||||
let mut offset: u32 = 0u32;
|
||||
// checking that the block before calling advance is empty
|
||||
assert!(block_segments.docs().is_empty());
|
||||
// checking that the `doc_freq` is correct
|
||||
assert_eq!(block_segments.doc_freq(), 100_000);
|
||||
while let Some(block) = block_segments.next() {
|
||||
for (i, doc) in block.iter().cloned().enumerate() {
|
||||
assert_eq!(offset + (i as u32), doc);
|
||||
}
|
||||
offset += block.len() as u32;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_reset_block_segment_postings() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
// create two postings list, one containg even number,
|
||||
// the other containing odd numbers.
|
||||
for i in 0..6 {
|
||||
let doc = doc!(int_field=> (i % 2) as u64);
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
let mut block_segments;
|
||||
{
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let term_info = segment_reader.get_term_info(&term).unwrap();
|
||||
block_segments =
|
||||
segment_reader
|
||||
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
|
||||
}
|
||||
assert!(block_segments.advance());
|
||||
assert!(block_segments.docs() == &[0, 2, 4]);
|
||||
{
|
||||
let term = Term::from_field_u64(int_field, 1u64);
|
||||
let term_info = segment_reader.get_term_info(&term).unwrap();
|
||||
segment_reader.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
|
||||
}
|
||||
assert!(block_segments.advance());
|
||||
assert!(block_segments.docs() == &[1, 3, 5]);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user