Merge branch 'issue/162'

This commit is contained in:
Paul Masurel
2017-05-21 20:04:03 +09:00
6 changed files with 393 additions and 196 deletions

View File

@@ -19,12 +19,10 @@ use std::sync::Arc;
use std::fmt;
use schema::Field;
use postings::SegmentPostingsOption;
use postings::SegmentPostings;
use postings::{SegmentPostings, BlockSegmentPostings};
use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader};
use schema::Schema;
use schema::FieldType;
use postings::FreqHandler;
use schema::TextIndexingOptions;
@@ -219,6 +217,20 @@ impl SegmentReader {
term_info: &TermInfo,
option: SegmentPostingsOption)
-> SegmentPostings {
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
let delete_bitset = self.delete_bitset.clone();
SegmentPostings::from_block_postings(block_postings, delete_bitset)
}
/// Returns a block postings given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_block_postings_from_terminfo(&self,
term_info: &TermInfo,
option: SegmentPostingsOption)
-> BlockSegmentPostings {
let offset = term_info.postings_offset as usize;
let postings_data = &self.postings_data[offset..];
let freq_handler = match option {
@@ -230,34 +242,26 @@ impl SegmentReader {
FreqHandler::new_with_freq_and_position(offseted_position_data)
}
};
SegmentPostings::from_data(term_info.doc_freq,
postings_data,
&self.delete_bitset,
freq_handler)
BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler)
}
/// Returns the posting list associated with a term.
/// Resets the block segment to another position of the postings
/// file.
///
/// If the term is not found, return None.
/// Even when non-null, because of deletes, the posting object
/// returned by this method may contain no documents.
pub fn read_postings_all_info(&self, term: &Term) -> Option<SegmentPostings> {
let field_entry = self.schema.get_field_entry(term.field());
let segment_posting_option = match *field_entry.field_type() {
FieldType::Str(ref text_options) => {
match text_options.get_indexing_options() {
TextIndexingOptions::TokenizedWithFreq => SegmentPostingsOption::Freq,
TextIndexingOptions::TokenizedWithFreqAndPosition => {
SegmentPostingsOption::FreqAndPositions
}
_ => SegmentPostingsOption::NoFreq,
}
}
FieldType::U64(_) |
FieldType::I64(_) => SegmentPostingsOption::NoFreq,
};
self.read_postings(term, segment_posting_option)
/// This is useful for enumerating through a list of terms,
/// and consuming the associated posting lists while avoiding
/// reallocating a `BlockSegmentPostings`.
///
/// # Warning
///
/// This does not reset the positions list.
pub fn reset_block_postings_from_terminfo<'a>(&'a self,
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings<'a>) {
let offset = term_info.postings_offset as usize;
let postings_data: &'a [u8] = &self.postings_data[offset..];
block_postings.reset(term_info.doc_freq as usize, postings_data);
}
/// Returns the term info associated with the term.

View File

@@ -66,6 +66,7 @@ impl DeleteBitSet {
}
/// Returns true iff the document is deleted.
#[inline]
pub fn is_deleted(&self, doc: DocId) -> bool {
if self.len == 0 {
false

View File

@@ -207,6 +207,7 @@ mod tests {
use schema::*;
use DocSet;
use IndexWriter;
use postings::SegmentPostingsOption::FreqAndPositions;
use fastfield::{FastFieldReader, U64FastFieldReader, I64FastFieldReader};
use Postings;
use rand::{XorShiftRng, Rng, SeedableRng};
@@ -338,6 +339,10 @@ mod tests {
fn test_delete_postings1() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let term_abcd = Term::from_field_text(text_field, "abcd");
let term_a = Term::from_field_text(text_field, "a");
let term_b = Term::from_field_text(text_field, "b");
let term_c = Term::from_field_text(text_field, "c");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
@@ -385,21 +390,15 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader
.read_postings_all_info(&Term::from_field_text(text_field, "abcd"))
.is_none());
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
{
let mut postings = reader
.read_postings_all_info(&Term::from_field_text(text_field, "a"))
.unwrap();
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 5);
assert!(!postings.advance());
}
{
let mut postings = reader
.read_postings_all_info(&Term::from_field_text(text_field, "b"))
.unwrap();
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -425,21 +424,16 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader
.read_postings_all_info(&Term::from_field_text(text_field, "abcd"))
.is_none());
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
{
let mut postings = reader
.read_postings_all_info(&Term::from_field_text(text_field, "a"))
.unwrap();
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 5);
assert!(!postings.advance());
}
{
let mut postings = reader
.read_postings_all_info(&Term::from_field_text(text_field, "b"))
.unwrap();
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -465,19 +459,13 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader
.read_postings_all_info(&Term::from_field_text(text_field, "abcd"))
.is_none());
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
{
let mut postings = reader
.read_postings_all_info(&Term::from_field_text(text_field, "a"))
.unwrap();
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
assert!(!postings.advance());
}
{
let mut postings = reader
.read_postings_all_info(&Term::from_field_text(text_field, "b"))
.unwrap();
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -485,9 +473,7 @@ mod tests {
assert!(!postings.advance());
}
{
let mut postings = reader
.read_postings_all_info(&Term::from_field_text(text_field, "c"))
.unwrap();
let mut postings = reader.read_postings(&term_c, FreqAndPositions).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 4);
assert!(!postings.advance());
@@ -596,12 +582,10 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader
.read_postings_all_info(&Term::from_field_text(text_field, "abcd"))
.is_none());
let mut postings = reader
.read_postings_all_info(&Term::from_field_text(text_field, "af"))
.unwrap();
let term_abcd = Term::from_field_text(text_field, "abcd");
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
let term_af = Term::from_field_text(text_field, "af");
let mut postings = reader.read_postings(&term_af, FreqAndPositions).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert_eq!(postings.term_freq(), 3);

View File

@@ -47,7 +47,6 @@ impl FreqHandler {
}
}
/// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions.
pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler {
let positions = read_positions(position_data);

View File

@@ -26,7 +26,7 @@ pub use self::postings::Postings;
#[cfg(test)]
pub use self::vec_postings::VecPostings;
pub use self::segment_postings::SegmentPostings;
pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings};
pub use self::intersection::IntersectionDocSet;
pub use self::freq_handler::FreqHandler;
pub use self::segment_postings_option::SegmentPostingsOption;
@@ -42,6 +42,7 @@ mod tests {
use indexer::SegmentWriter;
use core::SegmentReader;
use core::Index;
use postings::SegmentPostingsOption::FreqAndPositions;
use std::iter;
use datastruct::stacker::Heap;
use fastfield::FastFieldReader;
@@ -128,11 +129,15 @@ mod tests {
}
{
let term_a = Term::from_field_text(text_field, "abcdef");
assert!(segment_reader.read_postings_all_info(&term_a).is_none());
assert!(segment_reader
.read_postings(&term_a, FreqAndPositions)
.is_none());
}
{
let term_a = Term::from_field_text(text_field, "a");
let mut postings_a = segment_reader.read_postings_all_info(&term_a).unwrap();
let mut postings_a = segment_reader
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert_eq!(postings_a.len(), 1000);
assert!(postings_a.advance());
assert_eq!(postings_a.doc(), 0);
@@ -151,7 +156,9 @@ mod tests {
}
{
let term_e = Term::from_field_text(text_field, "e");
let mut postings_e = segment_reader.read_postings_all_info(&term_e).unwrap();
let mut postings_e = segment_reader
.read_postings(&term_e, FreqAndPositions)
.unwrap();
assert_eq!(postings_e.len(), 1000 - 2);
for i in 2u32..1000u32 {
assert!(postings_e.advance());
@@ -474,7 +481,9 @@ mod tests {
let mut segment_postings = segment_reader
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
let mut existing_docs = Vec::new();
segment_postings.advance();
for doc in &docs {
if *doc >= segment_postings.doc() {
existing_docs.push(*doc);

View File

@@ -2,101 +2,50 @@ use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder};
use DocId;
use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult};
use std::cmp;
use std::num::Wrapping;
use fastfield::DeleteBitSet;
use fst::Streamer;
const EMPTY_DATA: [u8; 0] = [0u8; 0];
/// `SegmentPostings` represents the inverted list or postings associated to
/// a term in a `Segment`.
///
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
/// Positions on the other hand, are optionally entirely decoded upfront.
pub struct SegmentPostings<'a> {
len: usize,
// Removing this makes the code slower
// See https://github.com/tantivy-search/tantivy/issues/89
block_len: usize,
doc_offset: u32,
block_decoder: BlockDecoder,
freq_handler: FreqHandler,
remaining_data: &'a [u8],
cur: Wrapping<usize>,
block_cursor: BlockSegmentPostings<'a>,
cur: usize,
delete_bitset: DeleteBitSet,
}
impl<'a> SegmentPostings<'a> {
fn load_next_block(&mut self) {
let num_remaining_docs = self.len - self.cur.0;
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
self.remaining_data =
self.block_decoder
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
self.block_len = NUM_DOCS_PER_BLOCK;
} else {
self.remaining_data =
self.block_decoder
.uncompress_vint_sorted(self.remaining_data,
self.doc_offset,
num_remaining_docs);
self.freq_handler
.read_freq_vint(self.remaining_data, num_remaining_docs);
self.block_len = num_remaining_docs;
}
}
/// Reads a Segment postings from an &[u8]
///
/// * `len` - number of document in the posting lists.
/// * `data` - data array. The complete data is not necessarily used.
/// * `freq_handler` - the freq handler is in charge of decoding
/// frequencies and/or positions
pub fn from_data(len: u32,
data: &'a [u8],
delete_bitset: &'a DeleteBitSet,
freq_handler: FreqHandler)
-> SegmentPostings<'a> {
pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>,
delete_bitset: DeleteBitSet)
-> SegmentPostings<'a> {
SegmentPostings {
len: len as usize,
block_len: len as usize,
doc_offset: 0,
block_decoder: BlockDecoder::new(),
freq_handler: freq_handler,
remaining_data: data,
cur: Wrapping(usize::max_value()),
delete_bitset: delete_bitset.clone(),
block_cursor: segment_block_postings,
cur: NUM_DOCS_PER_BLOCK, // cursor within the block
delete_bitset: delete_bitset,
}
}
/// Returns an empty segment postings object
pub fn empty() -> SegmentPostings<'static> {
let empty_block_cursor = BlockSegmentPostings::empty();
SegmentPostings {
len: 0,
block_len: 0,
doc_offset: 0,
block_decoder: BlockDecoder::new(),
freq_handler: FreqHandler::new_without_freq(),
remaining_data: &EMPTY_DATA,
block_cursor: empty_block_cursor,
delete_bitset: DeleteBitSet::empty(),
cur: Wrapping(usize::max_value()),
cur: NUM_DOCS_PER_BLOCK,
}
}
/// Index within a block is used as an address when
/// interacting with the `FreqHandler`
fn index_within_block(&self) -> usize {
self.cur.0 % NUM_DOCS_PER_BLOCK
}
/// Sets the current position to a location relative
/// to the current block
#[inline]
fn set_within_block(&mut self, inner_pos: usize) {
self.cur = Wrapping(self.cur.0 & !(NUM_DOCS_PER_BLOCK - 1)) + Wrapping(inner_pos)
}
}
@@ -106,12 +55,13 @@ impl<'a> DocSet for SegmentPostings<'a> {
#[inline]
fn advance(&mut self) -> bool {
loop {
self.cur += Wrapping(1);
if self.cur.0 >= self.len {
return false;
}
if self.index_within_block() == 0 {
self.load_next_block();
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = NUM_DOCS_PER_BLOCK;
return false;
}
}
if !self.delete_bitset.is_deleted(self.doc()) {
return true;
@@ -119,78 +69,82 @@ impl<'a> DocSet for SegmentPostings<'a> {
}
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
if !self.advance() {
return SkipResult::End;
}
let mut pos = self.index_within_block();
// skip blocks until one that might contain the target
loop {
// check if we need to go to the next block
if target > self.block_decoder.output(self.block_len - 1) {
self.cur += Wrapping(self.block_len - pos);
self.load_next_block();
pos = 0;
// there was no more data
if self.cur.0 == self.len {
let (current_doc, last_doc_in_block) = {
let block_docs = self.block_cursor.docs();
(block_docs[self.cur], block_docs[block_docs.len() - 1])
};
if target > last_doc_in_block {
if !self.block_cursor.advance() {
return SkipResult::End;
}
} else if target < self.block_decoder.output(pos) {
// We've overpassed the target after the first `advance` call
// or we're at the beginning of a block.
// Either way, we're on the first `DocId` greater than `target`
return SkipResult::OverStep;
self.cur = 0;
} else {
if target < current_doc {
// We've overpassed the target after the first `advance` call
// or we're at the beginning of a block.
// Either way, we're on the first `DocId` greater than `target`
return SkipResult::OverStep;
}
break;
}
}
{
// we're in the right block now, start with an exponential search
let block_docs = self.block_cursor.docs();
let block_len = block_docs.len();
debug_assert!(target >= self.block_decoder.output(pos));
debug_assert!(target <= self.block_decoder.output(self.block_len - 1));
debug_assert!(target >= block_docs[self.cur]);
debug_assert!(target <= block_docs[block_len - 1]);
// we're in the right block now, start with an exponential search
let mut start = pos;
let mut end = self.block_len;
let mut count = 1;
loop {
let new = start + count;
if new < end && self.block_decoder.output(new) < target {
start = new;
count *= 2;
} else {
break;
let mut start = self.cur;
let mut end = block_len;
let mut count = 1;
loop {
let new = start + count;
if new < end && block_docs[new] < target {
start = new;
count *= 2;
} else {
break;
}
}
end = cmp::min(start + count, end);
// now do a binary search
let mut count = end - start;
while count > 0 {
let step = count / 2;
let mid = start + step;
let doc = block_docs[mid];
if doc < target {
start = mid + 1;
count -= step + 1;
} else {
count = step;
}
}
// `doc` is now >= `target`
let doc = block_docs[start];
self.cur = start;
if !self.delete_bitset.is_deleted(doc) {
if doc == target {
return SkipResult::Reached;
} else {
return SkipResult::OverStep;
}
}
}
end = cmp::min(start + count, end);
// now do a binary search
let mut count = end - start;
while count > 0 {
let step = count / 2;
let mid = start + step;
let doc = self.block_decoder.output(mid);
if doc < target {
start = mid + 1;
count -= step + 1;
} else {
count = step;
}
}
// `doc` is now >= `target`
let doc = self.block_decoder.output(start);
self.set_within_block(start);
if !self.delete_bitset.is_deleted(doc) {
if doc == target {
return SkipResult::Reached;
} else {
return SkipResult::OverStep;
}
}
if self.advance() {
SkipResult::OverStep
} else {
@@ -200,22 +154,268 @@ impl<'a> DocSet for SegmentPostings<'a> {
#[inline]
fn doc(&self) -> DocId {
self.block_decoder.output(self.index_within_block())
let docs = self.block_cursor.docs();
assert!(self.cur < docs.len(),
"Have you forgotten to call `.advance()` at least once before calling .doc().");
docs[self.cur]
}
}
impl<'a> HasLen for SegmentPostings<'a> {
fn len(&self) -> usize {
self.len
self.block_cursor.doc_freq()
}
}
impl<'a> Postings for SegmentPostings<'a> {
fn term_freq(&self) -> u32 {
self.freq_handler.freq(self.index_within_block())
self.block_cursor.freq_handler().freq(self.cur)
}
fn positions(&self) -> &[u32] {
self.freq_handler.positions(self.index_within_block())
self.block_cursor.freq_handler().positions(self.cur)
}
}
/// `BlockSegmentPostings` is a cursor iterating over blocks
/// of documents.
///
/// # Warning
///
/// While it is useful for some very specific high-performance
/// use cases, you should prefer using `SegmentPostings` for most usage.
pub struct BlockSegmentPostings<'a> {
block_decoder: BlockDecoder,
doc_freq: usize,
doc_offset: DocId,
num_binpacked_blocks: usize,
num_vint_docs: usize,
remaining_data: &'a [u8],
freq_handler: FreqHandler,
}
impl<'a> BlockSegmentPostings<'a> {
pub(crate) fn from_data(doc_freq: usize,
data: &'a [u8],
freq_handler: FreqHandler)
-> BlockSegmentPostings<'a> {
let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK;
let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks;
BlockSegmentPostings {
num_binpacked_blocks: num_binpacked_blocks,
num_vint_docs: num_vint_docs,
block_decoder: BlockDecoder::new(),
freq_handler: freq_handler,
remaining_data: data,
doc_offset: 0,
doc_freq: doc_freq,
}
}
// Resets the block segment postings on another position
// in the postings file.
//
// This is useful for enumerating through a list of terms,
// and consuming the associated posting lists while avoiding
// reallocating a `BlockSegmentPostings`.
//
// # Warning
//
// This does not reset the positions list.
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: &'a [u8]) {
let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK;
let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks;
self.num_binpacked_blocks = num_binpacked_blocks;
self.num_vint_docs = num_vint_docs;
self.remaining_data = postings_data;
self.doc_offset = 0;
self.doc_freq = doc_freq;
}
/// Returns the document frequency associated to this block postings.
///
/// This `doc_freq` is simply the sum of the length of all of the blocks
/// length, and it does not take in account deleted documents.
pub fn doc_freq(&self) -> usize {
self.doc_freq
}
/// Returns the array of docs in the current block.
///
/// Before the first call to `.advance()`, the block
/// returned by `.docs()` is empty.
#[inline]
pub fn docs(&self) -> &[DocId] {
self.block_decoder.output_array()
}
/// Returns the length of the current block.
///
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
/// except the last block that may have a length
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
#[inline]
fn block_len(&self) -> usize {
self.block_decoder.output_len
}
/// Returns a reference to the frequency handler.
pub fn freq_handler(&self) -> &FreqHandler {
&self.freq_handler
}
/// Advance to the next block.
///
/// Returns false iff there was no remaining blocks.
pub fn advance(&mut self) -> bool {
if self.num_binpacked_blocks > 0 {
self.remaining_data =
self.block_decoder
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
self.num_binpacked_blocks -= 1;
true
} else if self.num_vint_docs > 0 {
self.remaining_data =
self.block_decoder
.uncompress_vint_sorted(self.remaining_data,
self.doc_offset,
self.num_vint_docs);
self.freq_handler
.read_freq_vint(self.remaining_data, self.num_vint_docs);
self.num_vint_docs = 0;
true
} else {
false
}
}
/// Returns an empty segment postings object
pub fn empty() -> BlockSegmentPostings<'static> {
BlockSegmentPostings {
num_binpacked_blocks: 0,
num_vint_docs: 0,
block_decoder: BlockDecoder::new(),
freq_handler: FreqHandler::new_without_freq(),
remaining_data: &EMPTY_DATA,
doc_offset: 0,
doc_freq: 0,
}
}
}
impl<'a, 'b> Streamer<'b> for BlockSegmentPostings<'a> {
type Item = &'b [DocId];
fn next(&'b mut self) -> Option<&'b [DocId]> {
if self.advance() {
Some(self.docs())
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use DocSet;
use super::SegmentPostings;
use schema::{Document, SchemaBuilder};
use core::Index;
use schema::INT_INDEXED;
use schema::Term;
use fst::Streamer;
use postings::SegmentPostingsOption;
use common::HasLen;
use super::BlockSegmentPostings;
use schema::FieldValue;
#[test]
fn test_empty_segment_postings() {
let mut postings = SegmentPostings::empty();
assert!(!postings.advance());
assert!(!postings.advance());
assert_eq!(postings.len(), 0);
}
#[test]
fn test_empty_block_segment_postings() {
let mut postings = BlockSegmentPostings::empty();
assert!(!postings.advance());
assert_eq!(postings.doc_freq(), 0);
}
#[test]
fn test_block_segment_postings() {
let mut schema_builder = SchemaBuilder::default();
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
for _ in 0..100_000 {
let doc = doc!(int_field=>0u64);
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let term = Term::from_field_u64(int_field, 0u64);
let term_info = segment_reader.get_term_info(&term).unwrap();
let mut block_segments =
segment_reader
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
// checking that the `doc_freq` is correct
assert_eq!(block_segments.doc_freq(), 100_000);
while let Some(block) = block_segments.next() {
for (i, doc) in block.iter().cloned().enumerate() {
assert_eq!(offset + (i as u32), doc);
}
offset += block.len() as u32;
}
}
#[test]
fn test_reset_block_segment_postings() {
let mut schema_builder = SchemaBuilder::default();
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// create two postings list, one containg even number,
// the other containing odd numbers.
for i in 0..6 {
let doc = doc!(int_field=> (i % 2) as u64);
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let mut block_segments;
{
let term = Term::from_field_u64(int_field, 0u64);
let term_info = segment_reader.get_term_info(&term).unwrap();
block_segments =
segment_reader
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
}
assert!(block_segments.advance());
assert!(block_segments.docs() == &[0, 2, 4]);
{
let term = Term::from_field_u64(int_field, 1u64);
let term_info = segment_reader.get_term_info(&term).unwrap();
segment_reader.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
}
assert!(block_segments.advance());
assert!(block_segments.docs() == &[1, 3, 5]);
}
}