diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index ba05fb632..77ee2148d 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -19,12 +19,10 @@ use std::sync::Arc; use std::fmt; use schema::Field; use postings::SegmentPostingsOption; -use postings::SegmentPostings; +use postings::{SegmentPostings, BlockSegmentPostings}; use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader}; use schema::Schema; -use schema::FieldType; use postings::FreqHandler; -use schema::TextIndexingOptions; @@ -219,6 +217,20 @@ impl SegmentReader { term_info: &TermInfo, option: SegmentPostingsOption) -> SegmentPostings { + let block_postings = self.read_block_postings_from_terminfo(term_info, option); + let delete_bitset = self.delete_bitset.clone(); + SegmentPostings::from_block_postings(block_postings, delete_bitset) + } + + + /// Returns a block postings given a `term_info`. + /// This method is for an advanced usage only. + /// + /// Most user should prefer using `read_postings` instead. + pub fn read_block_postings_from_terminfo(&self, + term_info: &TermInfo, + option: SegmentPostingsOption) + -> BlockSegmentPostings { let offset = term_info.postings_offset as usize; let postings_data = &self.postings_data[offset..]; let freq_handler = match option { @@ -230,34 +242,26 @@ impl SegmentReader { FreqHandler::new_with_freq_and_position(offseted_position_data) } }; - SegmentPostings::from_data(term_info.doc_freq, - postings_data, - &self.delete_bitset, - freq_handler) + BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler) } - /// Returns the posting list associated with a term. + /// Resets the block segment to another position of the postings + /// file. /// - /// If the term is not found, return None. - /// Even when non-null, because of deletes, the posting object - /// returned by this method may contain no documents. - pub fn read_postings_all_info(&self, term: &Term) -> Option { - let field_entry = self.schema.get_field_entry(term.field()); - let segment_posting_option = match *field_entry.field_type() { - FieldType::Str(ref text_options) => { - match text_options.get_indexing_options() { - TextIndexingOptions::TokenizedWithFreq => SegmentPostingsOption::Freq, - TextIndexingOptions::TokenizedWithFreqAndPosition => { - SegmentPostingsOption::FreqAndPositions - } - _ => SegmentPostingsOption::NoFreq, - } - } - FieldType::U64(_) | - FieldType::I64(_) => SegmentPostingsOption::NoFreq, - }; - self.read_postings(term, segment_posting_option) + /// This is useful for enumerating through a list of terms, + /// and consuming the associated posting lists while avoiding + /// reallocating a `BlockSegmentPostings`. + /// + /// # Warning + /// + /// This does not reset the positions list. + pub fn reset_block_postings_from_terminfo<'a>(&'a self, + term_info: &TermInfo, + block_postings: &mut BlockSegmentPostings<'a>) { + let offset = term_info.postings_offset as usize; + let postings_data: &'a [u8] = &self.postings_data[offset..]; + block_postings.reset(term_info.doc_freq as usize, postings_data); } /// Returns the term info associated with the term. diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 8923437c8..d5ab7cbc9 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -66,6 +66,7 @@ impl DeleteBitSet { } /// Returns true iff the document is deleted. + #[inline] pub fn is_deleted(&self, doc: DocId) -> bool { if self.len == 0 { false diff --git a/src/lib.rs b/src/lib.rs index bfd098a96..61ecd4bbc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -207,6 +207,7 @@ mod tests { use schema::*; use DocSet; use IndexWriter; + use postings::SegmentPostingsOption::FreqAndPositions; use fastfield::{FastFieldReader, U64FastFieldReader, I64FastFieldReader}; use Postings; use rand::{XorShiftRng, Rng, SeedableRng}; @@ -338,6 +339,10 @@ mod tests { fn test_delete_postings1() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); + let term_abcd = Term::from_field_text(text_field, "abcd"); + let term_a = Term::from_field_text(text_field, "a"); + let term_b = Term::from_field_text(text_field, "b"); + let term_c = Term::from_field_text(text_field, "c"); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); { @@ -385,21 +390,15 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader - .read_postings_all_info(&Term::from_field_text(text_field, "abcd")) - .is_none()); + assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); { - let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "a")) - .unwrap(); + let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "b")) - .unwrap(); + let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -425,21 +424,16 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader - .read_postings_all_info(&Term::from_field_text(text_field, "abcd")) - .is_none()); + + assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); { - let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "a")) - .unwrap(); + let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "b")) - .unwrap(); + let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -465,19 +459,13 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader - .read_postings_all_info(&Term::from_field_text(text_field, "abcd")) - .is_none()); + assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); { - let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "a")) - .unwrap(); + let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(!postings.advance()); } { - let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "b")) - .unwrap(); + let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -485,9 +473,7 @@ mod tests { assert!(!postings.advance()); } { - let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "c")) - .unwrap(); + let mut postings = reader.read_postings(&term_c, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 4); assert!(!postings.advance()); @@ -596,12 +582,10 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader - .read_postings_all_info(&Term::from_field_text(text_field, "abcd")) - .is_none()); - let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "af")) - .unwrap(); + let term_abcd = Term::from_field_text(text_field, "abcd"); + assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); + let term_af = Term::from_field_text(text_field, "af"); + let mut postings = reader.read_postings(&term_af, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); assert_eq!(postings.term_freq(), 3); diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs index 627261696..f1e3f256c 100644 --- a/src/postings/freq_handler.rs +++ b/src/postings/freq_handler.rs @@ -47,7 +47,6 @@ impl FreqHandler { } } - /// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions. pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler { let positions = read_positions(position_data); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index db2bf8e3c..6f83ef3b9 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -26,7 +26,7 @@ pub use self::postings::Postings; #[cfg(test)] pub use self::vec_postings::VecPostings; -pub use self::segment_postings::SegmentPostings; +pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings}; pub use self::intersection::IntersectionDocSet; pub use self::freq_handler::FreqHandler; pub use self::segment_postings_option::SegmentPostingsOption; @@ -42,6 +42,7 @@ mod tests { use indexer::SegmentWriter; use core::SegmentReader; use core::Index; + use postings::SegmentPostingsOption::FreqAndPositions; use std::iter; use datastruct::stacker::Heap; use fastfield::FastFieldReader; @@ -128,11 +129,15 @@ mod tests { } { let term_a = Term::from_field_text(text_field, "abcdef"); - assert!(segment_reader.read_postings_all_info(&term_a).is_none()); + assert!(segment_reader + .read_postings(&term_a, FreqAndPositions) + .is_none()); } { let term_a = Term::from_field_text(text_field, "a"); - let mut postings_a = segment_reader.read_postings_all_info(&term_a).unwrap(); + let mut postings_a = segment_reader + .read_postings(&term_a, FreqAndPositions) + .unwrap(); assert_eq!(postings_a.len(), 1000); assert!(postings_a.advance()); assert_eq!(postings_a.doc(), 0); @@ -151,7 +156,9 @@ mod tests { } { let term_e = Term::from_field_text(text_field, "e"); - let mut postings_e = segment_reader.read_postings_all_info(&term_e).unwrap(); + let mut postings_e = segment_reader + .read_postings(&term_e, FreqAndPositions) + .unwrap(); assert_eq!(postings_e.len(), 1000 - 2); for i in 2u32..1000u32 { assert!(postings_e.advance()); @@ -474,7 +481,9 @@ mod tests { let mut segment_postings = segment_reader .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); + let mut existing_docs = Vec::new(); + segment_postings.advance(); for doc in &docs { if *doc >= segment_postings.doc() { existing_docs.push(*doc); diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index debc423c1..f76142035 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -2,101 +2,50 @@ use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder}; use DocId; use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult}; use std::cmp; -use std::num::Wrapping; use fastfield::DeleteBitSet; +use fst::Streamer; const EMPTY_DATA: [u8; 0] = [0u8; 0]; + /// `SegmentPostings` represents the inverted list or postings associated to /// a term in a `Segment`. /// /// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded. /// Positions on the other hand, are optionally entirely decoded upfront. pub struct SegmentPostings<'a> { - len: usize, - // Removing this makes the code slower - // See https://github.com/tantivy-search/tantivy/issues/89 - block_len: usize, - doc_offset: u32, - block_decoder: BlockDecoder, - freq_handler: FreqHandler, - remaining_data: &'a [u8], - cur: Wrapping, + block_cursor: BlockSegmentPostings<'a>, + cur: usize, delete_bitset: DeleteBitSet, } impl<'a> SegmentPostings<'a> { - fn load_next_block(&mut self) { - let num_remaining_docs = self.len - self.cur.0; - if num_remaining_docs >= NUM_DOCS_PER_BLOCK { - self.remaining_data = - self.block_decoder - .uncompress_block_sorted(self.remaining_data, self.doc_offset); - self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); - self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); - self.block_len = NUM_DOCS_PER_BLOCK; - } else { - self.remaining_data = - self.block_decoder - .uncompress_vint_sorted(self.remaining_data, - self.doc_offset, - num_remaining_docs); - self.freq_handler - .read_freq_vint(self.remaining_data, num_remaining_docs); - self.block_len = num_remaining_docs; - } - } - /// Reads a Segment postings from an &[u8] /// /// * `len` - number of document in the posting lists. /// * `data` - data array. The complete data is not necessarily used. /// * `freq_handler` - the freq handler is in charge of decoding /// frequencies and/or positions - pub fn from_data(len: u32, - data: &'a [u8], - delete_bitset: &'a DeleteBitSet, - freq_handler: FreqHandler) - -> SegmentPostings<'a> { + pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>, + delete_bitset: DeleteBitSet) + -> SegmentPostings<'a> { SegmentPostings { - len: len as usize, - block_len: len as usize, - doc_offset: 0, - block_decoder: BlockDecoder::new(), - freq_handler: freq_handler, - remaining_data: data, - cur: Wrapping(usize::max_value()), - delete_bitset: delete_bitset.clone(), + block_cursor: segment_block_postings, + cur: NUM_DOCS_PER_BLOCK, // cursor within the block + delete_bitset: delete_bitset, } } /// Returns an empty segment postings object pub fn empty() -> SegmentPostings<'static> { + let empty_block_cursor = BlockSegmentPostings::empty(); SegmentPostings { - len: 0, - block_len: 0, - doc_offset: 0, - block_decoder: BlockDecoder::new(), - freq_handler: FreqHandler::new_without_freq(), - remaining_data: &EMPTY_DATA, + block_cursor: empty_block_cursor, delete_bitset: DeleteBitSet::empty(), - cur: Wrapping(usize::max_value()), + cur: NUM_DOCS_PER_BLOCK, } } - - /// Index within a block is used as an address when - /// interacting with the `FreqHandler` - fn index_within_block(&self) -> usize { - self.cur.0 % NUM_DOCS_PER_BLOCK - } - - /// Sets the current position to a location relative - /// to the current block - #[inline] - fn set_within_block(&mut self, inner_pos: usize) { - self.cur = Wrapping(self.cur.0 & !(NUM_DOCS_PER_BLOCK - 1)) + Wrapping(inner_pos) - } } @@ -106,12 +55,13 @@ impl<'a> DocSet for SegmentPostings<'a> { #[inline] fn advance(&mut self) -> bool { loop { - self.cur += Wrapping(1); - if self.cur.0 >= self.len { - return false; - } - if self.index_within_block() == 0 { - self.load_next_block(); + self.cur += 1; + if self.cur >= self.block_cursor.block_len() { + self.cur = 0; + if !self.block_cursor.advance() { + self.cur = NUM_DOCS_PER_BLOCK; + return false; + } } if !self.delete_bitset.is_deleted(self.doc()) { return true; @@ -119,78 +69,82 @@ impl<'a> DocSet for SegmentPostings<'a> { } } + fn skip_next(&mut self, target: DocId) -> SkipResult { if !self.advance() { return SkipResult::End; } - let mut pos = self.index_within_block(); // skip blocks until one that might contain the target loop { // check if we need to go to the next block - if target > self.block_decoder.output(self.block_len - 1) { - self.cur += Wrapping(self.block_len - pos); - self.load_next_block(); - pos = 0; - - // there was no more data - if self.cur.0 == self.len { + let (current_doc, last_doc_in_block) = { + let block_docs = self.block_cursor.docs(); + (block_docs[self.cur], block_docs[block_docs.len() - 1]) + }; + if target > last_doc_in_block { + if !self.block_cursor.advance() { return SkipResult::End; } - } else if target < self.block_decoder.output(pos) { - // We've overpassed the target after the first `advance` call - // or we're at the beginning of a block. - // Either way, we're on the first `DocId` greater than `target` - return SkipResult::OverStep; + self.cur = 0; } else { + if target < current_doc { + // We've overpassed the target after the first `advance` call + // or we're at the beginning of a block. + // Either way, we're on the first `DocId` greater than `target` + return SkipResult::OverStep; + } break; } } + { + // we're in the right block now, start with an exponential search + let block_docs = self.block_cursor.docs(); + let block_len = block_docs.len(); - debug_assert!(target >= self.block_decoder.output(pos)); - debug_assert!(target <= self.block_decoder.output(self.block_len - 1)); + debug_assert!(target >= block_docs[self.cur]); + debug_assert!(target <= block_docs[block_len - 1]); - // we're in the right block now, start with an exponential search - let mut start = pos; - let mut end = self.block_len; - let mut count = 1; - loop { - let new = start + count; - if new < end && self.block_decoder.output(new) < target { - start = new; - count *= 2; - } else { - break; + let mut start = self.cur; + let mut end = block_len; + let mut count = 1; + loop { + let new = start + count; + if new < end && block_docs[new] < target { + start = new; + count *= 2; + } else { + break; + } + } + end = cmp::min(start + count, end); + + // now do a binary search + let mut count = end - start; + while count > 0 { + let step = count / 2; + let mid = start + step; + let doc = block_docs[mid]; + if doc < target { + start = mid + 1; + count -= step + 1; + } else { + count = step; + } + } + + // `doc` is now >= `target` + let doc = block_docs[start]; + self.cur = start; + + if !self.delete_bitset.is_deleted(doc) { + if doc == target { + return SkipResult::Reached; + } else { + return SkipResult::OverStep; + } } } - end = cmp::min(start + count, end); - - // now do a binary search - let mut count = end - start; - while count > 0 { - let step = count / 2; - let mid = start + step; - let doc = self.block_decoder.output(mid); - if doc < target { - start = mid + 1; - count -= step + 1; - } else { - count = step; - } - } - - // `doc` is now >= `target` - let doc = self.block_decoder.output(start); - self.set_within_block(start); - - if !self.delete_bitset.is_deleted(doc) { - if doc == target { - return SkipResult::Reached; - } else { - return SkipResult::OverStep; - } - } - if self.advance() { SkipResult::OverStep } else { @@ -200,22 +154,268 @@ impl<'a> DocSet for SegmentPostings<'a> { #[inline] fn doc(&self) -> DocId { - self.block_decoder.output(self.index_within_block()) + let docs = self.block_cursor.docs(); + assert!(self.cur < docs.len(), + "Have you forgotten to call `.advance()` at least once before calling .doc()."); + docs[self.cur] } } impl<'a> HasLen for SegmentPostings<'a> { fn len(&self) -> usize { - self.len + self.block_cursor.doc_freq() } } impl<'a> Postings for SegmentPostings<'a> { fn term_freq(&self) -> u32 { - self.freq_handler.freq(self.index_within_block()) + self.block_cursor.freq_handler().freq(self.cur) } fn positions(&self) -> &[u32] { - self.freq_handler.positions(self.index_within_block()) + self.block_cursor.freq_handler().positions(self.cur) + } +} + +/// `BlockSegmentPostings` is a cursor iterating over blocks +/// of documents. +/// +/// # Warning +/// +/// While it is useful for some very specific high-performance +/// use cases, you should prefer using `SegmentPostings` for most usage. +pub struct BlockSegmentPostings<'a> { + block_decoder: BlockDecoder, + doc_freq: usize, + doc_offset: DocId, + num_binpacked_blocks: usize, + num_vint_docs: usize, + remaining_data: &'a [u8], + freq_handler: FreqHandler, +} + +impl<'a> BlockSegmentPostings<'a> { + pub(crate) fn from_data(doc_freq: usize, + data: &'a [u8], + freq_handler: FreqHandler) + -> BlockSegmentPostings<'a> { + let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK; + let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; + BlockSegmentPostings { + num_binpacked_blocks: num_binpacked_blocks, + num_vint_docs: num_vint_docs, + block_decoder: BlockDecoder::new(), + freq_handler: freq_handler, + remaining_data: data, + doc_offset: 0, + doc_freq: doc_freq, + } + } + + // Resets the block segment postings on another position + // in the postings file. + // + // This is useful for enumerating through a list of terms, + // and consuming the associated posting lists while avoiding + // reallocating a `BlockSegmentPostings`. + // + // # Warning + // + // This does not reset the positions list. + pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: &'a [u8]) { + let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK; + let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; + self.num_binpacked_blocks = num_binpacked_blocks; + self.num_vint_docs = num_vint_docs; + self.remaining_data = postings_data; + self.doc_offset = 0; + self.doc_freq = doc_freq; + } + + /// Returns the document frequency associated to this block postings. + /// + /// This `doc_freq` is simply the sum of the length of all of the blocks + /// length, and it does not take in account deleted documents. + pub fn doc_freq(&self) -> usize { + self.doc_freq + } + + /// Returns the array of docs in the current block. + /// + /// Before the first call to `.advance()`, the block + /// returned by `.docs()` is empty. + #[inline] + pub fn docs(&self) -> &[DocId] { + self.block_decoder.output_array() + } + + /// Returns the length of the current block. + /// + /// All blocks have a length of `NUM_DOCS_PER_BLOCK`, + /// except the last block that may have a length + /// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1` + #[inline] + fn block_len(&self) -> usize { + self.block_decoder.output_len + } + + + /// Returns a reference to the frequency handler. + pub fn freq_handler(&self) -> &FreqHandler { + &self.freq_handler + } + + /// Advance to the next block. + /// + /// Returns false iff there was no remaining blocks. + pub fn advance(&mut self) -> bool { + if self.num_binpacked_blocks > 0 { + self.remaining_data = + self.block_decoder + .uncompress_block_sorted(self.remaining_data, self.doc_offset); + self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); + self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); + self.num_binpacked_blocks -= 1; + true + } else if self.num_vint_docs > 0 { + self.remaining_data = + self.block_decoder + .uncompress_vint_sorted(self.remaining_data, + self.doc_offset, + self.num_vint_docs); + self.freq_handler + .read_freq_vint(self.remaining_data, self.num_vint_docs); + self.num_vint_docs = 0; + true + } else { + false + } + } + + /// Returns an empty segment postings object + pub fn empty() -> BlockSegmentPostings<'static> { + BlockSegmentPostings { + num_binpacked_blocks: 0, + num_vint_docs: 0, + block_decoder: BlockDecoder::new(), + freq_handler: FreqHandler::new_without_freq(), + remaining_data: &EMPTY_DATA, + doc_offset: 0, + doc_freq: 0, + } + } +} + +impl<'a, 'b> Streamer<'b> for BlockSegmentPostings<'a> { + type Item = &'b [DocId]; + + fn next(&'b mut self) -> Option<&'b [DocId]> { + if self.advance() { + Some(self.docs()) + } else { + None + } + } +} + +#[cfg(test)] +mod tests { + + use DocSet; + use super::SegmentPostings; + use schema::{Document, SchemaBuilder}; + use core::Index; + use schema::INT_INDEXED; + use schema::Term; + use fst::Streamer; + use postings::SegmentPostingsOption; + use common::HasLen; + use super::BlockSegmentPostings; + use schema::FieldValue; + + #[test] + fn test_empty_segment_postings() { + let mut postings = SegmentPostings::empty(); + assert!(!postings.advance()); + assert!(!postings.advance()); + assert_eq!(postings.len(), 0); + } + + #[test] + fn test_empty_block_segment_postings() { + let mut postings = BlockSegmentPostings::empty(); + assert!(!postings.advance()); + assert_eq!(postings.doc_freq(), 0); + } + + #[test] + fn test_block_segment_postings() { + let mut schema_builder = SchemaBuilder::default(); + let int_field = schema_builder.add_u64_field("id", INT_INDEXED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + for _ in 0..100_000 { + let doc = doc!(int_field=>0u64); + index_writer.add_document(doc); + } + index_writer.commit().unwrap(); + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let segment_reader = searcher.segment_reader(0); + let term = Term::from_field_u64(int_field, 0u64); + let term_info = segment_reader.get_term_info(&term).unwrap(); + let mut block_segments = + segment_reader + .read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq); + let mut offset: u32 = 0u32; + // checking that the block before calling advance is empty + assert!(block_segments.docs().is_empty()); + // checking that the `doc_freq` is correct + assert_eq!(block_segments.doc_freq(), 100_000); + while let Some(block) = block_segments.next() { + for (i, doc) in block.iter().cloned().enumerate() { + assert_eq!(offset + (i as u32), doc); + } + offset += block.len() as u32; + } + } + + + #[test] + fn test_reset_block_segment_postings() { + let mut schema_builder = SchemaBuilder::default(); + let int_field = schema_builder.add_u64_field("id", INT_INDEXED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + // create two postings list, one containg even number, + // the other containing odd numbers. + for i in 0..6 { + let doc = doc!(int_field=> (i % 2) as u64); + index_writer.add_document(doc); + } + index_writer.commit().unwrap(); + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let segment_reader = searcher.segment_reader(0); + + let mut block_segments; + { + let term = Term::from_field_u64(int_field, 0u64); + let term_info = segment_reader.get_term_info(&term).unwrap(); + block_segments = + segment_reader + .read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq); + } + assert!(block_segments.advance()); + assert!(block_segments.docs() == &[0, 2, 4]); + { + let term = Term::from_field_u64(int_field, 1u64); + let term_info = segment_reader.get_term_info(&term).unwrap(); + segment_reader.reset_block_postings_from_terminfo(&term_info, &mut block_segments); + } + assert!(block_segments.advance()); + assert!(block_segments.docs() == &[1, 3, 5]); } }