From 3f49d65a87f5d03a55951650f67b099399bd85c3 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 20 May 2017 00:46:23 +0900 Subject: [PATCH 1/7] issue/162 Create block postings --- src/core/segment_reader.rs | 37 ++---- src/lib.rs | 40 +++--- src/postings/mod.rs | 12 +- src/postings/segment_postings.rs | 201 +++++++++++++++++++++++++++++-- 4 files changed, 232 insertions(+), 58 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index ba05fb632..62a9347d8 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use std::fmt; use schema::Field; use postings::SegmentPostingsOption; -use postings::SegmentPostings; +use postings::{SegmentPostings, BlockSegmentPostings}; use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader}; use schema::Schema; use schema::FieldType; @@ -219,6 +219,12 @@ impl SegmentReader { term_info: &TermInfo, option: SegmentPostingsOption) -> SegmentPostings { + let block_postings = self.read_block_postings_from_terminfo(term_info, option); + let delete_bitset = self.delete_bitset.clone(); + SegmentPostings::from_block_postings(block_postings, delete_bitset) + } + + pub fn read_block_postings_from_terminfo(&self, term_info: &TermInfo, option: SegmentPostingsOption) -> BlockSegmentPostings { let offset = term_info.postings_offset as usize; let postings_data = &self.postings_data[offset..]; let freq_handler = match option { @@ -230,34 +236,7 @@ impl SegmentReader { FreqHandler::new_with_freq_and_position(offseted_position_data) } }; - SegmentPostings::from_data(term_info.doc_freq, - postings_data, - &self.delete_bitset, - freq_handler) - } - - - /// Returns the posting list associated with a term. - /// - /// If the term is not found, return None. - /// Even when non-null, because of deletes, the posting object - /// returned by this method may contain no documents. - pub fn read_postings_all_info(&self, term: &Term) -> Option { - let field_entry = self.schema.get_field_entry(term.field()); - let segment_posting_option = match *field_entry.field_type() { - FieldType::Str(ref text_options) => { - match text_options.get_indexing_options() { - TextIndexingOptions::TokenizedWithFreq => SegmentPostingsOption::Freq, - TextIndexingOptions::TokenizedWithFreqAndPosition => { - SegmentPostingsOption::FreqAndPositions - } - _ => SegmentPostingsOption::NoFreq, - } - } - FieldType::U64(_) | - FieldType::I64(_) => SegmentPostingsOption::NoFreq, - }; - self.read_postings(term, segment_posting_option) + BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler) } /// Returns the term info associated with the term. diff --git a/src/lib.rs b/src/lib.rs index bfd098a96..ea45de7c9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -207,6 +207,7 @@ mod tests { use schema::*; use DocSet; use IndexWriter; + use postings::SegmentPostingsOption::FreqAndPositions; use fastfield::{FastFieldReader, U64FastFieldReader, I64FastFieldReader}; use Postings; use rand::{XorShiftRng, Rng, SeedableRng}; @@ -338,6 +339,10 @@ mod tests { fn test_delete_postings1() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); + let term_abcd = Term::from_field_text(text_field, "abcd"); + let term_a = Term::from_field_text(text_field, "a"); + let term_b = Term::from_field_text(text_field, "b"); + let term_c = Term::from_field_text(text_field, "c"); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); { @@ -386,11 +391,11 @@ mod tests { let searcher = index.searcher(); let reader = searcher.segment_reader(0); assert!(reader - .read_postings_all_info(&Term::from_field_text(text_field, "abcd")) - .is_none()); + .read_postings(&term_abcd, FreqAndPositions) + .is_none()); { let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "a")) + .read_postings(&term_a, FreqAndPositions) .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); @@ -398,7 +403,7 @@ mod tests { } { let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "b")) + .read_postings(&term_b, FreqAndPositions) .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); @@ -425,12 +430,13 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); + assert!(reader - .read_postings_all_info(&Term::from_field_text(text_field, "abcd")) - .is_none()); + .read_postings(&term_abcd, FreqAndPositions) + .is_none()); { let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "a")) + .read_postings(&term_a, FreqAndPositions) .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); @@ -438,7 +444,7 @@ mod tests { } { let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "b")) + .read_postings(&term_b, FreqAndPositions) .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); @@ -466,17 +472,17 @@ mod tests { let searcher = index.searcher(); let reader = searcher.segment_reader(0); assert!(reader - .read_postings_all_info(&Term::from_field_text(text_field, "abcd")) - .is_none()); + .read_postings(&term_abcd, FreqAndPositions) + .is_none()); { let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "a")) + .read_postings(&term_a, FreqAndPositions) .unwrap(); assert!(!postings.advance()); } { let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "b")) + .read_postings(&term_b, FreqAndPositions) .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); @@ -486,7 +492,7 @@ mod tests { } { let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "c")) + .read_postings(&term_c, FreqAndPositions) .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 4); @@ -596,11 +602,13 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); + let term_abcd = Term::from_field_text(text_field, "abcd"); assert!(reader - .read_postings_all_info(&Term::from_field_text(text_field, "abcd")) - .is_none()); + .read_postings(&term_abcd, FreqAndPositions) + .is_none()); + let term_af = Term::from_field_text(text_field, "af"); let mut postings = reader - .read_postings_all_info(&Term::from_field_text(text_field, "af")) + .read_postings(&term_af, FreqAndPositions) .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 6dbb55ab0..483b7ed46 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -26,7 +26,7 @@ pub use self::postings::Postings; #[cfg(test)] pub use self::vec_postings::VecPostings; -pub use self::segment_postings::SegmentPostings; +pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings}; pub use self::intersection::IntersectionDocSet; pub use self::freq_handler::FreqHandler; pub use self::segment_postings_option::SegmentPostingsOption; @@ -42,6 +42,7 @@ mod tests { use indexer::SegmentWriter; use core::SegmentReader; use core::Index; + use postings::SegmentPostingsOption::FreqAndPositions; use std::iter; use datastruct::stacker::Heap; use fastfield::FastFieldReader; @@ -129,11 +130,11 @@ mod tests { } { let term_a = Term::from_field_text(text_field, "abcdef"); - assert!(segment_reader.read_postings_all_info(&term_a).is_none()); + assert!(segment_reader.read_postings(&term_a, FreqAndPositions).is_none()); } { let term_a = Term::from_field_text(text_field, "a"); - let mut postings_a = segment_reader.read_postings_all_info(&term_a).unwrap(); + let mut postings_a = segment_reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert_eq!(postings_a.len(), 1000); assert!(postings_a.advance()); assert_eq!(postings_a.doc(), 0); @@ -152,7 +153,7 @@ mod tests { } { let term_e = Term::from_field_text(text_field, "e"); - let mut postings_e = segment_reader.read_postings_all_info(&term_e).unwrap(); + let mut postings_e = segment_reader.read_postings(&term_e, FreqAndPositions).unwrap(); assert_eq!(postings_e.len(), 1000 - 2); for i in 2u32..1000u32 { assert!(postings_e.advance()); @@ -467,6 +468,7 @@ mod tests { }); } + fn bench_skip_next(p: f32, b: &mut Bencher) { let searcher = INDEX.searcher(); let segment_reader = searcher.segment_reader(0); @@ -475,6 +477,7 @@ mod tests { let mut segment_postings = segment_reader .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); + let mut existing_docs = Vec::new(); for doc in &docs { if *doc >= segment_postings.doc() { @@ -490,6 +493,7 @@ mod tests { .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); for doc in &existing_docs { + println!("doc {}", doc); if segment_postings.skip_next(*doc) == SkipResult::End { break; } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index debc423c1..f917449b2 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -8,6 +8,8 @@ use fastfield::DeleteBitSet; const EMPTY_DATA: [u8; 0] = [0u8; 0]; + +/* /// `SegmentPostings` represents the inverted list or postings associated to /// a term in a `Segment`. /// @@ -85,11 +87,7 @@ impl<'a> SegmentPostings<'a> { } } - /// Index within a block is used as an address when - /// interacting with the `FreqHandler` - fn index_within_block(&self) -> usize { - self.cur.0 % NUM_DOCS_PER_BLOCK - } + /// Sets the current position to a location relative /// to the current block @@ -119,6 +117,107 @@ impl<'a> DocSet for SegmentPostings<'a> { } } + + + #[inline] + fn doc(&self) -> DocId { + self.block_decoder.output(self.index_within_block()) + } +} + +impl<'a> HasLen for SegmentPostings<'a> { + fn len(&self) -> usize { + self.len + } +} + +impl<'a> Postings for SegmentPostings<'a> { + fn term_freq(&self) -> u32 { + self.freq_handler.freq(self.index_within_block()) + } + + fn positions(&self) -> &[u32] { + self.freq_handler.positions(self.index_within_block()) + } +} + +*/ + + + +/// `SegmentPostings` represents the inverted list or postings associated to +/// a term in a `Segment`. +/// +/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded. +/// Positions on the other hand, are optionally entirely decoded upfront. +pub struct SegmentPostings<'a> { + len: usize, + cur: Wrapping, + block_cursor: BlockSegmentPostings<'a>, + cur_block_len: usize, + delete_bitset: DeleteBitSet, +} + +impl<'a> SegmentPostings<'a> { + + + /// Reads a Segment postings from an &[u8] + /// + /// * `len` - number of document in the posting lists. + /// * `data` - data array. The complete data is not necessarily used. + /// * `freq_handler` - the freq handler is in charge of decoding + /// frequencies and/or positions + pub fn from_block_postings( + segment_block_postings: BlockSegmentPostings<'a>, + delete_bitset: DeleteBitSet) -> SegmentPostings<'a> { + SegmentPostings { + len: segment_block_postings.len, + block_cursor: segment_block_postings, + cur: Wrapping(usize::max_value()), + cur_block_len: 0, + delete_bitset: delete_bitset, + } + } + + /// Returns an empty segment postings object + pub fn empty() -> SegmentPostings<'static> { + let empty_block_cursor = BlockSegmentPostings::empty(); + SegmentPostings { + len: 0, + block_cursor: empty_block_cursor, + delete_bitset: DeleteBitSet::empty(), + cur: Wrapping(usize::max_value()), + cur_block_len: 0, + } + } +} + + +impl<'a> DocSet for SegmentPostings<'a> { + // goes to the next element. + // next needs to be called a first time to point to the correct element. + #[inline] + fn advance(&mut self) -> bool { + loop { + self.cur += Wrapping(1); + assert!(self.cur.0 >= 0); + assert!(self.cur.0 <= self.cur_block_len); + if self.cur.0 == self.cur_block_len { + self.cur = Wrapping(0); + if !self.block_cursor.advance() { + self.cur_block_len = 0; + self.cur = Wrapping(usize::max_value()); + return false; + } + self.cur_block_len = self.block_cursor.docs().len(); + } + if !self.delete_bitset.is_deleted(self.doc()) { + return true; + } + } + } + + /* fn skip_next(&mut self, target: DocId) -> SkipResult { if !self.advance() { return SkipResult::End; @@ -197,10 +296,11 @@ impl<'a> DocSet for SegmentPostings<'a> { SkipResult::End } } - + */ + #[inline] fn doc(&self) -> DocId { - self.block_decoder.output(self.index_within_block()) + self.block_cursor.docs()[self.cur.0] } } @@ -212,10 +312,93 @@ impl<'a> HasLen for SegmentPostings<'a> { impl<'a> Postings for SegmentPostings<'a> { fn term_freq(&self) -> u32 { - self.freq_handler.freq(self.index_within_block()) + self.block_cursor.freq_handler().freq(self.cur.0) } fn positions(&self) -> &[u32] { - self.freq_handler.positions(self.index_within_block()) + self.block_cursor.freq_handler().positions(self.cur.0) } } + + + + +pub struct BlockSegmentPostings<'a> { + num_binpacked_blocks: usize, + num_vint_docs: usize, + block_decoder: BlockDecoder, + freq_handler: FreqHandler, + remaining_data: &'a [u8], + doc_offset: DocId, + len: usize, +} + +impl<'a> BlockSegmentPostings<'a> { + + pub fn from_data(len: usize, data: &'a [u8], freq_handler: FreqHandler) -> BlockSegmentPostings<'a> { + let num_binpacked_blocks: usize = (len as usize) / NUM_DOCS_PER_BLOCK; + let num_vint_docs = (len as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; + BlockSegmentPostings { + num_binpacked_blocks: num_binpacked_blocks, + num_vint_docs: num_vint_docs, + block_decoder: BlockDecoder::new(), + freq_handler: freq_handler, + remaining_data: data, + doc_offset: 0, + len: len, + } + } + + pub fn reset(&mut self, len: usize, data: &'a [u8]) { + let num_binpacked_blocks: usize = (len as usize) / NUM_DOCS_PER_BLOCK; + let num_vint_docs = (len as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; + self.num_binpacked_blocks = num_binpacked_blocks; + self.num_vint_docs = num_vint_docs; + self.remaining_data = data; + self.doc_offset = 0; + self.len = len; + } + + pub fn docs(&self) -> &[DocId] { + self.block_decoder.output_array() + } + + pub fn freq_handler(&self) -> &FreqHandler { + &self.freq_handler + } + + pub fn advance(&mut self) -> bool { + if self.num_binpacked_blocks > 0 { + self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset); + self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); + self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); + self.num_binpacked_blocks -= 1; + true + } + else { + if self.num_vint_docs > 0 { + self.remaining_data = self.block_decoder.uncompress_vint_sorted(self.remaining_data, self.doc_offset, self.num_vint_docs); + self.freq_handler.read_freq_vint(self.remaining_data, self.num_vint_docs); + self.num_vint_docs = 0; + true + } + else { + false + } + } + } + + /// Returns an empty segment postings object + pub fn empty() -> BlockSegmentPostings<'static> { + BlockSegmentPostings { + num_binpacked_blocks: 0, + num_vint_docs: 0, + block_decoder: BlockDecoder::new(), + freq_handler: FreqHandler::new_without_freq(), + remaining_data: &EMPTY_DATA, + doc_offset: 0, + len: 0, + } + } + +} From 2be5f08cd6b567806531e8a3f6ce840e48adadce Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 20 May 2017 11:46:40 +0900 Subject: [PATCH 2/7] issue/162 Added block iteration API --- src/core/segment_reader.rs | 2 - src/postings/mod.rs | 3 +- src/postings/segment_postings.rs | 295 ++++++++++--------------------- 3 files changed, 90 insertions(+), 210 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 62a9347d8..03ad7d248 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -22,9 +22,7 @@ use postings::SegmentPostingsOption; use postings::{SegmentPostings, BlockSegmentPostings}; use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader}; use schema::Schema; -use schema::FieldType; use postings::FreqHandler; -use schema::TextIndexingOptions; diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 483b7ed46..a338ae8db 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -468,7 +468,6 @@ mod tests { }); } - fn bench_skip_next(p: f32, b: &mut Bencher) { let searcher = INDEX.searcher(); let segment_reader = searcher.segment_reader(0); @@ -479,6 +478,7 @@ mod tests { .unwrap(); let mut existing_docs = Vec::new(); + segment_postings.advance(); for doc in &docs { if *doc >= segment_postings.doc() { existing_docs.push(*doc); @@ -493,7 +493,6 @@ mod tests { .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); for doc in &existing_docs { - println!("doc {}", doc); if segment_postings.skip_next(*doc) == SkipResult::End { break; } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index f917449b2..b5a191d3c 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -2,14 +2,12 @@ use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder}; use DocId; use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult}; use std::cmp; -use std::num::Wrapping; use fastfield::DeleteBitSet; const EMPTY_DATA: [u8; 0] = [0u8; 0]; -/* /// `SegmentPostings` represents the inverted list or postings associated to /// a term in a `Segment`. /// @@ -17,142 +15,7 @@ const EMPTY_DATA: [u8; 0] = [0u8; 0]; /// Positions on the other hand, are optionally entirely decoded upfront. pub struct SegmentPostings<'a> { len: usize, - // Removing this makes the code slower - // See https://github.com/tantivy-search/tantivy/issues/89 - block_len: usize, - doc_offset: u32, - block_decoder: BlockDecoder, - freq_handler: FreqHandler, - remaining_data: &'a [u8], - cur: Wrapping, - delete_bitset: DeleteBitSet, -} - -impl<'a> SegmentPostings<'a> { - fn load_next_block(&mut self) { - let num_remaining_docs = self.len - self.cur.0; - if num_remaining_docs >= NUM_DOCS_PER_BLOCK { - self.remaining_data = - self.block_decoder - .uncompress_block_sorted(self.remaining_data, self.doc_offset); - self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); - self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); - self.block_len = NUM_DOCS_PER_BLOCK; - } else { - self.remaining_data = - self.block_decoder - .uncompress_vint_sorted(self.remaining_data, - self.doc_offset, - num_remaining_docs); - self.freq_handler - .read_freq_vint(self.remaining_data, num_remaining_docs); - self.block_len = num_remaining_docs; - } - } - - /// Reads a Segment postings from an &[u8] - /// - /// * `len` - number of document in the posting lists. - /// * `data` - data array. The complete data is not necessarily used. - /// * `freq_handler` - the freq handler is in charge of decoding - /// frequencies and/or positions - pub fn from_data(len: u32, - data: &'a [u8], - delete_bitset: &'a DeleteBitSet, - freq_handler: FreqHandler) - -> SegmentPostings<'a> { - SegmentPostings { - len: len as usize, - block_len: len as usize, - doc_offset: 0, - block_decoder: BlockDecoder::new(), - freq_handler: freq_handler, - remaining_data: data, - cur: Wrapping(usize::max_value()), - delete_bitset: delete_bitset.clone(), - } - } - - /// Returns an empty segment postings object - pub fn empty() -> SegmentPostings<'static> { - SegmentPostings { - len: 0, - block_len: 0, - doc_offset: 0, - block_decoder: BlockDecoder::new(), - freq_handler: FreqHandler::new_without_freq(), - remaining_data: &EMPTY_DATA, - delete_bitset: DeleteBitSet::empty(), - cur: Wrapping(usize::max_value()), - } - } - - - - /// Sets the current position to a location relative - /// to the current block - #[inline] - fn set_within_block(&mut self, inner_pos: usize) { - self.cur = Wrapping(self.cur.0 & !(NUM_DOCS_PER_BLOCK - 1)) + Wrapping(inner_pos) - } -} - - -impl<'a> DocSet for SegmentPostings<'a> { - // goes to the next element. - // next needs to be called a first time to point to the correct element. - #[inline] - fn advance(&mut self) -> bool { - loop { - self.cur += Wrapping(1); - if self.cur.0 >= self.len { - return false; - } - if self.index_within_block() == 0 { - self.load_next_block(); - } - if !self.delete_bitset.is_deleted(self.doc()) { - return true; - } - } - } - - - - #[inline] - fn doc(&self) -> DocId { - self.block_decoder.output(self.index_within_block()) - } -} - -impl<'a> HasLen for SegmentPostings<'a> { - fn len(&self) -> usize { - self.len - } -} - -impl<'a> Postings for SegmentPostings<'a> { - fn term_freq(&self) -> u32 { - self.freq_handler.freq(self.index_within_block()) - } - - fn positions(&self) -> &[u32] { - self.freq_handler.positions(self.index_within_block()) - } -} - -*/ - - - -/// `SegmentPostings` represents the inverted list or postings associated to -/// a term in a `Segment`. -/// -/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded. -/// Positions on the other hand, are optionally entirely decoded upfront. -pub struct SegmentPostings<'a> { - len: usize, - cur: Wrapping, + cur: usize, block_cursor: BlockSegmentPostings<'a>, cur_block_len: usize, delete_bitset: DeleteBitSet, @@ -173,7 +36,7 @@ impl<'a> SegmentPostings<'a> { SegmentPostings { len: segment_block_postings.len, block_cursor: segment_block_postings, - cur: Wrapping(usize::max_value()), + cur: NUM_DOCS_PER_BLOCK, // cursor within the block cur_block_len: 0, delete_bitset: delete_bitset, } @@ -186,7 +49,7 @@ impl<'a> SegmentPostings<'a> { len: 0, block_cursor: empty_block_cursor, delete_bitset: DeleteBitSet::empty(), - cur: Wrapping(usize::max_value()), + cur: NUM_DOCS_PER_BLOCK, cur_block_len: 0, } } @@ -198,15 +61,13 @@ impl<'a> DocSet for SegmentPostings<'a> { // next needs to be called a first time to point to the correct element. #[inline] fn advance(&mut self) -> bool { - loop { - self.cur += Wrapping(1); - assert!(self.cur.0 >= 0); - assert!(self.cur.0 <= self.cur_block_len); - if self.cur.0 == self.cur_block_len { - self.cur = Wrapping(0); + loop { + self.cur += 1; + if self.cur >= self.cur_block_len { + self.cur = 0; if !self.block_cursor.advance() { self.cur_block_len = 0; - self.cur = Wrapping(usize::max_value()); + self.cur = NUM_DOCS_PER_BLOCK; return false; } self.cur_block_len = self.block_cursor.docs().len(); @@ -217,90 +78,96 @@ impl<'a> DocSet for SegmentPostings<'a> { } } - /* + fn skip_next(&mut self, target: DocId) -> SkipResult { if !self.advance() { return SkipResult::End; } - let mut pos = self.index_within_block(); // skip blocks until one that might contain the target loop { // check if we need to go to the next block - if target > self.block_decoder.output(self.block_len - 1) { - self.cur += Wrapping(self.block_len - pos); - self.load_next_block(); - pos = 0; - - // there was no more data - if self.cur.0 == self.len { + let last_doc_in_block = { + let block_docs = self.block_cursor.docs(); + block_docs[block_docs.len() - 1] + }; + if target > last_doc_in_block { + if !self.block_cursor.advance() { return SkipResult::End; } - } else if target < self.block_decoder.output(pos) { - // We've overpassed the target after the first `advance` call - // or we're at the beginning of a block. - // Either way, we're on the first `DocId` greater than `target` - return SkipResult::OverStep; + self.cur = 0; } else { + let block_docs = self.block_cursor.docs(); + if target < block_docs[self.cur] { + // We've overpassed the target after the first `advance` call + // or we're at the beginning of a block. + // Either way, we're on the first `DocId` greater than `target` + return SkipResult::OverStep; + } break; } } + { + // we're in the right block now, start with an exponential search + let block_docs = self.block_cursor.docs(); + let block_len = block_docs.len(); - debug_assert!(target >= self.block_decoder.output(pos)); - debug_assert!(target <= self.block_decoder.output(self.block_len - 1)); + debug_assert!(target >= block_docs[self.cur]); + debug_assert!(target <= block_docs[block_len - 1]); - // we're in the right block now, start with an exponential search - let mut start = pos; - let mut end = self.block_len; - let mut count = 1; - loop { - let new = start + count; - if new < end && self.block_decoder.output(new) < target { - start = new; - count *= 2; - } else { - break; + let mut start = 0; + let mut end = block_len; + let mut count = 1; + loop { + let new = start + count; + if new < end && block_docs[new] < target { + start = new; + count *= 2; + } else { + break; + } + } + end = cmp::min(start + count, end); + + // now do a binary search + let mut count = end - start; + while count > 0 { + let step = count / 2; + let mid = start + step; + let doc = block_docs[mid]; + if doc < target { + start = mid + 1; + count -= step + 1; + } else { + count = step; + } + } + + // `doc` is now >= `target` + let doc = block_docs[start]; + self.cur = start; + + if !self.delete_bitset.is_deleted(doc) { + if doc == target { + return SkipResult::Reached; + } else { + return SkipResult::OverStep; + } } } - end = cmp::min(start + count, end); - - // now do a binary search - let mut count = end - start; - while count > 0 { - let step = count / 2; - let mid = start + step; - let doc = self.block_decoder.output(mid); - if doc < target { - start = mid + 1; - count -= step + 1; - } else { - count = step; - } - } - - // `doc` is now >= `target` - let doc = self.block_decoder.output(start); - self.set_within_block(start); - - if !self.delete_bitset.is_deleted(doc) { - if doc == target { - return SkipResult::Reached; - } else { - return SkipResult::OverStep; - } - } - if self.advance() { SkipResult::OverStep } else { SkipResult::End } } - */ + #[inline] fn doc(&self) -> DocId { - self.block_cursor.docs()[self.cur.0] + let docs = self.block_cursor.docs(); + assert!(self.cur < docs.len(), "Have you forgotten to call `.advance()` at least once before calling .doc()."); + docs[self.cur] } } @@ -312,11 +179,11 @@ impl<'a> HasLen for SegmentPostings<'a> { impl<'a> Postings for SegmentPostings<'a> { fn term_freq(&self) -> u32 { - self.block_cursor.freq_handler().freq(self.cur.0) + self.block_cursor.freq_handler().freq(self.cur) } fn positions(&self) -> &[u32] { - self.block_cursor.freq_handler().positions(self.cur.0) + self.block_cursor.freq_handler().positions(self.cur) } } @@ -359,6 +226,8 @@ impl<'a> BlockSegmentPostings<'a> { self.len = len; } + + /// Returns the array of docs in the current block. pub fn docs(&self) -> &[DocId] { self.block_decoder.output_array() } @@ -402,3 +271,17 @@ impl<'a> BlockSegmentPostings<'a> { } } + +#[cfg(test)] +mod tests { + + use DocSet; + use super::SegmentPostings; + + #[test] + fn test_empty_segment_postings() { + let mut postings = SegmentPostings::empty(); + assert!(!postings.advance()); + assert!(!postings.advance()); + } +} \ No newline at end of file From 835936585f89a60f7d8c5ffe6913be56c919b49b Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Sat, 20 May 2017 18:45:41 +0300 Subject: [PATCH 3/7] Don't search whole blocks, but only the remaining part --- src/postings/segment_postings.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index b5a191d3c..3f35a439e 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -41,7 +41,7 @@ impl<'a> SegmentPostings<'a> { delete_bitset: delete_bitset, } } - + /// Returns an empty segment postings object pub fn empty() -> SegmentPostings<'static> { let empty_block_cursor = BlockSegmentPostings::empty(); @@ -78,7 +78,7 @@ impl<'a> DocSet for SegmentPostings<'a> { } } - + fn skip_next(&mut self, target: DocId) -> SkipResult { if !self.advance() { return SkipResult::End; @@ -115,7 +115,7 @@ impl<'a> DocSet for SegmentPostings<'a> { debug_assert!(target >= block_docs[self.cur]); debug_assert!(target <= block_docs[block_len - 1]); - let mut start = 0; + let mut start = self.cur; let mut end = block_len; let mut count = 1; loop { @@ -161,7 +161,7 @@ impl<'a> DocSet for SegmentPostings<'a> { SkipResult::End } } - + #[inline] fn doc(&self) -> DocId { @@ -201,7 +201,7 @@ pub struct BlockSegmentPostings<'a> { } impl<'a> BlockSegmentPostings<'a> { - + pub fn from_data(len: usize, data: &'a [u8], freq_handler: FreqHandler) -> BlockSegmentPostings<'a> { let num_binpacked_blocks: usize = (len as usize) / NUM_DOCS_PER_BLOCK; let num_vint_docs = (len as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; @@ -231,11 +231,11 @@ impl<'a> BlockSegmentPostings<'a> { pub fn docs(&self) -> &[DocId] { self.block_decoder.output_array() } - + pub fn freq_handler(&self) -> &FreqHandler { &self.freq_handler } - + pub fn advance(&mut self) -> bool { if self.num_binpacked_blocks > 0 { self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset); @@ -256,7 +256,7 @@ impl<'a> BlockSegmentPostings<'a> { } } } - + /// Returns an empty segment postings object pub fn empty() -> BlockSegmentPostings<'static> { BlockSegmentPostings { @@ -269,7 +269,7 @@ impl<'a> BlockSegmentPostings<'a> { len: 0, } } - + } #[cfg(test)] @@ -284,4 +284,4 @@ mod tests { assert!(!postings.advance()); assert!(!postings.advance()); } -} \ No newline at end of file +} From 020779f61b3dfb982c82aec3324948151c2a58eb Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Sat, 20 May 2017 20:56:37 +0300 Subject: [PATCH 4/7] Make things faster --- src/fastfield/delete.rs | 1 + src/postings/segment_postings.rs | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 8923437c8..d5ab7cbc9 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -66,6 +66,7 @@ impl DeleteBitSet { } /// Returns true iff the document is deleted. + #[inline] pub fn is_deleted(&self, doc: DocId) -> bool { if self.len == 0 { false diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 3f35a439e..86eaa7a14 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -87,9 +87,9 @@ impl<'a> DocSet for SegmentPostings<'a> { // skip blocks until one that might contain the target loop { // check if we need to go to the next block - let last_doc_in_block = { + let (current_doc, last_doc_in_block) = { let block_docs = self.block_cursor.docs(); - block_docs[block_docs.len() - 1] + (block_docs[self.cur], block_docs[block_docs.len() - 1]) }; if target > last_doc_in_block { if !self.block_cursor.advance() { @@ -97,8 +97,7 @@ impl<'a> DocSet for SegmentPostings<'a> { } self.cur = 0; } else { - let block_docs = self.block_cursor.docs(); - if target < block_docs[self.cur] { + if target < current_doc { // We've overpassed the target after the first `advance` call // or we're at the beginning of a block. // Either way, we're on the first `DocId` greater than `target` @@ -226,12 +225,13 @@ impl<'a> BlockSegmentPostings<'a> { self.len = len; } - /// Returns the array of docs in the current block. + #[inline] pub fn docs(&self) -> &[DocId] { self.block_decoder.output_array() } + #[inline] pub fn freq_handler(&self) -> &FreqHandler { &self.freq_handler } From 3e4606de5debe47535ca1514496a469e8f7e4337 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 21 May 2017 16:31:52 +0900 Subject: [PATCH 5/7] Simplifying, and reordering the members --- src/postings/segment_postings.rs | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 86eaa7a14..f09f18cec 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -14,10 +14,8 @@ const EMPTY_DATA: [u8; 0] = [0u8; 0]; /// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded. /// Positions on the other hand, are optionally entirely decoded upfront. pub struct SegmentPostings<'a> { - len: usize, - cur: usize, block_cursor: BlockSegmentPostings<'a>, - cur_block_len: usize, + cur: usize, delete_bitset: DeleteBitSet, } @@ -34,10 +32,8 @@ impl<'a> SegmentPostings<'a> { segment_block_postings: BlockSegmentPostings<'a>, delete_bitset: DeleteBitSet) -> SegmentPostings<'a> { SegmentPostings { - len: segment_block_postings.len, block_cursor: segment_block_postings, cur: NUM_DOCS_PER_BLOCK, // cursor within the block - cur_block_len: 0, delete_bitset: delete_bitset, } } @@ -46,11 +42,9 @@ impl<'a> SegmentPostings<'a> { pub fn empty() -> SegmentPostings<'static> { let empty_block_cursor = BlockSegmentPostings::empty(); SegmentPostings { - len: 0, block_cursor: empty_block_cursor, delete_bitset: DeleteBitSet::empty(), cur: NUM_DOCS_PER_BLOCK, - cur_block_len: 0, } } } @@ -63,14 +57,12 @@ impl<'a> DocSet for SegmentPostings<'a> { fn advance(&mut self) -> bool { loop { self.cur += 1; - if self.cur >= self.cur_block_len { + if self.cur >= self.block_cursor.block_len() { self.cur = 0; if !self.block_cursor.advance() { - self.cur_block_len = 0; self.cur = NUM_DOCS_PER_BLOCK; return false; } - self.cur_block_len = self.block_cursor.docs().len(); } if !self.delete_bitset.is_deleted(self.doc()) { return true; @@ -110,7 +102,7 @@ impl<'a> DocSet for SegmentPostings<'a> { // we're in the right block now, start with an exponential search let block_docs = self.block_cursor.docs(); let block_len = block_docs.len(); - + debug_assert!(target >= block_docs[self.cur]); debug_assert!(target <= block_docs[block_len - 1]); @@ -172,7 +164,7 @@ impl<'a> DocSet for SegmentPostings<'a> { impl<'a> HasLen for SegmentPostings<'a> { fn len(&self) -> usize { - self.len + self.block_cursor.len } } @@ -190,13 +182,13 @@ impl<'a> Postings for SegmentPostings<'a> { pub struct BlockSegmentPostings<'a> { + block_decoder: BlockDecoder, + len: usize, + doc_offset: DocId, num_binpacked_blocks: usize, num_vint_docs: usize, - block_decoder: BlockDecoder, - freq_handler: FreqHandler, remaining_data: &'a [u8], - doc_offset: DocId, - len: usize, + freq_handler: FreqHandler, } impl<'a> BlockSegmentPostings<'a> { @@ -231,6 +223,10 @@ impl<'a> BlockSegmentPostings<'a> { self.block_decoder.output_array() } + pub fn block_len(&self) -> usize { + self.block_decoder.output_len + } + #[inline] pub fn freq_handler(&self) -> &FreqHandler { &self.freq_handler From 73d54c63793f81d51f78b1f62abf75e4a2625a73 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Sun, 21 May 2017 10:44:49 +0300 Subject: [PATCH 6/7] Inline block_len --- src/postings/segment_postings.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index f09f18cec..093a60827 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -223,6 +223,7 @@ impl<'a> BlockSegmentPostings<'a> { self.block_decoder.output_array() } + #[inline] pub fn block_len(&self) -> usize { self.block_decoder.output_len } From 581449a824ada534afdefc44cde1aea124699156 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 21 May 2017 18:54:41 +0900 Subject: [PATCH 7/7] issue/162 Docs and unit tests --- src/core/segment_reader.rs | 29 ++++- src/lib.rs | 50 ++----- src/postings/freq_handler.rs | 1 - src/postings/mod.rs | 14 +- src/postings/segment_postings.rs | 217 +++++++++++++++++++++++++------ 5 files changed, 228 insertions(+), 83 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 03ad7d248..77ee2148d 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -222,7 +222,15 @@ impl SegmentReader { SegmentPostings::from_block_postings(block_postings, delete_bitset) } - pub fn read_block_postings_from_terminfo(&self, term_info: &TermInfo, option: SegmentPostingsOption) -> BlockSegmentPostings { + + /// Returns a block postings given a `term_info`. + /// This method is for an advanced usage only. + /// + /// Most user should prefer using `read_postings` instead. + pub fn read_block_postings_from_terminfo(&self, + term_info: &TermInfo, + option: SegmentPostingsOption) + -> BlockSegmentPostings { let offset = term_info.postings_offset as usize; let postings_data = &self.postings_data[offset..]; let freq_handler = match option { @@ -237,6 +245,25 @@ impl SegmentReader { BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler) } + + /// Resets the block segment to another position of the postings + /// file. + /// + /// This is useful for enumerating through a list of terms, + /// and consuming the associated posting lists while avoiding + /// reallocating a `BlockSegmentPostings`. + /// + /// # Warning + /// + /// This does not reset the positions list. + pub fn reset_block_postings_from_terminfo<'a>(&'a self, + term_info: &TermInfo, + block_postings: &mut BlockSegmentPostings<'a>) { + let offset = term_info.postings_offset as usize; + let postings_data: &'a [u8] = &self.postings_data[offset..]; + block_postings.reset(term_info.doc_freq as usize, postings_data); + } + /// Returns the term info associated with the term. pub fn get_term_info(&self, term: &Term) -> Option { self.terms.get(term.as_slice()) diff --git a/src/lib.rs b/src/lib.rs index ea45de7c9..61ecd4bbc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -390,21 +390,15 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader - .read_postings(&term_abcd, FreqAndPositions) - .is_none()); + assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); { - let mut postings = reader - .read_postings(&term_a, FreqAndPositions) - .unwrap(); + let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = reader - .read_postings(&term_b, FreqAndPositions) - .unwrap(); + let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -430,22 +424,16 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - - assert!(reader - .read_postings(&term_abcd, FreqAndPositions) - .is_none()); + + assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); { - let mut postings = reader - .read_postings(&term_a, FreqAndPositions) - .unwrap(); + let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = reader - .read_postings(&term_b, FreqAndPositions) - .unwrap(); + let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -471,19 +459,13 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader - .read_postings(&term_abcd, FreqAndPositions) - .is_none()); + assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); { - let mut postings = reader - .read_postings(&term_a, FreqAndPositions) - .unwrap(); + let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(!postings.advance()); } { - let mut postings = reader - .read_postings(&term_b, FreqAndPositions) - .unwrap(); + let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -491,9 +473,7 @@ mod tests { assert!(!postings.advance()); } { - let mut postings = reader - .read_postings(&term_c, FreqAndPositions) - .unwrap(); + let mut postings = reader.read_postings(&term_c, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 4); assert!(!postings.advance()); @@ -603,13 +583,9 @@ mod tests { let searcher = index.searcher(); let reader = searcher.segment_reader(0); let term_abcd = Term::from_field_text(text_field, "abcd"); - assert!(reader - .read_postings(&term_abcd, FreqAndPositions) - .is_none()); + assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); let term_af = Term::from_field_text(text_field, "af"); - let mut postings = reader - .read_postings(&term_af, FreqAndPositions) - .unwrap(); + let mut postings = reader.read_postings(&term_af, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); assert_eq!(postings.term_freq(), 3); diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs index 627261696..f1e3f256c 100644 --- a/src/postings/freq_handler.rs +++ b/src/postings/freq_handler.rs @@ -47,7 +47,6 @@ impl FreqHandler { } } - /// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions. pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler { let positions = read_positions(position_data); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index a338ae8db..8c550dcbe 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -130,11 +130,15 @@ mod tests { } { let term_a = Term::from_field_text(text_field, "abcdef"); - assert!(segment_reader.read_postings(&term_a, FreqAndPositions).is_none()); + assert!(segment_reader + .read_postings(&term_a, FreqAndPositions) + .is_none()); } { let term_a = Term::from_field_text(text_field, "a"); - let mut postings_a = segment_reader.read_postings(&term_a, FreqAndPositions).unwrap(); + let mut postings_a = segment_reader + .read_postings(&term_a, FreqAndPositions) + .unwrap(); assert_eq!(postings_a.len(), 1000); assert!(postings_a.advance()); assert_eq!(postings_a.doc(), 0); @@ -153,7 +157,9 @@ mod tests { } { let term_e = Term::from_field_text(text_field, "e"); - let mut postings_e = segment_reader.read_postings(&term_e, FreqAndPositions).unwrap(); + let mut postings_e = segment_reader + .read_postings(&term_e, FreqAndPositions) + .unwrap(); assert_eq!(postings_e.len(), 1000 - 2); for i in 2u32..1000u32 { assert!(postings_e.advance()); @@ -476,7 +482,7 @@ mod tests { let mut segment_postings = segment_reader .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); - + let mut existing_docs = Vec::new(); segment_postings.advance(); for doc in &docs { diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 093a60827..f76142035 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -3,6 +3,7 @@ use DocId; use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult}; use std::cmp; use fastfield::DeleteBitSet; +use fst::Streamer; const EMPTY_DATA: [u8; 0] = [0u8; 0]; @@ -20,20 +21,18 @@ pub struct SegmentPostings<'a> { } impl<'a> SegmentPostings<'a> { - - /// Reads a Segment postings from an &[u8] /// /// * `len` - number of document in the posting lists. /// * `data` - data array. The complete data is not necessarily used. /// * `freq_handler` - the freq handler is in charge of decoding /// frequencies and/or positions - pub fn from_block_postings( - segment_block_postings: BlockSegmentPostings<'a>, - delete_bitset: DeleteBitSet) -> SegmentPostings<'a> { + pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>, + delete_bitset: DeleteBitSet) + -> SegmentPostings<'a> { SegmentPostings { block_cursor: segment_block_postings, - cur: NUM_DOCS_PER_BLOCK, // cursor within the block + cur: NUM_DOCS_PER_BLOCK, // cursor within the block delete_bitset: delete_bitset, } } @@ -102,7 +101,7 @@ impl<'a> DocSet for SegmentPostings<'a> { // we're in the right block now, start with an exponential search let block_docs = self.block_cursor.docs(); let block_len = block_docs.len(); - + debug_assert!(target >= block_docs[self.cur]); debug_assert!(target <= block_docs[block_len - 1]); @@ -153,18 +152,18 @@ impl<'a> DocSet for SegmentPostings<'a> { } } - #[inline] fn doc(&self) -> DocId { let docs = self.block_cursor.docs(); - assert!(self.cur < docs.len(), "Have you forgotten to call `.advance()` at least once before calling .doc()."); + assert!(self.cur < docs.len(), + "Have you forgotten to call `.advance()` at least once before calling .doc()."); docs[self.cur] } } impl<'a> HasLen for SegmentPostings<'a> { fn len(&self) -> usize { - self.block_cursor.len + self.block_cursor.doc_freq() } } @@ -178,12 +177,16 @@ impl<'a> Postings for SegmentPostings<'a> { } } - - - +/// `BlockSegmentPostings` is a cursor iterating over blocks +/// of documents. +/// +/// # Warning +/// +/// While it is useful for some very specific high-performance +/// use cases, you should prefer using `SegmentPostings` for most usage. pub struct BlockSegmentPostings<'a> { block_decoder: BlockDecoder, - len: usize, + doc_freq: usize, doc_offset: DocId, num_binpacked_blocks: usize, num_vint_docs: usize, @@ -192,10 +195,12 @@ pub struct BlockSegmentPostings<'a> { } impl<'a> BlockSegmentPostings<'a> { - - pub fn from_data(len: usize, data: &'a [u8], freq_handler: FreqHandler) -> BlockSegmentPostings<'a> { - let num_binpacked_blocks: usize = (len as usize) / NUM_DOCS_PER_BLOCK; - let num_vint_docs = (len as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; + pub(crate) fn from_data(doc_freq: usize, + data: &'a [u8], + freq_handler: FreqHandler) + -> BlockSegmentPostings<'a> { + let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK; + let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; BlockSegmentPostings { num_binpacked_blocks: num_binpacked_blocks, num_vint_docs: num_vint_docs, @@ -203,54 +208,87 @@ impl<'a> BlockSegmentPostings<'a> { freq_handler: freq_handler, remaining_data: data, doc_offset: 0, - len: len, + doc_freq: doc_freq, } } - pub fn reset(&mut self, len: usize, data: &'a [u8]) { - let num_binpacked_blocks: usize = (len as usize) / NUM_DOCS_PER_BLOCK; - let num_vint_docs = (len as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; + // Resets the block segment postings on another position + // in the postings file. + // + // This is useful for enumerating through a list of terms, + // and consuming the associated posting lists while avoiding + // reallocating a `BlockSegmentPostings`. + // + // # Warning + // + // This does not reset the positions list. + pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: &'a [u8]) { + let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK; + let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; self.num_binpacked_blocks = num_binpacked_blocks; self.num_vint_docs = num_vint_docs; - self.remaining_data = data; + self.remaining_data = postings_data; self.doc_offset = 0; - self.len = len; + self.doc_freq = doc_freq; + } + + /// Returns the document frequency associated to this block postings. + /// + /// This `doc_freq` is simply the sum of the length of all of the blocks + /// length, and it does not take in account deleted documents. + pub fn doc_freq(&self) -> usize { + self.doc_freq } /// Returns the array of docs in the current block. + /// + /// Before the first call to `.advance()`, the block + /// returned by `.docs()` is empty. #[inline] pub fn docs(&self) -> &[DocId] { self.block_decoder.output_array() } + /// Returns the length of the current block. + /// + /// All blocks have a length of `NUM_DOCS_PER_BLOCK`, + /// except the last block that may have a length + /// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1` #[inline] - pub fn block_len(&self) -> usize { + fn block_len(&self) -> usize { self.block_decoder.output_len } - #[inline] + + /// Returns a reference to the frequency handler. pub fn freq_handler(&self) -> &FreqHandler { &self.freq_handler } + /// Advance to the next block. + /// + /// Returns false iff there was no remaining blocks. pub fn advance(&mut self) -> bool { if self.num_binpacked_blocks > 0 { - self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset); + self.remaining_data = + self.block_decoder + .uncompress_block_sorted(self.remaining_data, self.doc_offset); self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); self.num_binpacked_blocks -= 1; true - } - else { - if self.num_vint_docs > 0 { - self.remaining_data = self.block_decoder.uncompress_vint_sorted(self.remaining_data, self.doc_offset, self.num_vint_docs); - self.freq_handler.read_freq_vint(self.remaining_data, self.num_vint_docs); - self.num_vint_docs = 0; - true - } - else { - false - } + } else if self.num_vint_docs > 0 { + self.remaining_data = + self.block_decoder + .uncompress_vint_sorted(self.remaining_data, + self.doc_offset, + self.num_vint_docs); + self.freq_handler + .read_freq_vint(self.remaining_data, self.num_vint_docs); + self.num_vint_docs = 0; + true + } else { + false } } @@ -261,12 +299,23 @@ impl<'a> BlockSegmentPostings<'a> { num_vint_docs: 0, block_decoder: BlockDecoder::new(), freq_handler: FreqHandler::new_without_freq(), - remaining_data: &EMPTY_DATA, + remaining_data: &EMPTY_DATA, doc_offset: 0, - len: 0, + doc_freq: 0, } } +} +impl<'a, 'b> Streamer<'b> for BlockSegmentPostings<'a> { + type Item = &'b [DocId]; + + fn next(&'b mut self) -> Option<&'b [DocId]> { + if self.advance() { + Some(self.docs()) + } else { + None + } + } } #[cfg(test)] @@ -274,11 +323,99 @@ mod tests { use DocSet; use super::SegmentPostings; + use schema::{Document, SchemaBuilder}; + use core::Index; + use schema::INT_INDEXED; + use schema::Term; + use fst::Streamer; + use postings::SegmentPostingsOption; + use common::HasLen; + use super::BlockSegmentPostings; + use schema::FieldValue; #[test] fn test_empty_segment_postings() { let mut postings = SegmentPostings::empty(); assert!(!postings.advance()); assert!(!postings.advance()); + assert_eq!(postings.len(), 0); + } + + #[test] + fn test_empty_block_segment_postings() { + let mut postings = BlockSegmentPostings::empty(); + assert!(!postings.advance()); + assert_eq!(postings.doc_freq(), 0); + } + + #[test] + fn test_block_segment_postings() { + let mut schema_builder = SchemaBuilder::default(); + let int_field = schema_builder.add_u64_field("id", INT_INDEXED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + for _ in 0..100_000 { + let doc = doc!(int_field=>0u64); + index_writer.add_document(doc); + } + index_writer.commit().unwrap(); + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let segment_reader = searcher.segment_reader(0); + let term = Term::from_field_u64(int_field, 0u64); + let term_info = segment_reader.get_term_info(&term).unwrap(); + let mut block_segments = + segment_reader + .read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq); + let mut offset: u32 = 0u32; + // checking that the block before calling advance is empty + assert!(block_segments.docs().is_empty()); + // checking that the `doc_freq` is correct + assert_eq!(block_segments.doc_freq(), 100_000); + while let Some(block) = block_segments.next() { + for (i, doc) in block.iter().cloned().enumerate() { + assert_eq!(offset + (i as u32), doc); + } + offset += block.len() as u32; + } + } + + + #[test] + fn test_reset_block_segment_postings() { + let mut schema_builder = SchemaBuilder::default(); + let int_field = schema_builder.add_u64_field("id", INT_INDEXED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + // create two postings list, one containg even number, + // the other containing odd numbers. + for i in 0..6 { + let doc = doc!(int_field=> (i % 2) as u64); + index_writer.add_document(doc); + } + index_writer.commit().unwrap(); + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let segment_reader = searcher.segment_reader(0); + + let mut block_segments; + { + let term = Term::from_field_u64(int_field, 0u64); + let term_info = segment_reader.get_term_info(&term).unwrap(); + block_segments = + segment_reader + .read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq); + } + assert!(block_segments.advance()); + assert!(block_segments.docs() == &[0, 2, 4]); + { + let term = Term::from_field_u64(int_field, 1u64); + let term_info = segment_reader.get_term_info(&term).unwrap(); + segment_reader.reset_block_postings_from_terminfo(&term_info, &mut block_segments); + } + assert!(block_segments.advance()); + assert!(block_segments.docs() == &[1, 3, 5]); } }