From 2be5f08cd6b567806531e8a3f6ce840e48adadce Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 20 May 2017 11:46:40 +0900 Subject: [PATCH] issue/162 Added block iteration API --- src/core/segment_reader.rs | 2 - src/postings/mod.rs | 3 +- src/postings/segment_postings.rs | 295 ++++++++++--------------------- 3 files changed, 90 insertions(+), 210 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 62a9347d8..03ad7d248 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -22,9 +22,7 @@ use postings::SegmentPostingsOption; use postings::{SegmentPostings, BlockSegmentPostings}; use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader}; use schema::Schema; -use schema::FieldType; use postings::FreqHandler; -use schema::TextIndexingOptions; diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 483b7ed46..a338ae8db 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -468,7 +468,6 @@ mod tests { }); } - fn bench_skip_next(p: f32, b: &mut Bencher) { let searcher = INDEX.searcher(); let segment_reader = searcher.segment_reader(0); @@ -479,6 +478,7 @@ mod tests { .unwrap(); let mut existing_docs = Vec::new(); + segment_postings.advance(); for doc in &docs { if *doc >= segment_postings.doc() { existing_docs.push(*doc); @@ -493,7 +493,6 @@ mod tests { .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) .unwrap(); for doc in &existing_docs { - println!("doc {}", doc); if segment_postings.skip_next(*doc) == SkipResult::End { break; } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index f917449b2..b5a191d3c 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -2,14 +2,12 @@ use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder}; use DocId; use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult}; use std::cmp; -use std::num::Wrapping; use fastfield::DeleteBitSet; const EMPTY_DATA: [u8; 0] = [0u8; 0]; -/* /// `SegmentPostings` represents the inverted list or postings associated to /// a term in a `Segment`. /// @@ -17,142 +15,7 @@ const EMPTY_DATA: [u8; 0] = [0u8; 0]; /// Positions on the other hand, are optionally entirely decoded upfront. pub struct SegmentPostings<'a> { len: usize, - // Removing this makes the code slower - // See https://github.com/tantivy-search/tantivy/issues/89 - block_len: usize, - doc_offset: u32, - block_decoder: BlockDecoder, - freq_handler: FreqHandler, - remaining_data: &'a [u8], - cur: Wrapping, - delete_bitset: DeleteBitSet, -} - -impl<'a> SegmentPostings<'a> { - fn load_next_block(&mut self) { - let num_remaining_docs = self.len - self.cur.0; - if num_remaining_docs >= NUM_DOCS_PER_BLOCK { - self.remaining_data = - self.block_decoder - .uncompress_block_sorted(self.remaining_data, self.doc_offset); - self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); - self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); - self.block_len = NUM_DOCS_PER_BLOCK; - } else { - self.remaining_data = - self.block_decoder - .uncompress_vint_sorted(self.remaining_data, - self.doc_offset, - num_remaining_docs); - self.freq_handler - .read_freq_vint(self.remaining_data, num_remaining_docs); - self.block_len = num_remaining_docs; - } - } - - /// Reads a Segment postings from an &[u8] - /// - /// * `len` - number of document in the posting lists. - /// * `data` - data array. The complete data is not necessarily used. - /// * `freq_handler` - the freq handler is in charge of decoding - /// frequencies and/or positions - pub fn from_data(len: u32, - data: &'a [u8], - delete_bitset: &'a DeleteBitSet, - freq_handler: FreqHandler) - -> SegmentPostings<'a> { - SegmentPostings { - len: len as usize, - block_len: len as usize, - doc_offset: 0, - block_decoder: BlockDecoder::new(), - freq_handler: freq_handler, - remaining_data: data, - cur: Wrapping(usize::max_value()), - delete_bitset: delete_bitset.clone(), - } - } - - /// Returns an empty segment postings object - pub fn empty() -> SegmentPostings<'static> { - SegmentPostings { - len: 0, - block_len: 0, - doc_offset: 0, - block_decoder: BlockDecoder::new(), - freq_handler: FreqHandler::new_without_freq(), - remaining_data: &EMPTY_DATA, - delete_bitset: DeleteBitSet::empty(), - cur: Wrapping(usize::max_value()), - } - } - - - - /// Sets the current position to a location relative - /// to the current block - #[inline] - fn set_within_block(&mut self, inner_pos: usize) { - self.cur = Wrapping(self.cur.0 & !(NUM_DOCS_PER_BLOCK - 1)) + Wrapping(inner_pos) - } -} - - -impl<'a> DocSet for SegmentPostings<'a> { - // goes to the next element. - // next needs to be called a first time to point to the correct element. - #[inline] - fn advance(&mut self) -> bool { - loop { - self.cur += Wrapping(1); - if self.cur.0 >= self.len { - return false; - } - if self.index_within_block() == 0 { - self.load_next_block(); - } - if !self.delete_bitset.is_deleted(self.doc()) { - return true; - } - } - } - - - - #[inline] - fn doc(&self) -> DocId { - self.block_decoder.output(self.index_within_block()) - } -} - -impl<'a> HasLen for SegmentPostings<'a> { - fn len(&self) -> usize { - self.len - } -} - -impl<'a> Postings for SegmentPostings<'a> { - fn term_freq(&self) -> u32 { - self.freq_handler.freq(self.index_within_block()) - } - - fn positions(&self) -> &[u32] { - self.freq_handler.positions(self.index_within_block()) - } -} - -*/ - - - -/// `SegmentPostings` represents the inverted list or postings associated to -/// a term in a `Segment`. -/// -/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded. -/// Positions on the other hand, are optionally entirely decoded upfront. -pub struct SegmentPostings<'a> { - len: usize, - cur: Wrapping, + cur: usize, block_cursor: BlockSegmentPostings<'a>, cur_block_len: usize, delete_bitset: DeleteBitSet, @@ -173,7 +36,7 @@ impl<'a> SegmentPostings<'a> { SegmentPostings { len: segment_block_postings.len, block_cursor: segment_block_postings, - cur: Wrapping(usize::max_value()), + cur: NUM_DOCS_PER_BLOCK, // cursor within the block cur_block_len: 0, delete_bitset: delete_bitset, } @@ -186,7 +49,7 @@ impl<'a> SegmentPostings<'a> { len: 0, block_cursor: empty_block_cursor, delete_bitset: DeleteBitSet::empty(), - cur: Wrapping(usize::max_value()), + cur: NUM_DOCS_PER_BLOCK, cur_block_len: 0, } } @@ -198,15 +61,13 @@ impl<'a> DocSet for SegmentPostings<'a> { // next needs to be called a first time to point to the correct element. #[inline] fn advance(&mut self) -> bool { - loop { - self.cur += Wrapping(1); - assert!(self.cur.0 >= 0); - assert!(self.cur.0 <= self.cur_block_len); - if self.cur.0 == self.cur_block_len { - self.cur = Wrapping(0); + loop { + self.cur += 1; + if self.cur >= self.cur_block_len { + self.cur = 0; if !self.block_cursor.advance() { self.cur_block_len = 0; - self.cur = Wrapping(usize::max_value()); + self.cur = NUM_DOCS_PER_BLOCK; return false; } self.cur_block_len = self.block_cursor.docs().len(); @@ -217,90 +78,96 @@ impl<'a> DocSet for SegmentPostings<'a> { } } - /* + fn skip_next(&mut self, target: DocId) -> SkipResult { if !self.advance() { return SkipResult::End; } - let mut pos = self.index_within_block(); // skip blocks until one that might contain the target loop { // check if we need to go to the next block - if target > self.block_decoder.output(self.block_len - 1) { - self.cur += Wrapping(self.block_len - pos); - self.load_next_block(); - pos = 0; - - // there was no more data - if self.cur.0 == self.len { + let last_doc_in_block = { + let block_docs = self.block_cursor.docs(); + block_docs[block_docs.len() - 1] + }; + if target > last_doc_in_block { + if !self.block_cursor.advance() { return SkipResult::End; } - } else if target < self.block_decoder.output(pos) { - // We've overpassed the target after the first `advance` call - // or we're at the beginning of a block. - // Either way, we're on the first `DocId` greater than `target` - return SkipResult::OverStep; + self.cur = 0; } else { + let block_docs = self.block_cursor.docs(); + if target < block_docs[self.cur] { + // We've overpassed the target after the first `advance` call + // or we're at the beginning of a block. + // Either way, we're on the first `DocId` greater than `target` + return SkipResult::OverStep; + } break; } } + { + // we're in the right block now, start with an exponential search + let block_docs = self.block_cursor.docs(); + let block_len = block_docs.len(); - debug_assert!(target >= self.block_decoder.output(pos)); - debug_assert!(target <= self.block_decoder.output(self.block_len - 1)); + debug_assert!(target >= block_docs[self.cur]); + debug_assert!(target <= block_docs[block_len - 1]); - // we're in the right block now, start with an exponential search - let mut start = pos; - let mut end = self.block_len; - let mut count = 1; - loop { - let new = start + count; - if new < end && self.block_decoder.output(new) < target { - start = new; - count *= 2; - } else { - break; + let mut start = 0; + let mut end = block_len; + let mut count = 1; + loop { + let new = start + count; + if new < end && block_docs[new] < target { + start = new; + count *= 2; + } else { + break; + } + } + end = cmp::min(start + count, end); + + // now do a binary search + let mut count = end - start; + while count > 0 { + let step = count / 2; + let mid = start + step; + let doc = block_docs[mid]; + if doc < target { + start = mid + 1; + count -= step + 1; + } else { + count = step; + } + } + + // `doc` is now >= `target` + let doc = block_docs[start]; + self.cur = start; + + if !self.delete_bitset.is_deleted(doc) { + if doc == target { + return SkipResult::Reached; + } else { + return SkipResult::OverStep; + } } } - end = cmp::min(start + count, end); - - // now do a binary search - let mut count = end - start; - while count > 0 { - let step = count / 2; - let mid = start + step; - let doc = self.block_decoder.output(mid); - if doc < target { - start = mid + 1; - count -= step + 1; - } else { - count = step; - } - } - - // `doc` is now >= `target` - let doc = self.block_decoder.output(start); - self.set_within_block(start); - - if !self.delete_bitset.is_deleted(doc) { - if doc == target { - return SkipResult::Reached; - } else { - return SkipResult::OverStep; - } - } - if self.advance() { SkipResult::OverStep } else { SkipResult::End } } - */ + #[inline] fn doc(&self) -> DocId { - self.block_cursor.docs()[self.cur.0] + let docs = self.block_cursor.docs(); + assert!(self.cur < docs.len(), "Have you forgotten to call `.advance()` at least once before calling .doc()."); + docs[self.cur] } } @@ -312,11 +179,11 @@ impl<'a> HasLen for SegmentPostings<'a> { impl<'a> Postings for SegmentPostings<'a> { fn term_freq(&self) -> u32 { - self.block_cursor.freq_handler().freq(self.cur.0) + self.block_cursor.freq_handler().freq(self.cur) } fn positions(&self) -> &[u32] { - self.block_cursor.freq_handler().positions(self.cur.0) + self.block_cursor.freq_handler().positions(self.cur) } } @@ -359,6 +226,8 @@ impl<'a> BlockSegmentPostings<'a> { self.len = len; } + + /// Returns the array of docs in the current block. pub fn docs(&self) -> &[DocId] { self.block_decoder.output_array() } @@ -402,3 +271,17 @@ impl<'a> BlockSegmentPostings<'a> { } } + +#[cfg(test)] +mod tests { + + use DocSet; + use super::SegmentPostings; + + #[test] + fn test_empty_segment_postings() { + let mut postings = SegmentPostings::empty(); + assert!(!postings.advance()); + assert!(!postings.advance()); + } +} \ No newline at end of file