diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index a0eb36787..05b95e0f2 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -346,7 +346,7 @@ impl IndexWriter { fn drop_sender(&mut self) { let (sender, _receiver) = channel::bounded(1); - mem::replace(&mut self.operation_sender, sender); + self.operation_sender = sender; } /// If there are some merging threads, blocks until they all finish their work and diff --git a/src/postings/block_search.rs b/src/postings/block_search.rs index 010c9dd70..08cd55379 100644 --- a/src/postings/block_search.rs +++ b/src/postings/block_search.rs @@ -87,6 +87,7 @@ fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) { (begin, end) } +#[inline(never)] fn galloping(block_docs: &[u32], target: u32) -> usize { let (start, end) = exponential_search(&block_docs, target); start + linear_search(&block_docs[start..end], target) @@ -133,19 +134,14 @@ impl BlockSearcher { /// /// Indeed, if the block is not full, the remaining items are TERMINATED. /// It is surprisingly faster, most likely because of the lack of branch misprediction. - pub(crate) fn search_in_block( - self, - block_docs: &AlignedBuffer, - start: usize, - target: u32, - ) -> usize { + pub(crate) fn search_in_block(self, block_docs: &AlignedBuffer, target: u32) -> usize { #[cfg(target_arch = "x86_64")] { if self == BlockSearcher::SSE2 { return sse2::linear_search_sse2_128(block_docs, target); } } - start + galloping(&block_docs.0[start..], target) + galloping(&block_docs.0[..], target) } } @@ -199,12 +195,10 @@ mod tests { assert!(block.len() < COMPRESSION_BLOCK_SIZE); let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE]; output_buffer[..block.len()].copy_from_slice(block); - for i in 0..cursor { - assert_eq!( - block_searcher.search_in_block(&AlignedBuffer(output_buffer), i, target), - cursor - ); - } + assert_eq!( + block_searcher.search_in_block(&AlignedBuffer(output_buffer), target), + cursor + ); } fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) { diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index 2d35dfadd..e9136bd97 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -139,12 +139,13 @@ impl BlockSegmentPostings { self.doc_decoder.output_array() } + #[inline(always)] pub(crate) fn docs_aligned(&self) -> &AlignedBuffer { self.doc_decoder.output_aligned() } /// Return the document at index `idx` of the block. - #[inline] + #[inline(always)] pub fn doc(&self, idx: usize) -> u32 { self.doc_decoder.output(idx) } @@ -177,23 +178,12 @@ impl BlockSegmentPostings { /// Position on a block that may contains `target_doc`. /// - /// If the current block last element is greater or equal to `target_doc`, return true. - /// - /// Returns true if a block that has an element greater or equal to the target is found. - /// Returning true does not guarantee that the smallest element of the block is smaller - /// than the target. It only guarantees that the last element is greater or equal. - /// - /// Returns false iff all of the document remaining are smaller than - /// `doc_id`. In that case, all of these document are consumed. - pub fn seek(&mut self, target_doc: DocId) -> bool { - self.skip_reader.seek(target_doc); - self.read_block(); - - // The last block last doc may actually stop before the target. - self.docs() - .last() - .map(|last_doc| *last_doc >= target_doc) - .unwrap_or(false) + /// If all docs are smaller than target, the block loaded may be empty, + /// or be the last an incomplete VInt block. + pub fn seek(&mut self, target_doc: DocId) { + if self.skip_reader.seek(target_doc) { + self.read_block(); + } } fn read_block(&mut self) { @@ -263,6 +253,7 @@ mod tests { use crate::common::HasLen; use crate::core::Index; use crate::docset::{DocSet, TERMINATED}; + use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::postings::postings::Postings; use crate::postings::SegmentPostings; use crate::schema::IndexRecordOption; @@ -373,17 +364,6 @@ mod tests { inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic) } - #[test] - fn test_block_segment_postings_skip() { - for i in 0..4 { - let mut block_postings = build_block_postings(&[3]); - assert_eq!(block_postings.seek(i), true); - assert_eq!(block_postings.seek(i), true); - } - let mut block_postings = build_block_postings(&[3]); - assert_eq!(block_postings.seek(4u32), false); - } - #[test] fn test_block_segment_postings_skip2() { let mut docs = vec![0]; @@ -392,13 +372,13 @@ mod tests { } let mut block_postings = build_block_postings(&docs[..]); for i in vec![0, 424, 10000] { - assert!(block_postings.seek(i)); + block_postings.seek(i); let docs = block_postings.docs(); assert!(docs[0] <= i); assert!(docs.last().cloned().unwrap_or(0u32) >= i); } - assert!(!block_postings.seek(100_000)); - assert!(!block_postings.seek(101_000)); + block_postings.seek(100_000); + assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED); } #[test] diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index e04b8f607..5b4b64b08 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,6 +1,6 @@ use crate::common::HasLen; -use crate::docset::{DocSet, TERMINATED}; +use crate::docset::DocSet; use crate::positions::PositionReader; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; @@ -90,61 +90,43 @@ impl DocSet for SegmentPostings { // next needs to be called a first time to point to the correct element. #[inline] fn advance(&mut self) -> DocId { - self.cur += 1; - if self.cur >= self.block_cursor.block_len() { - if self.block_cursor.advance() { - self.cur = 0; - } else { - self.cur = COMPRESSION_BLOCK_SIZE - 1; - return TERMINATED; - } + if self.cur == COMPRESSION_BLOCK_SIZE - 1 { + self.cur = 0; + self.block_cursor.advance(); + } else { + self.cur += 1; } self.doc() } fn seek(&mut self, target: DocId) -> DocId { - let doc = self.doc(); - if doc >= target { - return doc; - } - - // skip blocks until one that might contain the target - // check if we need to go to the next block - if self - .block_cursor - .docs() - .last() - .map(|&doc| doc < target) - .unwrap_or(true) - { - // We are not in the right block. - if !self.block_cursor.seek(target) { - self.block_cursor.doc_decoder.clear(); - self.cur = 0; - return TERMINATED; - } - self.cur = 0; + if self.doc() == target { + return target; } + self.block_cursor.seek(target); // At this point we are on the block, that might contain our document. - - let cur = self.cur; - let output = self.block_cursor.docs_aligned(); - let new_cur = self.block_searcher.search_in_block(&output, cur, target); - self.cur = new_cur; + + self.cur = self.block_searcher.search_in_block(&output, target); + + // The last block is not full and padded with the value TERMINATED, + // so that we are guaranteed to have at least doc in the block (a real one or the padding) + // that is greater or equal to the target. + debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE); // `doc` is now the first element >= `target` - let doc = output.0[new_cur]; + + // If all docs are smaller than target the current block should be incomplemented and padded + // with the value `TERMINATED`. + // + // After the search, the cursor should point to the first value of TERMINATED. + let doc = output.0[self.cur]; debug_assert!(doc >= target); doc } /// Return the current document's `DocId`. - /// - /// # Panics - /// - /// Will panics if called without having called advance before. #[inline] fn doc(&self) -> DocId { self.block_cursor.doc(self.cur) diff --git a/src/postings/skip.rs b/src/postings/skip.rs index ad133a3f1..7c486d7d3 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -102,7 +102,8 @@ impl SkipReader { self.remaining_docs = doc_freq; } - pub fn doc(&self) -> DocId { + #[inline(always)] + pub(crate) fn last_doc_in_block(&self) -> DocId { self.last_doc_in_block } @@ -158,10 +159,17 @@ impl SkipReader { /// If the target is larger than all documents, the skip_reader /// then advance to the last Variable In block. /// - /// Returns true if the last block is reached. - pub fn seek(&mut self, target: DocId) { - while self.doc() < target { + /// Returns true if the skip reader had to advance, + /// false if it was already positionned on the right block. + pub fn seek(&mut self, target: DocId) -> bool { + if self.last_doc_in_block() >= target { + return false; + } + loop { self.advance(); + if self.last_doc_in_block() >= target { + return true; + } } } @@ -218,7 +226,7 @@ mod tests { IndexRecordOption::WithFreqs, ); assert!(skip_reader.advance()); - assert_eq!(skip_reader.doc(), 1u32); + assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!( skip_reader.block_info(), BlockInfo::BitPacked { @@ -228,7 +236,7 @@ mod tests { } ); assert!(skip_reader.advance()); - assert_eq!(skip_reader.doc(), 5u32); + assert_eq!(skip_reader.last_doc_in_block(), 5u32); assert_eq!( skip_reader.block_info(), BlockInfo::BitPacked { @@ -257,7 +265,7 @@ mod tests { IndexRecordOption::Basic, ); assert!(skip_reader.advance()); - assert_eq!(skip_reader.doc(), 1u32); + assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!( skip_reader.block_info(), BlockInfo::BitPacked { @@ -267,7 +275,7 @@ mod tests { } ); assert!(skip_reader.advance()); - assert_eq!(skip_reader.doc(), 5u32); + assert_eq!(skip_reader.last_doc_in_block(), 5u32); assert_eq!( skip_reader.block_info(), BlockInfo::BitPacked { @@ -295,7 +303,7 @@ mod tests { IndexRecordOption::Basic, ); assert!(skip_reader.advance()); - assert_eq!(skip_reader.doc(), 1u32); + assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!( skip_reader.block_info(), BlockInfo::BitPacked { diff --git a/src/query/intersection.rs b/src/query/intersection.rs index d72ef5866..e8a510f07 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -112,11 +112,6 @@ impl DocSet for Intersection candidate { candidate = left.seek(seek_doc); diff --git a/src/query/union.rs b/src/query/union.rs index 6ca9267ae..fed667b23 100644 --- a/src/query/union.rs +++ b/src/query/union.rs @@ -198,6 +198,18 @@ where // TODO Also implement `count` with deletes efficiently. + fn doc(&self) -> DocId { + self.doc + } + + fn size_hint(&self) -> u32 { + self.docsets + .iter() + .map(|docset| docset.size_hint()) + .max() + .unwrap_or(0u32) + } + fn count_including_deleted(&mut self) -> u32 { if self.doc == TERMINATED { return 0; @@ -219,18 +231,6 @@ where self.cursor = HORIZON_NUM_TINYBITSETS; count } - - fn doc(&self) -> DocId { - self.doc - } - - fn size_hint(&self) -> u32 { - self.docsets - .iter() - .map(|docset| docset.size_hint()) - .max() - .unwrap_or(0u32) - } } impl Scorer for Union