Simplification of the segment postings seek implementation. (#834)

2026-05-26 05:00:41 +00:00 · 2020-05-27 08:49:47 +09:00
parent 7275ebdf3c
commit baf015fc57
7 changed files with 71 additions and 112 deletions
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -346,7 +346,7 @@ impl IndexWriter {

    fn drop_sender(&mut self) {
        let (sender, _receiver) = channel::bounded(1);
-        mem::replace(&mut self.operation_sender, sender);
+        self.operation_sender = sender;
    }

    /// If there are some merging threads, blocks until they all finish their work and
--- a/src/postings/block_search.rs
+++ b/src/postings/block_search.rs
@@ -87,6 +87,7 @@ fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
    (begin, end)
 }

+#[inline(never)]
 fn galloping(block_docs: &[u32], target: u32) -> usize {
    let (start, end) = exponential_search(&block_docs, target);
    start + linear_search(&block_docs[start..end], target)
@@ -133,19 +134,14 @@ impl BlockSearcher {
    ///
    /// Indeed, if the block is not full, the remaining items are TERMINATED.
    /// It is surprisingly faster, most likely because of the lack of branch misprediction.
-    pub(crate) fn search_in_block(
-        self,
-        block_docs: &AlignedBuffer,
-        start: usize,
-        target: u32,
-    ) -> usize {
+    pub(crate) fn search_in_block(self, block_docs: &AlignedBuffer, target: u32) -> usize {
        #[cfg(target_arch = "x86_64")]
        {
            if self == BlockSearcher::SSE2 {
                return sse2::linear_search_sse2_128(block_docs, target);
            }
        }
-        start + galloping(&block_docs.0[start..], target)
+        galloping(&block_docs.0[..], target)
    }
 }

@@ -199,12 +195,10 @@ mod tests {
        assert!(block.len() < COMPRESSION_BLOCK_SIZE);
        let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
        output_buffer[..block.len()].copy_from_slice(block);
-        for i in 0..cursor {
-            assert_eq!(
-                block_searcher.search_in_block(&AlignedBuffer(output_buffer), i, target),
-                cursor
-            );
-        }
+        assert_eq!(
+            block_searcher.search_in_block(&AlignedBuffer(output_buffer), target),
+            cursor
+        );
    }

    fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
--- a/src/postings/block_segment_postings.rs
+++ b/src/postings/block_segment_postings.rs
@@ -139,12 +139,13 @@ impl BlockSegmentPostings {
        self.doc_decoder.output_array()
    }

+    #[inline(always)]
    pub(crate) fn docs_aligned(&self) -> &AlignedBuffer {
        self.doc_decoder.output_aligned()
    }

    /// Return the document at index `idx` of the block.
-    #[inline]
+    #[inline(always)]
    pub fn doc(&self, idx: usize) -> u32 {
        self.doc_decoder.output(idx)
    }
@@ -177,23 +178,12 @@ impl BlockSegmentPostings {

    /// Position on a block that may contains `target_doc`.
    ///
-    /// If the current block last element is greater or equal to `target_doc`, return true.
-    ///
-    /// Returns true if a block that has an element greater or equal to the target is found.
-    /// Returning true does not guarantee that the smallest element of the block is smaller
-    /// than the target. It only guarantees that the last element is greater or equal.
-    ///
-    /// Returns false iff all of the document remaining are smaller than
-    /// `doc_id`. In that case, all of these document are consumed.
-    pub fn seek(&mut self, target_doc: DocId) -> bool {
-        self.skip_reader.seek(target_doc);
-        self.read_block();
-
-        // The last block last doc may actually stop before the target.
-        self.docs()
-            .last()
-            .map(|last_doc| *last_doc >= target_doc)
-            .unwrap_or(false)
+    /// If all docs are smaller than target, the block loaded may be empty,
+    /// or be the last an incomplete VInt block.
+    pub fn seek(&mut self, target_doc: DocId) {
+        if self.skip_reader.seek(target_doc) {
+            self.read_block();
+        }
    }

    fn read_block(&mut self) {
@@ -263,6 +253,7 @@ mod tests {
    use crate::common::HasLen;
    use crate::core::Index;
    use crate::docset::{DocSet, TERMINATED};
+    use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
    use crate::postings::postings::Postings;
    use crate::postings::SegmentPostings;
    use crate::schema::IndexRecordOption;
@@ -373,17 +364,6 @@ mod tests {
        inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
    }

-    #[test]
-    fn test_block_segment_postings_skip() {
-        for i in 0..4 {
-            let mut block_postings = build_block_postings(&[3]);
-            assert_eq!(block_postings.seek(i), true);
-            assert_eq!(block_postings.seek(i), true);
-        }
-        let mut block_postings = build_block_postings(&[3]);
-        assert_eq!(block_postings.seek(4u32), false);
-    }
-
    #[test]
    fn test_block_segment_postings_skip2() {
        let mut docs = vec![0];
@@ -392,13 +372,13 @@ mod tests {
        }
        let mut block_postings = build_block_postings(&docs[..]);
        for i in vec![0, 424, 10000] {
-            assert!(block_postings.seek(i));
+            block_postings.seek(i);
            let docs = block_postings.docs();
            assert!(docs[0] <= i);
            assert!(docs.last().cloned().unwrap_or(0u32) >= i);
        }
-        assert!(!block_postings.seek(100_000));
-        assert!(!block_postings.seek(101_000));
+        block_postings.seek(100_000);
+        assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
    }

    #[test]
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -1,6 +1,6 @@
 use crate::common::HasLen;

-use crate::docset::{DocSet, TERMINATED};
+use crate::docset::DocSet;
 use crate::positions::PositionReader;

 use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
@@ -90,61 +90,43 @@ impl DocSet for SegmentPostings {
    // next needs to be called a first time to point to the correct element.
    #[inline]
    fn advance(&mut self) -> DocId {
-        self.cur += 1;
-        if self.cur >= self.block_cursor.block_len() {
-            if self.block_cursor.advance() {
-                self.cur = 0;
-            } else {
-                self.cur = COMPRESSION_BLOCK_SIZE - 1;
-                return TERMINATED;
-            }
+        if self.cur == COMPRESSION_BLOCK_SIZE - 1 {
+            self.cur = 0;
+            self.block_cursor.advance();
+        } else {
+            self.cur += 1;
        }
        self.doc()
    }

    fn seek(&mut self, target: DocId) -> DocId {
-        let doc = self.doc();
-        if doc >= target {
-            return doc;
-        }
-
-        // skip blocks until one that might contain the target
-        // check if we need to go to the next block
-        if self
-            .block_cursor
-            .docs()
-            .last()
-            .map(|&doc| doc < target)
-            .unwrap_or(true)
-        {
-            // We are not in the right block.
-            if !self.block_cursor.seek(target) {
-                self.block_cursor.doc_decoder.clear();
-                self.cur = 0;
-                return TERMINATED;
-            }
-            self.cur = 0;
+        if self.doc() == target {
+            return target;
        }
+        self.block_cursor.seek(target);

        // At this point we are on the block, that might contain our document.
-
-        let cur = self.cur;
-
        let output = self.block_cursor.docs_aligned();
-        let new_cur = self.block_searcher.search_in_block(&output, cur, target);
-        self.cur = new_cur;
+
+        self.cur = self.block_searcher.search_in_block(&output, target);
+
+        // The last block is not full and padded with the value TERMINATED,
+        // so that we are guaranteed to have at least doc in the block (a real one or the padding)
+        // that is greater or equal to the target.
+        debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE);

        // `doc` is now the first element >= `target`
-        let doc = output.0[new_cur];
+
+        // If all docs are smaller than target the current block should be incomplemented and padded
+        // with the value `TERMINATED`.
+        //
+        // After the search, the cursor should point to the first value of TERMINATED.
+        let doc = output.0[self.cur];
        debug_assert!(doc >= target);
        doc
    }

    /// Return the current document's `DocId`.
-    ///
-    /// # Panics
-    ///
-    /// Will panics if called without having called advance before.
    #[inline]
    fn doc(&self) -> DocId {
        self.block_cursor.doc(self.cur)
--- a/src/postings/skip.rs
+++ b/src/postings/skip.rs
@@ -102,7 +102,8 @@ impl SkipReader {
        self.remaining_docs = doc_freq;
    }

-    pub fn doc(&self) -> DocId {
+    #[inline(always)]
+    pub(crate) fn last_doc_in_block(&self) -> DocId {
        self.last_doc_in_block
    }

@@ -158,10 +159,17 @@ impl SkipReader {
    /// If the target is larger than all documents, the skip_reader
    /// then advance to the last Variable In block.
    ///
-    /// Returns true if the last block is reached.
-    pub fn seek(&mut self, target: DocId) {
-        while self.doc() < target {
+    /// Returns true if the skip reader had to advance,
+    /// false if it was already positionned on the right block.
+    pub fn seek(&mut self, target: DocId) -> bool {
+        if self.last_doc_in_block() >= target {
+            return false;
+        }
+        loop {
            self.advance();
+            if self.last_doc_in_block() >= target {
+                return true;
+            }
        }
    }

@@ -218,7 +226,7 @@ mod tests {
            IndexRecordOption::WithFreqs,
        );
        assert!(skip_reader.advance());
-        assert_eq!(skip_reader.doc(), 1u32);
+        assert_eq!(skip_reader.last_doc_in_block(), 1u32);
        assert_eq!(
            skip_reader.block_info(),
            BlockInfo::BitPacked {
@@ -228,7 +236,7 @@ mod tests {
            }
        );
        assert!(skip_reader.advance());
-        assert_eq!(skip_reader.doc(), 5u32);
+        assert_eq!(skip_reader.last_doc_in_block(), 5u32);
        assert_eq!(
            skip_reader.block_info(),
            BlockInfo::BitPacked {
@@ -257,7 +265,7 @@ mod tests {
            IndexRecordOption::Basic,
        );
        assert!(skip_reader.advance());
-        assert_eq!(skip_reader.doc(), 1u32);
+        assert_eq!(skip_reader.last_doc_in_block(), 1u32);
        assert_eq!(
            skip_reader.block_info(),
            BlockInfo::BitPacked {
@@ -267,7 +275,7 @@ mod tests {
            }
        );
        assert!(skip_reader.advance());
-        assert_eq!(skip_reader.doc(), 5u32);
+        assert_eq!(skip_reader.last_doc_in_block(), 5u32);
        assert_eq!(
            skip_reader.block_info(),
            BlockInfo::BitPacked {
@@ -295,7 +303,7 @@ mod tests {
            IndexRecordOption::Basic,
        );
        assert!(skip_reader.advance());
-        assert_eq!(skip_reader.doc(), 1u32);
+        assert_eq!(skip_reader.last_doc_in_block(), 1u32);
        assert_eq!(
            skip_reader.block_info(),
            BlockInfo::BitPacked {
--- a/src/query/intersection.rs
+++ b/src/query/intersection.rs
@@ -112,11 +112,6 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
            debug_assert_eq!(left.doc(), right.doc());
            // test the remaining scorers;
            for docset in self.others.iter_mut() {
-                // `candidate_ord` is already at the
-                // right position.
-                //
-                // Calling `skip_next` would advance this docset
-                // and miss it.
                let seek_doc = docset.seek(candidate);
                if seek_doc > candidate {
                    candidate = left.seek(seek_doc);
--- a/src/query/union.rs
+++ b/src/query/union.rs
@@ -198,6 +198,18 @@ where

    // TODO Also implement `count` with deletes efficiently.

+    fn doc(&self) -> DocId {
+        self.doc
+    }
+
+    fn size_hint(&self) -> u32 {
+        self.docsets
+            .iter()
+            .map(|docset| docset.size_hint())
+            .max()
+            .unwrap_or(0u32)
+    }
+
    fn count_including_deleted(&mut self) -> u32 {
        if self.doc == TERMINATED {
            return 0;
@@ -219,18 +231,6 @@ where
        self.cursor = HORIZON_NUM_TINYBITSETS;
        count
    }
-
-    fn doc(&self) -> DocId {
-        self.doc
-    }
-
-    fn size_hint(&self) -> u32 {
-        self.docsets
-            .iter()
-            .map(|docset| docset.size_hint())
-            .max()
-            .unwrap_or(0u32)
-    }
 }

 impl<TScorer, TScoreCombiner> Scorer for Union<TScorer, TScoreCombiner>