Keep buffered union refill out of line

BufferedUnionScorer is the hot path for full union traversal, including (TopDocs, Count) where Count forces all matches to be visited. After the block-wand intersection changes, LLVM started inlining the refill helper into the advance path, which regressed TOP_100_COUNT union queries even though the union algorithm did not change. Force the refill helper out of line so the advance loop stays small and stable while pruning collectors continue to use Block-WAND. Benchmark on search-benchmark-game TOP_100_COUNT union query set (301 queries, sum of per-query medians): - tantivy 0.26: 0.853646s - main before: 0.918605s - this change: 0.841659s
Remove union wrapping for single-terms
2026-07-01 14:50:44 +00:00 · 2026-06-29 19:33:50 +02:00 · 2026-06-29 19:33:50 +02:00
5 changed files with 60 additions and 121 deletions
--- a/src/postings/block_search.rs
+++ b/src/postings/block_search.rs
@@ -1,78 +1,36 @@
 use crate::postings::compression::COMPRESSION_BLOCK_SIZE;

-/// Returns the index of the first element in `arr` that is greater than or
-/// equal to `target`.
+/// Search the first index containing an element greater or equal to
+/// the target.
 ///
-/// This is equivalent to:
-///
-/// ```ignore
-/// arr.iter().take_while(|&&val| val < target).count()
+/// The results should be equivalent to
+/// ```compile_fail
+/// block[..]
+//       .iter()
+//       .take_while(|&&val| val < target)
+//       .count()
 /// ```
+/// 
+/// the `start` argument is just used to hint that the response is
+/// greater than beyond `start`. The implementation may or may not use
+/// it for optimization.
 ///
-/// # Assumptions
+/// # Assumption
 ///
-/// - `arr` is sorted in nondecreasing order. Values may be repeated; the last block is often padded
-///   with duplicates of its final value.
-/// - `target` is less than or equal to the last element in `arr`, so the result is always a valid
-///   index into the block.
-///
-/// # `K`
-///
-/// `K` is the branching factor. Each reduction probes `K - 1` segment-end
-/// pivots, keeps the matching segment, and finally linearly scans the remaining
-/// range. `K` must be one of `2`, `4`, `8`, `16`, or `32`.
-///
-/// This is the "k-ary search on a sorted array" variant of Schlegel, Gemulla & Lehner,
-/// "k-Ary Search on Modern Processors", DaMoN 2009 (<https://dl.acm.org/doi/10.1145/1565694.1565705>),
-/// specialized to a lower-bound (no equality early-exit) with a linear scan over the final
-/// `< K` elements. We do not use their linearized-tree (`k-ary-lt`) layout, which would require
-/// reordering the block.
-///
-/// The core idea vs a traditional binary search is that we can check multiple numbers in parallel,
-/// which better utilizes the CPU's instruction-level parallelism.
-///
-/// `kary_search::<8>` reduces in three steps: 128 -> 16 -> 2, then a 2-element scan. It could be
-/// done in only two steps (128 -> 16, then scanning all 16 contiguous elements). For that
-/// we need popcount for that to be fast though (TODO).
-#[inline(always)]
-pub fn kary_search<const K: usize>(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
-    const {
-        assert!(
-            matches!(K, 2 | 4 | 8 | 16 | 32),
-            "K must be one of 2, 4, 8, 16, or 32"
-        );
-    };
-
-    let mut base = 0usize;
-    let mut range = COMPRESSION_BLOCK_SIZE;
-
-    loop {
-        let step = range / K;
-        if step == 0 {
-            break;
+/// - The block is sorted. Some elements may appear several times. This is the case at the
+///   end of the last block for instance.
+/// - The target is assumed smaller or equal to the last element of the block.
+pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
+    let mut start = 0;
+    let mut len = arr.len();
+    for _ in 0..7 {
+        len /= 2;
+        let pivot = unsafe { *arr.get_unchecked(start + len - 1) };
+        if pivot < target {
+            start += len;
        }
-        debug_assert_eq!(range % K, 0);
-        // Count how many segment-end pivots are < target (branchless, unrolled).
-        let mut count = 0usize;
-        for i in 1..K {
-            count += (arr[base + i * step - 1] < target) as usize;
-        }
-        base += count * step;
-        range = step;
    }
-
-    // Linear scan over the ≤K remaining elements.
-    let mut count = 0usize;
-    for i in 0..range {
-        count += (arr[base + i] < target) as usize;
-    }
-    base + count
-}
-
-/// entry point used by postings; implemented as an 8-ary branchless search.
-#[inline]
-pub fn search_block(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
-    kary_search::<8>(arr, target)
+    start
 }

 #[cfg(test)]
@@ -81,7 +39,7 @@ mod tests {

    use proptest::prelude::*;

-    use super::{kary_search, search_block};
+    use super::branchless_binary_search;
    use crate::docset::TERMINATED;
    use crate::postings::compression::COMPRESSION_BLOCK_SIZE;

@@ -99,7 +57,7 @@ mod tests {
        assert_eq!(block.len(), COMPRESSION_BLOCK_SIZE);
        let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
        output_buffer[..block.len()].copy_from_slice(block);
-        assert_eq!(search_block(&output_buffer, target), cursor);
+        assert_eq!(branchless_binary_search(&output_buffer, target), cursor);
    }

    fn util_test_search_in_block_all(block: &[u32]) {
@@ -122,44 +80,6 @@ mod tests {
        util_test_search_in_block_all(&v[..]);
    }

-    #[test]
-    fn test_search_in_branchless_binary_search_corner_cases() {
-        let all_same = vec![7u32; COMPRESSION_BLOCK_SIZE];
-        util_test_search_in_block_all(&all_same);
-
-        let repeated_across_pivots: Vec<u32> = (0..COMPRESSION_BLOCK_SIZE)
-            .map(|i| (i / 17) as u32)
-            .collect();
-        util_test_search_in_block_all(&repeated_across_pivots);
-
-        let mut padded_last_block = vec![0u32; COMPRESSION_BLOCK_SIZE];
-        for (i, value) in padded_last_block.iter_mut().enumerate() {
-            *value = if i < COMPRESSION_BLOCK_SIZE / 2 {
-                i as u32
-            } else {
-                TERMINATED
-            };
-        }
-        util_test_search_in_block_all(&padded_last_block);
-    }
-
-    #[test]
-    fn test_kary_search_allowed_branching_factors() {
-        let mut block = [TERMINATED; COMPRESSION_BLOCK_SIZE];
-        for (idx, value) in block.iter_mut().enumerate() {
-            *value = (idx / 3) as u32;
-        }
-
-        for target in [0, 1, 17, block[COMPRESSION_BLOCK_SIZE - 1]] {
-            let expected = search_in_block_trivial_but_slow(&block, target);
-            assert_eq!(kary_search::<2>(&block, target), expected);
-            assert_eq!(kary_search::<4>(&block, target), expected);
-            assert_eq!(kary_search::<8>(&block, target), expected);
-            assert_eq!(kary_search::<16>(&block, target), expected);
-            assert_eq!(kary_search::<32>(&block, target), expected);
-        }
-    }
-
    fn monotonous_block() -> impl Strategy<Value = Vec<u32>> {
        prop::collection::vec(0u32..5u32, COMPRESSION_BLOCK_SIZE).prop_map(|mut deltas| {
            let mut el = 0;
--- a/src/postings/compression/mod.rs
+++ b/src/postings/compression/mod.rs
@@ -158,7 +158,7 @@ impl BlockDecoder {
    /// Uses the padded buffer to enable branchless search.
    #[inline]
    pub(crate) fn seek_within_block(&self, target: u32) -> usize {
-        crate::postings::search_block(&self.output, target)
+        crate::postings::branchless_binary_search(&self.output, target)
    }

    #[inline]
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -2,7 +2,7 @@

 mod block_search;

-pub(crate) use self::block_search::search_block;
+pub(crate) use self::block_search::branchless_binary_search;

 mod block_segment_postings;
 pub(crate) mod compression;
--- a/src/query/boolean_query/boolean_weight.rs
+++ b/src/query/boolean_query/boolean_weight.rs
@@ -91,10 +91,14 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
    num_docs: u32,
 ) -> Box<dyn Scorer> {
    match scorer {
-        SpecializedScorer::TermUnion(term_scorers) => {
-            let union_scorer =
-                BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs);
-            Box::new(union_scorer)
+        SpecializedScorer::TermUnion(mut term_scorers) => {
+            if term_scorers.len() == 1 {
+                Box::new(term_scorers.pop().unwrap())
+            } else {
+                let union_scorer =
+                    BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs);
+                Box::new(union_scorer)
+            }
        }
        SpecializedScorer::TermIntersection(term_scorers) => {
            let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
@@ -504,10 +508,15 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
        let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
        let num_docs = reader.num_docs();
        match scorer {
-            SpecializedScorer::TermUnion(term_scorers) => {
-                let mut union_scorer =
-                    BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
-                for_each_scorer(&mut union_scorer, callback);
+            SpecializedScorer::TermUnion(mut term_scorers) => {
+                if term_scorers.len() == 1 {
+                    let mut term_scorer = term_scorers.pop().unwrap();
+                    for_each_scorer(&mut term_scorer, callback);
+                } else {
+                    let mut union_scorer =
+                        BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
+                    for_each_scorer(&mut union_scorer, callback);
+                }
            }
            SpecializedScorer::TermIntersection(term_scorers) => {
                let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
@@ -534,10 +543,15 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
        let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];

        match scorer {
-            SpecializedScorer::TermUnion(term_scorers) => {
-                let mut union_scorer =
-                    BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
-                for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
+            SpecializedScorer::TermUnion(mut term_scorers) => {
+                if term_scorers.len() == 1 {
+                    let mut term_scorer = term_scorers.pop().unwrap();
+                    for_each_docset_buffered(&mut term_scorer, &mut buffer, callback);
+                } else {
+                    let mut union_scorer =
+                        BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
+                    for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
+                }
            }
            SpecializedScorer::TermIntersection(term_scorers) => {
                let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
--- a/src/query/union/buffered_union.rs
+++ b/src/query/union/buffered_union.rs
@@ -55,6 +55,11 @@ pub struct BufferedUnionScorer<TScorer, TScoreCombiner = DoNothingCombiner> {
    num_docs: u32,
 }

+// Keep this helper out-of-line. When LLVM inlines it into
+// `BufferedUnionScorer::advance`, the full traversal path used by combined
+// collectors such as `(TopDocs, Count)` becomes sensitive to unrelated codegen
+// changes and regresses on large unions.
+#[inline(never)]
 fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
    scorers: &mut Vec<TScorer>,
    bitsets: &mut [TinySet; HORIZON_NUM_TINYBITSETS],