block_search: drop unsafe indexing, remove K=64

For K∈{2,4,8,16,32} LLVM proves the index bounds and elides the checks, so get_unchecked buys nothing on the production K=8 path. K=64 was the only value that defeated bounds-check elision (one check in the tail scan) and it was instantiated in tests only — drop it. block_search: cite the k-ary search paper Document that kary_search is the 'k-ary search on a sorted array' variant from Schlegel, Gemulla & Lehner (DaMoN 2009), specialized to a lower-bound.
10% faster intersections: Use k-ary in block search
2026-06-30 06:10:41 +00:00 · 2026-06-29 10:27:28 +02:00 · 2026-06-29 10:27:18 +02:00
5 changed files with 121 additions and 60 deletions
--- a/src/postings/block_search.rs
+++ b/src/postings/block_search.rs
@@ -1,36 +1,78 @@
 use crate::postings::compression::COMPRESSION_BLOCK_SIZE;

-/// Search the first index containing an element greater or equal to
-/// the target.
+/// Returns the index of the first element in `arr` that is greater than or
+/// equal to `target`.
 ///
-/// The results should be equivalent to
-/// ```compile_fail
-/// block[..]
-//       .iter()
-//       .take_while(|&&val| val < target)
-//       .count()
+/// This is equivalent to:
+///
+/// ```ignore
+/// arr.iter().take_while(|&&val| val < target).count()
 /// ```
-/// 
-/// the `start` argument is just used to hint that the response is
-/// greater than beyond `start`. The implementation may or may not use
-/// it for optimization.
 ///
-/// # Assumption
+/// # Assumptions
 ///
-/// - The block is sorted. Some elements may appear several times. This is the case at the
-///   end of the last block for instance.
-/// - The target is assumed smaller or equal to the last element of the block.
-pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
-    let mut start = 0;
-    let mut len = arr.len();
-    for _ in 0..7 {
-        len /= 2;
-        let pivot = unsafe { *arr.get_unchecked(start + len - 1) };
-        if pivot < target {
-            start += len;
+/// - `arr` is sorted in nondecreasing order. Values may be repeated; the last block is often padded
+///   with duplicates of its final value.
+/// - `target` is less than or equal to the last element in `arr`, so the result is always a valid
+///   index into the block.
+///
+/// # `K`
+///
+/// `K` is the branching factor. Each reduction probes `K - 1` segment-end
+/// pivots, keeps the matching segment, and finally linearly scans the remaining
+/// range. `K` must be one of `2`, `4`, `8`, `16`, or `32`.
+///
+/// This is the "k-ary search on a sorted array" variant of Schlegel, Gemulla & Lehner,
+/// "k-Ary Search on Modern Processors", DaMoN 2009 (<https://dl.acm.org/doi/10.1145/1565694.1565705>),
+/// specialized to a lower-bound (no equality early-exit) with a linear scan over the final
+/// `< K` elements. We do not use their linearized-tree (`k-ary-lt`) layout, which would require
+/// reordering the block.
+///
+/// The core idea vs a traditional binary search is that we can check multiple numbers in parallel,
+/// which better utilizes the CPU's instruction-level parallelism.
+///
+/// `kary_search::<8>` reduces in three steps: 128 -> 16 -> 2, then a 2-element scan. It could be
+/// done in only two steps (128 -> 16, then scanning all 16 contiguous elements). For that
+/// we need popcount for that to be fast though (TODO).
+#[inline(always)]
+pub fn kary_search<const K: usize>(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
+    const {
+        assert!(
+            matches!(K, 2 | 4 | 8 | 16 | 32),
+            "K must be one of 2, 4, 8, 16, or 32"
+        );
+    };
+
+    let mut base = 0usize;
+    let mut range = COMPRESSION_BLOCK_SIZE;
+
+    loop {
+        let step = range / K;
+        if step == 0 {
+            break;
        }
+        debug_assert_eq!(range % K, 0);
+        // Count how many segment-end pivots are < target (branchless, unrolled).
+        let mut count = 0usize;
+        for i in 1..K {
+            count += (arr[base + i * step - 1] < target) as usize;
+        }
+        base += count * step;
+        range = step;
    }
-    start
+
+    // Linear scan over the ≤K remaining elements.
+    let mut count = 0usize;
+    for i in 0..range {
+        count += (arr[base + i] < target) as usize;
+    }
+    base + count
+}
+
+/// entry point used by postings; implemented as an 8-ary branchless search.
+#[inline]
+pub fn search_block(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
+    kary_search::<8>(arr, target)
 }

 #[cfg(test)]
@@ -39,7 +81,7 @@ mod tests {

    use proptest::prelude::*;

-    use super::branchless_binary_search;
+    use super::{kary_search, search_block};
    use crate::docset::TERMINATED;
    use crate::postings::compression::COMPRESSION_BLOCK_SIZE;

@@ -57,7 +99,7 @@ mod tests {
        assert_eq!(block.len(), COMPRESSION_BLOCK_SIZE);
        let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
        output_buffer[..block.len()].copy_from_slice(block);
-        assert_eq!(branchless_binary_search(&output_buffer, target), cursor);
+        assert_eq!(search_block(&output_buffer, target), cursor);
    }

    fn util_test_search_in_block_all(block: &[u32]) {
@@ -80,6 +122,44 @@ mod tests {
        util_test_search_in_block_all(&v[..]);
    }

+    #[test]
+    fn test_search_in_branchless_binary_search_corner_cases() {
+        let all_same = vec![7u32; COMPRESSION_BLOCK_SIZE];
+        util_test_search_in_block_all(&all_same);
+
+        let repeated_across_pivots: Vec<u32> = (0..COMPRESSION_BLOCK_SIZE)
+            .map(|i| (i / 17) as u32)
+            .collect();
+        util_test_search_in_block_all(&repeated_across_pivots);
+
+        let mut padded_last_block = vec![0u32; COMPRESSION_BLOCK_SIZE];
+        for (i, value) in padded_last_block.iter_mut().enumerate() {
+            *value = if i < COMPRESSION_BLOCK_SIZE / 2 {
+                i as u32
+            } else {
+                TERMINATED
+            };
+        }
+        util_test_search_in_block_all(&padded_last_block);
+    }
+
+    #[test]
+    fn test_kary_search_allowed_branching_factors() {
+        let mut block = [TERMINATED; COMPRESSION_BLOCK_SIZE];
+        for (idx, value) in block.iter_mut().enumerate() {
+            *value = (idx / 3) as u32;
+        }
+
+        for target in [0, 1, 17, block[COMPRESSION_BLOCK_SIZE - 1]] {
+            let expected = search_in_block_trivial_but_slow(&block, target);
+            assert_eq!(kary_search::<2>(&block, target), expected);
+            assert_eq!(kary_search::<4>(&block, target), expected);
+            assert_eq!(kary_search::<8>(&block, target), expected);
+            assert_eq!(kary_search::<16>(&block, target), expected);
+            assert_eq!(kary_search::<32>(&block, target), expected);
+        }
+    }
+
    fn monotonous_block() -> impl Strategy<Value = Vec<u32>> {
        prop::collection::vec(0u32..5u32, COMPRESSION_BLOCK_SIZE).prop_map(|mut deltas| {
            let mut el = 0;
--- a/src/postings/compression/mod.rs
+++ b/src/postings/compression/mod.rs
@@ -158,7 +158,7 @@ impl BlockDecoder {
    /// Uses the padded buffer to enable branchless search.
    #[inline]
    pub(crate) fn seek_within_block(&self, target: u32) -> usize {
-        crate::postings::branchless_binary_search(&self.output, target)
+        crate::postings::search_block(&self.output, target)
    }

    #[inline]
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -2,7 +2,7 @@

 mod block_search;

-pub(crate) use self::block_search::branchless_binary_search;
+pub(crate) use self::block_search::search_block;

 mod block_segment_postings;
 pub(crate) mod compression;
--- a/src/query/boolean_query/boolean_weight.rs
+++ b/src/query/boolean_query/boolean_weight.rs
@@ -91,14 +91,10 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
    num_docs: u32,
 ) -> Box<dyn Scorer> {
    match scorer {
-        SpecializedScorer::TermUnion(mut term_scorers) => {
-            if term_scorers.len() == 1 {
-                Box::new(term_scorers.pop().unwrap())
-            } else {
-                let union_scorer =
-                    BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs);
-                Box::new(union_scorer)
-            }
+        SpecializedScorer::TermUnion(term_scorers) => {
+            let union_scorer =
+                BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs);
+            Box::new(union_scorer)
        }
        SpecializedScorer::TermIntersection(term_scorers) => {
            let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
@@ -508,15 +504,10 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
        let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
        let num_docs = reader.num_docs();
        match scorer {
-            SpecializedScorer::TermUnion(mut term_scorers) => {
-                if term_scorers.len() == 1 {
-                    let mut term_scorer = term_scorers.pop().unwrap();
-                    for_each_scorer(&mut term_scorer, callback);
-                } else {
-                    let mut union_scorer =
-                        BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
-                    for_each_scorer(&mut union_scorer, callback);
-                }
+            SpecializedScorer::TermUnion(term_scorers) => {
+                let mut union_scorer =
+                    BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
+                for_each_scorer(&mut union_scorer, callback);
            }
            SpecializedScorer::TermIntersection(term_scorers) => {
                let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
@@ -543,15 +534,10 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
        let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];

        match scorer {
-            SpecializedScorer::TermUnion(mut term_scorers) => {
-                if term_scorers.len() == 1 {
-                    let mut term_scorer = term_scorers.pop().unwrap();
-                    for_each_docset_buffered(&mut term_scorer, &mut buffer, callback);
-                } else {
-                    let mut union_scorer =
-                        BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
-                    for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
-                }
+            SpecializedScorer::TermUnion(term_scorers) => {
+                let mut union_scorer =
+                    BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
+                for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
            }
            SpecializedScorer::TermIntersection(term_scorers) => {
                let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
--- a/src/query/union/buffered_union.rs
+++ b/src/query/union/buffered_union.rs
@@ -55,11 +55,6 @@ pub struct BufferedUnionScorer<TScorer, TScoreCombiner = DoNothingCombiner> {
    num_docs: u32,
 }

-// Keep this helper out-of-line. When LLVM inlines it into
-// `BufferedUnionScorer::advance`, the full traversal path used by combined
-// collectors such as `(TopDocs, Count)` becomes sensitive to unrelated codegen
-// changes and regresses on large unions.
-#[inline(never)]
 fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
    scorers: &mut Vec<TScorer>,
    bitsets: &mut [TinySet; HORIZON_NUM_TINYBITSETS],