try to use select-nth instead of full sort in segment level agg top-k selection

2026-06-30 14:20:42 +00:00 · 2026-06-29 09:13:21 +00:00
4 changed files with 62 additions and 128 deletions
--- a/src/aggregation/bucket/term_agg/mod.rs
+++ b/src/aggregation/bucket/term_agg/mod.rs
@@ -981,19 +981,27 @@ where
    ) -> crate::Result<IntermediateBucketResult> {
        let mut entries: Vec<(u64, Bucket)> = term_buckets.into_vec();

+        let segment_size = term_req.req.segment_size as usize;
+
+        // select_nth_unstable_by_key(segment_size, ...) places the (k+1)-th element at
+        // entries[segment_size] and guarantees entries[0..segment_size] are the top-k,
+        // unordered. We need this to properly compute term_doc_count_before_cutoff.
        match &term_req.req.order.target {
            OrderTarget::Key => {
                // We rely on the fact, that term ordinals match the order of the strings
                // TODO: We could have a special collector, that keeps only TOP n results at any
                // time.
-                if term_req.req.order.order == Order::Desc {
-                    entries.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.0));
-                } else {
-                    entries.sort_unstable_by_key(|bucket| bucket.0);
+                if entries.len() > segment_size {
+                    if term_req.req.order.order == Order::Desc {
+                        entries
+                            .select_nth_unstable_by_key(segment_size, |b| std::cmp::Reverse(b.0));
+                    } else {
+                        entries.select_nth_unstable_by_key(segment_size, |b| b.0);
+                    }
                }
            }
            OrderTarget::SubAggregation(sub_agg_path) => {
-                // Peek segment-level metric values, sort, then fall through to
+                // Peek segment-level metric values, select top-k, then fall through to
                // `cut_off_buckets`. Like Elasticsearch, we always cut off when ordering
                // by a sub-agg: top-K results are approximate and may differ from the
                // global ordering, especially for non-monotonic metrics like avg/min.
@@ -1003,7 +1011,7 @@ where
                    ))
                })?;
                let (agg_name, agg_prop) = get_agg_name_and_property(sub_agg_path);
-                // Fetch values up-front; otherwise sort would re-compute per comparison
+                // Fetch values up-front; otherwise sort would re-compute per call
                let mut keyed: Vec<(f64, (u64, Bucket))> = entries
                    .into_iter()
                    .map(|bucket| {
@@ -1013,28 +1021,34 @@ where
                        (metric_value, bucket)
                    })
                    .collect();
-                if term_req.req.order.order == Order::Desc {
-                    keyed.sort_unstable_by(|a, b| {
-                        b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal)
-                    });
-                } else {
-                    keyed.sort_unstable_by(|a, b| {
-                        a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal)
-                    });
+                if keyed.len() > segment_size {
+                    if term_req.req.order.order == Order::Desc {
+                        keyed.select_nth_unstable_by(segment_size, |a, b| {
+                            b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal)
+                        });
+                    } else {
+                        keyed.select_nth_unstable_by(segment_size, |a, b| {
+                            a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal)
+                        });
+                    }
                }
                entries = keyed.into_iter().map(|(_, e)| e).collect();
            }
            OrderTarget::Count => {
-                if term_req.req.order.order == Order::Desc {
-                    entries.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.1.count));
-                } else {
-                    entries.sort_unstable_by_key(|bucket| bucket.1.count);
+                if entries.len() > segment_size {
+                    if term_req.req.order.order == Order::Desc {
+                        entries.select_nth_unstable_by_key(segment_size, |b| {
+                            std::cmp::Reverse(b.1.count)
+                        });
+                    } else {
+                        entries.select_nth_unstable_by_key(segment_size, |b| b.1.count);
+                    }
                }
            }
        }

        let (term_doc_count_before_cutoff, sum_other_doc_count) =
-            cut_off_buckets(&mut entries, term_req.req.segment_size as usize);
+            cut_off_buckets(&mut entries, segment_size);

        let mut dict: FxHashMap<IntermediateKey, IntermediateTermBucketEntry> = Default::default();
        dict.reserve(entries.len());
--- a/src/postings/block_search.rs
+++ b/src/postings/block_search.rs
@@ -1,78 +1,36 @@
 use crate::postings::compression::COMPRESSION_BLOCK_SIZE;

-/// Returns the index of the first element in `arr` that is greater than or
-/// equal to `target`.
+/// Search the first index containing an element greater or equal to
+/// the target.
 ///
-/// This is equivalent to:
-///
-/// ```ignore
-/// arr.iter().take_while(|&&val| val < target).count()
+/// The results should be equivalent to
+/// ```compile_fail
+/// block[..]
+//       .iter()
+//       .take_while(|&&val| val < target)
+//       .count()
 /// ```
+/// 
+/// the `start` argument is just used to hint that the response is
+/// greater than beyond `start`. The implementation may or may not use
+/// it for optimization.
 ///
-/// # Assumptions
+/// # Assumption
 ///
-/// - `arr` is sorted in nondecreasing order. Values may be repeated; the last block is often padded
-///   with duplicates of its final value.
-/// - `target` is less than or equal to the last element in `arr`, so the result is always a valid
-///   index into the block.
-///
-/// # `K`
-///
-/// `K` is the branching factor. Each reduction probes `K - 1` segment-end
-/// pivots, keeps the matching segment, and finally linearly scans the remaining
-/// range. `K` must be one of `2`, `4`, `8`, `16`, or `32`.
-///
-/// This is the "k-ary search on a sorted array" variant of Schlegel, Gemulla & Lehner,
-/// "k-Ary Search on Modern Processors", DaMoN 2009 (<https://dl.acm.org/doi/10.1145/1565694.1565705>),
-/// specialized to a lower-bound (no equality early-exit) with a linear scan over the final
-/// `< K` elements. We do not use their linearized-tree (`k-ary-lt`) layout, which would require
-/// reordering the block.
-///
-/// The core idea vs a traditional binary search is that we can check multiple numbers in parallel,
-/// which better utilizes the CPU's instruction-level parallelism.
-///
-/// `kary_search::<8>` reduces in three steps: 128 -> 16 -> 2, then a 2-element scan. It could be
-/// done in only two steps (128 -> 16, then scanning all 16 contiguous elements). For that
-/// we need popcount for that to be fast though (TODO).
-#[inline(always)]
-pub fn kary_search<const K: usize>(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
-    const {
-        assert!(
-            matches!(K, 2 | 4 | 8 | 16 | 32),
-            "K must be one of 2, 4, 8, 16, or 32"
-        );
-    };
-
-    let mut base = 0usize;
-    let mut range = COMPRESSION_BLOCK_SIZE;
-
-    loop {
-        let step = range / K;
-        if step == 0 {
-            break;
+/// - The block is sorted. Some elements may appear several times. This is the case at the
+///   end of the last block for instance.
+/// - The target is assumed smaller or equal to the last element of the block.
+pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
+    let mut start = 0;
+    let mut len = arr.len();
+    for _ in 0..7 {
+        len /= 2;
+        let pivot = unsafe { *arr.get_unchecked(start + len - 1) };
+        if pivot < target {
+            start += len;
        }
-        debug_assert_eq!(range % K, 0);
-        // Count how many segment-end pivots are < target (branchless, unrolled).
-        let mut count = 0usize;
-        for i in 1..K {
-            count += (arr[base + i * step - 1] < target) as usize;
-        }
-        base += count * step;
-        range = step;
    }
-
-    // Linear scan over the ≤K remaining elements.
-    let mut count = 0usize;
-    for i in 0..range {
-        count += (arr[base + i] < target) as usize;
-    }
-    base + count
-}
-
-/// entry point used by postings; implemented as an 8-ary branchless search.
-#[inline]
-pub fn search_block(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
-    kary_search::<8>(arr, target)
+    start
 }

 #[cfg(test)]
@@ -81,7 +39,7 @@ mod tests {

    use proptest::prelude::*;

-    use super::{kary_search, search_block};
+    use super::branchless_binary_search;
    use crate::docset::TERMINATED;
    use crate::postings::compression::COMPRESSION_BLOCK_SIZE;

@@ -99,7 +57,7 @@ mod tests {
        assert_eq!(block.len(), COMPRESSION_BLOCK_SIZE);
        let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
        output_buffer[..block.len()].copy_from_slice(block);
-        assert_eq!(search_block(&output_buffer, target), cursor);
+        assert_eq!(branchless_binary_search(&output_buffer, target), cursor);
    }

    fn util_test_search_in_block_all(block: &[u32]) {
@@ -122,44 +80,6 @@ mod tests {
        util_test_search_in_block_all(&v[..]);
    }

-    #[test]
-    fn test_search_in_branchless_binary_search_corner_cases() {
-        let all_same = vec![7u32; COMPRESSION_BLOCK_SIZE];
-        util_test_search_in_block_all(&all_same);
-
-        let repeated_across_pivots: Vec<u32> = (0..COMPRESSION_BLOCK_SIZE)
-            .map(|i| (i / 17) as u32)
-            .collect();
-        util_test_search_in_block_all(&repeated_across_pivots);
-
-        let mut padded_last_block = vec![0u32; COMPRESSION_BLOCK_SIZE];
-        for (i, value) in padded_last_block.iter_mut().enumerate() {
-            *value = if i < COMPRESSION_BLOCK_SIZE / 2 {
-                i as u32
-            } else {
-                TERMINATED
-            };
-        }
-        util_test_search_in_block_all(&padded_last_block);
-    }
-
-    #[test]
-    fn test_kary_search_allowed_branching_factors() {
-        let mut block = [TERMINATED; COMPRESSION_BLOCK_SIZE];
-        for (idx, value) in block.iter_mut().enumerate() {
-            *value = (idx / 3) as u32;
-        }
-
-        for target in [0, 1, 17, block[COMPRESSION_BLOCK_SIZE - 1]] {
-            let expected = search_in_block_trivial_but_slow(&block, target);
-            assert_eq!(kary_search::<2>(&block, target), expected);
-            assert_eq!(kary_search::<4>(&block, target), expected);
-            assert_eq!(kary_search::<8>(&block, target), expected);
-            assert_eq!(kary_search::<16>(&block, target), expected);
-            assert_eq!(kary_search::<32>(&block, target), expected);
-        }
-    }
-
    fn monotonous_block() -> impl Strategy<Value = Vec<u32>> {
        prop::collection::vec(0u32..5u32, COMPRESSION_BLOCK_SIZE).prop_map(|mut deltas| {
            let mut el = 0;
--- a/src/postings/compression/mod.rs
+++ b/src/postings/compression/mod.rs
@@ -158,7 +158,7 @@ impl BlockDecoder {
    /// Uses the padded buffer to enable branchless search.
    #[inline]
    pub(crate) fn seek_within_block(&self, target: u32) -> usize {
-        crate::postings::search_block(&self.output, target)
+        crate::postings::branchless_binary_search(&self.output, target)
    }

    #[inline]
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -2,7 +2,7 @@

 mod block_search;

-pub(crate) use self::block_search::search_block;
+pub(crate) use self::block_search::branchless_binary_search;

 mod block_segment_postings;
 pub(crate) mod compression;