fix term aggregation u32::MAX overflow issue

explain why naive scorer must accumulate scores in WAND order
fix flaky test
2026-06-19 08:50:47 +00:00 · 2026-06-18 17:07:43 +08:00 · 2026-06-17 18:58:58 +08:00 · 2026-06-17 18:58:58 +08:00
8 changed files with 250 additions and 194 deletions
--- a/benches/bool_queries_with_range.rs
+++ b/benches/bool_queries_with_range.rs
@@ -110,31 +110,43 @@ fn main() {
    // Prepare corpora with varying scenarios
    let scenarios = vec![
        (
-            "dense and 0.1% a".to_string(),
-            5_000_000,
-            0.001,
+            "dense and 99% a".to_string(),
+            10_000_000,
+            0.99,
            "dense",
            0,
            9,
        ),
-        ("dense and 1% a".to_string(), 5_000_000, 0.01, "dense", 0, 9),
-        ("dense and 10% a".to_string(), 5_000_000, 0.1, "dense", 0, 9),
        (
-            "sparse and 50% a".to_string(),
-            5_000_000,
+            "dense and 99% a".to_string(),
+            10_000_000,
+            0.99,
+            "dense",
+            990,
+            999,
+        ),
+        (
+            "sparse and 99% a".to_string(),
+            10_000_000,
            0.99,
            "sparse",
            0,
            9,
        ),
+        (
+            "sparse and 99% a".to_string(),
+            10_000_000,
+            0.99,
+            "sparse",
+            9_999_990,
+            9_999_999,
+        ),
    ];

    let mut runner = BenchRunner::new();
-    for (scenario_id, num_docs, p_title_a, num_rand_distribution, range_low, range_high) in
-        scenarios
-    {
+    for (scenario_id, n, p_title_a, num_rand_distribution, range_low, range_high) in scenarios {
        // Build index for this scenario
-        let bench_index = build_shared_indices(num_docs, p_title_a, num_rand_distribution);
+        let bench_index = build_shared_indices(n, p_title_a, num_rand_distribution);

        // Create benchmark group
        let mut group = runner.new_group();
@@ -146,7 +158,7 @@ fn main() {
        let field_names = ["num_rand", "num_asc", "num_rand_fast", "num_asc_fast"];

        // Define the three terms we want to test with
-        let terms = ["a"];
+        let terms = ["a", "b", "z"];

        // Generate all combinations of terms and field names
        let mut queries = Vec::new();
@@ -191,7 +203,7 @@ fn run_benchmark_tasks(
        bench_index,
        query_str,
        DocSetCollector,
-        "all_results",
+        "all results",
    );

    // Test top 100 by the field (if it's a FAST field)
--- a/src/aggregation/bucket/composite/collector.rs
+++ b/src/aggregation/bucket/composite/collector.rs
@@ -275,7 +275,7 @@ impl SegmentCompositeCollector {
            dict.insert(
                key,
                IntermediateCompositeBucketEntry {
-                    doc_count: agg.count,
+                    doc_count: agg.count as u64,
                    sub_aggregation: sub_aggregation_res,
                },
            );
--- a/src/aggregation/bucket/term_agg/mod.rs
+++ b/src/aggregation/bucket/term_agg/mod.rs
@@ -957,7 +957,7 @@ fn into_intermediate_bucket_entry(
        )?;
    }
    Ok(IntermediateTermBucketEntry {
-        doc_count: bucket.count,
+        doc_count: bucket.count as u64,
        sub_aggregation: sub_aggregation_res,
    })
 }
--- a/src/aggregation/bucket/term_missing_agg.rs
+++ b/src/aggregation/bucket/term_missing_agg.rs
@@ -98,7 +98,7 @@ impl SegmentAggregationCollector for TermMissingAgg {

        let missing_count = &self.missing_count_per_bucket[parent_bucket_id as usize];
        let mut missing_entry = IntermediateTermBucketEntry {
-            doc_count: missing_count.missing_count,
+            doc_count: missing_count.missing_count as u64,
            sub_aggregation: Default::default(),
        };
        if let Some(sub_agg) = &mut self.sub_agg {
--- a/src/aggregation/intermediate_agg_result.rs
+++ b/src/aggregation/intermediate_agg_result.rs
@@ -930,7 +930,7 @@ impl IntermediateRangeBucketEntry {
 #[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
 pub struct IntermediateTermBucketEntry {
    /// The number of documents in the bucket.
-    pub doc_count: u32,
+    pub doc_count: u64,
    /// The sub_aggregation in this bucket.
    pub sub_aggregation: IntermediateAggregationResults,
 }
@@ -1240,6 +1240,24 @@ mod tests {
        assert_eq!(tree_left, tree_expected);
    }

+    #[test]
+    fn test_term_bucket_doc_count_no_u32_overflow() {
+        // Two segments each contributing (u32::MAX - 100) docs to the same term. Summing them
+        // overflowed when doc_count was u32.
+        let per_segment = u32::MAX as u64 - 100;
+        let mut entry = IntermediateTermBucketEntry {
+            doc_count: per_segment,
+            sub_aggregation: Default::default(),
+        };
+        entry
+            .merge_fruits(IntermediateTermBucketEntry {
+                doc_count: per_segment,
+                sub_aggregation: Default::default(),
+            })
+            .unwrap();
+        assert_eq!(entry.doc_count, per_segment * 2);
+    }
+
    #[test]
    fn test_merge_fruits_tree_empty() {
        let mut tree_left = get_intermediate_tree_with_ranges(&[
--- a/src/query/boolean_query/block_wand_intersection.rs
+++ b/src/query/boolean_query/block_wand_intersection.rs
@@ -273,8 +273,14 @@ mod tests {
            }

            if all_match {
-                let score: Score =
-                    leader.score() + secondaries.iter_mut().map(|s| s.score()).sum::<Score>();
+                // Accumulate in the same left-to-right order as the WAND implementation
+                // (leader first, then each secondary in turn).  Float addition is not
+                // associative, so `leader + secondaries.sum()` gives a different bit
+                // pattern and can cause spurious nearly_equals failures.
+                let mut score: Score = leader.score();
+                for secondary in secondaries.iter_mut() {
+                    score += secondary.score();
+                }

                if score > limit {
                    heap.push(Float(score));
@@ -417,6 +423,198 @@ mod tests {
        }
    }

+    #[test]
+    fn test_block_wand_intersection_three_scorers_regression() {
+        // Minimal failing case found by proptest (CI run 27557430583, job 81460063906).
+        // Posting list 0 spans docs 0–63 (all present, doc 8 has tf=80, doc 26 tf=4, rest tf=1).
+        // Posting lists 1 and 2 are sparse with varying term freqs, and doc 16/64 appear only
+        // in lists 1/2 but not list 0.  The high tf=80 on doc 8 of list 0 makes the WAND
+        // upper-bound estimation skip documents that the naive intersection would score.
+        let posting_lists: &[&[(DocId, u32)]] = &[
+            &[
+                (0, 1),
+                (1, 1),
+                (2, 1),
+                (3, 1),
+                (4, 1),
+                (5, 1),
+                (6, 1),
+                (7, 1),
+                (8, 80),
+                (9, 1),
+                (10, 1),
+                (11, 1),
+                (12, 1),
+                (13, 1),
+                (14, 1),
+                (15, 1),
+                (17, 1),
+                (18, 1),
+                (19, 1),
+                (20, 1),
+                (21, 1),
+                (22, 1),
+                (23, 1),
+                (24, 1),
+                (25, 1),
+                (26, 4),
+                (27, 1),
+                (28, 1),
+                (29, 1),
+                (30, 1),
+                (31, 1),
+                (32, 1),
+                (33, 1),
+                (34, 1),
+                (35, 1),
+                (36, 1),
+                (37, 1),
+                (38, 1),
+                (39, 1),
+                (40, 1),
+                (41, 1),
+                (42, 1),
+                (43, 1),
+                (44, 1),
+                (45, 1),
+                (46, 1),
+                (47, 1),
+                (48, 1),
+                (49, 1),
+                (50, 1),
+                (51, 1),
+                (52, 1),
+                (53, 1),
+                (54, 1),
+                (55, 1),
+                (56, 1),
+                (57, 1),
+                (58, 1),
+                (59, 1),
+                (60, 1),
+                (61, 1),
+                (62, 1),
+                (63, 1),
+            ],
+            &[
+                (0, 2),
+                (3, 98),
+                (7, 93),
+                (8, 87),
+                (9, 39),
+                (10, 2),
+                (12, 71),
+                (14, 47),
+                (15, 76),
+                (16, 6),
+                (17, 38),
+                (19, 61),
+                (20, 87),
+                (21, 1),
+                (22, 5),
+                (23, 43),
+                (25, 48),
+                (26, 87),
+                (28, 81),
+                (29, 69),
+                (30, 7),
+                (31, 47),
+                (32, 32),
+                (33, 38),
+                (35, 39),
+                (38, 65),
+                (39, 98),
+                (42, 43),
+                (43, 52),
+                (44, 99),
+                (45, 88),
+                (48, 24),
+                (51, 61),
+                (52, 22),
+                (53, 58),
+                (55, 26),
+                (56, 32),
+                (58, 57),
+                (60, 29),
+                (61, 78),
+                (62, 9),
+                (63, 44),
+                (64, 29),
+            ],
+            &[
+                (0, 94),
+                (2, 49),
+                (3, 63),
+                (4, 7),
+                (6, 93),
+                (7, 17),
+                (8, 91),
+                (9, 18),
+                (10, 85),
+                (11, 11),
+                (12, 45),
+                (13, 42),
+                (15, 91),
+                (16, 44),
+                (17, 36),
+                (18, 68),
+                (19, 24),
+                (20, 17),
+                (21, 59),
+                (22, 97),
+                (24, 20),
+                (25, 7),
+                (26, 85),
+                (27, 69),
+                (28, 78),
+                (29, 84),
+                (30, 35),
+                (31, 49),
+                (33, 83),
+                (34, 97),
+                (35, 29),
+                (36, 43),
+                (37, 59),
+                (38, 79),
+                (39, 74),
+                (40, 21),
+                (41, 5),
+                (42, 47),
+                (43, 27),
+                (44, 59),
+                (45, 97),
+                (46, 91),
+                (47, 81),
+                (48, 57),
+                (49, 47),
+                (50, 64),
+                (51, 86),
+                (52, 60),
+                (53, 52),
+                (54, 14),
+                (55, 23),
+                (56, 64),
+                (57, 40),
+                (58, 5),
+                (59, 30),
+                (60, 81),
+                (61, 62),
+                (62, 39),
+                (63, 93),
+                (64, 82),
+            ],
+        ];
+        let fieldnorms: &[u32] = &[
+            624, 668, 725, 670, 851, 169, 537, 627, 200, 757, 51, 272, 835, 89, 750, 63, 272, 406,
+            394, 390, 822, 449, 257, 571, 527, 855, 4, 98, 548, 413, 539, 351, 596, 151, 728, 152,
+            766, 829, 20, 828, 477, 251, 743, 646, 136, 477, 909, 907, 266, 341, 676, 161, 40, 384,
+            347, 707, 42, 397, 482, 814, 801, 528, 465, 410, 171,
+        ];
+        let posting_lists_owned: Vec<Vec<(DocId, u32)>> =
+            posting_lists.iter().map(|pl| pl.to_vec()).collect();
+        test_block_wand_intersection_aux(&posting_lists_owned, fieldnorms);
+    }
+
    #[test]
    fn test_block_wand_intersection_disjoint() {
        // Two posting lists with no overlap — intersection is empty.
--- a/src/query/const_score_query.rs
+++ b/src/query/const_score_query.rs
@@ -1,6 +1,6 @@
 use std::fmt;

-use crate::docset::{SeekDangerResult, COLLECT_BLOCK_BUFFER_LEN};
+use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
 use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
 use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term};

@@ -119,10 +119,6 @@ impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
        self.docset.seek(target)
    }

-    fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
-        self.docset.seek_danger(target)
-    }
-
    fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
        self.docset.fill_buffer(buffer)
    }
--- a/src/query/range_query/fast_field_range_doc_set.rs
+++ b/src/query/range_query/fast_field_range_doc_set.rs
@@ -3,7 +3,6 @@ use std::ops::RangeInclusive;

 use columnar::Column;

-use crate::docset::SeekDangerResult;
 use crate::{DocId, DocSet, TERMINATED};

 /// Helper to have a cursor over a vec of docids
@@ -185,37 +184,6 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
        doc
    }

-    /// `seek_danger` only needs to answer whether `target` itself matches, so it does a cheap
-    /// point lookup on the column instead of scanning forward to materialize the next match (the
-    /// expensive part of a regular `seek`).
-    fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
-        // Covers `target == TERMINATED` and any target past the last doc: no match is possible.
-        if target >= self.column.num_docs() {
-            return SeekDangerResult::SeekLowerBound(TERMINATED);
-        }
-
-        if self.is_last_seek_distance_large(target) {
-            self.reset_fetch_range();
-        }
-        self.last_seek_pos_opt = Some(target);
-
-        let is_match = self
-            .column
-            .values_for_doc(target)
-            .any(|value| self.value_range.contains(&value));
-        if is_match {
-            // Leave the docset in a valid state positioned on `target`, so `doc()` returns it and a
-            // following `advance()` resumes the scan right after it.
-            self.loaded_docs.get_cleared_data().push(target);
-            self.next_fetch_start = target + 1;
-            SeekDangerResult::Found
-        } else {
-            // `target` is not in the docset. The next match is strictly greater than `target`, so
-            // `target + 1` is a valid lower bound. We may leave the docset in an invalid state.
-            SeekDangerResult::SeekLowerBound(target + 1)
-        }
-    }
-
    fn size_hint(&self) -> u32 {
        // TODO: Implement a better size hint
        self.column.num_docs() / 10
@@ -241,148 +209,12 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe

 #[cfg(test)]
 mod tests {
-    use std::ops::{Bound, RangeInclusive};
+    use std::ops::Bound;

-    use columnar::Column;
-
-    use super::RangeDocSet;
    use crate::collector::Count;
    use crate::directory::RamDirectory;
-    use crate::docset::{SeekDangerResult, TERMINATED};
    use crate::query::RangeQuery;
-    use crate::{schema, DocSet, Index, IndexBuilder, TantivyDocument, Term};
-
-    /// Builds a single-segment index where doc `i` carries `values_for_doc(i)` in a u64 fast
-    /// field, then returns its column so we can drive a `RangeDocSet` directly.
-    fn build_u64_column(
-        num_docs: usize,
-        values_for_doc: impl Fn(usize) -> Vec<u64>,
-    ) -> Column<u64> {
-        let mut schema_builder = schema::SchemaBuilder::new();
-        let value_field = schema_builder.add_u64_field("value", schema::FAST);
-        let index = Index::create_in_ram(schema_builder.build());
-        {
-            let mut writer = index.writer_for_tests().unwrap();
-            for i in 0..num_docs {
-                let mut doc = TantivyDocument::new();
-                for v in values_for_doc(i) {
-                    doc.add_u64(value_field, v);
-                }
-                writer.add_document(doc).unwrap();
-            }
-            writer.commit().unwrap();
-        }
-        let searcher = index.reader().unwrap().searcher();
-        assert_eq!(searcher.segment_readers().len(), 1);
-        searcher
-            .segment_reader(0)
-            .fast_fields()
-            .u64("value")
-            .unwrap()
-    }
-
-    fn range_docset(
-        value_range: RangeInclusive<u64>,
-        num_docs: usize,
-        values_for_doc: impl Fn(usize) -> Vec<u64>,
-    ) -> RangeDocSet<u64> {
-        RangeDocSet::new(value_range, build_u64_column(num_docs, values_for_doc))
-    }
-
-    #[test]
-    fn seek_danger_found_leaves_valid_state() {
-        // Even docs match the range, odd docs do not.
-        let mut docset = range_docset(0..=0, 100, |i| vec![(i % 2) as u64]);
-
-        // Matching target: `Found`, and the docset is positioned exactly on it.
-        assert_eq!(docset.seek_danger(10), SeekDangerResult::Found);
-        assert_eq!(docset.doc(), 10);
-        // A following advance resumes the scan right after the found doc.
-        assert_eq!(docset.advance(), 12);
-        assert_eq!(docset.doc(), 12);
-    }
-
-    #[test]
-    fn seek_danger_miss_returns_lower_bound() {
-        let mut docset = range_docset(0..=0, 100, |i| vec![(i % 2) as u64]);
-
-        // Odd target does not match: lower bound is strictly greater than the target and never
-        // skips past the next real match (here doc 12, the first even doc after 11).
-        match docset.seek_danger(11) {
-            SeekDangerResult::SeekLowerBound(lower_bound) => {
-                assert!(lower_bound > 11);
-                assert!(lower_bound <= 12);
-            }
-            SeekDangerResult::Found => panic!("11 should not match"),
-        }
-        // After a miss we may be in an invalid state; another seek_danger recovers it.
-        assert_eq!(docset.seek_danger(12), SeekDangerResult::Found);
-        assert_eq!(docset.doc(), 12);
-    }
-
-    #[test]
-    fn seek_danger_terminated_and_out_of_bounds() {
-        let mut docset = range_docset(0..=0, 10, |i| vec![(i % 2) as u64]);
-        assert_eq!(
-            docset.seek_danger(TERMINATED),
-            SeekDangerResult::SeekLowerBound(TERMINATED)
-        );
-        // A target past the last doc has no possible match either.
-        assert_eq!(
-            docset.seek_danger(10),
-            SeekDangerResult::SeekLowerBound(TERMINATED)
-        );
-    }
-
-    #[test]
-    fn seek_danger_multivalued() {
-        // Doc `i` holds values [i, i+1]; the range {5} matches docs 4 and 5.
-        let mut docset = range_docset(5..=5, 20, |i| vec![i as u64, i as u64 + 1]);
-
-        assert_eq!(docset.seek_danger(4), SeekDangerResult::Found);
-        assert_eq!(docset.doc(), 4);
-        assert_eq!(docset.advance(), 5);
-        // No further match after doc 5.
-        assert_eq!(docset.advance(), TERMINATED);
-    }
-
-    #[test]
-    fn seek_danger_matches_seek() {
-        // Cross-check seek_danger against the true next match for every target, on a column with a
-        // few sparse matches.
-        let matches = [3u32, 7, 50, 51, 99];
-        let num_docs = 100;
-        let values_for_doc = |i: usize| {
-            vec![if matches.contains(&(i as u32)) {
-                1u64
-            } else {
-                0u64
-            }]
-        };
-
-        for target in 0..num_docs as u32 {
-            // The first matching doc greater than or equal to `target`, i.e. what `seek` returns.
-            let expected = matches
-                .iter()
-                .copied()
-                .find(|&m| m >= target)
-                .unwrap_or(TERMINATED);
-
-            let mut danger = range_docset(1..=1, num_docs, values_for_doc);
-            match danger.seek_danger(target) {
-                SeekDangerResult::Found => {
-                    assert_eq!(expected, target, "target {target} reported Found");
-                    assert_eq!(danger.doc(), target);
-                }
-                SeekDangerResult::SeekLowerBound(lower_bound) => {
-                    assert_ne!(expected, target, "target {target} should have been Found");
-                    assert!(lower_bound > target);
-                    // The lower bound must never skip past the true next match.
-                    assert!(lower_bound <= expected);
-                }
-            }
-        }
-    }
+    use crate::{schema, IndexBuilder, TantivyDocument, Term};

    #[test]
    fn range_query_fast_optional_field_minimum() {
Author	SHA1	Message	Date
Pascal Seitz	1e859fd78d	fix term aggregation u32::MAX overflow issue	2026-06-18 17:07:43 +08:00
Pascal Seitz	f451fa938f	explain why naive scorer must accumulate scores in WAND order	2026-06-17 18:58:58 +08:00
Pascal Seitz	2a82dd6f64	fix flaky test	2026-06-17 18:58:58 +08:00