Compare commits

..

3 Commits

Author SHA1 Message Date
Pascal Seitz
1e859fd78d fix term aggregation u32::MAX overflow issue 2026-06-18 17:07:43 +08:00
Pascal Seitz
f451fa938f explain why naive scorer must accumulate scores in WAND order 2026-06-17 18:58:58 +08:00
Pascal Seitz
2a82dd6f64 fix flaky test 2026-06-17 18:58:58 +08:00
8 changed files with 250 additions and 194 deletions

View File

@@ -110,31 +110,43 @@ fn main() {
// Prepare corpora with varying scenarios
let scenarios = vec![
(
"dense and 0.1% a".to_string(),
5_000_000,
0.001,
"dense and 99% a".to_string(),
10_000_000,
0.99,
"dense",
0,
9,
),
("dense and 1% a".to_string(), 5_000_000, 0.01, "dense", 0, 9),
("dense and 10% a".to_string(), 5_000_000, 0.1, "dense", 0, 9),
(
"sparse and 50% a".to_string(),
5_000_000,
"dense and 99% a".to_string(),
10_000_000,
0.99,
"dense",
990,
999,
),
(
"sparse and 99% a".to_string(),
10_000_000,
0.99,
"sparse",
0,
9,
),
(
"sparse and 99% a".to_string(),
10_000_000,
0.99,
"sparse",
9_999_990,
9_999_999,
),
];
let mut runner = BenchRunner::new();
for (scenario_id, num_docs, p_title_a, num_rand_distribution, range_low, range_high) in
scenarios
{
for (scenario_id, n, p_title_a, num_rand_distribution, range_low, range_high) in scenarios {
// Build index for this scenario
let bench_index = build_shared_indices(num_docs, p_title_a, num_rand_distribution);
let bench_index = build_shared_indices(n, p_title_a, num_rand_distribution);
// Create benchmark group
let mut group = runner.new_group();
@@ -146,7 +158,7 @@ fn main() {
let field_names = ["num_rand", "num_asc", "num_rand_fast", "num_asc_fast"];
// Define the three terms we want to test with
let terms = ["a"];
let terms = ["a", "b", "z"];
// Generate all combinations of terms and field names
let mut queries = Vec::new();
@@ -191,7 +203,7 @@ fn run_benchmark_tasks(
bench_index,
query_str,
DocSetCollector,
"all_results",
"all results",
);
// Test top 100 by the field (if it's a FAST field)

View File

@@ -275,7 +275,7 @@ impl SegmentCompositeCollector {
dict.insert(
key,
IntermediateCompositeBucketEntry {
doc_count: agg.count,
doc_count: agg.count as u64,
sub_aggregation: sub_aggregation_res,
},
);

View File

@@ -957,7 +957,7 @@ fn into_intermediate_bucket_entry(
)?;
}
Ok(IntermediateTermBucketEntry {
doc_count: bucket.count,
doc_count: bucket.count as u64,
sub_aggregation: sub_aggregation_res,
})
}

View File

@@ -98,7 +98,7 @@ impl SegmentAggregationCollector for TermMissingAgg {
let missing_count = &self.missing_count_per_bucket[parent_bucket_id as usize];
let mut missing_entry = IntermediateTermBucketEntry {
doc_count: missing_count.missing_count,
doc_count: missing_count.missing_count as u64,
sub_aggregation: Default::default(),
};
if let Some(sub_agg) = &mut self.sub_agg {

View File

@@ -930,7 +930,7 @@ impl IntermediateRangeBucketEntry {
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateTermBucketEntry {
/// The number of documents in the bucket.
pub doc_count: u32,
pub doc_count: u64,
/// The sub_aggregation in this bucket.
pub sub_aggregation: IntermediateAggregationResults,
}
@@ -1240,6 +1240,24 @@ mod tests {
assert_eq!(tree_left, tree_expected);
}
#[test]
fn test_term_bucket_doc_count_no_u32_overflow() {
// Two segments each contributing (u32::MAX - 100) docs to the same term. Summing them
// overflowed when doc_count was u32.
let per_segment = u32::MAX as u64 - 100;
let mut entry = IntermediateTermBucketEntry {
doc_count: per_segment,
sub_aggregation: Default::default(),
};
entry
.merge_fruits(IntermediateTermBucketEntry {
doc_count: per_segment,
sub_aggregation: Default::default(),
})
.unwrap();
assert_eq!(entry.doc_count, per_segment * 2);
}
#[test]
fn test_merge_fruits_tree_empty() {
let mut tree_left = get_intermediate_tree_with_ranges(&[

View File

@@ -273,8 +273,14 @@ mod tests {
}
if all_match {
let score: Score =
leader.score() + secondaries.iter_mut().map(|s| s.score()).sum::<Score>();
// Accumulate in the same left-to-right order as the WAND implementation
// (leader first, then each secondary in turn). Float addition is not
// associative, so `leader + secondaries.sum()` gives a different bit
// pattern and can cause spurious nearly_equals failures.
let mut score: Score = leader.score();
for secondary in secondaries.iter_mut() {
score += secondary.score();
}
if score > limit {
heap.push(Float(score));
@@ -417,6 +423,198 @@ mod tests {
}
}
#[test]
fn test_block_wand_intersection_three_scorers_regression() {
// Minimal failing case found by proptest (CI run 27557430583, job 81460063906).
// Posting list 0 spans docs 063 (all present, doc 8 has tf=80, doc 26 tf=4, rest tf=1).
// Posting lists 1 and 2 are sparse with varying term freqs, and doc 16/64 appear only
// in lists 1/2 but not list 0. The high tf=80 on doc 8 of list 0 makes the WAND
// upper-bound estimation skip documents that the naive intersection would score.
let posting_lists: &[&[(DocId, u32)]] = &[
&[
(0, 1),
(1, 1),
(2, 1),
(3, 1),
(4, 1),
(5, 1),
(6, 1),
(7, 1),
(8, 80),
(9, 1),
(10, 1),
(11, 1),
(12, 1),
(13, 1),
(14, 1),
(15, 1),
(17, 1),
(18, 1),
(19, 1),
(20, 1),
(21, 1),
(22, 1),
(23, 1),
(24, 1),
(25, 1),
(26, 4),
(27, 1),
(28, 1),
(29, 1),
(30, 1),
(31, 1),
(32, 1),
(33, 1),
(34, 1),
(35, 1),
(36, 1),
(37, 1),
(38, 1),
(39, 1),
(40, 1),
(41, 1),
(42, 1),
(43, 1),
(44, 1),
(45, 1),
(46, 1),
(47, 1),
(48, 1),
(49, 1),
(50, 1),
(51, 1),
(52, 1),
(53, 1),
(54, 1),
(55, 1),
(56, 1),
(57, 1),
(58, 1),
(59, 1),
(60, 1),
(61, 1),
(62, 1),
(63, 1),
],
&[
(0, 2),
(3, 98),
(7, 93),
(8, 87),
(9, 39),
(10, 2),
(12, 71),
(14, 47),
(15, 76),
(16, 6),
(17, 38),
(19, 61),
(20, 87),
(21, 1),
(22, 5),
(23, 43),
(25, 48),
(26, 87),
(28, 81),
(29, 69),
(30, 7),
(31, 47),
(32, 32),
(33, 38),
(35, 39),
(38, 65),
(39, 98),
(42, 43),
(43, 52),
(44, 99),
(45, 88),
(48, 24),
(51, 61),
(52, 22),
(53, 58),
(55, 26),
(56, 32),
(58, 57),
(60, 29),
(61, 78),
(62, 9),
(63, 44),
(64, 29),
],
&[
(0, 94),
(2, 49),
(3, 63),
(4, 7),
(6, 93),
(7, 17),
(8, 91),
(9, 18),
(10, 85),
(11, 11),
(12, 45),
(13, 42),
(15, 91),
(16, 44),
(17, 36),
(18, 68),
(19, 24),
(20, 17),
(21, 59),
(22, 97),
(24, 20),
(25, 7),
(26, 85),
(27, 69),
(28, 78),
(29, 84),
(30, 35),
(31, 49),
(33, 83),
(34, 97),
(35, 29),
(36, 43),
(37, 59),
(38, 79),
(39, 74),
(40, 21),
(41, 5),
(42, 47),
(43, 27),
(44, 59),
(45, 97),
(46, 91),
(47, 81),
(48, 57),
(49, 47),
(50, 64),
(51, 86),
(52, 60),
(53, 52),
(54, 14),
(55, 23),
(56, 64),
(57, 40),
(58, 5),
(59, 30),
(60, 81),
(61, 62),
(62, 39),
(63, 93),
(64, 82),
],
];
let fieldnorms: &[u32] = &[
624, 668, 725, 670, 851, 169, 537, 627, 200, 757, 51, 272, 835, 89, 750, 63, 272, 406,
394, 390, 822, 449, 257, 571, 527, 855, 4, 98, 548, 413, 539, 351, 596, 151, 728, 152,
766, 829, 20, 828, 477, 251, 743, 646, 136, 477, 909, 907, 266, 341, 676, 161, 40, 384,
347, 707, 42, 397, 482, 814, 801, 528, 465, 410, 171,
];
let posting_lists_owned: Vec<Vec<(DocId, u32)>> =
posting_lists.iter().map(|pl| pl.to_vec()).collect();
test_block_wand_intersection_aux(&posting_lists_owned, fieldnorms);
}
#[test]
fn test_block_wand_intersection_disjoint() {
// Two posting lists with no overlap — intersection is empty.

View File

@@ -1,6 +1,6 @@
use std::fmt;
use crate::docset::{SeekDangerResult, COLLECT_BLOCK_BUFFER_LEN};
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term};
@@ -119,10 +119,6 @@ impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
self.docset.seek(target)
}
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
self.docset.seek_danger(target)
}
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
self.docset.fill_buffer(buffer)
}

View File

@@ -3,7 +3,6 @@ use std::ops::RangeInclusive;
use columnar::Column;
use crate::docset::SeekDangerResult;
use crate::{DocId, DocSet, TERMINATED};
/// Helper to have a cursor over a vec of docids
@@ -185,37 +184,6 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
doc
}
/// `seek_danger` only needs to answer whether `target` itself matches, so it does a cheap
/// point lookup on the column instead of scanning forward to materialize the next match (the
/// expensive part of a regular `seek`).
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
// Covers `target == TERMINATED` and any target past the last doc: no match is possible.
if target >= self.column.num_docs() {
return SeekDangerResult::SeekLowerBound(TERMINATED);
}
if self.is_last_seek_distance_large(target) {
self.reset_fetch_range();
}
self.last_seek_pos_opt = Some(target);
let is_match = self
.column
.values_for_doc(target)
.any(|value| self.value_range.contains(&value));
if is_match {
// Leave the docset in a valid state positioned on `target`, so `doc()` returns it and a
// following `advance()` resumes the scan right after it.
self.loaded_docs.get_cleared_data().push(target);
self.next_fetch_start = target + 1;
SeekDangerResult::Found
} else {
// `target` is not in the docset. The next match is strictly greater than `target`, so
// `target + 1` is a valid lower bound. We may leave the docset in an invalid state.
SeekDangerResult::SeekLowerBound(target + 1)
}
}
fn size_hint(&self) -> u32 {
// TODO: Implement a better size hint
self.column.num_docs() / 10
@@ -241,148 +209,12 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
#[cfg(test)]
mod tests {
use std::ops::{Bound, RangeInclusive};
use std::ops::Bound;
use columnar::Column;
use super::RangeDocSet;
use crate::collector::Count;
use crate::directory::RamDirectory;
use crate::docset::{SeekDangerResult, TERMINATED};
use crate::query::RangeQuery;
use crate::{schema, DocSet, Index, IndexBuilder, TantivyDocument, Term};
/// Builds a single-segment index where doc `i` carries `values_for_doc(i)` in a u64 fast
/// field, then returns its column so we can drive a `RangeDocSet` directly.
fn build_u64_column(
num_docs: usize,
values_for_doc: impl Fn(usize) -> Vec<u64>,
) -> Column<u64> {
let mut schema_builder = schema::SchemaBuilder::new();
let value_field = schema_builder.add_u64_field("value", schema::FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut writer = index.writer_for_tests().unwrap();
for i in 0..num_docs {
let mut doc = TantivyDocument::new();
for v in values_for_doc(i) {
doc.add_u64(value_field, v);
}
writer.add_document(doc).unwrap();
}
writer.commit().unwrap();
}
let searcher = index.reader().unwrap().searcher();
assert_eq!(searcher.segment_readers().len(), 1);
searcher
.segment_reader(0)
.fast_fields()
.u64("value")
.unwrap()
}
fn range_docset(
value_range: RangeInclusive<u64>,
num_docs: usize,
values_for_doc: impl Fn(usize) -> Vec<u64>,
) -> RangeDocSet<u64> {
RangeDocSet::new(value_range, build_u64_column(num_docs, values_for_doc))
}
#[test]
fn seek_danger_found_leaves_valid_state() {
// Even docs match the range, odd docs do not.
let mut docset = range_docset(0..=0, 100, |i| vec![(i % 2) as u64]);
// Matching target: `Found`, and the docset is positioned exactly on it.
assert_eq!(docset.seek_danger(10), SeekDangerResult::Found);
assert_eq!(docset.doc(), 10);
// A following advance resumes the scan right after the found doc.
assert_eq!(docset.advance(), 12);
assert_eq!(docset.doc(), 12);
}
#[test]
fn seek_danger_miss_returns_lower_bound() {
let mut docset = range_docset(0..=0, 100, |i| vec![(i % 2) as u64]);
// Odd target does not match: lower bound is strictly greater than the target and never
// skips past the next real match (here doc 12, the first even doc after 11).
match docset.seek_danger(11) {
SeekDangerResult::SeekLowerBound(lower_bound) => {
assert!(lower_bound > 11);
assert!(lower_bound <= 12);
}
SeekDangerResult::Found => panic!("11 should not match"),
}
// After a miss we may be in an invalid state; another seek_danger recovers it.
assert_eq!(docset.seek_danger(12), SeekDangerResult::Found);
assert_eq!(docset.doc(), 12);
}
#[test]
fn seek_danger_terminated_and_out_of_bounds() {
let mut docset = range_docset(0..=0, 10, |i| vec![(i % 2) as u64]);
assert_eq!(
docset.seek_danger(TERMINATED),
SeekDangerResult::SeekLowerBound(TERMINATED)
);
// A target past the last doc has no possible match either.
assert_eq!(
docset.seek_danger(10),
SeekDangerResult::SeekLowerBound(TERMINATED)
);
}
#[test]
fn seek_danger_multivalued() {
// Doc `i` holds values [i, i+1]; the range {5} matches docs 4 and 5.
let mut docset = range_docset(5..=5, 20, |i| vec![i as u64, i as u64 + 1]);
assert_eq!(docset.seek_danger(4), SeekDangerResult::Found);
assert_eq!(docset.doc(), 4);
assert_eq!(docset.advance(), 5);
// No further match after doc 5.
assert_eq!(docset.advance(), TERMINATED);
}
#[test]
fn seek_danger_matches_seek() {
// Cross-check seek_danger against the true next match for every target, on a column with a
// few sparse matches.
let matches = [3u32, 7, 50, 51, 99];
let num_docs = 100;
let values_for_doc = |i: usize| {
vec![if matches.contains(&(i as u32)) {
1u64
} else {
0u64
}]
};
for target in 0..num_docs as u32 {
// The first matching doc greater than or equal to `target`, i.e. what `seek` returns.
let expected = matches
.iter()
.copied()
.find(|&m| m >= target)
.unwrap_or(TERMINATED);
let mut danger = range_docset(1..=1, num_docs, values_for_doc);
match danger.seek_danger(target) {
SeekDangerResult::Found => {
assert_eq!(expected, target, "target {target} reported Found");
assert_eq!(danger.doc(), target);
}
SeekDangerResult::SeekLowerBound(lower_bound) => {
assert_ne!(expected, target, "target {target} should have been Found");
assert!(lower_bound > target);
// The lower bound must never skip past the true next match.
assert!(lower_bound <= expected);
}
}
}
}
use crate::{schema, IndexBuilder, TantivyDocument, Term};
#[test]
fn range_query_fast_optional_field_minimum() {