Files
tantivy/src/query/size_hint.rs
PSeitz 33835b6a01 Add DocSet::cost() (#2707)
* query: add DocSet cost hint and use it for intersection ordering

- Add DocSet::cost()
- Use cost() instead of size_hint() to order scorers in intersect_scorers

This isolates cost-related changes without the new seek APIs from
PR #2538

* add comments

---------

Co-authored-by: Pascal Seitz <pascal.seitz@datadoghq.com>
2025-10-13 16:25:49 +02:00

142 lines
5.0 KiB
Rust

/// Computes the estimated number of documents in the intersection of multiple docsets
/// given their sizes.
///
/// # Arguments
/// * `docset_sizes` - An iterator over the sizes of the docsets (number of documents in each set).
/// * `max_docs` - The maximum number of docs that can hit, usually number of documents in the
/// segment.
///
/// # Returns
/// The estimated number of documents in the intersection.
pub fn estimate_intersection<I>(mut docset_sizes: I, max_docs: u32) -> u32
where I: Iterator<Item = u32> {
if max_docs == 0u32 {
return 0u32;
}
// Terms tend to be not really randomly distributed.
// This factor is used to adjust the estimate.
let mut co_loc_factor: f64 = 1.3;
let mut intersection_estimate = match docset_sizes.next() {
Some(first_size) => first_size as f64,
None => return 0, // No docsets provided, so return 0.
};
let mut smallest_docset_size = intersection_estimate;
// Assuming random distribution of terms, the probability of a document being in the
// intersection
for size in docset_sizes {
// Diminish the co-location factor for each additional set, or we will overestimate.
co_loc_factor = (co_loc_factor - 0.1).max(1.0);
intersection_estimate *= (size as f64 / max_docs as f64) * co_loc_factor;
smallest_docset_size = smallest_docset_size.min(size as f64);
}
intersection_estimate.round().min(smallest_docset_size) as u32
}
/// Computes the estimated number of documents in the union of multiple docsets
/// given their sizes.
///
/// # Arguments
/// * `docset_sizes` - An iterator over the sizes of the docsets (number of documents in each set).
/// * `max_docs` - The maximum number of docs that can hit, usually number of documents in the
/// segment.
///
/// # Returns
/// The estimated number of documents in the union.
pub fn estimate_union<I>(docset_sizes: I, max_docs: u32) -> u32
where I: Iterator<Item = u32> {
// Terms tend to be not really randomly distributed.
// This factor is used to adjust the estimate.
// Unlike intersection, the co-location reduces the estimate.
let co_loc_factor = 0.8;
// The approach for union is to compute the probability of a document not being in any of the
// sets
let mut not_in_any_set_prob = 1.0;
// Assuming random distribution of terms, the probability of a document being in the
// union is the complement of the probability of it not being in any of the sets.
for size in docset_sizes {
let prob_in_set = (size as f64 / max_docs as f64) * co_loc_factor;
not_in_any_set_prob *= 1.0 - prob_in_set;
}
let union_estimate = (max_docs as f64 * (1.0 - not_in_any_set_prob)).round();
union_estimate.min(max_docs as f64) as u32
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_estimate_intersection_small1() {
let docset_sizes = &[500, 1000];
let n = 10_000;
let result = estimate_intersection(docset_sizes.iter().copied(), n);
assert_eq!(result, 60);
}
#[test]
fn test_estimate_intersection_small2() {
let docset_sizes = &[500, 1000, 1500];
let n = 10_000;
let result = estimate_intersection(docset_sizes.iter().copied(), n);
assert_eq!(result, 10);
}
#[test]
fn test_estimate_intersection_large_values() {
let docset_sizes = &[100_000, 50_000, 30_000];
let n = 1_000_000;
let result = estimate_intersection(docset_sizes.iter().copied(), n);
assert_eq!(result, 198);
}
#[test]
fn test_estimate_union_small() {
let docset_sizes = &[500, 1000, 1500];
let n = 10000;
let result = estimate_union(docset_sizes.iter().copied(), n);
assert_eq!(result, 2228);
}
#[test]
fn test_estimate_union_large_values() {
let docset_sizes = &[100000, 50000, 30000];
let n = 1000000;
let result = estimate_union(docset_sizes.iter().copied(), n);
assert_eq!(result, 137997);
}
#[test]
fn test_estimate_intersection_large() {
let docset_sizes: Vec<_> = (0..10).map(|_| 4_000_000).collect();
let n = 5_000_000;
let result = estimate_intersection(docset_sizes.iter().copied(), n);
// Check that it doesn't overflow and returns a reasonable result
assert_eq!(result, 708_670);
}
#[test]
fn test_estimate_intersection_overflow_safety() {
let docset_sizes: Vec<_> = (0..100).map(|_| 4_000_000).collect();
let n = 5_000_000;
let result = estimate_intersection(docset_sizes.iter().copied(), n);
// Check that it doesn't overflow and returns a reasonable result
assert_eq!(result, 0);
}
#[test]
fn test_estimate_union_overflow_safety() {
let docset_sizes: Vec<_> = (0..100).map(|_| 1_000_000).collect();
let n = 20_000_000;
let result = estimate_union(docset_sizes.iter().copied(), n);
// Check that it doesn't overflow and returns a reasonable result
assert_eq!(result, 19_662_594);
}
}