mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-23 02:40:44 +00:00
Compare commits
7 Commits
faster_uni
...
trinity.po
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4031d97bac | ||
|
|
799f7b4646 | ||
|
|
fc88d80726 | ||
|
|
6a684e7c38 | ||
|
|
94fe52cc67 | ||
|
|
2ff39f6f7f | ||
|
|
1d06328cb3 |
@@ -241,6 +241,28 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::column_values::u64_based::tests::create_and_validate;
|
||||
|
||||
// A block boundary where a high run ends and a low run begins: y0 ≈ 2^32, y511 ≈ 0.
|
||||
// This large jump used to cause an overflow which made us render all value on 64b
|
||||
// when 32 was enough.
|
||||
fn large_descending_jump_vals() -> Vec<u64> {
|
||||
let high_start: u64 = 4_294_967_039; // ≈ 2^32 - 257
|
||||
(0u64..256)
|
||||
.map(|i| high_start + i)
|
||||
.chain(0u64..256)
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_blockwise_linear_large_descending_jump_uses_at_most_32bit() {
|
||||
let vals = large_descending_jump_vals();
|
||||
let (_, actual_rate) =
|
||||
create_and_validate::<BlockwiseLinearCodec>(&vals, "large descending jump").unwrap();
|
||||
assert!(
|
||||
actual_rate <= 0.6,
|
||||
"compression rate {actual_rate:.3} is too high (bug: 64-bit residuals)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets_simple() {
|
||||
create_and_validate::<BlockwiseLinearCodec>(
|
||||
|
||||
@@ -37,7 +37,7 @@ fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 {
|
||||
} else {
|
||||
y0.wrapping_sub(y1)
|
||||
};
|
||||
if abs_dy >= 1 << 32 {
|
||||
if abs_dy >= 1 << 31 {
|
||||
// This is outside of realm we handle.
|
||||
// Let's just bail.
|
||||
return 0u64;
|
||||
|
||||
@@ -299,6 +299,12 @@ impl AggregationVariants {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_sum(&self) -> Option<&SumAggregation> {
|
||||
match &self {
|
||||
AggregationVariants::Sum(sum) => Some(sum),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -377,7 +377,22 @@ impl IntermediateMetricResult {
|
||||
MetricResult::ExtendedStats(intermediate_stats.finalize())
|
||||
}
|
||||
IntermediateMetricResult::Sum(intermediate_sum) => {
|
||||
MetricResult::Sum(intermediate_sum.finalize().into())
|
||||
// By default match Elasticsearch: empty / all-missing sum
|
||||
// buckets serialize as `"value": 0`, not `"value": null`.
|
||||
// The non-ES `none_if_no_match` flag on `SumAggregation`
|
||||
// opts into SQL-style `null` for downstream consumers.
|
||||
let none_if_no_match = req
|
||||
.agg
|
||||
.as_sum()
|
||||
.and_then(|sum| sum.none_if_no_match)
|
||||
.unwrap_or(false);
|
||||
let value = intermediate_sum.finalize();
|
||||
if none_if_no_match {
|
||||
MetricResult::Sum(value.into())
|
||||
} else {
|
||||
let value = Some(value.unwrap_or(0.0));
|
||||
MetricResult::Sum(value.into())
|
||||
}
|
||||
}
|
||||
IntermediateMetricResult::Percentiles(percentiles) => MetricResult::Percentiles(
|
||||
percentiles
|
||||
|
||||
@@ -27,6 +27,16 @@ pub struct SumAggregation {
|
||||
/// { "field": "my_numbers", "missing": "10.0" }
|
||||
#[serde(default, deserialize_with = "deserialize_option_f64")]
|
||||
pub missing: Option<f64>,
|
||||
/// Non-Elasticsearch extension. When `Some(true)`, the serialized result
|
||||
/// returns `"value": null` if no values were collected (all documents had
|
||||
/// missing/NULL values for the field), matching the behavior of `min`,
|
||||
/// `max`, and `avg`. When `None` or `Some(false)` (the default) the
|
||||
/// result returns `"value": 0`, matching Elasticsearch.
|
||||
///
|
||||
/// Intended for SQL-style consumers where `SUM` of zero rows is `NULL`
|
||||
/// and must be distinguishable from a bucket that genuinely sums to `0`.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub none_if_no_match: Option<bool>,
|
||||
}
|
||||
|
||||
impl SumAggregation {
|
||||
@@ -35,6 +45,7 @@ impl SumAggregation {
|
||||
Self {
|
||||
field: field_name,
|
||||
missing: None,
|
||||
none_if_no_match: None,
|
||||
}
|
||||
}
|
||||
/// Returns the field name the aggregation is computed on.
|
||||
@@ -59,8 +70,104 @@ impl IntermediateSum {
|
||||
pub fn merge_fruits(&mut self, other: IntermediateSum) {
|
||||
self.stats.merge_fruits(other.stats);
|
||||
}
|
||||
/// Computes the final minimum value.
|
||||
/// Computes the final sum value.
|
||||
///
|
||||
/// Returns `None` when no values were collected, matching the Rust-side
|
||||
/// behavior of `IntermediateMin`, `IntermediateMax`, and
|
||||
/// `IntermediateAvg`. The Elasticsearch-vs-SQL choice for the
|
||||
/// user-visible result is made at the boundary in
|
||||
/// [`IntermediateMetricResult::into_final_metric_result`]: by default
|
||||
/// `None` is coerced to `Some(0.0)` to match Elasticsearch
|
||||
/// (`"value": 0`), and the [`SumAggregation::none_if_no_match`] flag
|
||||
/// opts out of that coercion for SQL-style consumers.
|
||||
pub fn finalize(&self) -> Option<f64> {
|
||||
Some(self.stats.finalize().sum)
|
||||
let stats = self.stats.finalize();
|
||||
if stats.count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(stats.sum)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_sum_finalize_returns_none_when_no_values() {
|
||||
// Default IntermediateSum has count=0 — finalize should return None,
|
||||
// matching MIN/MAX/AVG behavior for all-NULL groups.
|
||||
let sum = IntermediateSum::default();
|
||||
assert_eq!(sum.finalize(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sum_finalize_returns_value_when_has_values() {
|
||||
let mut sum = IntermediateSum::default();
|
||||
// Merge in a result that has actual values
|
||||
let stats = IntermediateStats {
|
||||
count: 3,
|
||||
sum: 42.0,
|
||||
min: 10.0,
|
||||
max: 20.0,
|
||||
..Default::default()
|
||||
};
|
||||
let other = IntermediateSum::from_stats(stats);
|
||||
sum.merge_fruits(other);
|
||||
assert_eq!(sum.finalize(), Some(42.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sum_merge_two_empty_still_none() {
|
||||
let mut a = IntermediateSum::default();
|
||||
let b = IntermediateSum::default();
|
||||
a.merge_fruits(b);
|
||||
assert_eq!(a.finalize(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sum_aggregation_empty_index_default_matches_es() -> crate::Result<()> {
|
||||
use serde_json::json;
|
||||
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
|
||||
|
||||
// Empty index — sum has no values to collect.
|
||||
let values: Vec<Vec<&str>> = vec![];
|
||||
let index = get_test_index_from_terms(false, &values)?;
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"score_sum": { "sum": { "field": "score" } }
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
// Default: match Elasticsearch — empty sum serializes as 0, not null.
|
||||
assert_eq!(res["score_sum"]["value"], 0.0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sum_aggregation_empty_index_none_if_no_match_opt_in() -> crate::Result<()> {
|
||||
use serde_json::json;
|
||||
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
|
||||
|
||||
let values: Vec<Vec<&str>> = vec![];
|
||||
let index = get_test_index_from_terms(false, &values)?;
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"score_sum": { "sum": { "field": "score", "none_if_no_match": true } }
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
// Opt-in non-ES extension — empty sum serializes as null.
|
||||
assert!(
|
||||
res["score_sum"]["value"].is_null(),
|
||||
"expected null, got {:?}",
|
||||
res["score_sum"]["value"]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -287,6 +287,33 @@ impl BlockSegmentPostings {
|
||||
doc
|
||||
}
|
||||
|
||||
/// Returns the number of documents with a doc id strictly smaller than `target`
|
||||
/// (i.e. the *rank* of `target` in this posting list).
|
||||
///
|
||||
/// This jumps to the block that may contain `target` through the skip list, so no
|
||||
/// skipped block is decoded; a single block is then decoded to locate `target`
|
||||
/// within it. The cost is therefore `O(number_of_skip_list_entries)` plus one block
|
||||
/// decode, rather than `O(doc_freq)`.
|
||||
///
|
||||
/// Like [`Self::seek`], the underlying cursor only ever moves forward. This method
|
||||
/// must be called with **non-decreasing** `target` values (galloping); calling it
|
||||
/// with a `target` smaller than a previous one yields an incorrect result. `target`
|
||||
/// must be a valid doc id (i.e. `target <= TERMINATED`), exactly as for `seek`.
|
||||
///
|
||||
/// Edge cases: returns `0` when `target` is smaller than every doc id, and
|
||||
/// `doc_freq()` when `target` is larger than every doc id.
|
||||
pub fn rank(&mut self, target: DocId) -> u32 {
|
||||
if self.doc_freq == 0 {
|
||||
return 0;
|
||||
}
|
||||
// `within` = number of docs in the landed block with a doc id < target.
|
||||
let within = self.seek(target);
|
||||
// `remaining_docs` counts the landed block and everything after it, so the
|
||||
// difference is the number of docs in all blocks strictly before it.
|
||||
let docs_before_block = self.doc_freq - self.skip_reader.remaining_docs();
|
||||
docs_before_block + within as u32
|
||||
}
|
||||
|
||||
pub(crate) fn position_offset(&self) -> u64 {
|
||||
self.skip_reader.position_offset()
|
||||
}
|
||||
@@ -568,4 +595,38 @@ mod tests {
|
||||
assert_eq!(block_segments.docs(), &[1, 3, 5]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings_rank() -> crate::Result<()> {
|
||||
// ~8 blocks worth of docs so the skip list is actually exercised.
|
||||
let docs: Vec<DocId> = (0..1000u32).map(|i| i * 3).collect();
|
||||
let mut block_postings = build_block_postings(&docs[..])?;
|
||||
let doc_freq = block_postings.doc_freq();
|
||||
|
||||
// rank(target) must equal the number of docs strictly below target.
|
||||
// Targets are queried in non-decreasing order, as the API requires.
|
||||
// `target` values must be a valid doc id (<= TERMINATED) and non-decreasing.
|
||||
let targets = [
|
||||
0u32, 1, 2, 3, 4, 299, 300, 301, 1500, 2996, 2997, 3000, 10_000,
|
||||
];
|
||||
for &target in &targets {
|
||||
let expected = docs.iter().filter(|&&d| d < target).count() as u32;
|
||||
assert_eq!(
|
||||
block_postings.rank(target),
|
||||
expected,
|
||||
"rank({target}) mismatch"
|
||||
);
|
||||
}
|
||||
|
||||
// Edge cases: below the first doc -> 0, above the last doc -> doc_freq.
|
||||
let mut fresh = build_block_postings(&docs[..])?;
|
||||
assert_eq!(fresh.rank(0), 0);
|
||||
let mut fresh = build_block_postings(&docs[..])?;
|
||||
assert_eq!(fresh.rank(1_000_000), doc_freq);
|
||||
|
||||
// Empty postings: rank is always 0.
|
||||
let mut empty = BlockSegmentPostings::empty();
|
||||
assert_eq!(empty.rank(42), 0);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -187,6 +187,12 @@ impl SkipReader {
|
||||
self.last_doc_in_block
|
||||
}
|
||||
|
||||
/// Number of docs from the start of the current block to the end of the postings
|
||||
/// (i.e. the current block plus every block after it).
|
||||
pub(crate) fn remaining_docs(&self) -> u32 {
|
||||
self.remaining_docs
|
||||
}
|
||||
|
||||
pub fn position_offset(&self) -> u64 {
|
||||
self.position_offset
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user