add missing parameter for stats,min,max,count,sum,avg (#2151)

* add missing parameter for stats,min,max,count,sum,avg add missing parameter for stats,min,max,count,sum,avg closes #1913 partially #1789 * Apply suggestions from code review Co-authored-by: Paul Masurel <paul@quickwit.io> --------- Co-authored-by: Paul Masurel <paul@quickwit.io>
2026-01-07 17:42:55 +00:00 · 2023-08-28 08:59:51 +02:00
parent 267dfe58d7
commit 73cb71762f
10 changed files with 389 additions and 34 deletions
--- a/src/aggregation/agg_req_with_accessor.rs
+++ b/src/aggregation/agg_req_with_accessor.rs
@@ -112,12 +112,24 @@ impl AggregationWithAccessor {
                    fallback_type,
                )?
            }
-            Average(AverageAggregation { field: field_name })
-            | Count(CountAggregation { field: field_name })
-            | Max(MaxAggregation { field: field_name })
-            | Min(MinAggregation { field: field_name })
-            | Stats(StatsAggregation { field: field_name })
-            | Sum(SumAggregation { field: field_name }) => {
+            Average(AverageAggregation {
+                field: field_name, ..
+            })
+            | Count(CountAggregation {
+                field: field_name, ..
+            })
+            | Max(MaxAggregation {
+                field: field_name, ..
+            })
+            | Min(MinAggregation {
+                field: field_name, ..
+            })
+            | Stats(StatsAggregation {
+                field: field_name, ..
+            })
+            | Sum(SumAggregation {
+                field: field_name, ..
+            }) => {
                let (accessor, field_type) =
                    get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;

--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -1455,6 +1455,47 @@ mod tests {

        Ok(())
    }
+    #[test]
+    fn terms_empty_json() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let json = schema_builder.add_json_field("json", FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests().unwrap();
+        // => Segment with empty json
+        index_writer.add_document(doc!()).unwrap();
+        index_writer.commit().unwrap();
+        // => Segment with json, but no field partially_empty
+        index_writer
+            .add_document(doc!(json => json!({"different_field": "blue"})))
+            .unwrap();
+        index_writer.commit().unwrap();
+        //// => Segment with field partially_empty
+        index_writer
+            .add_document(doc!(json => json!({"partially_empty": "blue"})))
+            .unwrap();
+        index_writer.add_document(doc!())?;
+        index_writer.commit().unwrap();
+
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "my_texts": {
+                "terms": {
+                    "field": "json.partially_empty"
+                },
+            }
+        }))
+        .unwrap();
+
+        let res = exec_request_with_query(agg_req, &index, None)?;
+
+        assert_eq!(res["my_texts"]["buckets"][0]["key"], "blue");
+        assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
+        assert_eq!(res["my_texts"]["buckets"][1], serde_json::Value::Null);
+        assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
+        assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
+
+        Ok(())
+    }

    #[test]
    fn terms_aggregation_bytes() -> crate::Result<()> {
@@ -1492,6 +1533,7 @@ mod tests {

        Ok(())
    }
+
    #[test]
    fn terms_aggregation_missing_multi_value() -> crate::Result<()> {
        let mut schema_builder = Schema::builder();
--- a/src/aggregation/metric/average.rs
+++ b/src/aggregation/metric/average.rs
@@ -20,12 +20,21 @@ use super::{IntermediateStats, SegmentStatsCollector};
 pub struct AverageAggregation {
    /// The field name to compute the average on.
    pub field: String,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(default)]
+    pub missing: Option<f64>,
 }

 impl AverageAggregation {
    /// Creates a new [`AverageAggregation`] instance from a field name.
    pub fn from_field_name(field_name: String) -> Self {
-        Self { field: field_name }
+        Self {
+            field: field_name,
+            missing: None,
+        }
    }
    /// Returns the field name the aggregation is computed on.
    pub fn field_name(&self) -> &str {
--- a/src/aggregation/metric/count.rs
+++ b/src/aggregation/metric/count.rs
@@ -20,12 +20,21 @@ use super::{IntermediateStats, SegmentStatsCollector};
 pub struct CountAggregation {
    /// The field name to compute the count on.
    pub field: String,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(default)]
+    pub missing: Option<f64>,
 }

 impl CountAggregation {
    /// Creates a new [`CountAggregation`] instance from a field name.
    pub fn from_field_name(field_name: String) -> Self {
-        Self { field: field_name }
+        Self {
+            field: field_name,
+            missing: None,
+        }
    }
    /// Returns the field name the aggregation is computed on.
    pub fn field_name(&self) -> &str {
--- a/src/aggregation/metric/max.rs
+++ b/src/aggregation/metric/max.rs
@@ -20,12 +20,21 @@ use super::{IntermediateStats, SegmentStatsCollector};
 pub struct MaxAggregation {
    /// The field name to compute the maximum on.
    pub field: String,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(default)]
+    pub missing: Option<f64>,
 }

 impl MaxAggregation {
    /// Creates a new [`MaxAggregation`] instance from a field name.
    pub fn from_field_name(field_name: String) -> Self {
-        Self { field: field_name }
+        Self {
+            field: field_name,
+            missing: None,
+        }
    }
    /// Returns the field name the aggregation is computed on.
    pub fn field_name(&self) -> &str {
@@ -56,3 +65,55 @@ impl IntermediateMax {
        self.stats.finalize().max
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::aggregation::agg_req::Aggregations;
+    use crate::aggregation::tests::exec_request_with_query;
+    use crate::schema::{Schema, FAST};
+    use crate::Index;
+
+    #[test]
+    fn test_max_agg_with_missing() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let json = schema_builder.add_json_field("json", FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests().unwrap();
+        // => Segment with empty json
+        index_writer.add_document(doc!()).unwrap();
+        index_writer.commit().unwrap();
+        // => Segment with json, but no field partially_empty
+        index_writer
+            .add_document(doc!(json => json!({"different_field": "blue"})))
+            .unwrap();
+        index_writer.commit().unwrap();
+        //// => Segment with field partially_empty
+        index_writer
+            .add_document(doc!(json => json!({"partially_empty": 10.0})))
+            .unwrap();
+        index_writer.add_document(doc!())?;
+        index_writer.commit().unwrap();
+
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "my_stats": {
+                "max": {
+                    "field": "json.partially_empty",
+                    "missing": 100.0,
+                }
+            }
+        }))
+        .unwrap();
+
+        let res = exec_request_with_query(agg_req, &index, None)?;
+
+        assert_eq!(
+            res["my_stats"],
+            json!({
+                "value": 100.0,
+            })
+        );
+
+        Ok(())
+    }
+}
--- a/src/aggregation/metric/min.rs
+++ b/src/aggregation/metric/min.rs
@@ -20,12 +20,21 @@ use super::{IntermediateStats, SegmentStatsCollector};
 pub struct MinAggregation {
    /// The field name to compute the minimum on.
    pub field: String,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(default)]
+    pub missing: Option<f64>,
 }

 impl MinAggregation {
    /// Creates a new [`MinAggregation`] instance from a field name.
    pub fn from_field_name(field_name: String) -> Self {
-        Self { field: field_name }
+        Self {
+            field: field_name,
+            missing: None,
+        }
    }
    /// Returns the field name the aggregation is computed on.
    pub fn field_name(&self) -> &str {
--- a/src/aggregation/metric/percentiles.rs
+++ b/src/aggregation/metric/percentiles.rs
@@ -80,6 +80,12 @@ pub struct PercentilesAggregationReq {
    /// Whether to return the percentiles as a hash map
    #[serde(default = "default_as_true")]
    pub keyed: bool,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub missing: Option<f64>,
 }
 fn default_percentiles() -> &'static [f64] {
    &[1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0]
@@ -95,6 +101,7 @@ impl PercentilesAggregationReq {
            field: field_name,
            percents: None,
            keyed: default_as_true(),
+            missing: None,
        }
    }
    /// Returns the field name the aggregation is computed on.
--- a/src/aggregation/metric/stats.rs
+++ b/src/aggregation/metric/stats.rs
@@ -5,11 +5,11 @@ use super::*;
 use crate::aggregation::agg_req_with_accessor::{
    AggregationWithAccessor, AggregationsWithAccessor,
 };
-use crate::aggregation::f64_from_fastfield_u64;
 use crate::aggregation::intermediate_agg_result::{
    IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
 };
 use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
+use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64};
 use crate::{DocId, TantivyError};

 /// A multi-value metric aggregation that computes a collection of statistics on numeric values that
@@ -29,12 +29,21 @@ use crate::{DocId, TantivyError};
 pub struct StatsAggregation {
    /// The field name to compute the stats on.
    pub field: String,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(default)]
+    pub missing: Option<f64>,
 }

 impl StatsAggregation {
    /// Creates a new [`StatsAggregation`] instance from a field name.
    pub fn from_field_name(field_name: String) -> Self {
-        StatsAggregation { field: field_name }
+        StatsAggregation {
+            field: field_name,
+            missing: None,
+        }
    }
    /// Returns the field name the aggregation is computed on.
    pub fn field_name(&self) -> &str {
@@ -153,6 +162,7 @@ pub(crate) enum SegmentStatsType {

 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct SegmentStatsCollector {
+    missing: Option<u64>,
    field_type: ColumnType,
    pub(crate) collecting_for: SegmentStatsType,
    pub(crate) stats: IntermediateStats,
@@ -165,12 +175,15 @@ impl SegmentStatsCollector {
        field_type: ColumnType,
        collecting_for: SegmentStatsType,
        accessor_idx: usize,
+        missing: Option<f64>,
    ) -> Self {
+        let missing = missing.and_then(|val| f64_to_fastfield_u64(val, &field_type));
        Self {
            field_type,
            collecting_for,
            stats: IntermediateStats::default(),
            accessor_idx,
+            missing,
            val_cache: Default::default(),
        }
    }
@@ -180,10 +193,17 @@ impl SegmentStatsCollector {
        docs: &[DocId],
        agg_accessor: &mut AggregationWithAccessor,
    ) {
-        agg_accessor
-            .column_block_accessor
-            .fetch_block(docs, &agg_accessor.accessor);
-
+        if let Some(missing) = self.missing.as_ref() {
+            agg_accessor.column_block_accessor.fetch_block_with_missing(
+                docs,
+                &agg_accessor.accessor,
+                *missing,
+            );
+        } else {
+            agg_accessor
+                .column_block_accessor
+                .fetch_block(docs, &agg_accessor.accessor);
+        }
        for val in agg_accessor.column_block_accessor.iter_vals() {
            let val1 = f64_from_fastfield_u64(val, &self.field_type);
            self.stats.collect(val1);
@@ -234,10 +254,22 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
        agg_with_accessor: &mut AggregationsWithAccessor,
    ) -> crate::Result<()> {
        let field = &agg_with_accessor.aggs.values[self.accessor_idx].accessor;
-
-        for val in field.values_for_doc(doc) {
-            let val1 = f64_from_fastfield_u64(val, &self.field_type);
-            self.stats.collect(val1);
+        if let Some(missing) = self.missing {
+            let mut has_val = false;
+            for val in field.values_for_doc(doc) {
+                let val1 = f64_from_fastfield_u64(val, &self.field_type);
+                self.stats.collect(val1);
+                has_val = true;
+            }
+            if !has_val {
+                self.stats
+                    .collect(f64_from_fastfield_u64(missing, &self.field_type));
+            }
+        } else {
+            for val in field.values_for_doc(doc) {
+                let val1 = f64_from_fastfield_u64(val, &self.field_type);
+                self.stats.collect(val1);
+            }
        }

        Ok(())
@@ -262,11 +294,13 @@ mod tests {

    use crate::aggregation::agg_req::{Aggregation, Aggregations};
    use crate::aggregation::agg_result::AggregationResults;
-    use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values};
+    use crate::aggregation::tests::{
+        exec_request_with_query, get_test_index_2_segments, get_test_index_from_values,
+    };
    use crate::aggregation::AggregationCollector;
    use crate::query::{AllQuery, TermQuery};
-    use crate::schema::IndexRecordOption;
-    use crate::Term;
+    use crate::schema::{IndexRecordOption, Schema, FAST};
+    use crate::{Index, Term};

    #[test]
    fn test_aggregation_stats_empty_index() -> crate::Result<()> {
@@ -453,4 +487,159 @@ mod tests {

        Ok(())
    }
+
+    #[test]
+    fn test_stats_json() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let json = schema_builder.add_json_field("json", FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests().unwrap();
+        // => Segment with empty json
+        index_writer.add_document(doc!()).unwrap();
+        index_writer.commit().unwrap();
+        // => Segment with json, but no field partially_empty
+        index_writer
+            .add_document(doc!(json => json!({"different_field": "blue"})))
+            .unwrap();
+        index_writer.commit().unwrap();
+        //// => Segment with field partially_empty
+        index_writer
+            .add_document(doc!(json => json!({"partially_empty": 10.0})))
+            .unwrap();
+        index_writer.add_document(doc!())?;
+        index_writer.commit().unwrap();
+
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "my_stats": {
+                "stats": {
+                    "field": "json.partially_empty"
+                },
+            }
+        }))
+        .unwrap();
+
+        let res = exec_request_with_query(agg_req, &index, None)?;
+
+        assert_eq!(
+            res["my_stats"],
+            json!({
+                "avg":  10.0,
+                "count": 1,
+                "max": 10.0,
+                "min": 10.0,
+                "sum": 10.0
+            })
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_stats_json_missing() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let json = schema_builder.add_json_field("json", FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests().unwrap();
+        // => Segment with empty json
+        index_writer.add_document(doc!()).unwrap();
+        index_writer.commit().unwrap();
+        // => Segment with json, but no field partially_empty
+        index_writer
+            .add_document(doc!(json => json!({"different_field": "blue"})))
+            .unwrap();
+        index_writer.commit().unwrap();
+        //// => Segment with field partially_empty
+        index_writer
+            .add_document(doc!(json => json!({"partially_empty": 10.0})))
+            .unwrap();
+        index_writer.add_document(doc!())?;
+        index_writer.commit().unwrap();
+
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "my_stats": {
+                "stats": {
+                    "field": "json.partially_empty",
+                    "missing": 0.0
+                },
+            }
+        }))
+        .unwrap();
+
+        let res = exec_request_with_query(agg_req, &index, None)?;
+
+        assert_eq!(
+            res["my_stats"],
+            json!({
+                "avg":  2.5,
+                "count": 4,
+                "max": 10.0,
+                "min": 0.0,
+                "sum": 10.0
+            })
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_stats_json_missing_sub_agg() -> crate::Result<()> {
+        // This test verifies the `collect` method (in contrast to `collect_block`), which is
+        // called when the sub-aggregations are flushed.
+        let mut schema_builder = Schema::builder();
+        let text_field = schema_builder.add_text_field("texts", FAST);
+        let score_field_f64 = schema_builder.add_f64_field("score", FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+
+        {
+            let mut index_writer = index.writer_for_tests()?;
+            // writing the segment
+            index_writer.add_document(doc!(
+                score_field_f64 => 10.0f64,
+                text_field => "a"
+            ))?;
+
+            index_writer.add_document(doc!(text_field => "a"))?;
+
+            index_writer.commit()?;
+        }
+
+        let agg_req: Aggregations = {
+            serde_json::from_value(json!({
+                "range_with_stats": {
+                    "terms": {
+                        "field": "texts"
+                    },
+                    "aggs": {
+                        "my_stats": {
+                            "stats": {
+                                "field": "score",
+                                "missing": 0.0
+                            }
+                        }
+                    }
+                }
+            }))
+            .unwrap()
+        };
+
+        let res = exec_request_with_query(agg_req, &index, None)?;
+
+        assert_eq!(
+            res["range_with_stats"]["buckets"][0]["my_stats"]["count"],
+            2
+        );
+        assert_eq!(
+            res["range_with_stats"]["buckets"][0]["my_stats"]["min"],
+            0.0
+        );
+        assert_eq!(
+            res["range_with_stats"]["buckets"][0]["my_stats"]["avg"],
+            5.0
+        );
+
+        Ok(())
+    }
 }
--- a/src/aggregation/metric/sum.rs
+++ b/src/aggregation/metric/sum.rs
@@ -20,12 +20,21 @@ use super::{IntermediateStats, SegmentStatsCollector};
 pub struct SumAggregation {
    /// The field name to compute the minimum on.
    pub field: String,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(default)]
+    pub missing: Option<f64>,
 }

 impl SumAggregation {
    /// Creates a new [`SumAggregation`] instance from a field name.
    pub fn from_field_name(field_name: String) -> Self {
-        Self { field: field_name }
+        Self {
+            field: field_name,
+            missing: None,
+        }
    }
    /// Returns the field name the aggregation is computed on.
    pub fn field_name(&self) -> &str {
--- a/src/aggregation/segment_agg_result.rs
+++ b/src/aggregation/segment_agg_result.rs
@@ -105,35 +105,43 @@ pub(crate) fn build_single_agg_segment_collector(
            req.field_type,
            accessor_idx,
        )?)),
-        Average(AverageAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
-            req.field_type,
-            SegmentStatsType::Average,
-            accessor_idx,
-        ))),
-        Count(CountAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
+        Average(AverageAggregation { missing, .. }) => {
+            Ok(Box::new(SegmentStatsCollector::from_req(
+                req.field_type,
+                SegmentStatsType::Average,
+                accessor_idx,
+                *missing,
+            )))
+        }
+        Count(CountAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
            req.field_type,
            SegmentStatsType::Count,
            accessor_idx,
+            *missing,
        ))),
-        Max(MaxAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
+        Max(MaxAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
            req.field_type,
            SegmentStatsType::Max,
            accessor_idx,
+            *missing,
        ))),
-        Min(MinAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
+        Min(MinAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
            req.field_type,
            SegmentStatsType::Min,
            accessor_idx,
+            *missing,
        ))),
-        Stats(StatsAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
+        Stats(StatsAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
            req.field_type,
            SegmentStatsType::Stats,
            accessor_idx,
+            *missing,
        ))),
-        Sum(SumAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
+        Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
            req.field_type,
            SegmentStatsType::Sum,
            accessor_idx,
+            *missing,
        ))),
        Percentiles(percentiles_req) => Ok(Box::new(
            SegmentPercentilesCollector::from_req_and_validate(