Merge pull request #1340 from PSeitz/term_agg

fix collecting term_dict field names
2026-06-02 08:30:41 +00:00 · 2022-04-18 08:21:27 +02:00
parent d832cfcfd8 ec69875d15
commit c7c3eab256
3 changed files with 38 additions and 19 deletions
--- a/src/aggregation/agg_req.rs
+++ b/src/aggregation/agg_req.rs
@@ -187,6 +187,7 @@ impl BucketAggregation {
        if let BucketAggregationType::Terms(terms) = &self.bucket_agg {
            term_dict_field_names.insert(terms.field.to_string());
        }
+        term_dict_field_names.extend(get_term_dict_field_names(&self.sub_aggregation));
    }
    fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
        self.bucket_agg.get_fast_field_names(fast_field_names);
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -37,8 +37,8 @@ use crate::DocId;
 /// ## Per bucket document count error
 /// If you set the `show_term_doc_count_error` parameter to true, the terms aggregation will include
 /// doc_count_error_upper_bound, which is an upper bound to the error on the doc_count returned by
-/// each segment. It’s the sum of the size of the largest bucket on each shard that didn’t fit into
-/// shard_size.
+/// each segment. It’s the sum of the size of the largest bucket on each segment that didn’t fit
+/// into segment_size.
 ///
 /// Result type is [BucketResult](crate::aggregation::agg_result::BucketResult) with
 /// [TermBucketEntry](crate::aggregation::agg_result::BucketEntry) on the
@@ -131,7 +131,7 @@ pub(crate) struct TermsAggregationInternal {
    /// If you set the `show_term_doc_count_error` parameter to true, the terms aggregation will
    /// include doc_count_error_upper_bound, which is an upper bound to the error on the
    /// doc_count returned by each shard. It’s the sum of the size of the largest bucket on
-    /// each segment that didn’t fit into `shard_size`.
+    /// each segment that didn’t fit into `segment_size`.
    pub show_term_doc_count_error: bool,

    /// The get more accurate results, we fetch more than `size` from each segment.
@@ -524,7 +524,6 @@ mod tests {
                bucket_agg: BucketAggregationType::Terms(TermsAggregation {
                    field: "string_id".to_string(),
                    size: Some(2),
-                    shard_size: Some(2),
                    min_doc_count: Some(3),
                    ..Default::default()
                }),
@@ -554,10 +553,8 @@ mod tests {
    #[test]
    fn terms_aggregation_min_doc_count_special_case() -> crate::Result<()> {
        let terms_per_segment = vec![
-            vec!["terma", "terma", "termb", "termb", "termb", "termc"], /* termc doesn't make it
-                                                                         * from this segment */
-            vec!["terma", "terma", "termb", "termc", "termc"], /* termb doesn't make it from
-                                                                * this segment */
+            vec!["terma", "terma", "termb", "termb", "termb", "termc"],
+            vec!["terma", "terma", "termb", "termc", "termc"],
        ];

        let index = get_test_index_from_terms(false, &terms_per_segment)?;
@@ -576,8 +573,8 @@ mod tests {
        .into_iter()
        .collect();

+        // searching for terma, but min_doc_count will return all terms
        let res = exec_request_with_query(agg_req, &index, Some(("string_id", "terma")))?;
-        println!("{}", &serde_json::to_string_pretty(&res).unwrap());

        assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
        assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
@@ -618,7 +615,6 @@ mod tests {
        .collect();

        let res = exec_request(agg_req, &index)?;
-        println!("{}", &serde_json::to_string_pretty(&res).unwrap());

        assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
        assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
--- a/src/aggregation/mod.rs
+++ b/src/aggregation/mod.rs
@@ -313,8 +313,11 @@ mod tests {
    use super::bucket::RangeAggregation;
    use super::collector::AggregationCollector;
    use super::metric::AverageAggregation;
-    use crate::aggregation::agg_req::{BucketAggregationType, MetricAggregation};
+    use crate::aggregation::agg_req::{
+        get_term_dict_field_names, BucketAggregationType, MetricAggregation,
+    };
    use crate::aggregation::agg_result::AggregationResults;
+    use crate::aggregation::bucket::TermsAggregation;
    use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
    use crate::aggregation::segment_agg_result::DOC_BLOCK_SIZE;
    use crate::aggregation::DistributedAggregationCollector;
@@ -628,8 +631,10 @@ mod tests {
            .set_indexing_options(
                TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
            )
+            .set_fast()
            .set_stored();
        let text_field = schema_builder.add_text_field("text", text_fieldtype);
+        schema_builder.add_text_field("dummy_text", STRING);
        let score_fieldtype =
            crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
        let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
@@ -834,10 +839,21 @@ mod tests {
            IndexRecordOption::Basic,
        );

-        let sub_agg_req: Aggregations =
-            vec![("average_in_range".to_string(), get_avg_req("score"))]
-                .into_iter()
-                .collect();
+        let sub_agg_req: Aggregations = vec![
+            ("average_in_range".to_string(), get_avg_req("score")),
+            (
+                "term_agg".to_string(),
+                Aggregation::Bucket(BucketAggregation {
+                    bucket_agg: BucketAggregationType::Terms(TermsAggregation {
+                        field: "text".to_string(),
+                        ..Default::default()
+                    }),
+                    sub_aggregation: Default::default(),
+                }),
+            ),
+        ]
+        .into_iter()
+        .collect();
        let agg_req: Aggregations = if use_elastic_json_req {
            let elasticsearch_compatible_json_req = r#"
 {
@@ -853,7 +869,8 @@ mod tests {
      ]
    },
    "aggs": {
-      "average_in_range": { "avg": { "field": "score" } }
+      "average_in_range": { "avg": { "field": "score" } },
+      "term_agg": { "terms": { "field": "text" } }
    }
  },
  "rangei64": {
@@ -868,7 +885,8 @@ mod tests {
      ]
    },
    "aggs": {
-      "average_in_range": { "avg": { "field": "score" } }
+      "average_in_range": { "avg": { "field": "score" } },
+      "term_agg": { "terms": { "field": "text" } }
    }
  },
  "average": {
@@ -886,7 +904,8 @@ mod tests {
      ]
    },
    "aggs": {
-      "average_in_range": { "avg": { "field": "score" } }
+      "average_in_range": { "avg": { "field": "score" } },
+      "term_agg": { "terms": { "field": "text" } }
    }
  }
 }
@@ -945,6 +964,9 @@ mod tests {
            agg_req
        };

+        let field_names = get_term_dict_field_names(&agg_req);
+        assert_eq!(field_names, vec!["text".to_string()].into_iter().collect());
+
        let agg_res: AggregationResults = if use_distributed_collector {
            let collector = DistributedAggregationCollector::from_aggs(agg_req.clone());

@@ -1085,7 +1107,7 @@ mod tests {
            searcher.search(&AllQuery, &collector).unwrap_err()
        };

-        let agg_res = avg_on_field("text");
+        let agg_res = avg_on_field("dummy_text");
        assert_eq!(
            format!("{:?}", agg_res),
            r#"InvalidArgument("Only fast fields of type f64, u64, i64 are supported, but got Str ")"#