use serde_json::Value; use crate::aggregation::agg_req::{Aggregation, Aggregations}; use crate::aggregation::agg_result::AggregationResults; use crate::aggregation::collector::AggregationCollector; use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults; use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms}; use crate::aggregation::DistributedAggregationCollector; use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::query::{AllQuery, TermQuery}; use crate::schema::{IndexRecordOption, Schema, FAST}; use crate::{Index, IndexWriter, Term}; // The following tests ensure that each bucket aggregation type correctly functions as a // sub-aggregation of another bucket aggregation in two scenarios: // 1) The parent has more buckets than the child sub-aggregation // 2) The child sub-aggregation has more buckets than the parent // // These scenarios exercise the bucket id mapping and sub-aggregation routing logic. #[test] fn test_terms_as_subagg_parent_more_vs_child_more() -> crate::Result<()> { let index = get_test_index_2_segments(false)?; // Case A: parent has more buckets than child // Parent: range with 4 buckets // Child: terms on text -> 2 buckets let agg_parent_more: Aggregations = serde_json::from_value(json!({ "parent_range": { "range": { "field": "score", "ranges": [ {"to": 3.0}, {"from": 3.0, "to": 7.0}, {"from": 7.0, "to": 20.0}, {"from": 20.0} ] }, "aggs": { "child_terms": {"terms": {"field": "text", "order": {"_key": "asc"}}} } } })) .unwrap(); let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?; // Exact expected structure and counts assert_eq!( res["parent_range"]["buckets"], json!([ { "key": "*-3", "doc_count": 1, "to": 3.0, "child_terms": { "buckets": [ {"doc_count": 1, "key": "cool"} ], "sum_other_doc_count": 0 } }, { "key": "3-7", "doc_count": 3, "from": 3.0, "to": 7.0, "child_terms": { "buckets": [ {"doc_count": 2, "key": "cool"}, {"doc_count": 1, "key": "nohit"} ], "sum_other_doc_count": 0 } }, { "key": "7-20", "doc_count": 3, "from": 7.0, "to": 20.0, "child_terms": { "buckets": [ {"doc_count": 3, "key": "cool"} ], "sum_other_doc_count": 0 } }, { "key": "20-*", "doc_count": 2, "from": 20.0, "child_terms": { "buckets": [ {"doc_count": 1, "key": "cool"}, {"doc_count": 1, "key": "nohit"} ], "sum_other_doc_count": 0 } } ]) ); // Case B: child has more buckets than parent // Parent: histogram on score with large interval -> 1 bucket // Child: terms on text -> 2 buckets (cool/nohit) let agg_child_more: Aggregations = serde_json::from_value(json!({ "parent_hist": { "histogram": {"field": "score", "interval": 100.0}, "aggs": { "child_terms": {"terms": {"field": "text", "order": {"_key": "asc"}}} } } })) .unwrap(); let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?; assert_eq!( res["parent_hist"], json!({ "buckets": [ { "key": 0.0, "doc_count": 9, "child_terms": { "buckets": [ {"doc_count": 7, "key": "cool"}, {"doc_count": 2, "key": "nohit"} ], "sum_other_doc_count": 0 } } ] }) ); Ok(()) } #[test] fn test_range_as_subagg_parent_more_vs_child_more() -> crate::Result<()> { let index = get_test_index_2_segments(false)?; // Case A: parent has more buckets than child // Parent: range with 5 buckets // Child: coarse range with 3 buckets let agg_parent_more: Aggregations = serde_json::from_value(json!({ "parent_range": { "range": { "field": "score", "ranges": [ {"to": 3.0}, {"from": 3.0, "to": 7.0}, {"from": 7.0, "to": 11.0}, {"from": 11.0, "to": 20.0}, {"from": 20.0} ] }, "aggs": { "child_range": { "range": { "field": "score", "ranges": [ {"to": 3.0}, {"from": 3.0, "to": 20.0} ] } } } } })) .unwrap(); let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?; assert_eq!( res["parent_range"]["buckets"], json!([ {"key": "*-3", "doc_count": 1, "to": 3.0, "child_range": {"buckets": [ {"key": "*-3", "doc_count": 1, "to": 3.0}, {"key": "3-20", "doc_count": 0, "from": 3.0, "to": 20.0}, {"key": "20-*", "doc_count": 0, "from": 20.0} ]} }, {"key": "3-7", "doc_count": 3, "from": 3.0, "to": 7.0, "child_range": {"buckets": [ {"key": "*-3", "doc_count": 0, "to": 3.0}, {"key": "3-20", "doc_count": 3, "from": 3.0, "to": 20.0}, {"key": "20-*", "doc_count": 0, "from": 20.0} ]} }, {"key": "7-11", "doc_count": 1, "from": 7.0, "to": 11.0, "child_range": {"buckets": [ {"key": "*-3", "doc_count": 0, "to": 3.0}, {"key": "3-20", "doc_count": 1, "from": 3.0, "to": 20.0}, {"key": "20-*", "doc_count": 0, "from": 20.0} ]} }, {"key": "11-20", "doc_count": 2, "from": 11.0, "to": 20.0, "child_range": {"buckets": [ {"key": "*-3", "doc_count": 0, "to": 3.0}, {"key": "3-20", "doc_count": 2, "from": 3.0, "to": 20.0}, {"key": "20-*", "doc_count": 0, "from": 20.0} ]} }, {"key": "20-*", "doc_count": 2, "from": 20.0, "child_range": {"buckets": [ {"key": "*-3", "doc_count": 0, "to": 3.0}, {"key": "3-20", "doc_count": 0, "from": 3.0, "to": 20.0}, {"key": "20-*", "doc_count": 2, "from": 20.0} ]} } ]) ); // Case B: child has more buckets than parent // Parent: terms on text (2 buckets) // Child: range with 4 buckets let agg_child_more: Aggregations = serde_json::from_value(json!({ "parent_terms": { "terms": {"field": "text"}, "aggs": { "child_range": { "range": { "field": "score", "ranges": [ {"to": 3.0}, {"from": 3.0, "to": 7.0}, {"from": 7.0, "to": 20.0} ] } } } } })) .unwrap(); let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?; assert_eq!( res["parent_terms"], json!({ "buckets": [ { "key": "cool", "doc_count": 7, "child_range": { "buckets": [ {"key": "*-3", "doc_count": 1, "to": 3.0}, {"key": "3-7", "doc_count": 2, "from": 3.0, "to": 7.0}, {"key": "7-20", "doc_count": 3, "from": 7.0, "to": 20.0}, {"key": "20-*", "doc_count": 1, "from": 20.0} ] } }, { "key": "nohit", "doc_count": 2, "child_range": { "buckets": [ {"key": "*-3", "doc_count": 0, "to": 3.0}, {"key": "3-7", "doc_count": 1, "from": 3.0, "to": 7.0}, {"key": "7-20", "doc_count": 0, "from": 7.0, "to": 20.0}, {"key": "20-*", "doc_count": 1, "from": 20.0} ] } } ], "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0 }) ); Ok(()) } #[test] fn test_histogram_as_subagg_parent_more_vs_child_more() -> crate::Result<()> { let index = get_test_index_2_segments(false)?; // Case A: parent has more buckets than child // Parent: range with several ranges // Child: histogram with large interval (single bucket per parent) let agg_parent_more: Aggregations = serde_json::from_value(json!({ "parent_range": { "range": { "field": "score", "ranges": [ {"to": 3.0}, {"from": 3.0, "to": 7.0}, {"from": 7.0, "to": 11.0}, {"from": 11.0, "to": 20.0}, {"from": 20.0} ] }, "aggs": { "child_hist": {"histogram": {"field": "score", "interval": 100.0}} } } })) .unwrap(); let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?; assert_eq!( res["parent_range"]["buckets"], json!([ {"key": "*-3", "doc_count": 1, "to": 3.0, "child_hist": {"buckets": [ {"key": 0.0, "doc_count": 1} ]} }, {"key": "3-7", "doc_count": 3, "from": 3.0, "to": 7.0, "child_hist": {"buckets": [ {"key": 0.0, "doc_count": 3} ]} }, {"key": "7-11", "doc_count": 1, "from": 7.0, "to": 11.0, "child_hist": {"buckets": [ {"key": 0.0, "doc_count": 1} ]} }, {"key": "11-20", "doc_count": 2, "from": 11.0, "to": 20.0, "child_hist": {"buckets": [ {"key": 0.0, "doc_count": 2} ]} }, {"key": "20-*", "doc_count": 2, "from": 20.0, "child_hist": {"buckets": [ {"key": 0.0, "doc_count": 2} ]} } ]) ); // Case B: child has more buckets than parent // Parent: terms on text -> 2 buckets // Child: histogram with small interval -> multiple buckets including empties let agg_child_more: Aggregations = serde_json::from_value(json!({ "parent_terms": { "terms": {"field": "text"}, "aggs": { "child_hist": {"histogram": {"field": "score", "interval": 10.0}} } } })) .unwrap(); let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?; assert_eq!( res["parent_terms"], json!({ "buckets": [ { "key": "cool", "doc_count": 7, "child_hist": { "buckets": [ {"key": 0.0, "doc_count": 4}, {"key": 10.0, "doc_count": 2}, {"key": 20.0, "doc_count": 0}, {"key": 30.0, "doc_count": 0}, {"key": 40.0, "doc_count": 1} ] } }, { "key": "nohit", "doc_count": 2, "child_hist": { "buckets": [ {"key": 0.0, "doc_count": 1}, {"key": 10.0, "doc_count": 0}, {"key": 20.0, "doc_count": 0}, {"key": 30.0, "doc_count": 0}, {"key": 40.0, "doc_count": 1} ] } } ], "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0 }) ); Ok(()) } #[test] fn test_date_histogram_as_subagg_parent_more_vs_child_more() -> crate::Result<()> { let index = get_test_index_2_segments(false)?; // Case A: parent has more buckets than child // Parent: range with several buckets // Child: date_histogram with 30d -> single bucket per parent let agg_parent_more: Aggregations = serde_json::from_value(json!({ "parent_range": { "range": { "field": "score", "ranges": [ {"to": 3.0}, {"from": 3.0, "to": 7.0}, {"from": 7.0, "to": 11.0}, {"from": 11.0, "to": 20.0}, {"from": 20.0} ] }, "aggs": { "child_date_hist": {"date_histogram": {"field": "date", "fixed_interval": "30d"}} } } })) .unwrap(); let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?; let buckets = res["parent_range"]["buckets"].as_array().unwrap(); // Verify each parent bucket has exactly one child date bucket with matching doc_count for bucket in buckets { let parent_count = bucket["doc_count"].as_u64().unwrap(); let child_buckets = bucket["child_date_hist"]["buckets"].as_array().unwrap(); assert_eq!(child_buckets.len(), 1); assert_eq!(child_buckets[0]["doc_count"], parent_count); } // Case B: child has more buckets than parent // Parent: terms on text (2 buckets) // Child: date_histogram with 1d -> multiple buckets let agg_child_more: Aggregations = serde_json::from_value(json!({ "parent_terms": { "terms": {"field": "text"}, "aggs": { "child_date_hist": {"date_histogram": {"field": "date", "fixed_interval": "1d"}} } } })) .unwrap(); let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?; let buckets = res["parent_terms"]["buckets"].as_array().unwrap(); // cool bucket assert_eq!(buckets[0]["key"], "cool"); let cool_buckets = buckets[0]["child_date_hist"]["buckets"].as_array().unwrap(); assert_eq!(cool_buckets.len(), 3); assert_eq!(cool_buckets[0]["doc_count"], 1); // day 0 assert_eq!(cool_buckets[1]["doc_count"], 4); // day 1 assert_eq!(cool_buckets[2]["doc_count"], 2); // day 2 // nohit bucket assert_eq!(buckets[1]["key"], "nohit"); let nohit_buckets = buckets[1]["child_date_hist"]["buckets"].as_array().unwrap(); assert_eq!(nohit_buckets.len(), 2); assert_eq!(nohit_buckets[0]["doc_count"], 1); // day 1 assert_eq!(nohit_buckets[1]["doc_count"], 1); // day 2 Ok(()) } fn get_avg_req(field_name: &str) -> Aggregation { serde_json::from_value(json!({ "avg": { "field": field_name, } })) .unwrap() } fn get_collector(agg_req: Aggregations) -> AggregationCollector { AggregationCollector::from_aggs(agg_req, Default::default()) } // *** EVERY BUCKET-TYPE SHOULD BE TESTED HERE *** // Note: The flushng part of these tests are outdated, since the buffering change after converting // the collection into one collector per request instead of per bucket. // // However they are useful as they test a complex aggregation requests. fn test_aggregation_flushing( merge_segments: bool, use_distributed_collector: bool, ) -> crate::Result<()> { let mut values_and_terms = (0..80) .map(|val| vec![(val as f64, "terma".to_string())]) .collect::>(); values_and_terms.last_mut().unwrap()[0].1 = "termb".to_string(); let index = get_test_index_from_values_and_terms(merge_segments, &values_and_terms)?; let reader = index.reader()?; assert_eq!(COLLECT_BLOCK_BUFFER_LEN, 64); // In the tree we cache documents of COLLECT_BLOCK_BUFFER_LEN before passing them down as one // block. // // Build a request so that on the first level we have one full cache, which is then flushed. // The same cache should have some residue docs at the end, which are flushed (Range 0-70) // -> 70 docs // // The second level should also have some residue docs in the cache that are flushed at the // end. // // A second bucket on the first level should have the cache unfilled // let elasticsearch_compatible_json_req = r#" let elasticsearch_compatible_json = json!( { "bucketsL1": { "range": { "field": "score", "ranges": [ { "to": 3.0f64 }, { "from": 3.0f64, "to": 70.0f64 }, { "from": 70.0f64 } ] }, "aggs": { "bucketsL2": { "range": { "field": "score", "ranges": [ { "to": 30.0f64 }, { "from": 30.0f64, "to": 70.0f64 }, { "from": 70.0f64 } ] } } } }, "top_hits_test":{ "terms": { "field": "string_id" }, "aggs": { "bucketsL2": { "top_hits": { "size": 2, "sort": [ { "score": "asc" } ], "docvalue_fields": ["score"] } } } }, "histogram_test":{ "histogram": { "field": "score", "interval": 70.0, "offset": 3.0 }, "aggs": { "bucketsL2": { "histogram": { "field": "score", "interval": 70.0 } } } }, "term_agg_test":{ "terms": { "field": "string_id" }, "aggs": { "bucketsL2": { "histogram": { "field": "score", "interval": 70.0 } } } }, "cardinality_string_id":{ "cardinality": { "field": "string_id" } }, "cardinality_score":{ "cardinality": { "field": "score" } } }); let agg_req: Aggregations = serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap()) .unwrap(); let agg_res: AggregationResults = if use_distributed_collector { let collector = DistributedAggregationCollector::from_aggs(agg_req.clone(), Default::default()); let searcher = reader.searcher(); let intermediate_agg_result = searcher.search(&AllQuery, &collector).unwrap(); // Test postcard roundtrip serialization let intermediate_agg_result_bytes = postcard::to_allocvec(&intermediate_agg_result).expect( "Postcard Serialization failed, flatten etc. is not supported in the intermediate \ result", ); let intermediate_agg_result: IntermediateAggregationResults = postcard::from_bytes(&intermediate_agg_result_bytes) .expect("Post deserialization failed"); intermediate_agg_result .into_final_result(agg_req, Default::default()) .unwrap() } else { let collector = get_collector(agg_req); let searcher = reader.searcher(); searcher.search(&AllQuery, &collector).unwrap() }; let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?; assert_eq!(res["bucketsL1"]["buckets"][0]["doc_count"], 3); assert_eq!( res["bucketsL1"]["buckets"][0]["bucketsL2"]["buckets"][0]["doc_count"], 3 ); assert_eq!(res["bucketsL1"]["buckets"][1]["key"], "3-70"); assert_eq!(res["bucketsL1"]["buckets"][1]["doc_count"], 70 - 3); assert_eq!( res["bucketsL1"]["buckets"][1]["bucketsL2"]["buckets"][0]["doc_count"], 27 ); assert_eq!( res["bucketsL1"]["buckets"][1]["bucketsL2"]["buckets"][1]["doc_count"], 40 ); assert_eq!( res["bucketsL1"]["buckets"][1]["bucketsL2"]["buckets"][2]["doc_count"], 0 ); assert_eq!( res["bucketsL1"]["buckets"][2]["bucketsL2"]["buckets"][2]["doc_count"], 80 - 70 ); assert_eq!(res["bucketsL1"]["buckets"][2]["doc_count"], 80 - 70); assert_eq!( res["term_agg_test"], json!( { "buckets": [ { "bucketsL2": { "buckets": [ { "doc_count": 70, "key": 0.0 }, { "doc_count": 9, "key": 70.0 } ] }, "doc_count": 79, "key": "terma" }, { "bucketsL2": { "buckets": [ { "doc_count": 1, "key": 70.0 } ] }, "doc_count": 1, "key": "termb" } ], "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0 } ) ); assert_eq!(res["cardinality_string_id"]["value"], 2.0); assert_eq!(res["cardinality_score"]["value"], 80.0); Ok(()) } #[test] fn test_aggregation_flushing_variants() { test_aggregation_flushing(false, false).unwrap(); test_aggregation_flushing(false, true).unwrap(); test_aggregation_flushing(true, false).unwrap(); test_aggregation_flushing(true, true).unwrap(); } #[test] fn test_aggregation_level1_simple() -> crate::Result<()> { let index = get_test_index_2_segments(true)?; let reader = index.reader()?; let text_field = reader.searcher().schema().get_field("text").unwrap(); let term_query = TermQuery::new( Term::from_field_text(text_field, "cool"), IndexRecordOption::Basic, ); let range_agg = |field_name: &str| -> Aggregation { serde_json::from_value(json!({ "range": { "field": field_name, "ranges": [ { "from": 3.0f64, "to": 7.0f64 }, { "from": 7.0f64, "to": 20.0f64 } ] } })) .unwrap() }; let agg_req_1: Aggregations = vec![ ("average".to_string(), get_avg_req("score")), ("range".to_string(), range_agg("score")), ] .into_iter() .collect(); let collector = get_collector(agg_req_1); let searcher = reader.searcher(); let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap(); let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?; assert_eq!(res["average"]["value"], 12.142857142857142); assert_eq!( res["range"]["buckets"], json!( [ { "key": "*-3", "doc_count": 1, "to": 3.0 }, { "key": "3-7", "doc_count": 2, "from": 3.0, "to": 7.0 }, { "key": "7-20", "doc_count": 3, "from": 7.0, "to": 20.0 }, { "key": "20-*", "doc_count": 1, "from": 20.0 } ]) ); Ok(()) } #[test] fn test_aggregation_level1() -> crate::Result<()> { let index = get_test_index_2_segments(true)?; let reader = index.reader()?; let text_field = reader.searcher().schema().get_field("text").unwrap(); let term_query = TermQuery::new( Term::from_field_text(text_field, "cool"), IndexRecordOption::Basic, ); let range_agg = |field_name: &str| -> Aggregation { serde_json::from_value(json!({ "range": { "field": field_name, "ranges": [ { "from": 3.0f64, "to": 7.0f64 }, { "from": 7.0f64, "to": 20.0f64 } ] } })) .unwrap() }; let agg_req_1: Aggregations = vec![ ("average_i64".to_string(), get_avg_req("score_i64")), ("average_f64".to_string(), get_avg_req("score_f64")), ("average".to_string(), get_avg_req("score")), ("range".to_string(), range_agg("score")), ("rangef64".to_string(), range_agg("score_f64")), ("rangei64".to_string(), range_agg("score_i64")), ] .into_iter() .collect(); let collector = get_collector(agg_req_1); let searcher = reader.searcher(); let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap(); let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?; assert_eq!(res["average"]["value"], 12.142857142857142); assert_eq!(res["average_f64"]["value"], 12.214285714285714); assert_eq!(res["average_i64"]["value"], 12.142857142857142); assert_eq!( res["range"]["buckets"], json!( [ { "key": "*-3", "doc_count": 1, "to": 3.0 }, { "key": "3-7", "doc_count": 2, "from": 3.0, "to": 7.0 }, { "key": "7-20", "doc_count": 3, "from": 7.0, "to": 20.0 }, { "key": "20-*", "doc_count": 1, "from": 20.0 } ]) ); Ok(()) } fn test_aggregation_level2( merge_segments: bool, use_distributed_collector: bool, ) -> crate::Result<()> { let index = get_test_index_2_segments(merge_segments)?; let reader = index.reader()?; let text_field = reader.searcher().schema().get_field("text").unwrap(); let term_query = TermQuery::new( Term::from_field_text(text_field, "cool"), IndexRecordOption::Basic, ); let query_with_no_hits = TermQuery::new( Term::from_field_text(text_field, "thistermdoesnotexist"), IndexRecordOption::Basic, ); let elasticsearch_compatible_json_req = r#" { "rangef64": { "range": { "field": "score_f64", "ranges": [ { "to": 3.0 }, { "from": 3.0, "to": 7.0 }, { "from": 7.0, "to": 19.0 }, { "from": 19.0, "to": 20.0 }, { "from": 20.0 } ] }, "aggs": { "average_in_range": { "avg": { "field": "score" } }, "term_agg": { "terms": { "field": "text" } } } }, "rangei64": { "range": { "field": "score_i64", "ranges": [ { "to": 3.0 }, { "from": 3.0, "to": 7.0 }, { "from": 7.0, "to": 19.0 }, { "from": 19.0, "to": 20.0 }, { "from": 20.0 } ] }, "aggs": { "average_in_range": { "avg": { "field": "score" } }, "term_agg": { "terms": { "field": "text" } } } }, "average": { "avg": { "field": "score" } }, "range": { "range": { "field": "score", "ranges": [ { "to": 3.0 }, { "from": 3.0, "to": 7.0 }, { "from": 7.0, "to": 19.0 }, { "from": 19.0, "to": 20.0 }, { "from": 20.0 } ] }, "aggs": { "average_in_range": { "avg": { "field": "score" } }, "term_agg": { "terms": { "field": "text" } } } } } "#; let agg_req: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap(); let agg_res: AggregationResults = if use_distributed_collector { let collector = DistributedAggregationCollector::from_aggs(agg_req.clone(), Default::default()); let searcher = reader.searcher(); let res = searcher.search(&term_query, &collector).unwrap(); res.into_final_result(agg_req.clone(), Default::default()) .unwrap() } else { let collector = get_collector(agg_req.clone()); let searcher = reader.searcher(); searcher.search(&term_query, &collector).unwrap() }; let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?; assert_eq!(res["range"]["buckets"][1]["key"], "3-7"); assert_eq!(res["range"]["buckets"][1]["doc_count"], 2u64); assert_eq!(res["rangef64"]["buckets"][1]["doc_count"], 2u64); assert_eq!(res["rangei64"]["buckets"][1]["doc_count"], 2u64); assert_eq!(res["average"]["value"], 12.142857142857142f64); assert_eq!(res["range"]["buckets"][2]["key"], "7-19"); assert_eq!(res["range"]["buckets"][2]["doc_count"], 3u64); assert_eq!(res["rangef64"]["buckets"][2]["doc_count"], 3u64); assert_eq!(res["rangei64"]["buckets"][2]["doc_count"], 3u64); assert_eq!(res["rangei64"]["buckets"][5], serde_json::Value::Null); assert_eq!(res["range"]["buckets"][4]["key"], "20-*"); assert_eq!(res["range"]["buckets"][4]["doc_count"], 1u64); assert_eq!(res["rangef64"]["buckets"][4]["doc_count"], 1u64); assert_eq!(res["rangei64"]["buckets"][4]["doc_count"], 1u64); assert_eq!(res["range"]["buckets"][3]["key"], "19-20"); assert_eq!(res["range"]["buckets"][3]["doc_count"], 0u64); assert_eq!(res["rangef64"]["buckets"][3]["doc_count"], 0u64); assert_eq!(res["rangei64"]["buckets"][3]["doc_count"], 0u64); assert_eq!( res["range"]["buckets"][3]["average_in_range"]["value"], serde_json::Value::Null ); assert_eq!( res["range"]["buckets"][4]["average_in_range"]["value"], 44.0f64 ); assert_eq!( res["rangef64"]["buckets"][4]["average_in_range"]["value"], 44.0f64 ); assert_eq!( res["rangei64"]["buckets"][4]["average_in_range"]["value"], 44.0f64 ); assert_eq!( res["range"]["7-19"]["average_in_range"]["value"], res["rangef64"]["7-19"]["average_in_range"]["value"] ); assert_eq!( res["range"]["7-19"]["average_in_range"]["value"], res["rangei64"]["7-19"]["average_in_range"]["value"] ); // Test empty result set let collector = get_collector(agg_req); let searcher = reader.searcher(); searcher.search(&query_with_no_hits, &collector).unwrap(); Ok(()) } #[test] fn test_aggregation_level2_multi_segments() -> crate::Result<()> { test_aggregation_level2(false, false) } #[test] fn test_aggregation_level2_single_segment() -> crate::Result<()> { test_aggregation_level2(true, false) } #[test] fn test_aggregation_level2_multi_segments_distributed_collector() -> crate::Result<()> { test_aggregation_level2(false, true) } #[test] fn test_aggregation_level2_single_segment_distributed_collector() -> crate::Result<()> { test_aggregation_level2(true, true) } #[test] fn test_aggregation_invalid_requests() -> crate::Result<()> { let index = get_test_index_2_segments(false)?; let reader = index.reader()?; let avg_on_field = |field_name: &str| { let agg_req_1: Aggregations = serde_json::from_value(json!({ "average": { "avg": { "field": field_name, }, } })) .unwrap(); let collector = get_collector(agg_req_1); let searcher = reader.searcher(); searcher.search(&AllQuery, &collector) }; let agg_res = avg_on_field("dummy_text").unwrap_err(); assert_eq!( format!("{agg_res:?}"), r#"InvalidArgument("Field \"dummy_text\" is not configured as fast field")"# ); let agg_req_1: Result = serde_json::from_value(json!({ "average": { "avg": { "fieldd": "a", }, } })); assert_eq!(agg_req_1.is_err(), true); assert_eq!(agg_req_1.unwrap_err().to_string(), "missing field `field`"); let agg_req_1: Result = serde_json::from_value(json!({ "average": { "doesnotmatchanyagg": { "field": "a", }, } })); assert_eq!(agg_req_1.is_err(), true); // TODO: This should list valid values assert!(agg_req_1 .unwrap_err() .to_string() .contains("unknown variant `doesnotmatchanyagg`, expected one of")); // TODO: This should return an error // let agg_res = avg_on_field("not_exist_field").unwrap_err(); // assert_eq!( // format!("{:?}", agg_res), // r#"InvalidArgument("No fast field found for field: not_exist_field")"# //); // TODO: This should return an error // let agg_res = avg_on_field("ip_addr").unwrap_err(); // assert_eq!( // format!("{:?}", agg_res), // r#"InvalidArgument("No fast field found for field: ip_addr")"# //); Ok(()) } #[test] fn test_aggregation_on_json_object() { let mut schema_builder = Schema::builder(); let json = schema_builder.add_json_field("json", FAST); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); index_writer .add_document(doc!(json => json!({"color": "red"}))) .unwrap(); index_writer .add_document(doc!(json => json!({"color": "red"}))) .unwrap(); index_writer .add_document(doc!(json => json!({"color": "blue"}))) .unwrap(); index_writer.commit().unwrap(); let reader = index.reader().unwrap(); let searcher = reader.searcher(); let agg: Aggregations = serde_json::from_value(json!({ "jsonagg": { "terms": { "field": "json.color", } } })) .unwrap(); let aggregation_collector = get_collector(agg); let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap(); let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap(); assert_eq!( &aggregation_res_json, &serde_json::json!({ "jsonagg": { "buckets": [ {"doc_count": 2, "key": "red"}, {"doc_count": 1, "key": "blue"}, ], "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0 } }) ); } #[test] fn test_aggregation_on_nested_json_object() { let mut schema_builder = Schema::builder(); let json = schema_builder.add_json_field("json.blub", FAST); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); index_writer .add_document(doc!(json => json!({"color.dot": "red", "color": {"nested":"red"} }))) .unwrap(); index_writer .add_document(doc!(json => json!({"color.dot": "blue", "color": {"nested":"blue"} }))) .unwrap(); index_writer .add_document(doc!(json => json!({"color.dot": "blue", "color": {"nested":"blue"} }))) .unwrap(); index_writer.commit().unwrap(); let reader = index.reader().unwrap(); let searcher = reader.searcher(); let agg: Aggregations = serde_json::from_value(json!({ "jsonagg1": { "terms": { "field": "json\\.blub.color\\.dot", } }, "jsonagg2": { "terms": { "field": "json\\.blub.color.nested", } } })) .unwrap(); let aggregation_collector = get_collector(agg); let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap(); let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap(); assert_eq!( &aggregation_res_json, &serde_json::json!({ "jsonagg1": { "buckets": [ {"doc_count": 2, "key": "blue"}, {"doc_count": 1, "key": "red"} ], "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0 }, "jsonagg2": { "buckets": [ {"doc_count": 2, "key": "blue"}, {"doc_count": 1, "key": "red"} ], "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0 } }) ); } #[test] fn test_aggregation_on_json_object_empty_columns() { let mut schema_builder = Schema::builder(); let json = schema_builder.add_json_field("json", FAST); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); // => Empty column when accessing color index_writer .add_document(doc!(json => json!({"price": 10.0}))) .unwrap(); index_writer.commit().unwrap(); // => Empty column when accessing price index_writer .add_document(doc!(json => json!({"color": "blue"}))) .unwrap(); index_writer.commit().unwrap(); // => Non Empty columns index_writer .add_document(doc!(json => json!({"color": "red", "price": 10.0}))) .unwrap(); index_writer .add_document(doc!(json => json!({"color": "red", "price": 10.0}))) .unwrap(); index_writer .add_document(doc!(json => json!({"color": "green", "price": 20.0}))) .unwrap(); index_writer .add_document(doc!(json => json!({"color": "green", "price": 20.0}))) .unwrap(); index_writer .add_document(doc!(json => json!({"color": "green", "price": 20.0}))) .unwrap(); index_writer.commit().unwrap(); let reader = index.reader().unwrap(); let searcher = reader.searcher(); let agg: Aggregations = serde_json::from_value(json!({ "jsonagg": { "terms": { "field": "json.color", } } })) .unwrap(); let aggregation_collector = get_collector(agg); let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap(); let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap(); assert_eq!( &aggregation_res_json, &serde_json::json!({ "jsonagg": { "buckets": [ {"doc_count": 3, "key": "green"}, {"doc_count": 2, "key": "red"}, {"doc_count": 1, "key": "blue"} ], "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0 } }) ); let agg_req_str = r#" { "jsonagg": { "aggs": { "min_price": { "min": { "field": "json.price" } } }, "terms": { "field": "json.color", "order": { "min_price": "desc" } } } } "#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); let aggregation_collector = get_collector(agg); let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap(); let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap(); assert_eq!( &aggregation_res_json, &serde_json::json!( { "jsonagg": { "buckets": [ { "key": "green", "doc_count": 3, "min_price": { "value": 20.0 } }, { "key": "red", "doc_count": 2, "min_price": { "value": 10.0 } }, { "key": "blue", "doc_count": 1, "min_price": { "value": null } } ], "sum_other_doc_count": 0 } } ) ); } #[test] fn test_aggregation_on_json_object_mixed_types() { let mut schema_builder = Schema::builder(); let json = schema_builder.add_json_field("json", FAST); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); // => Segment with all values numeric index_writer .add_document(doc!(json => json!({"mixed_type": 10.0, "mixed_price": 10.0}))) .unwrap(); index_writer.commit().unwrap(); // => Segment with all values text index_writer .add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0}))) .unwrap(); index_writer .add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0}))) .unwrap(); index_writer .add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0}))) .unwrap(); index_writer.commit().unwrap(); // => Segment with all boolean index_writer .add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"}))) .unwrap(); index_writer.commit().unwrap(); // => Segment with mixed values index_writer .add_document(doc!(json => json!({"mixed_type": "red", "mixed_price": 1.0}))) .unwrap(); index_writer .add_document(doc!(json => json!({"mixed_type": "red", "mixed_price": 1.0}))) .unwrap(); index_writer .add_document(doc!(json => json!({"mixed_type": -20.5, "mixed_price": -20.5}))) .unwrap(); index_writer .add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"}))) .unwrap(); index_writer.commit().unwrap(); // All bucket types let agg_req_str = r#" { "termagg": { "terms": { "field": "json.mixed_type", "order": { "min_price": "desc" } }, "aggs": { "min_price": { "min": { "field": "json.mixed_price" } } } }, "rangeagg": { "range": { "field": "json.mixed_type", "ranges": [ { "to": 3.0 }, { "from": 19.0, "to": 20.0 }, { "from": 20.0 } ] }, "aggs": { "average_in_range": { "avg": { "field": "json.mixed_type" } } } } } "#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); let aggregation_collector = get_collector(agg); let reader = index.reader().unwrap(); let searcher = reader.searcher(); let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap(); let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap(); use pretty_assertions::assert_eq; assert_eq!( &aggregation_res_json, &serde_json::json!({ "rangeagg": { "buckets": [ { "average_in_range": { "value": -20.5 }, "doc_count": 1, "key": "*-3", "to": 3.0 }, { "average_in_range": { "value": 10.0 }, "doc_count": 1, "from": 3.0, "key": "3-19", "to": 19.0 }, { "average_in_range": { "value": null }, "doc_count": 0, "from": 19.0, "key": "19-20", "to": 20.0 }, { "average_in_range": { "value": null }, "doc_count": 0, "from": 20.0, "key": "20-*" } ] }, "termagg": { "buckets": [ { "doc_count": 1, "key": 10, "min_price": { "value": 10.0 } }, { "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } }, { "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } }, { "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } }, { "doc_count": 2, "key": 1, "key_as_string": "true", "min_price": { "value": null } }, ], "sum_other_doc_count": 0 } } ) ); } #[test] fn test_aggregation_on_json_object_mixed_numerical_segments() { let mut schema_builder = Schema::builder(); let json = schema_builder.add_json_field("json", FAST); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); // => Segment with all values f64 numeric index_writer .add_document(doc!(json => json!({"mixed_price": 10.5}))) .unwrap(); // Gets converted to f64! index_writer .add_document(doc!(json => json!({"mixed_price": 10}))) .unwrap(); index_writer.commit().unwrap(); // => Segment with all values i64 numeric index_writer .add_document(doc!(json => json!({"mixed_price": 10}))) .unwrap(); index_writer.commit().unwrap(); index_writer.commit().unwrap(); // All bucket types let agg_req_str = r#" { "termagg": { "terms": { "field": "json.mixed_price" } } } "#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); let aggregation_collector = get_collector(agg); let reader = index.reader().unwrap(); let searcher = reader.searcher(); let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap(); let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap(); use pretty_assertions::assert_eq; assert_eq!( &aggregation_res_json, &serde_json::json!({ "termagg": { "buckets": [ { "doc_count": 2, "key": 10}, { "doc_count": 1, "key": 10.5}, ], "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0 } } ) ); } #[test] fn test_aggregation_invalid_field_returns_error() { // Test that aggregations return an error when given an invalid field name // with strict_field_validation enabled let index = get_test_index_2_segments(false).unwrap(); let reader = index.reader().unwrap(); let searcher = reader.searcher(); // Test with a field that doesn't exist at all let agg_req_str = r#" { "date_histogram_test": { "date_histogram": { "field": "not_valid_field", "fixed_interval": "30d" } } }"#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); // Enable strict field validation let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); let collector = AggregationCollector::from_aggs(agg, context); let result = searcher.search(&AllQuery, &collector); assert!(result.is_err()); match result { Err(crate::TantivyError::FieldNotFound(field_name)) => { assert_eq!(field_name, "not_valid_field"); } _ => panic!("Expected FieldNotFound error, got: {:?}", result), } // Test with histogram aggregation on invalid field let agg_req_str = r#" { "histogram_test": { "histogram": { "field": "invalid_histogram_field", "interval": 10.0 } } }"#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); let collector = AggregationCollector::from_aggs(agg, context); let result = searcher.search(&AllQuery, &collector); assert!(result.is_err()); match result { Err(crate::TantivyError::FieldNotFound(field_name)) => { assert_eq!(field_name, "invalid_histogram_field"); } _ => panic!("Expected FieldNotFound error, got: {:?}", result), } // Test with terms aggregation on invalid field let agg_req_str = r#" { "terms_test": { "terms": { "field": "invalid_terms_field" } } }"#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); let collector = AggregationCollector::from_aggs(agg, context); let result = searcher.search(&AllQuery, &collector); assert!(result.is_err()); match result { Err(crate::TantivyError::FieldNotFound(field_name)) => { assert_eq!(field_name, "invalid_terms_field"); } _ => panic!("Expected FieldNotFound error, got: {:?}", result), } // Test with avg metric aggregation on invalid field let agg_req_str = r#" { "avg_test": { "avg": { "field": "invalid_avg_field" } } }"#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); let collector = AggregationCollector::from_aggs(agg, context); let result = searcher.search(&AllQuery, &collector); assert!(result.is_err()); match result { Err(crate::TantivyError::FieldNotFound(field_name)) => { assert_eq!(field_name, "invalid_avg_field"); } _ => panic!("Expected FieldNotFound error, got: {:?}", result), } // Test with range aggregation on invalid field let agg_req_str = r#" { "range_test": { "range": { "field": "invalid_range_field", "ranges": [ { "to": 10.0 }, { "from": 10.0, "to": 20.0 }, { "from": 20.0 } ] } } }"#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); let collector = AggregationCollector::from_aggs(agg, context); let result = searcher.search(&AllQuery, &collector); assert!(result.is_err()); match result { Err(crate::TantivyError::FieldNotFound(field_name)) => { assert_eq!(field_name, "invalid_range_field"); } _ => panic!("Expected FieldNotFound error, got: {:?}", result), } }