From bc607a921ba0a326f5ac4239adb288321d61e97e Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 4 May 2022 18:51:18 +0800 Subject: [PATCH 1/3] add alias shard_size split_size for quickwit improve some docs --- src/aggregation/bucket/histogram/histogram.rs | 25 ++++++++++ src/aggregation/bucket/term_agg.rs | 46 +++++++++++++++++++ src/aggregation/mod.rs | 3 +- src/schema/text_options.rs | 5 ++ 4 files changed, 78 insertions(+), 1 deletion(-) diff --git a/src/aggregation/bucket/histogram/histogram.rs b/src/aggregation/bucket/histogram/histogram.rs index 79015072e..a2a4a87e5 100644 --- a/src/aggregation/bucket/histogram/histogram.rs +++ b/src/aggregation/bucket/histogram/histogram.rs @@ -1364,4 +1364,29 @@ mod tests { Ok(()) } + + #[test] + fn histogram_invalid_request() -> crate::Result<()> { + let index = get_test_index_2_segments(true)?; + + let agg_req: Aggregations = vec![( + "histogram".to_string(), + Aggregation::Bucket(BucketAggregation { + bucket_agg: BucketAggregationType::Histogram(HistogramAggregation { + field: "score_f64".to_string(), + interval: 0.0, + ..Default::default() + }), + sub_aggregation: Default::default(), + }), + )] + .into_iter() + .collect(); + + let agg_res = exec_request(agg_req, &index); + + assert!(agg_res.is_err()); + + Ok(()) + } } diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index 3323e09bc..af23352f1 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -81,6 +81,7 @@ pub struct TermsAggregation { /// /// Should never be smaller than size. #[serde(skip_serializing_if = "Option::is_none", default)] + #[serde(alias = "split_size")] pub shard_size: Option, /// The get more accurate results, we fetch more than `size` from each segment. @@ -1210,6 +1211,51 @@ mod tests { .unwrap(); assert_eq!(agg_req, agg_req_deser); + let elasticsearch_compatible_json = json!( + { + "term_agg_test":{ + "terms": { + "field": "string_id", + "split_size": 2u64, + } + } + }); + + // test alias shard_size, split_size + let agg_req: Aggregations = vec![( + "term_agg_test".to_string(), + Aggregation::Bucket(BucketAggregation { + bucket_agg: BucketAggregationType::Terms(TermsAggregation { + field: "string_id".to_string(), + shard_size: Some(2), + ..Default::default() + }), + sub_aggregation: Default::default(), + }), + )] + .into_iter() + .collect(); + + let agg_req_deser: Aggregations = + serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap()) + .unwrap(); + assert_eq!(agg_req, agg_req_deser); + + let elasticsearch_compatible_json = json!( + { + "term_agg_test":{ + "terms": { + "field": "string_id", + "shard_size": 2u64, + } + } + }); + + let agg_req_deser: Aggregations = + serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap()) + .unwrap(); + assert_eq!(agg_req, agg_req_deser); + Ok(()) } } diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index 193a94d04..ac7fc606a 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -20,7 +20,8 @@ //! //! #### Limitations //! -//! Currently aggregations work only on single value fast fields of type u64, f64 and i64. +//! Currently aggregations work only on single value fast fields of type u64, f64, i64 and on +//! string fast fields. //! //! # JSON Format //! Aggregations request and result structures de/serialize into elasticsearch compatible JSON. diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index b164ada31..14728154a 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -42,6 +42,11 @@ impl TextOptions { /// Text fast fields will have the term ids stored in the fast field. /// The fast field will be a multivalued fast field. /// + /// The effective cardinality depends on the tokenizer. When creating fast fields on text + /// fields it is recommended to use the "raw" tokenizer, since it will store the original text + /// unchanged. The "default" tokenizer will store the terms as lower case and this will be + /// reflected in the dictionary. + /// /// The original text can be retrieved via `ord_to_term` from the dictionary. #[must_use] pub fn set_fast(mut self) -> TextOptions { From d11a8cce26f58eb3360c54581ab883ad9d9e69fc Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 5 May 2022 17:33:33 +0800 Subject: [PATCH 2/3] minor docs fix --- src/aggregation/bucket/term_agg.rs | 6 +++--- src/aggregation/intermediate_agg_result.rs | 2 ++ src/aggregation/mod.rs | 4 ++-- src/fastfield/writer.rs | 2 +- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index af23352f1..8199dfd05 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -97,11 +97,11 @@ pub struct TermsAggregation { /// doc_count returned by each shard. It’s the sum of the size of the largest bucket on /// each segment that didn’t fit into `shard_size`. /// - /// Defaults to true when ordering by counts desc. + /// Defaults to true when ordering by count desc. #[serde(skip_serializing_if = "Option::is_none", default)] pub show_term_doc_count_error: Option, - /// Filter all terms than are lower `min_doc_count`. Defaults to 1. + /// Filter all terms that are lower than `min_doc_count`. Defaults to 1. /// /// **Expensive**: When set to 0, this will return all terms in the field. #[serde(skip_serializing_if = "Option::is_none", default)] @@ -144,7 +144,7 @@ pub(crate) struct TermsAggregationInternal { /// Increasing this value is will increase the cost for more accuracy. pub segment_size: u32, - /// Filter all terms than are lower `min_doc_count`. Defaults to 1. + /// Filter all terms that are lower than `min_doc_count`. Defaults to 1. /// /// *Expensive*: When set to 0, this will return all terms in the field. pub min_doc_count: u64, diff --git a/src/aggregation/intermediate_agg_result.rs b/src/aggregation/intermediate_agg_result.rs index 936caf38a..9bde00707 100644 --- a/src/aggregation/intermediate_agg_result.rs +++ b/src/aggregation/intermediate_agg_result.rs @@ -24,7 +24,9 @@ use crate::aggregation::bucket::TermsAggregationInternal; /// intermediate results. #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct IntermediateAggregationResults { + #[serde(skip_serializing_if = "Option::is_none")] pub(crate) metrics: Option>, + #[serde(skip_serializing_if = "Option::is_none")] pub(crate) buckets: Option>, } diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index ac7fc606a..37fa05c0f 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -20,8 +20,8 @@ //! //! #### Limitations //! -//! Currently aggregations work only on single value fast fields of type u64, f64, i64 and on -//! string fast fields. +//! Currently aggregations work only on single value fast fields of type u64, f64, i64 and +//! fast fields on text fields. //! //! # JSON Format //! Aggregations request and result structures de/serialize into elasticsearch compatible JSON. diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index a28bf732c..90f1916e6 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -300,7 +300,7 @@ impl IntFastFieldWriter { /// If the document has more than one value for the given field, /// only the first one is taken in account. /// - /// Values for string fast fields are skipped. + /// Values on text fast fields are skipped. pub fn add_document(&mut self, doc: &Document) { match doc.get_first(self.field) { Some(v) => { From d77e8de36a03400c72e88f70c7213f70f7e08e3b Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 6 May 2022 17:52:18 +0800 Subject: [PATCH 3/3] flip alias variable name --- src/aggregation/bucket/term_agg.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index 8199dfd05..c9833c885 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -81,8 +81,8 @@ pub struct TermsAggregation { /// /// Should never be smaller than size. #[serde(skip_serializing_if = "Option::is_none", default)] - #[serde(alias = "split_size")] - pub shard_size: Option, + #[serde(alias = "shard_size")] + pub split_size: Option, /// The get more accurate results, we fetch more than `size` from each segment. /// @@ -573,7 +573,7 @@ mod tests { bucket_agg: BucketAggregationType::Terms(TermsAggregation { field: "string_id".to_string(), size: Some(2), - shard_size: Some(2), + split_size: Some(2), ..Default::default() }), sub_aggregation: Default::default(), @@ -1227,7 +1227,7 @@ mod tests { Aggregation::Bucket(BucketAggregation { bucket_agg: BucketAggregationType::Terms(TermsAggregation { field: "string_id".to_string(), - shard_size: Some(2), + split_size: Some(2), ..Default::default() }), sub_aggregation: Default::default(),