Merge pull request #1358 from quickwit-oss/fix_docs

add alias shard_size to split_size for quickwit
2026-06-05 01:50:42 +00:00 · 2022-05-06 18:41:34 +08:00
parent 1273f33338 d77e8de36a
commit aab65490c9
6 changed files with 86 additions and 7 deletions
--- a/src/aggregation/bucket/histogram/histogram.rs
+++ b/src/aggregation/bucket/histogram/histogram.rs
@@ -1364,4 +1364,29 @@ mod tests {

        Ok(())
    }
+
+    #[test]
+    fn histogram_invalid_request() -> crate::Result<()> {
+        let index = get_test_index_2_segments(true)?;
+
+        let agg_req: Aggregations = vec![(
+            "histogram".to_string(),
+            Aggregation::Bucket(BucketAggregation {
+                bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
+                    field: "score_f64".to_string(),
+                    interval: 0.0,
+                    ..Default::default()
+                }),
+                sub_aggregation: Default::default(),
+            }),
+        )]
+        .into_iter()
+        .collect();
+
+        let agg_res = exec_request(agg_req, &index);
+
+        assert!(agg_res.is_err());
+
+        Ok(())
+    }
 }
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -81,7 +81,8 @@ pub struct TermsAggregation {
    ///
    /// Should never be smaller than size.
    #[serde(skip_serializing_if = "Option::is_none", default)]
-    pub shard_size: Option<u32>,
+    #[serde(alias = "shard_size")]
+    pub split_size: Option<u32>,

    /// The get more accurate results, we fetch more than `size` from each segment.
    ///
@@ -96,11 +97,11 @@ pub struct TermsAggregation {
    /// doc_count returned by each shard. It’s the sum of the size of the largest bucket on
    /// each segment that didn’t fit into `shard_size`.
    ///
-    /// Defaults to true when ordering by counts desc.
+    /// Defaults to true when ordering by count desc.
    #[serde(skip_serializing_if = "Option::is_none", default)]
    pub show_term_doc_count_error: Option<bool>,

-    /// Filter all terms than are lower `min_doc_count`. Defaults to 1.
+    /// Filter all terms that are lower than `min_doc_count`. Defaults to 1.
    ///
    /// **Expensive**: When set to 0, this will return all terms in the field.
    #[serde(skip_serializing_if = "Option::is_none", default)]
@@ -143,7 +144,7 @@ pub(crate) struct TermsAggregationInternal {
    /// Increasing this value is will increase the cost for more accuracy.
    pub segment_size: u32,

-    /// Filter all terms than are lower `min_doc_count`. Defaults to 1.
+    /// Filter all terms that are lower than `min_doc_count`. Defaults to 1.
    ///
    /// *Expensive*: When set to 0, this will return all terms in the field.
    pub min_doc_count: u64,
@@ -572,7 +573,7 @@ mod tests {
                bucket_agg: BucketAggregationType::Terms(TermsAggregation {
                    field: "string_id".to_string(),
                    size: Some(2),
-                    shard_size: Some(2),
+                    split_size: Some(2),
                    ..Default::default()
                }),
                sub_aggregation: Default::default(),
@@ -1210,6 +1211,51 @@ mod tests {
                .unwrap();
        assert_eq!(agg_req, agg_req_deser);

+        let elasticsearch_compatible_json = json!(
+        {
+        "term_agg_test":{
+            "terms": {
+                "field": "string_id",
+                "split_size": 2u64,
+            }
+        }
+        });
+
+        // test alias shard_size, split_size
+        let agg_req: Aggregations = vec![(
+            "term_agg_test".to_string(),
+            Aggregation::Bucket(BucketAggregation {
+                bucket_agg: BucketAggregationType::Terms(TermsAggregation {
+                    field: "string_id".to_string(),
+                    split_size: Some(2),
+                    ..Default::default()
+                }),
+                sub_aggregation: Default::default(),
+            }),
+        )]
+        .into_iter()
+        .collect();
+
+        let agg_req_deser: Aggregations =
+            serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
+                .unwrap();
+        assert_eq!(agg_req, agg_req_deser);
+
+        let elasticsearch_compatible_json = json!(
+        {
+        "term_agg_test":{
+            "terms": {
+                "field": "string_id",
+                "shard_size": 2u64,
+            }
+        }
+        });
+
+        let agg_req_deser: Aggregations =
+            serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
+                .unwrap();
+        assert_eq!(agg_req, agg_req_deser);
+
        Ok(())
    }
 }
--- a/src/aggregation/intermediate_agg_result.rs
+++ b/src/aggregation/intermediate_agg_result.rs
@@ -24,7 +24,9 @@ use crate::aggregation::bucket::TermsAggregationInternal;
 /// intermediate results.
 #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct IntermediateAggregationResults {
+    #[serde(skip_serializing_if = "Option::is_none")]
    pub(crate) metrics: Option<VecWithNames<IntermediateMetricResult>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
    pub(crate) buckets: Option<VecWithNames<IntermediateBucketResult>>,
 }

--- a/src/aggregation/mod.rs
+++ b/src/aggregation/mod.rs
@@ -20,7 +20,8 @@
 //!
 //! #### Limitations
 //!
-//! Currently aggregations work only on single value fast fields of type u64, f64 and i64.
+//! Currently aggregations work only on single value fast fields of type u64, f64, i64 and
+//! fast fields on text fields.
 //!
 //! # JSON Format
 //! Aggregations request and result structures de/serialize into elasticsearch compatible JSON.
--- a/src/fastfield/writer.rs
+++ b/src/fastfield/writer.rs
@@ -300,7 +300,7 @@ impl IntFastFieldWriter {
    /// If the document has more than one value for the given field,
    /// only the first one is taken in account.
    ///
-    /// Values for string fast fields are skipped.
+    /// Values on text fast fields are skipped.
    pub fn add_document(&mut self, doc: &Document) {
        match doc.get_first(self.field) {
            Some(v) => {
--- a/src/schema/text_options.rs
+++ b/src/schema/text_options.rs
@@ -42,6 +42,11 @@ impl TextOptions {
    /// Text fast fields will have the term ids stored in the fast field.
    /// The fast field will be a multivalued fast field.
    ///
+    /// The effective cardinality depends on the tokenizer. When creating fast fields on text
+    /// fields it is recommended to use the "raw" tokenizer, since it will store the original text
+    /// unchanged. The "default" tokenizer will store the terms as lower case and this will be
+    /// reflected in the dictionary.
+    ///
    /// The original text can be retrieved via `ord_to_term` from the dictionary.
    #[must_use]
    pub fn set_fast(mut self) -> TextOptions {