add term aggregation clarification

2026-06-01 08:00:41 +00:00 · 2022-10-14 16:12:19 +08:00
parent 4b4c231bba
commit 952b048341
6 changed files with 72 additions and 18 deletions
--- a/fastfield_codecs/src/lib.rs
+++ b/fastfield_codecs/src/lib.rs
@@ -54,7 +54,7 @@ pub use self::serialize::{
 /// Available codecs to use to encode the u64 (via [`MonotonicallyMappableToU64`]) converted data.
 pub enum FastFieldCodecType {
    /// Bitpack all values in the value range. The number of bits is defined by the amplitude
-    /// column.max_value()-column.min_value()
+    /// `column.max_value() - column.min_value()`
    Bitpacked = 1,
    /// Linear interpolation puts a line between the first and last value and then bitpacks the
    /// values by the offset from the line. The number of bits is defined by the max deviation from
--- a/fastfield_codecs/src/serialize.rs
+++ b/fastfield_codecs/src/serialize.rs
@@ -40,7 +40,7 @@ use crate::{

 /// The normalized header gives some parameters after applying the following
 /// normalization of the vector:
-/// val -> (val - min_value) / gcd
+/// `val -> (val - min_value) / gcd`
 ///
 /// By design, after normalization, `min_value = 0` and `gcd = 1`.
 #[derive(Debug, Copy, Clone)]
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -17,7 +17,11 @@ use crate::fastfield::MultiValuedFastFieldReader;
 use crate::schema::Type;
 use crate::{DocId, TantivyError};

-/// Creates a bucket for every unique term
+/// Creates a bucket for every unique term and counts the number of occurences.
+/// Note that doc_count in the response buckets equals term count here.
+///
+/// If the text is untokenized and single value, that means one term per document and therefore it
+/// is in fact doc count.
 ///
 /// ### Terminology
 /// Shard parameters are supposed to be equivalent to elasticsearch shard parameter.
@@ -64,6 +68,25 @@ use crate::{DocId, TantivyError};
 ///     }
 /// }
 /// ```
+///
+/// /// # Response JSON Format
+/// ```json
+/// {
+///     ...
+///     "aggregations": {
+///         "genres": {
+///             "doc_count_error_upper_bound": 0,   
+///             "sum_other_doc_count": 0,           
+///             "buckets": [                        
+///                 { "key": "drumnbass", "doc_count": 6 },
+///                 { "key": "raggae", "doc_count": 4 },
+///                 { "key": "jazz", "doc_count": 2 }
+///             ]
+///         }
+///     }
+/// }
+/// ```
+
 #[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
 pub struct TermsAggregation {
    /// The field to aggregate on.
@@ -1206,11 +1229,43 @@ mod tests {
        .collect();

        let res = exec_request_with_query(agg_req, &index, None);
+
        assert!(res.is_err());

        Ok(())
    }

+    #[test]
+    fn terms_aggregation_multi_token_per_doc() -> crate::Result<()> {
+        let terms = vec!["Hello Hello", "Hallo Hallo"];
+
+        let index = get_test_index_from_terms(true, &[terms])?;
+
+        let agg_req: Aggregations = vec![(
+            "my_texts".to_string(),
+            Aggregation::Bucket(BucketAggregation {
+                bucket_agg: BucketAggregationType::Terms(TermsAggregation {
+                    field: "text_id".to_string(),
+                    min_doc_count: Some(0),
+                    ..Default::default()
+                }),
+                sub_aggregation: Default::default(),
+            }),
+        )]
+        .into_iter()
+        .collect();
+
+        let res = exec_request_with_query(agg_req, &index, None).unwrap();
+
+        assert_eq!(res["my_texts"]["buckets"][0]["key"], "hello");
+        assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
+
+        assert_eq!(res["my_texts"]["buckets"][1]["key"], "hallo");
+        assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
+
+        Ok(())
+    }
+
    #[test]
    fn test_json_format() -> crate::Result<()> {
        let agg_req: Aggregations = vec![(
--- a/src/aggregation/mod.rs
+++ b/src/aggregation/mod.rs
@@ -10,21 +10,19 @@
 //!
 //! There are two categories: [Metrics](metric) and [Buckets](bucket).
 //!
-//! # Usage
-//!
+//! ## Prerequisite
+//! Currently aggregations work only on [fast fields](`crate::fastfield`). Single value fast fields
+//! of type `u64`, `f64`, `i64` and fast fields on text fields.
 //!
+//! ## Usage
 //! To use aggregations, build an aggregation request by constructing
 //! [`Aggregations`](agg_req::Aggregations).
 //! Create an [`AggregationCollector`] from this request. `AggregationCollector` implements the
 //! [`Collector`](crate::collector::Collector) trait and can be passed as collector into
 //! [`Searcher::search()`](crate::Searcher::search).
 //!
-//! #### Limitations
 //!
-//! Currently aggregations work only on single value fast fields of type `u64`, `f64`, `i64` and
-//! fast fields on text fields.
-//!
-//! # JSON Format
+//! ## JSON Format
 //! Aggregations request and result structures de/serialize into elasticsearch compatible JSON.
 //!
 //! ```verbatim
@@ -35,7 +33,7 @@
 //! let json_response_string: String = &serde_json::to_string(&agg_res)?;
 //! ```
 //!
-//! # Supported Aggregations
+//! ## Supported Aggregations
 //! - [Bucket](bucket)
 //!     - [Histogram](bucket::HistogramAggregation)
 //!     - [Range](bucket::RangeAggregation)
--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -7,16 +7,15 @@
 //! It is designed for the fast random access of some document
 //! fields given a document id.
 //!
-//! `FastField` are useful when a field is required for all or most of
-//! the `DocSet` : for instance for scoring, grouping, filtering, or faceting.
+//! Fast fields are useful when a field is required for all or most of
+//! the `DocSet`: for instance for scoring, grouping, aggregation, filtering, or faceting.
 //!
 //!
-//! Fields have to be declared as `FAST` in the  schema.
-//! Currently supported fields are: u64, i64, f64 and bytes.
+//! Fields have to be declared as `FAST` in the schema.
+//! Currently supported fields are: u64, i64, f64, bytes and text.
 //!
-//! u64, i64 and f64 fields are stored in a bit-packed fashion so that
-//! their memory usage is directly linear with the amplitude of the
-//! values stored.
+//! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected
+//! automatically, when serializing.
 //!
 //! Read access performance is comparable to that of an array lookup.

--- a/src/schema/flags.rs
+++ b/src/schema/flags.rs
@@ -37,6 +37,8 @@ pub struct FastFlag;
 ///
 /// Fast fields can be random-accessed rapidly. Fields useful for scoring, filtering
 /// or collection should be mark as fast fields.
+///
+/// See [fast fields](`crate::fastfield`).
 pub const FAST: SchemaFlagList<FastFlag, ()> = SchemaFlagList {
    head: FastFlag,
    tail: (),