diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 91b6ff534..c8f631fc8 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -54,7 +54,7 @@ pub use self::serialize::{ /// Available codecs to use to encode the u64 (via [`MonotonicallyMappableToU64`]) converted data. pub enum FastFieldCodecType { /// Bitpack all values in the value range. The number of bits is defined by the amplitude - /// column.max_value()-column.min_value() + /// `column.max_value() - column.min_value()` Bitpacked = 1, /// Linear interpolation puts a line between the first and last value and then bitpacks the /// values by the offset from the line. The number of bits is defined by the max deviation from diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index 5a6c790d8..4bb2b295b 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -40,7 +40,7 @@ use crate::{ /// The normalized header gives some parameters after applying the following /// normalization of the vector: -/// val -> (val - min_value) / gcd +/// `val -> (val - min_value) / gcd` /// /// By design, after normalization, `min_value = 0` and `gcd = 1`. #[derive(Debug, Copy, Clone)] diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index 9f68a80d7..10469f9c5 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -17,7 +17,11 @@ use crate::fastfield::MultiValuedFastFieldReader; use crate::schema::Type; use crate::{DocId, TantivyError}; -/// Creates a bucket for every unique term +/// Creates a bucket for every unique term and counts the number of occurences. +/// Note that doc_count in the response buckets equals term count here. +/// +/// If the text is untokenized and single value, that means one term per document and therefore it +/// is in fact doc count. /// /// ### Terminology /// Shard parameters are supposed to be equivalent to elasticsearch shard parameter. @@ -64,6 +68,25 @@ use crate::{DocId, TantivyError}; /// } /// } /// ``` +/// +/// /// # Response JSON Format +/// ```json +/// { +/// ... +/// "aggregations": { +/// "genres": { +/// "doc_count_error_upper_bound": 0, +/// "sum_other_doc_count": 0, +/// "buckets": [ +/// { "key": "drumnbass", "doc_count": 6 }, +/// { "key": "raggae", "doc_count": 4 }, +/// { "key": "jazz", "doc_count": 2 } +/// ] +/// } +/// } +/// } +/// ``` + #[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] pub struct TermsAggregation { /// The field to aggregate on. @@ -1206,11 +1229,43 @@ mod tests { .collect(); let res = exec_request_with_query(agg_req, &index, None); + assert!(res.is_err()); Ok(()) } + #[test] + fn terms_aggregation_multi_token_per_doc() -> crate::Result<()> { + let terms = vec!["Hello Hello", "Hallo Hallo"]; + + let index = get_test_index_from_terms(true, &[terms])?; + + let agg_req: Aggregations = vec![( + "my_texts".to_string(), + Aggregation::Bucket(BucketAggregation { + bucket_agg: BucketAggregationType::Terms(TermsAggregation { + field: "text_id".to_string(), + min_doc_count: Some(0), + ..Default::default() + }), + sub_aggregation: Default::default(), + }), + )] + .into_iter() + .collect(); + + let res = exec_request_with_query(agg_req, &index, None).unwrap(); + + assert_eq!(res["my_texts"]["buckets"][0]["key"], "hello"); + assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2); + + assert_eq!(res["my_texts"]["buckets"][1]["key"], "hallo"); + assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2); + + Ok(()) + } + #[test] fn test_json_format() -> crate::Result<()> { let agg_req: Aggregations = vec![( diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index c5183fa06..876522f7d 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -10,21 +10,19 @@ //! //! There are two categories: [Metrics](metric) and [Buckets](bucket). //! -//! # Usage -//! +//! ## Prerequisite +//! Currently aggregations work only on [fast fields](`crate::fastfield`). Single value fast fields +//! of type `u64`, `f64`, `i64` and fast fields on text fields. //! +//! ## Usage //! To use aggregations, build an aggregation request by constructing //! [`Aggregations`](agg_req::Aggregations). //! Create an [`AggregationCollector`] from this request. `AggregationCollector` implements the //! [`Collector`](crate::collector::Collector) trait and can be passed as collector into //! [`Searcher::search()`](crate::Searcher::search). //! -//! #### Limitations //! -//! Currently aggregations work only on single value fast fields of type `u64`, `f64`, `i64` and -//! fast fields on text fields. -//! -//! # JSON Format +//! ## JSON Format //! Aggregations request and result structures de/serialize into elasticsearch compatible JSON. //! //! ```verbatim @@ -35,7 +33,7 @@ //! let json_response_string: String = &serde_json::to_string(&agg_res)?; //! ``` //! -//! # Supported Aggregations +//! ## Supported Aggregations //! - [Bucket](bucket) //! - [Histogram](bucket::HistogramAggregation) //! - [Range](bucket::RangeAggregation) diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 7a43f9ed4..cf65cb169 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -7,16 +7,15 @@ //! It is designed for the fast random access of some document //! fields given a document id. //! -//! `FastField` are useful when a field is required for all or most of -//! the `DocSet` : for instance for scoring, grouping, filtering, or faceting. +//! Fast fields are useful when a field is required for all or most of +//! the `DocSet`: for instance for scoring, grouping, aggregation, filtering, or faceting. //! //! -//! Fields have to be declared as `FAST` in the schema. -//! Currently supported fields are: u64, i64, f64 and bytes. +//! Fields have to be declared as `FAST` in the schema. +//! Currently supported fields are: u64, i64, f64, bytes and text. //! -//! u64, i64 and f64 fields are stored in a bit-packed fashion so that -//! their memory usage is directly linear with the amplitude of the -//! values stored. +//! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected +//! automatically, when serializing. //! //! Read access performance is comparable to that of an array lookup. diff --git a/src/schema/flags.rs b/src/schema/flags.rs index 3fa73e7a6..7c7238908 100644 --- a/src/schema/flags.rs +++ b/src/schema/flags.rs @@ -37,6 +37,8 @@ pub struct FastFlag; /// /// Fast fields can be random-accessed rapidly. Fields useful for scoring, filtering /// or collection should be mark as fast fields. +/// +/// See [fast fields](`crate::fastfield`). pub const FAST: SchemaFlagList = SchemaFlagList { head: FastFlag, tail: (),