chore!:drop JSON support on intermediate agg result (#1992)

* chore!:drop JSON support on intermediate agg result add support for other formats by removing skip_serialize and untagged JSON support is broken anyway due it's lack on f64::INF etc. handling * Update src/aggregation/intermediate_agg_result.rs Co-authored-by: Paul Masurel <paul@quickwit.io> * move from impl --------- Co-authored-by: Paul Masurel <paul@quickwit.io>
2026-01-04 00:02:55 +00:00 · 2023-04-26 19:05:16 +08:00
parent 80df1d9835
commit c599bf3b6c
5 changed files with 68 additions and 97 deletions
--- a/src/aggregation/agg_tests.rs
+++ b/src/aggregation/agg_tests.rs
@@ -4,7 +4,6 @@ use crate::aggregation::agg_req::{Aggregation, Aggregations};
 use crate::aggregation::agg_result::AggregationResults;
 use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
 use crate::aggregation::collector::AggregationCollector;
-use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
 use crate::aggregation::segment_agg_result::AggregationLimits;
 use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
 use crate::aggregation::DistributedAggregationCollector;
@@ -421,9 +420,6 @@ fn test_aggregation_level2(

        let searcher = reader.searcher();
        let res = searcher.search(&term_query, &collector).unwrap();
-        // Test de/serialization roundtrip on intermediate_agg_result
-        let res: IntermediateAggregationResults =
-            serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap();
        res.into_final_result(agg_req.clone(), &Default::default())
            .unwrap()
    } else {
--- a/src/aggregation/bucket/range.rs
+++ b/src/aggregation/bucket/range.rs
@@ -166,7 +166,7 @@ impl SegmentRangeBucketEntry {
        };

        Ok(IntermediateRangeBucketEntry {
-            key: self.key,
+            key: self.key.into(),
            doc_count: self.doc_count,
            sub_aggregation: sub_aggregation_res,
            from: self.from,
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -9,14 +9,14 @@ use crate::aggregation::agg_limits::MemoryConsumption;
 use crate::aggregation::agg_req_with_accessor::{
    AggregationWithAccessor, AggregationsWithAccessor,
 };
+use crate::aggregation::f64_from_fastfield_u64;
 use crate::aggregation::intermediate_agg_result::{
    IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
-    IntermediateTermBucketEntry, IntermediateTermBucketResult,
+    IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult,
 };
 use crate::aggregation::segment_agg_result::{
    build_segment_agg_collector, SegmentAggregationCollector,
 };
-use crate::aggregation::{f64_from_fastfield_u64, Key};
 use crate::error::DataCorruption;
 use crate::TantivyError;

@@ -30,10 +30,6 @@ use crate::TantivyError;
 /// Term aggregations work only on [fast fields](`crate::fastfield`) of type `u64`, `f64`, `i64` and
 /// text.
 ///
-/// ### Terminology
-/// Shard parameters are supposed to be equivalent to elasticsearch shard parameter.
-/// Since they are
-///
 /// ## Document count error
 /// To improve performance, results from one segment are cut off at `segment_size`. On a index with
 /// a single segment this is fine. When combining results of multiple segments, terms that
@@ -402,7 +398,7 @@ impl SegmentTermCollector {
            cut_off_buckets(&mut entries, self.req.segment_size as usize)
        };

-        let mut dict: FxHashMap<Key, IntermediateTermBucketEntry> = Default::default();
+        let mut dict: FxHashMap<IntermediateKey, IntermediateTermBucketEntry> = Default::default();
        dict.reserve(entries.len());

        let mut into_intermediate_bucket_entry =
@@ -453,7 +449,7 @@ impl SegmentTermCollector {

                let intermediate_entry = into_intermediate_bucket_entry(term_id, doc_count)?;

-                dict.insert(Key::Str(buffer.to_string()), intermediate_entry);
+                dict.insert(IntermediateKey::Str(buffer.to_string()), intermediate_entry);
            }
            if self.req.min_doc_count == 0 {
                // TODO: Handle rev streaming for descending sorting by keys
@@ -463,7 +459,7 @@ impl SegmentTermCollector {
                        break;
                    }

-                    let key = Key::Str(
+                    let key = IntermediateKey::Str(
                        std::str::from_utf8(key)
                            .map_err(|utf8_err| DataCorruption::comment_only(utf8_err.to_string()))?
                            .to_string(),
@@ -475,7 +471,7 @@ impl SegmentTermCollector {
            for (val, doc_count) in entries {
                let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
                let val = f64_from_fastfield_u64(val, &self.field_type);
-                dict.insert(Key::F64(val), intermediate_entry);
+                dict.insert(IntermediateKey::F64(val), intermediate_entry);
            }
        };

--- a/src/aggregation/intermediate_agg_result.rs
+++ b/src/aggregation/intermediate_agg_result.rs
@@ -3,12 +3,12 @@
 //! indices.

 use std::cmp::Ordering;
+use std::hash::Hash;

 use columnar::ColumnType;
 use itertools::Itertools;
 use rustc_hash::FxHashMap;
-use serde::ser::SerializeSeq;
-use serde::{Deserialize, Deserializer, Serialize, Serializer};
+use serde::{Deserialize, Serialize};

 use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
 use super::agg_result::{AggregationResult, BucketResult, MetricResult, RangeBucketEntry};
@@ -29,11 +29,52 @@ use crate::TantivyError;

 /// Contains the intermediate aggregation result, which is optimized to be merged with other
 /// intermediate results.
+///
+/// Notice: This struct should not be de/serialized via JSON format.
 #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct IntermediateAggregationResults {
    pub(crate) aggs_res: VecWithNames<IntermediateAggregationResult>,
 }

+#[derive(Clone, Debug, Serialize, Deserialize, PartialOrd, PartialEq)]
+/// The key to identify a bucket.
+/// This might seem redundant with `Key`, but the point is to have a different
+/// Serialize implementation.
+pub enum IntermediateKey {
+    /// String key
+    Str(String),
+    /// `f64` key
+    F64(f64),
+}
+impl From<Key> for IntermediateKey {
+    fn from(value: Key) -> Self {
+        match value {
+            Key::Str(s) => Self::Str(s),
+            Key::F64(f) => Self::F64(f),
+        }
+    }
+}
+impl From<IntermediateKey> for Key {
+    fn from(value: IntermediateKey) -> Self {
+        match value {
+            IntermediateKey::Str(s) => Self::Str(s),
+            IntermediateKey::F64(f) => Self::F64(f),
+        }
+    }
+}
+
+impl Eq for IntermediateKey {}
+
+impl std::hash::Hash for IntermediateKey {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        core::mem::discriminant(self).hash(state);
+        match self {
+            IntermediateKey::Str(text) => text.hash(state),
+            IntermediateKey::F64(val) => val.to_bits().hash(state),
+        }
+    }
+}
+
 impl IntermediateAggregationResults {
    /// Add a result
    pub fn push(&mut self, key: String, value: IntermediateAggregationResult) {
@@ -387,7 +428,7 @@ impl IntermediateBucketResult {
                IntermediateBucketResult::Terms(term_res_left),
                IntermediateBucketResult::Terms(term_res_right),
            ) => {
-                merge_key_maps(&mut term_res_left.entries, term_res_right.entries)?;
+                merge_maps(&mut term_res_left.entries, term_res_right.entries)?;
                term_res_left.sum_other_doc_count += term_res_right.sum_other_doc_count;
                term_res_left.doc_count_error_upper_bound +=
                    term_res_right.doc_count_error_upper_bound;
@@ -397,7 +438,7 @@ impl IntermediateBucketResult {
                IntermediateBucketResult::Range(range_res_left),
                IntermediateBucketResult::Range(range_res_right),
            ) => {
-                merge_serialized_key_maps(&mut range_res_left.buckets, range_res_right.buckets)?;
+                merge_maps(&mut range_res_left.buckets, range_res_right.buckets)?;
            }
            (
                IntermediateBucketResult::Histogram {
@@ -451,39 +492,11 @@ pub struct IntermediateRangeBucketResult {
 #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
 /// Term aggregation including error counts
 pub struct IntermediateTermBucketResult {
-    #[serde(
-        serialize_with = "serialize_entries",
-        deserialize_with = "deserialize_entries"
-    )]
-    pub(crate) entries: FxHashMap<Key, IntermediateTermBucketEntry>,
+    pub(crate) entries: FxHashMap<IntermediateKey, IntermediateTermBucketEntry>,
    pub(crate) sum_other_doc_count: u64,
    pub(crate) doc_count_error_upper_bound: u64,
 }

-// Serialize into a Vec to circument the JSON limitation, where keys can't be numbers
-fn serialize_entries<S>(
-    entries: &FxHashMap<Key, IntermediateTermBucketEntry>,
-    serializer: S,
-) -> Result<S::Ok, S::Error>
-where
-    S: Serializer,
-{
-    let mut seq = serializer.serialize_seq(Some(entries.len()))?;
-    for (k, v) in entries {
-        seq.serialize_element(&(k, v))?;
-    }
-    seq.end()
-}
-
-fn deserialize_entries<'de, D>(
-    deserializer: D,
-) -> Result<FxHashMap<Key, IntermediateTermBucketEntry>, D::Error>
-where D: Deserializer<'de> {
-    let vec_entries: Vec<(Key, IntermediateTermBucketEntry)> =
-        Deserialize::deserialize(deserializer)?;
-    Ok(vec_entries.into_iter().collect())
-}
-
 impl IntermediateTermBucketResult {
    pub(crate) fn into_final_result(
        self,
@@ -499,7 +512,7 @@ impl IntermediateTermBucketResult {
            .map(|(key, entry)| {
                Ok(BucketEntry {
                    key_as_string: None,
-                    key,
+                    key: key.into(),
                    doc_count: entry.doc_count,
                    sub_aggregation: entry
                        .sub_aggregation
@@ -577,25 +590,9 @@ trait MergeFruits {
    fn merge_fruits(&mut self, other: Self) -> crate::Result<()>;
 }

-fn merge_serialized_key_maps<V: MergeFruits + Clone>(
-    entries_left: &mut FxHashMap<SerializedKey, V>,
-    mut entries_right: FxHashMap<SerializedKey, V>,
-) -> crate::Result<()> {
-    for (name, entry_left) in entries_left.iter_mut() {
-        if let Some(entry_right) = entries_right.remove(name) {
-            entry_left.merge_fruits(entry_right)?;
-        }
-    }
-
-    for (key, res) in entries_right.into_iter() {
-        entries_left.entry(key).or_insert(res);
-    }
-    Ok(())
-}
-
-fn merge_key_maps<V: MergeFruits + Clone>(
-    entries_left: &mut FxHashMap<Key, V>,
-    mut entries_right: FxHashMap<Key, V>,
+fn merge_maps<V: MergeFruits + Clone, T: Eq + PartialEq + Hash>(
+    entries_left: &mut FxHashMap<T, V>,
+    mut entries_right: FxHashMap<T, V>,
 ) -> crate::Result<()> {
    for (name, entry_left) in entries_left.iter_mut() {
        if let Some(entry_right) = entries_right.remove(name) {
@@ -652,17 +649,15 @@ impl From<SegmentHistogramBucketEntry> for IntermediateHistogramBucketEntry {
 /// sub_aggregations.
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct IntermediateRangeBucketEntry {
-    /// The unique the bucket is identified.
-    pub key: Key,
+    /// The unique key the bucket is identified with.
+    pub key: IntermediateKey,
    /// The number of documents in the bucket.
    pub doc_count: u64,
    /// The sub_aggregation in this bucket.
    pub sub_aggregation: IntermediateAggregationResults,
    /// The from range of the bucket. Equals `f64::MIN` when `None`.
-    #[serde(skip_serializing_if = "Option::is_none")]
    pub from: Option<f64>,
    /// The to range of the bucket. Equals `f64::MAX` when `None`.
-    #[serde(skip_serializing_if = "Option::is_none")]
    pub to: Option<f64>,
 }

@@ -675,7 +670,7 @@ impl IntermediateRangeBucketEntry {
        limits: &AggregationLimits,
    ) -> crate::Result<RangeBucketEntry> {
        let mut range_bucket_entry = RangeBucketEntry {
-            key: self.key,
+            key: self.key.into(),
            doc_count: self.doc_count,
            sub_aggregation: self
                .sub_aggregation
@@ -752,7 +747,7 @@ mod tests {
            buckets.insert(
                key.to_string(),
                IntermediateRangeBucketEntry {
-                    key: Key::Str(key.to_string()),
+                    key: IntermediateKey::Str(key.to_string()),
                    doc_count: *doc_count,
                    sub_aggregation: Default::default(),
                    from: None,
@@ -783,7 +778,7 @@ mod tests {
            buckets.insert(
                key.to_string(),
                IntermediateRangeBucketEntry {
-                    key: Key::Str(key.to_string()),
+                    key: IntermediateKey::Str(key.to_string()),
                    doc_count: *doc_count,
                    from: None,
                    to: None,
@@ -866,26 +861,4 @@ mod tests {

        assert_eq!(tree_left, orig);
    }
-
-    #[test]
-    fn test_term_bucket_json_roundtrip() {
-        let term_buckets = IntermediateTermBucketResult {
-            entries: vec![(
-                Key::F64(5.0),
-                IntermediateTermBucketEntry {
-                    doc_count: 10,
-                    sub_aggregation: Default::default(),
-                },
-            )]
-            .into_iter()
-            .collect(),
-            sum_other_doc_count: 0,
-            doc_count_error_upper_bound: 0,
-        };
-
-        let term_buckets_round: IntermediateTermBucketResult =
-            serde_json::from_str(&serde_json::to_string(&term_buckets).unwrap()).unwrap();
-
-        assert_eq!(term_buckets, term_buckets_round);
-    }
 }
--- a/src/aggregation/mod.rs
+++ b/src/aggregation/mod.rs
@@ -24,6 +24,10 @@
 //! ## JSON Format
 //! Aggregations request and result structures de/serialize into elasticsearch compatible JSON.
 //!
+//! Notice: Intermediate aggregation results should not be de/serialized via JSON format.
+//! See compatibility tests here: https://github.com/PSeitz/test_serde_formats
+//! TLDR: use ciborium.
+//!
 //! ```verbatim
 //! let agg_req: Aggregations = serde_json::from_str(json_request_string).unwrap();
 //! let collector = AggregationCollector::from_aggs(agg_req, None);
@@ -151,6 +155,8 @@ pub use error::AggregationError;
 use itertools::Itertools;
 use serde::{Deserialize, Serialize};

+use self::intermediate_agg_result::IntermediateKey;
+
 /// Represents an associative array `(key => values)` in a very efficient manner.
 #[derive(Clone, PartialEq, Serialize, Deserialize)]
 pub(crate) struct VecWithNames<T: Clone> {