mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 00:02:55 +00:00
chore!:drop JSON support on intermediate agg result (#1992)
* chore!:drop JSON support on intermediate agg result add support for other formats by removing skip_serialize and untagged JSON support is broken anyway due it's lack on f64::INF etc. handling * Update src/aggregation/intermediate_agg_result.rs Co-authored-by: Paul Masurel <paul@quickwit.io> * move from impl --------- Co-authored-by: Paul Masurel <paul@quickwit.io>
This commit is contained in:
@@ -4,7 +4,6 @@ use crate::aggregation::agg_req::{Aggregation, Aggregations};
|
||||
use crate::aggregation::agg_result::AggregationResults;
|
||||
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
|
||||
use crate::aggregation::collector::AggregationCollector;
|
||||
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use crate::aggregation::segment_agg_result::AggregationLimits;
|
||||
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
|
||||
use crate::aggregation::DistributedAggregationCollector;
|
||||
@@ -421,9 +420,6 @@ fn test_aggregation_level2(
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let res = searcher.search(&term_query, &collector).unwrap();
|
||||
// Test de/serialization roundtrip on intermediate_agg_result
|
||||
let res: IntermediateAggregationResults =
|
||||
serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap();
|
||||
res.into_final_result(agg_req.clone(), &Default::default())
|
||||
.unwrap()
|
||||
} else {
|
||||
|
||||
@@ -166,7 +166,7 @@ impl SegmentRangeBucketEntry {
|
||||
};
|
||||
|
||||
Ok(IntermediateRangeBucketEntry {
|
||||
key: self.key,
|
||||
key: self.key.into(),
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation: sub_aggregation_res,
|
||||
from: self.from,
|
||||
|
||||
@@ -9,14 +9,14 @@ use crate::aggregation::agg_limits::MemoryConsumption;
|
||||
use crate::aggregation::agg_req_with_accessor::{
|
||||
AggregationWithAccessor, AggregationsWithAccessor,
|
||||
};
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||
IntermediateTermBucketEntry, IntermediateTermBucketResult,
|
||||
IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::{
|
||||
build_segment_agg_collector, SegmentAggregationCollector,
|
||||
};
|
||||
use crate::aggregation::{f64_from_fastfield_u64, Key};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::TantivyError;
|
||||
|
||||
@@ -30,10 +30,6 @@ use crate::TantivyError;
|
||||
/// Term aggregations work only on [fast fields](`crate::fastfield`) of type `u64`, `f64`, `i64` and
|
||||
/// text.
|
||||
///
|
||||
/// ### Terminology
|
||||
/// Shard parameters are supposed to be equivalent to elasticsearch shard parameter.
|
||||
/// Since they are
|
||||
///
|
||||
/// ## Document count error
|
||||
/// To improve performance, results from one segment are cut off at `segment_size`. On a index with
|
||||
/// a single segment this is fine. When combining results of multiple segments, terms that
|
||||
@@ -402,7 +398,7 @@ impl SegmentTermCollector {
|
||||
cut_off_buckets(&mut entries, self.req.segment_size as usize)
|
||||
};
|
||||
|
||||
let mut dict: FxHashMap<Key, IntermediateTermBucketEntry> = Default::default();
|
||||
let mut dict: FxHashMap<IntermediateKey, IntermediateTermBucketEntry> = Default::default();
|
||||
dict.reserve(entries.len());
|
||||
|
||||
let mut into_intermediate_bucket_entry =
|
||||
@@ -453,7 +449,7 @@ impl SegmentTermCollector {
|
||||
|
||||
let intermediate_entry = into_intermediate_bucket_entry(term_id, doc_count)?;
|
||||
|
||||
dict.insert(Key::Str(buffer.to_string()), intermediate_entry);
|
||||
dict.insert(IntermediateKey::Str(buffer.to_string()), intermediate_entry);
|
||||
}
|
||||
if self.req.min_doc_count == 0 {
|
||||
// TODO: Handle rev streaming for descending sorting by keys
|
||||
@@ -463,7 +459,7 @@ impl SegmentTermCollector {
|
||||
break;
|
||||
}
|
||||
|
||||
let key = Key::Str(
|
||||
let key = IntermediateKey::Str(
|
||||
std::str::from_utf8(key)
|
||||
.map_err(|utf8_err| DataCorruption::comment_only(utf8_err.to_string()))?
|
||||
.to_string(),
|
||||
@@ -475,7 +471,7 @@ impl SegmentTermCollector {
|
||||
for (val, doc_count) in entries {
|
||||
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
dict.insert(Key::F64(val), intermediate_entry);
|
||||
dict.insert(IntermediateKey::F64(val), intermediate_entry);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -3,12 +3,12 @@
|
||||
//! indices.
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::hash::Hash;
|
||||
|
||||
use columnar::ColumnType;
|
||||
use itertools::Itertools;
|
||||
use rustc_hash::FxHashMap;
|
||||
use serde::ser::SerializeSeq;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
||||
use super::agg_result::{AggregationResult, BucketResult, MetricResult, RangeBucketEntry};
|
||||
@@ -29,11 +29,52 @@ use crate::TantivyError;
|
||||
|
||||
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
||||
/// intermediate results.
|
||||
///
|
||||
/// Notice: This struct should not be de/serialized via JSON format.
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateAggregationResults {
|
||||
pub(crate) aggs_res: VecWithNames<IntermediateAggregationResult>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialOrd, PartialEq)]
|
||||
/// The key to identify a bucket.
|
||||
/// This might seem redundant with `Key`, but the point is to have a different
|
||||
/// Serialize implementation.
|
||||
pub enum IntermediateKey {
|
||||
/// String key
|
||||
Str(String),
|
||||
/// `f64` key
|
||||
F64(f64),
|
||||
}
|
||||
impl From<Key> for IntermediateKey {
|
||||
fn from(value: Key) -> Self {
|
||||
match value {
|
||||
Key::Str(s) => Self::Str(s),
|
||||
Key::F64(f) => Self::F64(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl From<IntermediateKey> for Key {
|
||||
fn from(value: IntermediateKey) -> Self {
|
||||
match value {
|
||||
IntermediateKey::Str(s) => Self::Str(s),
|
||||
IntermediateKey::F64(f) => Self::F64(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for IntermediateKey {}
|
||||
|
||||
impl std::hash::Hash for IntermediateKey {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
core::mem::discriminant(self).hash(state);
|
||||
match self {
|
||||
IntermediateKey::Str(text) => text.hash(state),
|
||||
IntermediateKey::F64(val) => val.to_bits().hash(state),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateAggregationResults {
|
||||
/// Add a result
|
||||
pub fn push(&mut self, key: String, value: IntermediateAggregationResult) {
|
||||
@@ -387,7 +428,7 @@ impl IntermediateBucketResult {
|
||||
IntermediateBucketResult::Terms(term_res_left),
|
||||
IntermediateBucketResult::Terms(term_res_right),
|
||||
) => {
|
||||
merge_key_maps(&mut term_res_left.entries, term_res_right.entries)?;
|
||||
merge_maps(&mut term_res_left.entries, term_res_right.entries)?;
|
||||
term_res_left.sum_other_doc_count += term_res_right.sum_other_doc_count;
|
||||
term_res_left.doc_count_error_upper_bound +=
|
||||
term_res_right.doc_count_error_upper_bound;
|
||||
@@ -397,7 +438,7 @@ impl IntermediateBucketResult {
|
||||
IntermediateBucketResult::Range(range_res_left),
|
||||
IntermediateBucketResult::Range(range_res_right),
|
||||
) => {
|
||||
merge_serialized_key_maps(&mut range_res_left.buckets, range_res_right.buckets)?;
|
||||
merge_maps(&mut range_res_left.buckets, range_res_right.buckets)?;
|
||||
}
|
||||
(
|
||||
IntermediateBucketResult::Histogram {
|
||||
@@ -451,39 +492,11 @@ pub struct IntermediateRangeBucketResult {
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// Term aggregation including error counts
|
||||
pub struct IntermediateTermBucketResult {
|
||||
#[serde(
|
||||
serialize_with = "serialize_entries",
|
||||
deserialize_with = "deserialize_entries"
|
||||
)]
|
||||
pub(crate) entries: FxHashMap<Key, IntermediateTermBucketEntry>,
|
||||
pub(crate) entries: FxHashMap<IntermediateKey, IntermediateTermBucketEntry>,
|
||||
pub(crate) sum_other_doc_count: u64,
|
||||
pub(crate) doc_count_error_upper_bound: u64,
|
||||
}
|
||||
|
||||
// Serialize into a Vec to circument the JSON limitation, where keys can't be numbers
|
||||
fn serialize_entries<S>(
|
||||
entries: &FxHashMap<Key, IntermediateTermBucketEntry>,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
let mut seq = serializer.serialize_seq(Some(entries.len()))?;
|
||||
for (k, v) in entries {
|
||||
seq.serialize_element(&(k, v))?;
|
||||
}
|
||||
seq.end()
|
||||
}
|
||||
|
||||
fn deserialize_entries<'de, D>(
|
||||
deserializer: D,
|
||||
) -> Result<FxHashMap<Key, IntermediateTermBucketEntry>, D::Error>
|
||||
where D: Deserializer<'de> {
|
||||
let vec_entries: Vec<(Key, IntermediateTermBucketEntry)> =
|
||||
Deserialize::deserialize(deserializer)?;
|
||||
Ok(vec_entries.into_iter().collect())
|
||||
}
|
||||
|
||||
impl IntermediateTermBucketResult {
|
||||
pub(crate) fn into_final_result(
|
||||
self,
|
||||
@@ -499,7 +512,7 @@ impl IntermediateTermBucketResult {
|
||||
.map(|(key, entry)| {
|
||||
Ok(BucketEntry {
|
||||
key_as_string: None,
|
||||
key,
|
||||
key: key.into(),
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: entry
|
||||
.sub_aggregation
|
||||
@@ -577,25 +590,9 @@ trait MergeFruits {
|
||||
fn merge_fruits(&mut self, other: Self) -> crate::Result<()>;
|
||||
}
|
||||
|
||||
fn merge_serialized_key_maps<V: MergeFruits + Clone>(
|
||||
entries_left: &mut FxHashMap<SerializedKey, V>,
|
||||
mut entries_right: FxHashMap<SerializedKey, V>,
|
||||
) -> crate::Result<()> {
|
||||
for (name, entry_left) in entries_left.iter_mut() {
|
||||
if let Some(entry_right) = entries_right.remove(name) {
|
||||
entry_left.merge_fruits(entry_right)?;
|
||||
}
|
||||
}
|
||||
|
||||
for (key, res) in entries_right.into_iter() {
|
||||
entries_left.entry(key).or_insert(res);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn merge_key_maps<V: MergeFruits + Clone>(
|
||||
entries_left: &mut FxHashMap<Key, V>,
|
||||
mut entries_right: FxHashMap<Key, V>,
|
||||
fn merge_maps<V: MergeFruits + Clone, T: Eq + PartialEq + Hash>(
|
||||
entries_left: &mut FxHashMap<T, V>,
|
||||
mut entries_right: FxHashMap<T, V>,
|
||||
) -> crate::Result<()> {
|
||||
for (name, entry_left) in entries_left.iter_mut() {
|
||||
if let Some(entry_right) = entries_right.remove(name) {
|
||||
@@ -652,17 +649,15 @@ impl From<SegmentHistogramBucketEntry> for IntermediateHistogramBucketEntry {
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateRangeBucketEntry {
|
||||
/// The unique the bucket is identified.
|
||||
pub key: Key,
|
||||
/// The unique key the bucket is identified with.
|
||||
pub key: IntermediateKey,
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
/// The from range of the bucket. Equals `f64::MIN` when `None`.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals `f64::MAX` when `None`.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
@@ -675,7 +670,7 @@ impl IntermediateRangeBucketEntry {
|
||||
limits: &AggregationLimits,
|
||||
) -> crate::Result<RangeBucketEntry> {
|
||||
let mut range_bucket_entry = RangeBucketEntry {
|
||||
key: self.key,
|
||||
key: self.key.into(),
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation: self
|
||||
.sub_aggregation
|
||||
@@ -752,7 +747,7 @@ mod tests {
|
||||
buckets.insert(
|
||||
key.to_string(),
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
key: IntermediateKey::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
sub_aggregation: Default::default(),
|
||||
from: None,
|
||||
@@ -783,7 +778,7 @@ mod tests {
|
||||
buckets.insert(
|
||||
key.to_string(),
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
key: IntermediateKey::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
from: None,
|
||||
to: None,
|
||||
@@ -866,26 +861,4 @@ mod tests {
|
||||
|
||||
assert_eq!(tree_left, orig);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_bucket_json_roundtrip() {
|
||||
let term_buckets = IntermediateTermBucketResult {
|
||||
entries: vec![(
|
||||
Key::F64(5.0),
|
||||
IntermediateTermBucketEntry {
|
||||
doc_count: 10,
|
||||
sub_aggregation: Default::default(),
|
||||
},
|
||||
)]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
sum_other_doc_count: 0,
|
||||
doc_count_error_upper_bound: 0,
|
||||
};
|
||||
|
||||
let term_buckets_round: IntermediateTermBucketResult =
|
||||
serde_json::from_str(&serde_json::to_string(&term_buckets).unwrap()).unwrap();
|
||||
|
||||
assert_eq!(term_buckets, term_buckets_round);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,6 +24,10 @@
|
||||
//! ## JSON Format
|
||||
//! Aggregations request and result structures de/serialize into elasticsearch compatible JSON.
|
||||
//!
|
||||
//! Notice: Intermediate aggregation results should not be de/serialized via JSON format.
|
||||
//! See compatibility tests here: https://github.com/PSeitz/test_serde_formats
|
||||
//! TLDR: use ciborium.
|
||||
//!
|
||||
//! ```verbatim
|
||||
//! let agg_req: Aggregations = serde_json::from_str(json_request_string).unwrap();
|
||||
//! let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
@@ -151,6 +155,8 @@ pub use error::AggregationError;
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use self::intermediate_agg_result::IntermediateKey;
|
||||
|
||||
/// Represents an associative array `(key => values)` in a very efficient manner.
|
||||
#[derive(Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub(crate) struct VecWithNames<T: Clone> {
|
||||
|
||||
Reference in New Issue
Block a user