mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-22 18:19:58 +00:00
fix merge intermediate aggregation results (#2719)
Previously the merging relied on the order of the results, which is invalid since https://github.com/quickwit-oss/tantivy/pull/2035. This bug is only hit in specific scenarios, when the aggregation collectors are built in a different order on different segments. Co-authored-by: Pascal Seitz <pascal.seitz@datadoghq.com>
This commit is contained in:
@@ -781,13 +781,20 @@ mod tests {
|
||||
use time::{Date, Month};
|
||||
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults,
|
||||
};
|
||||
use crate::aggregation::tests::{
|
||||
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
|
||||
get_test_index_from_terms, get_test_index_from_values_and_terms,
|
||||
};
|
||||
use crate::aggregation::AggregationLimitsGuard;
|
||||
use crate::aggregation::{AggregationLimitsGuard, DistributedAggregationCollector};
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING};
|
||||
use crate::query::AllQuery;
|
||||
use crate::schema::{
|
||||
IndexRecordOption, IntoIpv6Addr, Schema, TextFieldIndexing, TextOptions, FAST, STORED,
|
||||
STRING, TEXT,
|
||||
};
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
#[test]
|
||||
@@ -2274,4 +2281,127 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terms_aggs_hosts_and_tags_merge_on_mixed_order_request() -> crate::Result<()> {
|
||||
// This test ensures that merging of aggregation results works correctly
|
||||
// even if the order of the aggregation requests is different and
|
||||
// running on different indexes with the same data.
|
||||
let build_index = || -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let host = schema_builder.add_text_field("host", FAST);
|
||||
let tags = schema_builder.add_text_field("tags", FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut writer = index.writer(50_000_000).unwrap();
|
||||
|
||||
// --- Ingest documents (batch #1) ---
|
||||
writer.add_document(doc!(
|
||||
host => "192.168.0.10",
|
||||
tags => "nice",
|
||||
))?;
|
||||
writer.add_document(doc!(
|
||||
host => "192.168.0.1",
|
||||
tags => "nice",
|
||||
))?;
|
||||
writer.add_document(doc!(
|
||||
host => "192.168.0.11",
|
||||
tags => "nice",
|
||||
))?;
|
||||
writer.add_document(doc!(
|
||||
host => "192.168.0.10",
|
||||
tags => "nice",
|
||||
tags => "cool",
|
||||
))?;
|
||||
writer.add_document(doc!(
|
||||
host => "192.168.0.1",
|
||||
tags => "nice",
|
||||
tags => "cool",
|
||||
))?;
|
||||
|
||||
writer.commit()?;
|
||||
|
||||
// --- Ingest documents (batch #2) ---
|
||||
writer.add_document(doc!())?;
|
||||
writer.add_document(doc!())?;
|
||||
writer.add_document(doc!(
|
||||
host => "192.168.0.10",
|
||||
))?;
|
||||
writer.add_document(doc!(
|
||||
host => "192.168.0.10",
|
||||
))?;
|
||||
writer.add_document(doc!())?;
|
||||
|
||||
writer.commit()?;
|
||||
Ok(index)
|
||||
};
|
||||
let index = build_index()?;
|
||||
let index2 = build_index()?;
|
||||
|
||||
let search = |idx: &Index,
|
||||
agg_req: &Aggregations|
|
||||
-> crate::Result<IntermediateAggregationResults> {
|
||||
let collector = DistributedAggregationCollector::from_aggs(
|
||||
agg_req.clone(),
|
||||
AggregationLimitsGuard::default(),
|
||||
);
|
||||
let reader = idx.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let agg_res = searcher.search(&AllQuery, &collector)?;
|
||||
Ok(agg_res)
|
||||
};
|
||||
|
||||
// --- Aggregations: terms on host and tags ---
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"hosts": { "terms": { "field": "host" } },
|
||||
"tags": { "terms": { "field": "tags" } }
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let mut agg_res = search(&index, &agg_req)?;
|
||||
|
||||
// --- Aggregations: terms on host and tags ---
|
||||
let mut agg_req2: Aggregations = Aggregations::with_capacity(20);
|
||||
agg_req2.insert(
|
||||
"tags".to_string(),
|
||||
serde_json::from_value(json!({ "terms": { "field": "tags" } }))?,
|
||||
);
|
||||
agg_req2.insert(
|
||||
"hosts".to_string(),
|
||||
serde_json::from_value(json!({ "terms": { "field": "host" } }))?,
|
||||
);
|
||||
// make sure the order of the aggregation request is different
|
||||
assert_ne!(agg_req.keys().next(), agg_req2.keys().next());
|
||||
|
||||
let agg_res2 = search(&index2, &agg_req2)?;
|
||||
|
||||
agg_res.merge_fruits(agg_res2).unwrap();
|
||||
let agg_json = serde_json::to_value(
|
||||
&agg_res.into_final_result(agg_req2, AggregationLimitsGuard::default())?,
|
||||
)?;
|
||||
|
||||
// hosts:
|
||||
let hosts = &agg_json["hosts"]["buckets"];
|
||||
assert_eq!(hosts[0]["key"], "192.168.0.10");
|
||||
assert_eq!(hosts[0]["doc_count"], 8);
|
||||
assert_eq!(hosts[1]["key"], "192.168.0.1");
|
||||
assert_eq!(hosts[1]["doc_count"], 4);
|
||||
assert_eq!(hosts[2]["key"], "192.168.0.11");
|
||||
assert_eq!(hosts[2]["doc_count"], 2);
|
||||
// Implementation currently reports error bounds/other count; ensure zero.
|
||||
assert_eq!(agg_json["hosts"]["doc_count_error_upper_bound"], 0);
|
||||
assert_eq!(agg_json["hosts"]["sum_other_doc_count"], 0);
|
||||
|
||||
// tags:
|
||||
let tags_buckets = &agg_json["tags"]["buckets"];
|
||||
assert_eq!(tags_buckets[0]["key"], "nice");
|
||||
assert_eq!(tags_buckets[0]["doc_count"], 10);
|
||||
assert_eq!(tags_buckets[1]["key"], "cool");
|
||||
assert_eq!(tags_buckets[1]["doc_count"], 4);
|
||||
assert_eq!(agg_json["tags"]["doc_count_error_upper_bound"], 0);
|
||||
assert_eq!(agg_json["tags"]["sum_other_doc_count"], 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -179,12 +179,17 @@ impl IntermediateAggregationResults {
|
||||
}
|
||||
|
||||
/// Merge another intermediate aggregation result into this result.
|
||||
///
|
||||
/// The order of the values need to be the same on both results. This is ensured when the same
|
||||
/// (key values) are present on the underlying `VecWithNames` struct.
|
||||
pub fn merge_fruits(&mut self, other: IntermediateAggregationResults) -> crate::Result<()> {
|
||||
for (left, right) in self.aggs_res.values_mut().zip(other.aggs_res.into_values()) {
|
||||
left.merge_fruits(right)?;
|
||||
pub fn merge_fruits(&mut self, mut other: IntermediateAggregationResults) -> crate::Result<()> {
|
||||
for (key, left) in self.aggs_res.iter_mut() {
|
||||
if let Some(key) = other.aggs_res.remove(key) {
|
||||
left.merge_fruits(key)?;
|
||||
}
|
||||
}
|
||||
// Move remainder of other aggs_res into self.
|
||||
// Note: Currently we don't expect this to happen, as we create empty intermediate results
|
||||
// via [IntermediateAggregationResults::empty_from_req].
|
||||
for (key, value) in other.aggs_res {
|
||||
self.aggs_res.insert(key, value);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user