mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 10:02:55 +00:00
split subaggcache into two trait impls
This commit is contained in:
@@ -10,9 +10,9 @@ use crate::aggregation::accessor_helpers::{
|
||||
};
|
||||
use crate::aggregation::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
||||
use crate::aggregation::bucket::{
|
||||
build_segment_range_collector, FilterAggReqData, HistogramAggReqData, HistogramBounds,
|
||||
IncludeExcludeParam, MissingTermAggReqData, RangeAggReqData, SegmentFilterCollector,
|
||||
SegmentHistogramCollector, TermMissingAgg, TermsAggReqData, TermsAggregation,
|
||||
build_segment_filter_collector, build_segment_range_collector, FilterAggReqData,
|
||||
HistogramAggReqData, HistogramBounds, IncludeExcludeParam, MissingTermAggReqData,
|
||||
RangeAggReqData, SegmentHistogramCollector, TermMissingAgg, TermsAggReqData, TermsAggregation,
|
||||
TermsAggregationInternal,
|
||||
};
|
||||
use crate::aggregation::metric::{
|
||||
@@ -416,9 +416,7 @@ pub(crate) fn build_segment_agg_collector(
|
||||
req, node,
|
||||
)?)),
|
||||
AggKind::Range => Ok(build_segment_range_collector(req, node)?),
|
||||
AggKind::Filter => Ok(Box::new(SegmentFilterCollector::from_req_and_validate(
|
||||
req, node,
|
||||
)?)),
|
||||
AggKind::Filter => build_segment_filter_collector(req, node),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -733,6 +731,7 @@ fn build_nodes(
|
||||
segment_reader: reader.clone(),
|
||||
evaluator,
|
||||
matching_docs_buffer,
|
||||
is_top_level,
|
||||
});
|
||||
let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?;
|
||||
Ok(vec![AggRefNode {
|
||||
|
||||
@@ -6,7 +6,9 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use crate::aggregation::agg_data::{
|
||||
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
||||
};
|
||||
use crate::aggregation::cached_sub_aggs::CachedSubAggs;
|
||||
use crate::aggregation::cached_sub_aggs::{
|
||||
CachedSubAggs, HighCardSubAggCache, LowCardSubAggCache, SubAggCache,
|
||||
};
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||
};
|
||||
@@ -406,6 +408,8 @@ pub struct FilterAggReqData {
|
||||
pub evaluator: DocumentQueryEvaluator,
|
||||
/// Reusable buffer for matching documents to minimize allocations during collection
|
||||
pub matching_docs_buffer: Vec<DocId>,
|
||||
/// True if this filter aggregation is at the top level of the aggregation tree (not nested).
|
||||
pub is_top_level: bool,
|
||||
}
|
||||
|
||||
impl FilterAggReqData {
|
||||
@@ -415,6 +419,7 @@ impl FilterAggReqData {
|
||||
+ std::mem::size_of::<SegmentReader>()
|
||||
+ self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes)
|
||||
+ self.matching_docs_buffer.capacity() * std::mem::size_of::<DocId>()
|
||||
+ std::mem::size_of::<bool>()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -498,17 +503,17 @@ struct DocCount {
|
||||
}
|
||||
|
||||
/// Segment collector for filter aggregation
|
||||
pub struct SegmentFilterCollector {
|
||||
pub struct SegmentFilterCollector<C: SubAggCache> {
|
||||
/// Document counts per parent bucket
|
||||
parent_buckets: Vec<DocCount>,
|
||||
/// Sub-aggregation collectors
|
||||
sub_aggregations: Option<CachedSubAggs<true>>,
|
||||
sub_aggregations: Option<CachedSubAggs<C>>,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
/// Accessor index for this filter aggregation (to access FilterAggReqData)
|
||||
accessor_idx: usize,
|
||||
}
|
||||
|
||||
impl SegmentFilterCollector {
|
||||
impl<C: SubAggCache> SegmentFilterCollector<C> {
|
||||
/// Create a new filter segment collector following the new agg_data pattern
|
||||
pub(crate) fn from_req_and_validate(
|
||||
req: &mut AggregationsSegmentCtx,
|
||||
@@ -531,7 +536,27 @@ impl SegmentFilterCollector {
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for SegmentFilterCollector {
|
||||
pub(crate) fn build_segment_filter_collector(
|
||||
req: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
||||
let is_top_level = req.per_request.filter_req_data[node.idx_in_req_data]
|
||||
.as_ref()
|
||||
.expect("filter_req_data slot is empty")
|
||||
.is_top_level;
|
||||
|
||||
if is_top_level {
|
||||
Ok(Box::new(
|
||||
SegmentFilterCollector::<LowCardSubAggCache>::from_req_and_validate(req, node)?,
|
||||
))
|
||||
} else {
|
||||
Ok(Box::new(
|
||||
SegmentFilterCollector::<HighCardSubAggCache>::from_req_and_validate(req, node)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl<C: SubAggCache> Debug for SegmentFilterCollector<C> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentFilterCollector")
|
||||
.field("buckets", &self.parent_buckets)
|
||||
@@ -541,7 +566,7 @@ impl Debug for SegmentFilterCollector {
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentAggregationCollector for SegmentFilterCollector {
|
||||
impl<C: SubAggCache> SegmentAggregationCollector for SegmentFilterCollector<C> {
|
||||
fn add_intermediate_aggregation_result(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
|
||||
@@ -10,7 +10,7 @@ use crate::aggregation::agg_data::{
|
||||
};
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::agg_result::BucketEntry;
|
||||
use crate::aggregation::cached_sub_aggs::CachedSubAggs;
|
||||
use crate::aggregation::cached_sub_aggs::{CachedSubAggs, HighCardCachedSubAggs};
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||
IntermediateHistogramBucketEntry,
|
||||
@@ -258,7 +258,7 @@ pub(crate) struct SegmentHistogramBucketEntry {
|
||||
impl SegmentHistogramBucketEntry {
|
||||
pub(crate) fn into_intermediate_bucket_entry(
|
||||
self,
|
||||
sub_aggregation: &mut Option<CachedSubAggs>,
|
||||
sub_aggregation: &mut Option<HighCardCachedSubAggs>,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<IntermediateHistogramBucketEntry> {
|
||||
let mut sub_aggregation_res = IntermediateAggregationResults::default();
|
||||
@@ -291,7 +291,7 @@ pub struct SegmentHistogramCollector {
|
||||
/// The buckets containing the aggregation data.
|
||||
/// One Histogram bucket per parent bucket id.
|
||||
parent_buckets: Vec<HistogramBuckets>,
|
||||
sub_agg: Option<CachedSubAggs>,
|
||||
sub_agg: Option<HighCardCachedSubAggs>,
|
||||
accessor_idx: usize,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
}
|
||||
|
||||
@@ -9,7 +9,9 @@ use crate::aggregation::agg_data::{
|
||||
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
||||
};
|
||||
use crate::aggregation::agg_limits::AggregationLimitsGuard;
|
||||
use crate::aggregation::cached_sub_aggs::CachedSubAggs;
|
||||
use crate::aggregation::cached_sub_aggs::{
|
||||
CachedSubAggs, HighCardSubAggCache, LowCardCachedSubAggs, LowCardSubAggCache, SubAggCache,
|
||||
};
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||
IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
|
||||
@@ -153,13 +155,13 @@ pub(crate) struct SegmentRangeAndBucketEntry {
|
||||
|
||||
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||||
/// the correct datatype.
|
||||
pub struct SegmentRangeCollector<const LOWCARD: bool = false> {
|
||||
pub struct SegmentRangeCollector<C: SubAggCache> {
|
||||
/// The buckets containing the aggregation data.
|
||||
/// One for each ParentBucketId
|
||||
parent_buckets: Vec<Vec<SegmentRangeAndBucketEntry>>,
|
||||
column_type: ColumnType,
|
||||
pub(crate) accessor_idx: usize,
|
||||
sub_agg: Option<CachedSubAggs<LOWCARD>>,
|
||||
sub_agg: Option<CachedSubAggs<C>>,
|
||||
/// Here things get a bit weird. We need to assign unique bucket ids across all
|
||||
/// parent buckets. So we keep track of the next available bucket id here.
|
||||
/// This allows a kind of flattening of the bucket ids across all parent buckets.
|
||||
@@ -176,7 +178,7 @@ pub struct SegmentRangeCollector<const LOWCARD: bool = false> {
|
||||
limits: AggregationLimitsGuard,
|
||||
}
|
||||
|
||||
impl<const LOWCARD: bool> Debug for SegmentRangeCollector<LOWCARD> {
|
||||
impl<C: SubAggCache> Debug for SegmentRangeCollector<C> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentRangeCollector")
|
||||
.field("parent_buckets_len", &self.parent_buckets.len())
|
||||
@@ -227,7 +229,7 @@ impl SegmentRangeBucketEntry {
|
||||
}
|
||||
}
|
||||
|
||||
impl<const LOWCARD: bool> SegmentAggregationCollector for SegmentRangeCollector<LOWCARD> {
|
||||
impl<C: SubAggCache> SegmentAggregationCollector for SegmentRangeCollector<C> {
|
||||
fn add_intermediate_aggregation_result(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
@@ -348,8 +350,8 @@ pub(crate) fn build_segment_range_collector(
|
||||
};
|
||||
|
||||
if is_low_card {
|
||||
Ok(Box::new(SegmentRangeCollector {
|
||||
sub_agg: sub_agg.map(CachedSubAggs::<true>::new),
|
||||
Ok(Box::new(SegmentRangeCollector::<LowCardSubAggCache> {
|
||||
sub_agg: sub_agg.map(LowCardCachedSubAggs::new),
|
||||
column_type: field_type,
|
||||
accessor_idx,
|
||||
parent_buckets: Vec::new(),
|
||||
@@ -357,8 +359,8 @@ pub(crate) fn build_segment_range_collector(
|
||||
limits: agg_data.context.limits.clone(),
|
||||
}))
|
||||
} else {
|
||||
Ok(Box::new(SegmentRangeCollector {
|
||||
sub_agg: sub_agg.map(CachedSubAggs::<false>::new),
|
||||
Ok(Box::new(SegmentRangeCollector::<HighCardSubAggCache> {
|
||||
sub_agg: sub_agg.map(CachedSubAggs::new),
|
||||
column_type: field_type,
|
||||
accessor_idx,
|
||||
parent_buckets: Vec::new(),
|
||||
@@ -368,7 +370,7 @@ pub(crate) fn build_segment_range_collector(
|
||||
}
|
||||
}
|
||||
|
||||
impl<const LOWCARD: bool> SegmentRangeCollector<LOWCARD> {
|
||||
impl<C: SubAggCache> SegmentRangeCollector<C> {
|
||||
pub(crate) fn create_new_buckets(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
@@ -552,7 +554,7 @@ mod tests {
|
||||
pub fn get_collector_from_ranges(
|
||||
ranges: Vec<RangeAggregationRange>,
|
||||
field_type: ColumnType,
|
||||
) -> SegmentRangeCollector {
|
||||
) -> SegmentRangeCollector<HighCardSubAggCache> {
|
||||
let req = RangeAggregation {
|
||||
field: "dummy".to_string(),
|
||||
ranges,
|
||||
|
||||
@@ -17,7 +17,9 @@ use crate::aggregation::agg_data::{
|
||||
};
|
||||
use crate::aggregation::agg_limits::MemoryConsumption;
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::cached_sub_aggs::CachedSubAggs;
|
||||
use crate::aggregation::cached_sub_aggs::{
|
||||
CachedSubAggs, HighCardSubAggCache, LowCardCachedSubAggs, LowCardSubAggCache, SubAggCache,
|
||||
};
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||
IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult,
|
||||
@@ -389,7 +391,7 @@ pub(crate) fn build_segment_term_collector(
|
||||
// Decide which bucket storage is best suited for this aggregation.
|
||||
if is_top_level && max_term_id < MAX_NUM_TERMS_FOR_VEC && !has_sub_aggregations {
|
||||
let term_buckets = VecTermBucketsNoAgg::new(max_term_id + 1, &mut bucket_id_provider);
|
||||
let collector: SegmentTermCollector<_, true> = SegmentTermCollector {
|
||||
let collector: SegmentTermCollector<_, HighCardSubAggCache> = SegmentTermCollector {
|
||||
parent_buckets: vec![term_buckets],
|
||||
sub_agg: None,
|
||||
bucket_id_provider,
|
||||
@@ -399,8 +401,8 @@ pub(crate) fn build_segment_term_collector(
|
||||
Ok(Box::new(collector))
|
||||
} else if is_top_level && max_term_id < MAX_NUM_TERMS_FOR_VEC {
|
||||
let term_buckets = VecTermBuckets::new(max_term_id + 1, &mut bucket_id_provider);
|
||||
let sub_agg = sub_agg_collector.map(CachedSubAggs::<false>::new);
|
||||
let collector: SegmentTermCollector<_, false> = SegmentTermCollector {
|
||||
let sub_agg = sub_agg_collector.map(LowCardCachedSubAggs::new);
|
||||
let collector: SegmentTermCollector<_, LowCardSubAggCache> = SegmentTermCollector {
|
||||
parent_buckets: vec![term_buckets],
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
@@ -412,26 +414,28 @@ pub(crate) fn build_segment_term_collector(
|
||||
let term_buckets: PagedTermMap =
|
||||
PagedTermMap::new(max_term_id + 1, &mut bucket_id_provider);
|
||||
// Build sub-aggregation blueprint (flat pairs)
|
||||
let sub_agg = sub_agg_collector.map(CachedSubAggs::<false>::new);
|
||||
let collector: SegmentTermCollector<PagedTermMap, false> = SegmentTermCollector {
|
||||
parent_buckets: vec![term_buckets],
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
max_term_id,
|
||||
terms_req_data,
|
||||
};
|
||||
let sub_agg = sub_agg_collector.map(CachedSubAggs::new);
|
||||
let collector: SegmentTermCollector<PagedTermMap, HighCardSubAggCache> =
|
||||
SegmentTermCollector {
|
||||
parent_buckets: vec![term_buckets],
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
max_term_id,
|
||||
terms_req_data,
|
||||
};
|
||||
Ok(Box::new(collector))
|
||||
} else {
|
||||
let term_buckets: HashMapTermBuckets = HashMapTermBuckets::default();
|
||||
// Build sub-aggregation blueprint (flat pairs)
|
||||
let sub_agg = sub_agg_collector.map(CachedSubAggs::<false>::new);
|
||||
let collector: SegmentTermCollector<HashMapTermBuckets, false> = SegmentTermCollector {
|
||||
parent_buckets: vec![term_buckets],
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
max_term_id,
|
||||
terms_req_data,
|
||||
};
|
||||
let sub_agg = sub_agg_collector.map(CachedSubAggs::new);
|
||||
let collector: SegmentTermCollector<HashMapTermBuckets, HighCardSubAggCache> =
|
||||
SegmentTermCollector {
|
||||
parent_buckets: vec![term_buckets],
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
max_term_id,
|
||||
terms_req_data,
|
||||
};
|
||||
Ok(Box::new(collector))
|
||||
}
|
||||
}
|
||||
@@ -754,10 +758,10 @@ impl TermAggregationMap for VecTermBuckets {
|
||||
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||||
/// the correct datatype.
|
||||
#[derive(Debug)]
|
||||
struct SegmentTermCollector<TermMap: TermAggregationMap, const LOWCARD: bool = false> {
|
||||
struct SegmentTermCollector<TermMap: TermAggregationMap, C: SubAggCache> {
|
||||
/// The buckets containing the aggregation data.
|
||||
parent_buckets: Vec<TermMap>,
|
||||
sub_agg: Option<CachedSubAggs<LOWCARD>>,
|
||||
sub_agg: Option<CachedSubAggs<C>>,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
max_term_id: u64,
|
||||
terms_req_data: TermsAggReqData,
|
||||
@@ -768,8 +772,8 @@ pub(crate) fn get_agg_name_and_property(name: &str) -> (&str, &str) {
|
||||
(agg_name, agg_property)
|
||||
}
|
||||
|
||||
impl<TermMap: TermAggregationMap, const LOWCARD: bool> SegmentAggregationCollector
|
||||
for SegmentTermCollector<TermMap, LOWCARD>
|
||||
impl<TermMap: TermAggregationMap, C: SubAggCache> SegmentAggregationCollector
|
||||
for SegmentTermCollector<TermMap, C>
|
||||
{
|
||||
fn add_intermediate_aggregation_result(
|
||||
&mut self,
|
||||
@@ -901,8 +905,10 @@ fn extract_missing_value<T>(
|
||||
Some((key, bucket))
|
||||
}
|
||||
|
||||
impl<TermMap, const LOWCARD: bool> SegmentTermCollector<TermMap, LOWCARD>
|
||||
where TermMap: TermAggregationMap
|
||||
impl<TermMap, C> SegmentTermCollector<TermMap, C>
|
||||
where
|
||||
TermMap: TermAggregationMap,
|
||||
C: SubAggCache,
|
||||
{
|
||||
fn get_memory_consumption(&self) -> usize {
|
||||
self.parent_buckets
|
||||
@@ -914,7 +920,7 @@ where TermMap: TermAggregationMap
|
||||
#[inline]
|
||||
pub(crate) fn into_intermediate_bucket_result(
|
||||
term_req: &TermsAggReqData,
|
||||
sub_agg: &mut Option<CachedSubAggs<LOWCARD>>,
|
||||
sub_agg: &mut Option<CachedSubAggs<C>>,
|
||||
term_buckets: TermMap,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
@@ -959,7 +965,7 @@ where TermMap: TermAggregationMap
|
||||
|
||||
let into_intermediate_bucket_entry =
|
||||
|bucket: Bucket,
|
||||
sub_agg: &mut Option<CachedSubAggs<LOWCARD>>|
|
||||
sub_agg: &mut Option<CachedSubAggs<C>>|
|
||||
-> crate::Result<IntermediateTermBucketEntry> {
|
||||
if let Some(sub_agg) = sub_agg {
|
||||
let mut sub_aggregation_res = IntermediateAggregationResults::default();
|
||||
@@ -1115,13 +1121,13 @@ where TermMap: TermAggregationMap
|
||||
}
|
||||
}
|
||||
|
||||
impl<TermMap: TermAggregationMap, const LOWCARD: bool> SegmentTermCollector<TermMap, LOWCARD> {
|
||||
impl<TermMap: TermAggregationMap, C: SubAggCache> SegmentTermCollector<TermMap, C> {
|
||||
#[inline]
|
||||
fn collect_terms_with_docs(
|
||||
iter: impl Iterator<Item = (crate::DocId, u64)>,
|
||||
term_buckets: &mut TermMap,
|
||||
bucket_id_provider: &mut BucketIdProvider,
|
||||
sub_agg: &mut CachedSubAggs<LOWCARD>,
|
||||
sub_agg: &mut CachedSubAggs<C>,
|
||||
) {
|
||||
for (doc, term_id) in iter {
|
||||
let bucket_id = term_buckets.term_entry(term_id, bucket_id_provider);
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::aggregation::agg_data::{
|
||||
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
||||
};
|
||||
use crate::aggregation::bucket::term_agg::TermsAggregation;
|
||||
use crate::aggregation::cached_sub_aggs::CachedSubAggs;
|
||||
use crate::aggregation::cached_sub_aggs::{CachedSubAggs, HighCardCachedSubAggs};
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||
IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult,
|
||||
@@ -47,7 +47,7 @@ struct MissingCount {
|
||||
#[derive(Default, Debug)]
|
||||
pub struct TermMissingAgg {
|
||||
accessor_idx: usize,
|
||||
sub_agg: Option<CachedSubAggs>,
|
||||
sub_agg: Option<HighCardCachedSubAggs>,
|
||||
/// Idx = parent bucket id, Value = missing count for that bucket
|
||||
missing_count_per_bucket: Vec<MissingCount>,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
use std::fmt::Debug;
|
||||
|
||||
use super::segment_agg_result::SegmentAggregationCollector;
|
||||
use crate::aggregation::agg_data::AggregationsSegmentCtx;
|
||||
use crate::aggregation::bucket::MAX_NUM_TERMS_FOR_VEC;
|
||||
use crate::aggregation::BucketId;
|
||||
use crate::DocId;
|
||||
|
||||
#[derive(Debug)]
|
||||
/// A cache for sub-aggregations, storing doc ids per bucket id.
|
||||
/// Depending on the cardinality of the parent aggregation, we use different
|
||||
/// storage strategies.
|
||||
@@ -22,68 +23,49 @@ use crate::DocId;
|
||||
/// TODO: consider using a more advanced data structure for high cardinality
|
||||
/// aggregations.
|
||||
/// What this datastructure does in general is to group docs by bucket id.
|
||||
pub(crate) struct CachedSubAggs<const LOWCARD: bool = false> {
|
||||
/// Only used when LOWCARD is true.
|
||||
/// Cache doc ids per bucket for sub-aggregations.
|
||||
///
|
||||
/// The outer Vec is indexed by BucketId.
|
||||
per_bucket_docs: Vec<Vec<DocId>>,
|
||||
/// Only used when LOWCARD is false.
|
||||
///
|
||||
/// This weird partitioning is used to do some cheap grouping on the bucket ids.
|
||||
/// bucket ids are dense, e.g. when we don't detect the cardinality as low cardinality,
|
||||
/// but there are just 16 bucket ids, each bucket id will go to its own partition.
|
||||
///
|
||||
/// We want to keep this cheap, because high cardinality aggregations can have a lot of
|
||||
/// buckets, and they may be nothing to group.
|
||||
partitions: [PartitionEntry; NUM_PARTITIONS],
|
||||
pub(crate) sub_agg_collector: Box<dyn SegmentAggregationCollector>,
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct CachedSubAggs<C: SubAggCache> {
|
||||
cache: C,
|
||||
sub_agg_collector: Box<dyn SegmentAggregationCollector>,
|
||||
num_docs: usize,
|
||||
}
|
||||
|
||||
pub type LowCardCachedSubAggs = CachedSubAggs<LowCardSubAggCache>;
|
||||
pub type HighCardCachedSubAggs = CachedSubAggs<HighCardSubAggCache>;
|
||||
|
||||
const FLUSH_THRESHOLD: usize = 2048;
|
||||
const NUM_PARTITIONS: usize = 16;
|
||||
|
||||
impl<const LOWCARD: bool> CachedSubAggs<LOWCARD> {
|
||||
/// A trait for caching sub-aggregation doc ids per bucket id.
|
||||
/// Different implementations can be used depending on the cardinality
|
||||
/// of the parent aggregation.
|
||||
pub trait SubAggCache: Debug {
|
||||
fn new() -> Self;
|
||||
fn push(&mut self, bucket_id: BucketId, doc_id: DocId);
|
||||
fn flush_local(
|
||||
&mut self,
|
||||
sub_agg: &mut Box<dyn SegmentAggregationCollector>,
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
force: bool,
|
||||
) -> crate::Result<()>;
|
||||
}
|
||||
|
||||
impl<Backend: SubAggCache + Debug> CachedSubAggs<Backend> {
|
||||
pub fn new(sub_agg: Box<dyn SegmentAggregationCollector>) -> Self {
|
||||
Self {
|
||||
cache: Backend::new(),
|
||||
sub_agg_collector: sub_agg,
|
||||
num_docs: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_sub_agg_collector(&mut self) -> &mut Box<dyn SegmentAggregationCollector> {
|
||||
&mut self.sub_agg_collector
|
||||
}
|
||||
|
||||
pub fn new(sub_agg: Box<dyn SegmentAggregationCollector>) -> Self {
|
||||
Self {
|
||||
per_bucket_docs: Vec::new(),
|
||||
num_docs: 0,
|
||||
sub_agg_collector: sub_agg,
|
||||
partitions: core::array::from_fn(|_| PartitionEntry::default()),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn clear(&mut self) {
|
||||
for v in &mut self.per_bucket_docs {
|
||||
v.clear();
|
||||
}
|
||||
for partition in &mut self.partitions {
|
||||
partition.clear();
|
||||
}
|
||||
self.num_docs = 0;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
|
||||
if LOWCARD {
|
||||
// TODO: We could flush single buckets here
|
||||
let idx = bucket_id as usize;
|
||||
if self.per_bucket_docs.len() <= idx {
|
||||
self.per_bucket_docs.resize_with(idx + 1, Vec::new);
|
||||
}
|
||||
self.per_bucket_docs[idx].push(doc_id);
|
||||
} else {
|
||||
let idx = bucket_id % NUM_PARTITIONS as u32;
|
||||
let slot = &mut self.partitions[idx as usize];
|
||||
slot.bucket_ids.push(bucket_id);
|
||||
slot.docs.push(doc_id);
|
||||
}
|
||||
self.cache.push(bucket_id, doc_id);
|
||||
self.num_docs += 1;
|
||||
}
|
||||
|
||||
@@ -94,82 +76,45 @@ impl<const LOWCARD: bool> CachedSubAggs<LOWCARD> {
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
if self.num_docs >= FLUSH_THRESHOLD {
|
||||
self.flush_local(agg_data, false)?;
|
||||
self.cache
|
||||
.flush_local(&mut self.sub_agg_collector, agg_data, false)?;
|
||||
self.num_docs = 0;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Note: this does _not_ flush the sub aggregations
|
||||
fn flush_local(
|
||||
&mut self,
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
force: bool,
|
||||
) -> crate::Result<()> {
|
||||
if LOWCARD {
|
||||
// Pre-aggregated: call collect per bucket.
|
||||
let max_bucket = (self.per_bucket_docs.len() as BucketId).saturating_sub(1);
|
||||
self.sub_agg_collector
|
||||
.prepare_max_bucket(max_bucket, agg_data)?;
|
||||
// The threshold above which we flush buckets individually.
|
||||
// Note: We need to make sure that we don't lock ourselves into a situation where we hit
|
||||
// the FLUSH_THRESHOLD, but never flush any buckets. (except the final flush)
|
||||
let mut bucket_treshold = FLUSH_THRESHOLD / (self.per_bucket_docs.len().max(1) * 2);
|
||||
const _: () = {
|
||||
// MAX_NUM_TERMS_FOR_VEC == LOWCARD threshold
|
||||
let bucket_treshold = FLUSH_THRESHOLD / (MAX_NUM_TERMS_FOR_VEC as usize * 2);
|
||||
assert!(
|
||||
bucket_treshold > 0,
|
||||
"Bucket threshold must be greater than 0"
|
||||
);
|
||||
};
|
||||
if force {
|
||||
bucket_treshold = 0;
|
||||
}
|
||||
for (bucket_id, docs) in self
|
||||
.per_bucket_docs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, docs)| docs.len() > bucket_treshold)
|
||||
{
|
||||
self.sub_agg_collector
|
||||
.collect(bucket_id as BucketId, docs, agg_data)?;
|
||||
}
|
||||
} else {
|
||||
let mut max_bucket = 0u32;
|
||||
for partition in &self.partitions {
|
||||
if let Some(&local_max) = partition.bucket_ids.iter().max() {
|
||||
max_bucket = max_bucket.max(local_max);
|
||||
}
|
||||
}
|
||||
|
||||
self.sub_agg_collector
|
||||
.prepare_max_bucket(max_bucket, agg_data)?;
|
||||
|
||||
for slot in &self.partitions {
|
||||
if !slot.bucket_ids.is_empty() {
|
||||
// Reduce dynamic dispatch overhead by collecting a full partition in one call.
|
||||
self.sub_agg_collector.collect_multiple(
|
||||
&slot.bucket_ids,
|
||||
&slot.docs,
|
||||
agg_data,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
self.clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Note: this _does_ flush the sub aggregations
|
||||
/// Note: this _does_ flush the sub aggregations.
|
||||
pub fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||||
if self.num_docs != 0 {
|
||||
self.flush_local(agg_data, true)?;
|
||||
self.cache
|
||||
.flush_local(&mut self.sub_agg_collector, agg_data, true)?;
|
||||
self.num_docs = 0;
|
||||
}
|
||||
self.sub_agg_collector.flush(agg_data)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct HighCardSubAggCache {
|
||||
/// This weird partitioning is used to do some cheap grouping on the bucket ids.
|
||||
/// bucket ids are dense, e.g. when we don't detect the cardinality as low cardinality,
|
||||
/// but there are just 16 bucket ids, each bucket id will go to its own partition.
|
||||
///
|
||||
/// We want to keep this cheap, because high cardinality aggregations can have a lot of
|
||||
/// buckets, and there may be nothing to group.
|
||||
partitions: [PartitionEntry; NUM_PARTITIONS],
|
||||
}
|
||||
|
||||
impl HighCardSubAggCache {
|
||||
#[inline]
|
||||
fn clear(&mut self) {
|
||||
for partition in &mut self.partitions {
|
||||
partition.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct PartitionEntry {
|
||||
bucket_ids: Vec<BucketId>,
|
||||
@@ -183,3 +128,114 @@ impl PartitionEntry {
|
||||
self.docs.clear();
|
||||
}
|
||||
}
|
||||
|
||||
impl SubAggCache for HighCardSubAggCache {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
partitions: core::array::from_fn(|_| PartitionEntry::default()),
|
||||
}
|
||||
}
|
||||
|
||||
fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
|
||||
let idx = bucket_id % NUM_PARTITIONS as u32;
|
||||
let slot = &mut self.partitions[idx as usize];
|
||||
slot.bucket_ids.push(bucket_id);
|
||||
slot.docs.push(doc_id);
|
||||
}
|
||||
|
||||
fn flush_local(
|
||||
&mut self,
|
||||
sub_agg: &mut Box<dyn SegmentAggregationCollector>,
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
_force: bool,
|
||||
) -> crate::Result<()> {
|
||||
let mut max_bucket = 0u32;
|
||||
for partition in &self.partitions {
|
||||
if let Some(&local_max) = partition.bucket_ids.iter().max() {
|
||||
max_bucket = max_bucket.max(local_max);
|
||||
}
|
||||
}
|
||||
|
||||
sub_agg.prepare_max_bucket(max_bucket, agg_data)?;
|
||||
|
||||
for slot in &self.partitions {
|
||||
if !slot.bucket_ids.is_empty() {
|
||||
// Reduce dynamic dispatch overhead by collecting a full partition in one call.
|
||||
sub_agg.collect_multiple(&slot.bucket_ids, &slot.docs, agg_data)?;
|
||||
}
|
||||
}
|
||||
|
||||
self.clear();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct LowCardSubAggCache {
|
||||
/// Cache doc ids per bucket for sub-aggregations.
|
||||
///
|
||||
/// The outer Vec is indexed by BucketId.
|
||||
per_bucket_docs: Vec<Vec<DocId>>,
|
||||
}
|
||||
|
||||
impl LowCardSubAggCache {
|
||||
#[inline]
|
||||
fn clear(&mut self) {
|
||||
for v in &mut self.per_bucket_docs {
|
||||
v.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SubAggCache for LowCardSubAggCache {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
per_bucket_docs: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
|
||||
let idx = bucket_id as usize;
|
||||
if self.per_bucket_docs.len() <= idx {
|
||||
self.per_bucket_docs.resize_with(idx + 1, Vec::new);
|
||||
}
|
||||
self.per_bucket_docs[idx].push(doc_id);
|
||||
}
|
||||
|
||||
fn flush_local(
|
||||
&mut self,
|
||||
sub_agg: &mut Box<dyn SegmentAggregationCollector>,
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
force: bool,
|
||||
) -> crate::Result<()> {
|
||||
// Pre-aggregated: call collect per bucket.
|
||||
let max_bucket = (self.per_bucket_docs.len() as BucketId).saturating_sub(1);
|
||||
sub_agg.prepare_max_bucket(max_bucket, agg_data)?;
|
||||
// The threshold above which we flush buckets individually.
|
||||
// Note: We need to make sure that we don't lock ourselves into a situation where we hit
|
||||
// the FLUSH_THRESHOLD, but never flush any buckets. (except the final flush)
|
||||
let mut bucket_treshold = FLUSH_THRESHOLD / (self.per_bucket_docs.len().max(1) * 2);
|
||||
const _: () = {
|
||||
// MAX_NUM_TERMS_FOR_VEC == LOWCARD threshold
|
||||
let bucket_treshold = FLUSH_THRESHOLD / (MAX_NUM_TERMS_FOR_VEC as usize * 2);
|
||||
assert!(
|
||||
bucket_treshold > 0,
|
||||
"Bucket threshold must be greater than 0"
|
||||
);
|
||||
};
|
||||
if force {
|
||||
bucket_treshold = 0;
|
||||
}
|
||||
for (bucket_id, docs) in self
|
||||
.per_bucket_docs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, docs)| docs.len() > bucket_treshold)
|
||||
{
|
||||
sub_agg.collect(bucket_id as BucketId, docs, agg_data)?;
|
||||
}
|
||||
|
||||
self.clear();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::agg_req::Aggregations;
|
||||
use super::agg_result::AggregationResults;
|
||||
use super::cached_sub_aggs::CachedSubAggs;
|
||||
use super::cached_sub_aggs::LowCardCachedSubAggs;
|
||||
use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use super::AggContextParams;
|
||||
// group buffering strategy is chosen explicitly by callers; no need to hash-group on the fly.
|
||||
@@ -136,7 +136,7 @@ fn merge_fruits(
|
||||
/// `AggregationSegmentCollector` does the aggregation collection on a segment.
|
||||
pub struct AggregationSegmentCollector {
|
||||
aggs_with_accessor: AggregationsSegmentCtx,
|
||||
agg_collector: CachedSubAggs<true>,
|
||||
agg_collector: LowCardCachedSubAggs,
|
||||
error: Option<TantivyError>,
|
||||
}
|
||||
|
||||
@@ -151,7 +151,8 @@ impl AggregationSegmentCollector {
|
||||
) -> crate::Result<Self> {
|
||||
let mut agg_data =
|
||||
build_aggregations_data_from_req(agg, reader, segment_ordinal, context.clone())?;
|
||||
let mut result = CachedSubAggs::new(build_segment_agg_collectors_root(&mut agg_data)?);
|
||||
let mut result =
|
||||
LowCardCachedSubAggs::new(build_segment_agg_collectors_root(&mut agg_data)?);
|
||||
result
|
||||
.get_sub_agg_collector()
|
||||
.prepare_max_bucket(0, &agg_data)?; // prepare for bucket zero
|
||||
|
||||
Reference in New Issue
Block a user