diff --git a/src/aggregation/agg_data.rs b/src/aggregation/agg_data.rs index 9db6c6372..946c5a5a0 100644 --- a/src/aggregation/agg_data.rs +++ b/src/aggregation/agg_data.rs @@ -868,7 +868,7 @@ fn build_terms_or_cardinality_nodes( }); } - // Add one node per accessor to mirror previous behavior and allow per-type missing handling. + // Add one node per accessor for (accessor, column_type) in column_and_types { let missing_value_for_accessor = if use_special_missing_agg { None diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index 0e966135c..99a156902 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -333,6 +333,10 @@ impl TermsAggregationInternal { } } +/// The treshold for maximum number of terms to use a Vec-backed bucket storage. +/// TODO: Benchmark to validate the threshold +pub const MAX_NUM_TERMS_FOR_VEC: u64 = 100; + /// Build a concrete `SegmentTermCollector` with either a Vec- or HashMap-backed /// bucket storage, depending on the column type and aggregation level. pub(crate) fn build_segment_term_collector( @@ -366,16 +370,11 @@ pub(crate) fn build_segment_term_collector( // Build sub-aggregation blueprint if there are children. let has_sub_aggregations = !node.children.is_empty(); - // Decide whether to use a Vec-backed or HashMap-backed bucket storage. - // TODO: A better metric instead of is_top_level would be the number of buckets expected. // E.g. If term agg is not top level, but the parent is a bucket agg with less than 10 buckets, // we can still use Vec. let is_top_level = terms_req_data.is_top_level; - // TODO: Benchmark to validate the threshold - const MAX_NUM_TERMS_FOR_VEC: u64 = 100; - // Let's see if we can use a vec to aggregate our data // instead of a hashmap. let col_max_value = terms_req_data.accessor.max_value(); @@ -389,8 +388,7 @@ pub(crate) fn build_segment_term_collector( }; let mut bucket_id_provider = BucketIdProvider::default(); - // - use a Vec instead of a hashmap for our aggregation. - + // Decide which bucket storage is best suited for this aggregation. if is_top_level && max_term_id < MAX_NUM_TERMS_FOR_VEC && !has_sub_aggregations { let term_buckets = VecTermBucketsNoAgg::new(max_term_id + 1, &mut bucket_id_provider); let collector: SegmentTermCollector<_, true> = SegmentTermCollector { diff --git a/src/aggregation/cached_sub_aggs.rs b/src/aggregation/cached_sub_aggs.rs index f133c0273..94ea93fea 100644 --- a/src/aggregation/cached_sub_aggs.rs +++ b/src/aggregation/cached_sub_aggs.rs @@ -1,5 +1,6 @@ use super::segment_agg_result::SegmentAggregationCollector; use crate::aggregation::agg_data::AggregationsSegmentCtx; +use crate::aggregation::bucket::MAX_NUM_TERMS_FOR_VEC; use crate::aggregation::BucketId; use crate::DocId; @@ -28,9 +29,13 @@ pub(crate) struct CachedSubAggs { /// The outer Vec is indexed by BucketId. per_bucket_docs: Vec>, /// Only used when LOWCARD is false. - /// For higher cardinalities we use a partitioned approach to store /// - /// partitioned Vec<(BucketId, DocId)> pairs to improve grouping locality. + /// This weird partitioning is used to do some cheap grouping on the bucket ids. + /// bucket ids are dense, e.g. when we don't detect the cardinality as low cardinality, + /// but there are just 16 bucket ids, each bucket id will go to its own partition. + /// + /// We want to keep this cheap, because high cardinality aggregations can have a lot of + /// buckets, and they may be nothing to group. partitions: [PartitionEntry; NUM_PARTITIONS], pub(crate) sub_agg_collector: Box, num_docs: usize, @@ -110,7 +115,12 @@ impl CachedSubAggs { // the FLUSH_THRESHOLD, but never flush any buckets. (except the final flush) let mut bucket_treshold = FLUSH_THRESHOLD / (self.per_bucket_docs.len().max(1) * 2); const _: () = { - assert!(std::mem::size_of::() == 8, "expected 64-bit u64"); + // MAX_NUM_TERMS_FOR_VEC == LOWCARD threshold + let bucket_treshold = FLUSH_THRESHOLD / (MAX_NUM_TERMS_FOR_VEC as usize * 2); + assert!( + bucket_treshold > 0, + "Bucket threshold must be greater than 0" + ); }; if force { bucket_treshold = 0; diff --git a/src/aggregation/metric/top_hits.rs b/src/aggregation/metric/top_hits.rs index 02b6be41b..1adf41bbb 100644 --- a/src/aggregation/metric/top_hits.rs +++ b/src/aggregation/metric/top_hits.rs @@ -20,7 +20,6 @@ use crate::collector::sort_key::ReverseComparator; use crate::collector::TopNComputer; use crate::schema::OwnedValue; use crate::{DocAddress, DocId, SegmentOrdinal}; -// duplicate import removed; already imported above /// Contains all information required by the TopHitsSegmentCollector to perform the /// top_hits aggregation on a segment.