add comment

2025-12-23 02:29:57 +00:00 · 2025-12-12 16:09:58 +08:00
parent 15913446b8
commit 71dc08424c
4 changed files with 19 additions and 12 deletions
--- a/src/aggregation/agg_data.rs
+++ b/src/aggregation/agg_data.rs
@@ -868,7 +868,7 @@ fn build_terms_or_cardinality_nodes(
        });
    }

-    // Add one node per accessor to mirror previous behavior and allow per-type missing handling.
+    // Add one node per accessor
    for (accessor, column_type) in column_and_types {
        let missing_value_for_accessor = if use_special_missing_agg {
            None
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -333,6 +333,10 @@ impl TermsAggregationInternal {
    }
 }

+/// The treshold for maximum number of terms to use a Vec-backed bucket storage.
+/// TODO: Benchmark to validate the threshold
+pub const MAX_NUM_TERMS_FOR_VEC: u64 = 100;
+
 /// Build a concrete `SegmentTermCollector` with either a Vec- or HashMap-backed
 /// bucket storage, depending on the column type and aggregation level.
 pub(crate) fn build_segment_term_collector(
@@ -366,16 +370,11 @@ pub(crate) fn build_segment_term_collector(
    // Build sub-aggregation blueprint if there are children.
    let has_sub_aggregations = !node.children.is_empty();

-    // Decide whether to use a Vec-backed or HashMap-backed bucket storage.
-
    // TODO: A better metric instead of is_top_level would be the number of buckets expected.
    // E.g. If term agg is not top level, but the parent is a bucket agg with less than 10 buckets,
    // we can still use Vec.
    let is_top_level = terms_req_data.is_top_level;

-    // TODO: Benchmark to validate the threshold
-    const MAX_NUM_TERMS_FOR_VEC: u64 = 100;
-
    // Let's see if we can use a vec to aggregate our data
    // instead of a hashmap.
    let col_max_value = terms_req_data.accessor.max_value();
@@ -389,8 +388,7 @@ pub(crate) fn build_segment_term_collector(
    };

    let mut bucket_id_provider = BucketIdProvider::default();
-    // - use a Vec instead of a hashmap for our aggregation.
-
+    // Decide which bucket storage is best suited for this aggregation.
    if is_top_level && max_term_id < MAX_NUM_TERMS_FOR_VEC && !has_sub_aggregations {
        let term_buckets = VecTermBucketsNoAgg::new(max_term_id + 1, &mut bucket_id_provider);
        let collector: SegmentTermCollector<_, true> = SegmentTermCollector {
--- a/src/aggregation/cached_sub_aggs.rs
+++ b/src/aggregation/cached_sub_aggs.rs
@@ -1,5 +1,6 @@
 use super::segment_agg_result::SegmentAggregationCollector;
 use crate::aggregation::agg_data::AggregationsSegmentCtx;
+use crate::aggregation::bucket::MAX_NUM_TERMS_FOR_VEC;
 use crate::aggregation::BucketId;
 use crate::DocId;

@@ -28,9 +29,13 @@ pub(crate) struct CachedSubAggs<const LOWCARD: bool = false> {
    /// The outer Vec is indexed by BucketId.
    per_bucket_docs: Vec<Vec<DocId>>,
    /// Only used when LOWCARD is false.
-    /// For higher cardinalities we use a partitioned approach to store
    ///
-    /// partitioned Vec<(BucketId, DocId)> pairs to improve grouping locality.
+    /// This weird partitioning is used to do some cheap grouping on the bucket ids.
+    /// bucket ids are dense, e.g. when we don't detect the cardinality as low cardinality,
+    /// but there are just 16 bucket ids, each bucket id will go to its own partition.
+    ///
+    /// We want to keep this cheap, because high cardinality aggregations can have a lot of
+    /// buckets, and they may be nothing to group.
    partitions: [PartitionEntry; NUM_PARTITIONS],
    pub(crate) sub_agg_collector: Box<dyn SegmentAggregationCollector>,
    num_docs: usize,
@@ -110,7 +115,12 @@ impl<const LOWCARD: bool> CachedSubAggs<LOWCARD> {
            // the FLUSH_THRESHOLD, but never flush any buckets. (except the final flush)
            let mut bucket_treshold = FLUSH_THRESHOLD / (self.per_bucket_docs.len().max(1) * 2);
            const _: () = {
-                assert!(std::mem::size_of::<u64>() == 8, "expected 64-bit u64");
+                // MAX_NUM_TERMS_FOR_VEC == LOWCARD threshold
+                let bucket_treshold = FLUSH_THRESHOLD / (MAX_NUM_TERMS_FOR_VEC as usize * 2);
+                assert!(
+                    bucket_treshold > 0,
+                    "Bucket threshold must be greater than 0"
+                );
            };
            if force {
                bucket_treshold = 0;
--- a/src/aggregation/metric/top_hits.rs
+++ b/src/aggregation/metric/top_hits.rs
@@ -20,7 +20,6 @@ use crate::collector::sort_key::ReverseComparator;
 use crate::collector::TopNComputer;
 use crate::schema::OwnedValue;
 use crate::{DocAddress, DocId, SegmentOrdinal};
-// duplicate import removed; already imported above

 /// Contains all information required by the TopHitsSegmentCollector to perform the
 /// top_hits aggregation on a segment.