add comment

This commit is contained in:
Pascal Seitz
2025-12-12 16:09:58 +08:00
parent 15913446b8
commit 71dc08424c
4 changed files with 19 additions and 12 deletions

View File

@@ -868,7 +868,7 @@ fn build_terms_or_cardinality_nodes(
});
}
// Add one node per accessor to mirror previous behavior and allow per-type missing handling.
// Add one node per accessor
for (accessor, column_type) in column_and_types {
let missing_value_for_accessor = if use_special_missing_agg {
None

View File

@@ -333,6 +333,10 @@ impl TermsAggregationInternal {
}
}
/// The treshold for maximum number of terms to use a Vec-backed bucket storage.
/// TODO: Benchmark to validate the threshold
pub const MAX_NUM_TERMS_FOR_VEC: u64 = 100;
/// Build a concrete `SegmentTermCollector` with either a Vec- or HashMap-backed
/// bucket storage, depending on the column type and aggregation level.
pub(crate) fn build_segment_term_collector(
@@ -366,16 +370,11 @@ pub(crate) fn build_segment_term_collector(
// Build sub-aggregation blueprint if there are children.
let has_sub_aggregations = !node.children.is_empty();
// Decide whether to use a Vec-backed or HashMap-backed bucket storage.
// TODO: A better metric instead of is_top_level would be the number of buckets expected.
// E.g. If term agg is not top level, but the parent is a bucket agg with less than 10 buckets,
// we can still use Vec.
let is_top_level = terms_req_data.is_top_level;
// TODO: Benchmark to validate the threshold
const MAX_NUM_TERMS_FOR_VEC: u64 = 100;
// Let's see if we can use a vec to aggregate our data
// instead of a hashmap.
let col_max_value = terms_req_data.accessor.max_value();
@@ -389,8 +388,7 @@ pub(crate) fn build_segment_term_collector(
};
let mut bucket_id_provider = BucketIdProvider::default();
// - use a Vec instead of a hashmap for our aggregation.
// Decide which bucket storage is best suited for this aggregation.
if is_top_level && max_term_id < MAX_NUM_TERMS_FOR_VEC && !has_sub_aggregations {
let term_buckets = VecTermBucketsNoAgg::new(max_term_id + 1, &mut bucket_id_provider);
let collector: SegmentTermCollector<_, true> = SegmentTermCollector {

View File

@@ -1,5 +1,6 @@
use super::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::aggregation::bucket::MAX_NUM_TERMS_FOR_VEC;
use crate::aggregation::BucketId;
use crate::DocId;
@@ -28,9 +29,13 @@ pub(crate) struct CachedSubAggs<const LOWCARD: bool = false> {
/// The outer Vec is indexed by BucketId.
per_bucket_docs: Vec<Vec<DocId>>,
/// Only used when LOWCARD is false.
/// For higher cardinalities we use a partitioned approach to store
///
/// partitioned Vec<(BucketId, DocId)> pairs to improve grouping locality.
/// This weird partitioning is used to do some cheap grouping on the bucket ids.
/// bucket ids are dense, e.g. when we don't detect the cardinality as low cardinality,
/// but there are just 16 bucket ids, each bucket id will go to its own partition.
///
/// We want to keep this cheap, because high cardinality aggregations can have a lot of
/// buckets, and they may be nothing to group.
partitions: [PartitionEntry; NUM_PARTITIONS],
pub(crate) sub_agg_collector: Box<dyn SegmentAggregationCollector>,
num_docs: usize,
@@ -110,7 +115,12 @@ impl<const LOWCARD: bool> CachedSubAggs<LOWCARD> {
// the FLUSH_THRESHOLD, but never flush any buckets. (except the final flush)
let mut bucket_treshold = FLUSH_THRESHOLD / (self.per_bucket_docs.len().max(1) * 2);
const _: () = {
assert!(std::mem::size_of::<u64>() == 8, "expected 64-bit u64");
// MAX_NUM_TERMS_FOR_VEC == LOWCARD threshold
let bucket_treshold = FLUSH_THRESHOLD / (MAX_NUM_TERMS_FOR_VEC as usize * 2);
assert!(
bucket_treshold > 0,
"Bucket threshold must be greater than 0"
);
};
if force {
bucket_treshold = 0;

View File

@@ -20,7 +20,6 @@ use crate::collector::sort_key::ReverseComparator;
use crate::collector::TopNComputer;
use crate::schema::OwnedValue;
use crate::{DocAddress, DocId, SegmentOrdinal};
// duplicate import removed; already imported above
/// Contains all information required by the TopHitsSegmentCollector to perform the
/// top_hits aggregation on a segment.