mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
add comment
This commit is contained in:
@@ -868,7 +868,7 @@ fn build_terms_or_cardinality_nodes(
|
||||
});
|
||||
}
|
||||
|
||||
// Add one node per accessor to mirror previous behavior and allow per-type missing handling.
|
||||
// Add one node per accessor
|
||||
for (accessor, column_type) in column_and_types {
|
||||
let missing_value_for_accessor = if use_special_missing_agg {
|
||||
None
|
||||
|
||||
@@ -333,6 +333,10 @@ impl TermsAggregationInternal {
|
||||
}
|
||||
}
|
||||
|
||||
/// The treshold for maximum number of terms to use a Vec-backed bucket storage.
|
||||
/// TODO: Benchmark to validate the threshold
|
||||
pub const MAX_NUM_TERMS_FOR_VEC: u64 = 100;
|
||||
|
||||
/// Build a concrete `SegmentTermCollector` with either a Vec- or HashMap-backed
|
||||
/// bucket storage, depending on the column type and aggregation level.
|
||||
pub(crate) fn build_segment_term_collector(
|
||||
@@ -366,16 +370,11 @@ pub(crate) fn build_segment_term_collector(
|
||||
// Build sub-aggregation blueprint if there are children.
|
||||
let has_sub_aggregations = !node.children.is_empty();
|
||||
|
||||
// Decide whether to use a Vec-backed or HashMap-backed bucket storage.
|
||||
|
||||
// TODO: A better metric instead of is_top_level would be the number of buckets expected.
|
||||
// E.g. If term agg is not top level, but the parent is a bucket agg with less than 10 buckets,
|
||||
// we can still use Vec.
|
||||
let is_top_level = terms_req_data.is_top_level;
|
||||
|
||||
// TODO: Benchmark to validate the threshold
|
||||
const MAX_NUM_TERMS_FOR_VEC: u64 = 100;
|
||||
|
||||
// Let's see if we can use a vec to aggregate our data
|
||||
// instead of a hashmap.
|
||||
let col_max_value = terms_req_data.accessor.max_value();
|
||||
@@ -389,8 +388,7 @@ pub(crate) fn build_segment_term_collector(
|
||||
};
|
||||
|
||||
let mut bucket_id_provider = BucketIdProvider::default();
|
||||
// - use a Vec instead of a hashmap for our aggregation.
|
||||
|
||||
// Decide which bucket storage is best suited for this aggregation.
|
||||
if is_top_level && max_term_id < MAX_NUM_TERMS_FOR_VEC && !has_sub_aggregations {
|
||||
let term_buckets = VecTermBucketsNoAgg::new(max_term_id + 1, &mut bucket_id_provider);
|
||||
let collector: SegmentTermCollector<_, true> = SegmentTermCollector {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use super::segment_agg_result::SegmentAggregationCollector;
|
||||
use crate::aggregation::agg_data::AggregationsSegmentCtx;
|
||||
use crate::aggregation::bucket::MAX_NUM_TERMS_FOR_VEC;
|
||||
use crate::aggregation::BucketId;
|
||||
use crate::DocId;
|
||||
|
||||
@@ -28,9 +29,13 @@ pub(crate) struct CachedSubAggs<const LOWCARD: bool = false> {
|
||||
/// The outer Vec is indexed by BucketId.
|
||||
per_bucket_docs: Vec<Vec<DocId>>,
|
||||
/// Only used when LOWCARD is false.
|
||||
/// For higher cardinalities we use a partitioned approach to store
|
||||
///
|
||||
/// partitioned Vec<(BucketId, DocId)> pairs to improve grouping locality.
|
||||
/// This weird partitioning is used to do some cheap grouping on the bucket ids.
|
||||
/// bucket ids are dense, e.g. when we don't detect the cardinality as low cardinality,
|
||||
/// but there are just 16 bucket ids, each bucket id will go to its own partition.
|
||||
///
|
||||
/// We want to keep this cheap, because high cardinality aggregations can have a lot of
|
||||
/// buckets, and they may be nothing to group.
|
||||
partitions: [PartitionEntry; NUM_PARTITIONS],
|
||||
pub(crate) sub_agg_collector: Box<dyn SegmentAggregationCollector>,
|
||||
num_docs: usize,
|
||||
@@ -110,7 +115,12 @@ impl<const LOWCARD: bool> CachedSubAggs<LOWCARD> {
|
||||
// the FLUSH_THRESHOLD, but never flush any buckets. (except the final flush)
|
||||
let mut bucket_treshold = FLUSH_THRESHOLD / (self.per_bucket_docs.len().max(1) * 2);
|
||||
const _: () = {
|
||||
assert!(std::mem::size_of::<u64>() == 8, "expected 64-bit u64");
|
||||
// MAX_NUM_TERMS_FOR_VEC == LOWCARD threshold
|
||||
let bucket_treshold = FLUSH_THRESHOLD / (MAX_NUM_TERMS_FOR_VEC as usize * 2);
|
||||
assert!(
|
||||
bucket_treshold > 0,
|
||||
"Bucket threshold must be greater than 0"
|
||||
);
|
||||
};
|
||||
if force {
|
||||
bucket_treshold = 0;
|
||||
|
||||
@@ -20,7 +20,6 @@ use crate::collector::sort_key::ReverseComparator;
|
||||
use crate::collector::TopNComputer;
|
||||
use crate::schema::OwnedValue;
|
||||
use crate::{DocAddress, DocId, SegmentOrdinal};
|
||||
// duplicate import removed; already imported above
|
||||
|
||||
/// Contains all information required by the TopHitsSegmentCollector to perform the
|
||||
/// top_hits aggregation on a segment.
|
||||
|
||||
Reference in New Issue
Block a user