mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-31 15:40:40 +00:00
Compare commits
1 Commits
agg_cleanu
...
dependabot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cb474cc0f6 |
2
.github/workflows/scorecard.yml
vendored
2
.github/workflows/scorecard.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
|
||||
# Upload the results as artifacts.
|
||||
- name: 'Upload artifact'
|
||||
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
|
||||
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
|
||||
@@ -79,12 +79,13 @@ fn bench_agg(mut group: InputGroup<Index>) {
|
||||
register!(group, composite_histogram_calendar);
|
||||
|
||||
register!(group, cardinality_agg);
|
||||
register!(group, cardinality_agg_high_card);
|
||||
register!(group, cardinality_agg_low_card);
|
||||
register!(group, terms_status_with_cardinality_agg);
|
||||
register!(group, terms_100_buckets_with_cardinality_agg);
|
||||
register!(group, terms_many_with_single_term_order_by_card);
|
||||
register!(group, terms_many_with_single_term_2_order_by_card);
|
||||
register!(group, terms_many_with_single_term_order_by_cardinality_agg);
|
||||
register!(
|
||||
group,
|
||||
terms_many_with_nested_terms_double_order_by_cardinality_agg
|
||||
);
|
||||
|
||||
register!(group, range_agg);
|
||||
register!(group, range_agg_with_avg_sub_agg);
|
||||
@@ -172,32 +173,6 @@ fn cardinality_agg(index: &Index) {
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
// Full-scan cardinality on a near-1M-cardinality string field.
|
||||
// Hits the dense (PagedBitset) path: every doc has a unique term,
|
||||
// so the bucket promotes from FxHashSet shortly into the scan.
|
||||
fn cardinality_agg_high_card(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "text_all_unique_terms"
|
||||
},
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
// Full-scan cardinality on a tiny-cardinality string field (7 distinct
|
||||
// values). Stays on the FxHashSet path — the promotion threshold is
|
||||
// never crossed. Validates no regression on the sparse path.
|
||||
fn cardinality_agg_low_card(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "text_few_terms_status"
|
||||
},
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_status_with_cardinality_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
@@ -230,7 +205,7 @@ fn terms_100_buckets_with_cardinality_agg(index: &Index) {
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn terms_many_with_single_term_order_by_card(index: &Index) {
|
||||
fn terms_many_with_single_term_order_by_cardinality_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_many_terms" },
|
||||
@@ -242,7 +217,7 @@ fn terms_many_with_single_term_order_by_card(index: &Index) {
|
||||
},
|
||||
"aggs": {
|
||||
"cardinality": {
|
||||
"cardinality": { "field": "text_few_terms" }
|
||||
"cardinality": { "field": "text_many_terms" }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -255,20 +230,22 @@ fn terms_many_with_single_term_order_by_card(index: &Index) {
|
||||
// Two-level terms ordered by cardinality at each level: a high-card outer terms
|
||||
// (text_many_terms) ordered by a cardinality sub-agg, with a nested low-card terms
|
||||
// (text_few_terms_status) also ordered by a cardinality sub-agg, plus an avg.
|
||||
fn terms_many_with_single_term_2_order_by_card(index: &Index) {
|
||||
fn terms_many_with_nested_terms_double_order_by_cardinality_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"by_ip": {
|
||||
"terms": {
|
||||
"field": "text_many_terms",
|
||||
"order": { "card_few_terms": "desc" }
|
||||
"size": 50,
|
||||
"order": { "distinct_path": "desc" }
|
||||
},
|
||||
"aggs": {
|
||||
"card_few_terms": {
|
||||
"distinct_path": {
|
||||
"cardinality": { "field": "text_few_terms" }
|
||||
},
|
||||
"nested_terms": {
|
||||
"by_asn": {
|
||||
"terms": {
|
||||
"field": " single_term",
|
||||
"size": 10,
|
||||
"order": { "distinct_path2": "desc" }
|
||||
},
|
||||
"aggs": {
|
||||
|
||||
@@ -20,8 +20,8 @@ use crate::aggregation::metric::{
|
||||
build_segment_stats_collector, AverageAggregation, CardinalityAggReqData,
|
||||
CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation, MaxAggregation,
|
||||
MetricAggReqData, MinAggregation, SegmentCardinalityCollector, SegmentExtendedStatsCollector,
|
||||
SegmentPercentilesCollector, StatsAggregation, StatsType, SumAggregation, TermOrdSet,
|
||||
TopHitsAggReqData, TopHitsSegmentCollector, BITSET_MAX_TERM_ORD,
|
||||
SegmentPercentilesCollector, StatsAggregation, StatsType, SumAggregation, TopHitsAggReqData,
|
||||
TopHitsSegmentCollector,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::{
|
||||
GenericSegmentAggregationResultsCollector, SegmentAggregationCollector,
|
||||
@@ -41,7 +41,7 @@ pub struct AggregationsSegmentCtx {
|
||||
|
||||
impl AggregationsSegmentCtx {
|
||||
pub(crate) fn push_term_req_data(&mut self, data: TermsAggReqData) -> usize {
|
||||
self.per_request.term_req_data.push(data);
|
||||
self.per_request.term_req_data.push(Some(Box::new(data)));
|
||||
self.per_request.term_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_cardinality_req_data(&mut self, data: CardinalityAggReqData) -> usize {
|
||||
@@ -61,25 +61,31 @@ impl AggregationsSegmentCtx {
|
||||
self.per_request.missing_term_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_histogram_req_data(&mut self, data: HistogramAggReqData) -> usize {
|
||||
self.per_request.histogram_req_data.push(data);
|
||||
self.per_request
|
||||
.histogram_req_data
|
||||
.push(Some(Box::new(data)));
|
||||
self.per_request.histogram_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_range_req_data(&mut self, data: RangeAggReqData) -> usize {
|
||||
self.per_request.range_req_data.push(data);
|
||||
self.per_request.range_req_data.push(Some(Box::new(data)));
|
||||
self.per_request.range_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_filter_req_data(&mut self, data: FilterAggReqData) -> usize {
|
||||
self.per_request.filter_req_data.push(data);
|
||||
self.per_request.filter_req_data.push(Some(Box::new(data)));
|
||||
self.per_request.filter_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_composite_req_data(&mut self, data: CompositeAggReqData) -> usize {
|
||||
self.per_request.composite_req_data.push(data);
|
||||
self.per_request
|
||||
.composite_req_data
|
||||
.push(Some(Box::new(data)));
|
||||
self.per_request.composite_req_data.len() - 1
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_term_req_data(&self, idx: usize) -> &TermsAggReqData {
|
||||
&self.per_request.term_req_data[idx]
|
||||
self.per_request.term_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("term_req_data slot is empty (taken)")
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_cardinality_req_data(&self, idx: usize) -> &CardinalityAggReqData {
|
||||
@@ -97,6 +103,116 @@ impl AggregationsSegmentCtx {
|
||||
pub(crate) fn get_missing_term_req_data(&self, idx: usize) -> &MissingTermAggReqData {
|
||||
&self.per_request.missing_term_req_data[idx]
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_histogram_req_data(&self, idx: usize) -> &HistogramAggReqData {
|
||||
self.per_request.histogram_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_range_req_data(&self, idx: usize) -> &RangeAggReqData {
|
||||
self.per_request.range_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("range_req_data slot is empty (taken)")
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_composite_req_data(&self, idx: usize) -> &CompositeAggReqData {
|
||||
self.per_request.composite_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("composite_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
// ---------- mutable getters ----------
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_metric_req_data_mut(&mut self, idx: usize) -> &mut MetricAggReqData {
|
||||
&mut self.per_request.stats_metric_req_data[idx]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_cardinality_req_data_mut(
|
||||
&mut self,
|
||||
idx: usize,
|
||||
) -> &mut CardinalityAggReqData {
|
||||
&mut self.per_request.cardinality_req_data[idx]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_histogram_req_data_mut(&mut self, idx: usize) -> &mut HistogramAggReqData {
|
||||
self.per_request.histogram_req_data[idx]
|
||||
.as_deref_mut()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
// ---------- take / put (terms, histogram, range) ----------
|
||||
|
||||
/// Move out the boxed Histogram request at `idx`, leaving `None`.
|
||||
#[inline]
|
||||
pub(crate) fn take_histogram_req_data(&mut self, idx: usize) -> Box<HistogramAggReqData> {
|
||||
self.per_request.histogram_req_data[idx]
|
||||
.take()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Histogram request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_histogram_req_data(
|
||||
&mut self,
|
||||
idx: usize,
|
||||
value: Box<HistogramAggReqData>,
|
||||
) {
|
||||
debug_assert!(self.per_request.histogram_req_data[idx].is_none());
|
||||
self.per_request.histogram_req_data[idx] = Some(value);
|
||||
}
|
||||
|
||||
/// Move out the boxed Range request at `idx`, leaving `None`.
|
||||
#[inline]
|
||||
pub(crate) fn take_range_req_data(&mut self, idx: usize) -> Box<RangeAggReqData> {
|
||||
self.per_request.range_req_data[idx]
|
||||
.take()
|
||||
.expect("range_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Range request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_range_req_data(&mut self, idx: usize, value: Box<RangeAggReqData>) {
|
||||
debug_assert!(self.per_request.range_req_data[idx].is_none());
|
||||
self.per_request.range_req_data[idx] = Some(value);
|
||||
}
|
||||
|
||||
/// Move out the boxed Filter request at `idx`, leaving `None`.
|
||||
#[inline]
|
||||
pub(crate) fn take_filter_req_data(&mut self, idx: usize) -> Box<FilterAggReqData> {
|
||||
self.per_request.filter_req_data[idx]
|
||||
.take()
|
||||
.expect("filter_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Filter request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_filter_req_data(&mut self, idx: usize, value: Box<FilterAggReqData>) {
|
||||
debug_assert!(self.per_request.filter_req_data[idx].is_none());
|
||||
self.per_request.filter_req_data[idx] = Some(value);
|
||||
}
|
||||
|
||||
/// Move out the Composite request at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn take_composite_req_data(&mut self, idx: usize) -> Box<CompositeAggReqData> {
|
||||
self.per_request.composite_req_data[idx]
|
||||
.take()
|
||||
.expect("composite_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Composite request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_composite_req_data(
|
||||
&mut self,
|
||||
idx: usize,
|
||||
value: Box<CompositeAggReqData>,
|
||||
) {
|
||||
debug_assert!(self.per_request.composite_req_data[idx].is_none());
|
||||
self.per_request.composite_req_data[idx] = Some(value);
|
||||
}
|
||||
}
|
||||
|
||||
/// Each type of aggregation has its own request data struct. This struct holds
|
||||
@@ -107,14 +223,15 @@ impl AggregationsSegmentCtx {
|
||||
/// for a node with [AggKind::Terms]).
|
||||
#[derive(Default)]
|
||||
pub struct PerRequestAggSegCtx {
|
||||
// Box for cheap take/put - Only necessary for bucket aggs that have sub-aggregations
|
||||
/// TermsAggReqData contains the request data for a terms aggregation.
|
||||
pub term_req_data: Vec<TermsAggReqData>,
|
||||
pub term_req_data: Vec<Option<Box<TermsAggReqData>>>,
|
||||
/// HistogramAggReqData contains the request data for a histogram aggregation.
|
||||
pub histogram_req_data: Vec<HistogramAggReqData>,
|
||||
pub histogram_req_data: Vec<Option<Box<HistogramAggReqData>>>,
|
||||
/// RangeAggReqData contains the request data for a range aggregation.
|
||||
pub range_req_data: Vec<RangeAggReqData>,
|
||||
pub range_req_data: Vec<Option<Box<RangeAggReqData>>>,
|
||||
/// FilterAggReqData contains the request data for a filter aggregation.
|
||||
pub filter_req_data: Vec<FilterAggReqData>,
|
||||
pub filter_req_data: Vec<Option<Box<FilterAggReqData>>>,
|
||||
/// Shared by avg, min, max, sum, stats, extended_stats, count
|
||||
pub stats_metric_req_data: Vec<MetricAggReqData>,
|
||||
/// CardinalityAggReqData contains the request data for a cardinality aggregation.
|
||||
@@ -124,7 +241,7 @@ pub struct PerRequestAggSegCtx {
|
||||
/// MissingTermAggReqData contains the request data for a missing term aggregation.
|
||||
pub missing_term_req_data: Vec<MissingTermAggReqData>,
|
||||
/// CompositeAggReqData contains the request data for a composite aggregation.
|
||||
pub composite_req_data: Vec<CompositeAggReqData>,
|
||||
pub composite_req_data: Vec<Option<Box<CompositeAggReqData>>>,
|
||||
|
||||
/// Request tree used to build collectors.
|
||||
pub agg_tree: Vec<AggRefNode>,
|
||||
@@ -135,22 +252,22 @@ impl PerRequestAggSegCtx {
|
||||
fn get_memory_consumption(&self) -> usize {
|
||||
self.term_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().unwrap().get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.histogram_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().unwrap().get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.range_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().unwrap().get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.filter_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().unwrap().get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.stats_metric_req_data
|
||||
@@ -175,7 +292,7 @@ impl PerRequestAggSegCtx {
|
||||
+ self
|
||||
.composite_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().map(|d| d.get_memory_consumption()).unwrap_or(0))
|
||||
.sum::<usize>()
|
||||
+ self.agg_tree.len() * std::mem::size_of::<AggRefNode>()
|
||||
}
|
||||
@@ -184,16 +301,40 @@ impl PerRequestAggSegCtx {
|
||||
let idx = node.idx_in_req_data;
|
||||
let kind = node.kind;
|
||||
match kind {
|
||||
AggKind::Terms => self.term_req_data[idx].name.as_str(),
|
||||
AggKind::Terms => self.term_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("term_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Cardinality => &self.cardinality_req_data[idx].name,
|
||||
AggKind::StatsKind(_) => &self.stats_metric_req_data[idx].name,
|
||||
AggKind::TopHits => &self.top_hits_req_data[idx].name,
|
||||
AggKind::MissingTerm => &self.missing_term_req_data[idx].name,
|
||||
AggKind::Histogram => self.histogram_req_data[idx].name.as_str(),
|
||||
AggKind::DateHistogram => self.histogram_req_data[idx].name.as_str(),
|
||||
AggKind::Range => self.range_req_data[idx].name.as_str(),
|
||||
AggKind::Filter => self.filter_req_data[idx].name.as_str(),
|
||||
AggKind::Composite => self.composite_req_data[idx].name.as_str(),
|
||||
AggKind::Histogram => self.histogram_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::DateHistogram => self.histogram_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Range => self.range_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("range_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Filter => self.filter_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("filter_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Composite => self.composite_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("composite_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -271,39 +412,13 @@ pub(crate) fn build_segment_agg_collector(
|
||||
Ok(Box::new(TermMissingAgg::new(req, node)?))
|
||||
}
|
||||
AggKind::Cardinality => {
|
||||
let req_data = req.get_cardinality_req_data(node.idx_in_req_data);
|
||||
// For str columns, choose the per-bucket entries representation
|
||||
// based on the segment's column.max_value():
|
||||
// * small (< BITSET_MAX_TERM_ORD): `BitSet`, pre-allocated, no promotion machinery.
|
||||
// * large: `TermOrdSet` (sparse FxHashSet that promotes to a paged bitset).
|
||||
// For non-str columns the `entries` field is unused (values go
|
||||
// straight into the HLL sketch); we still pick `TermOrdSet`
|
||||
// because its empty Sparse(FxHashSet) costs nothing.
|
||||
let is_str = req_data.column_type == ColumnType::Str;
|
||||
let max_term_ord_inclusive = if is_str {
|
||||
req_data.accessor.max_value()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let collector: Box<dyn SegmentAggregationCollector> =
|
||||
if is_str && max_term_ord_inclusive < BITSET_MAX_TERM_ORD {
|
||||
Box::new(SegmentCardinalityCollector::<BitSet>::from_req(
|
||||
req_data.column_type,
|
||||
node.idx_in_req_data,
|
||||
req_data.accessor.clone(),
|
||||
req_data.missing_value_for_accessor,
|
||||
max_term_ord_inclusive,
|
||||
))
|
||||
} else {
|
||||
Box::new(SegmentCardinalityCollector::<TermOrdSet>::from_req(
|
||||
req_data.column_type,
|
||||
node.idx_in_req_data,
|
||||
req_data.accessor.clone(),
|
||||
req_data.missing_value_for_accessor,
|
||||
max_term_ord_inclusive,
|
||||
))
|
||||
};
|
||||
Ok(collector)
|
||||
let req_data = &mut req.get_cardinality_req_data_mut(node.idx_in_req_data);
|
||||
Ok(Box::new(SegmentCardinalityCollector::from_req(
|
||||
req_data.column_type,
|
||||
node.idx_in_req_data,
|
||||
req_data.accessor.clone(),
|
||||
req_data.missing_value_for_accessor,
|
||||
)))
|
||||
}
|
||||
AggKind::StatsKind(stats_type) => {
|
||||
let req_data = &mut req.per_request.stats_metric_req_data[node.idx_in_req_data];
|
||||
@@ -318,7 +433,7 @@ pub(crate) fn build_segment_agg_collector(
|
||||
SegmentExtendedStatsCollector::from_req(req_data, sigma),
|
||||
)),
|
||||
StatsType::Percentiles => {
|
||||
let req_data = req.get_metric_req_data(node.idx_in_req_data);
|
||||
let req_data = req.get_metric_req_data_mut(node.idx_in_req_data);
|
||||
Ok(Box::new(
|
||||
SegmentPercentilesCollector::from_req_and_validate(
|
||||
req_data.field_type,
|
||||
@@ -658,18 +773,23 @@ fn build_nodes(
|
||||
let schema = reader.schema();
|
||||
let tokenizers = &data.context.tokenizers;
|
||||
let query = filter_req.parse_query(schema, tokenizers)?;
|
||||
let evaluator =
|
||||
std::rc::Rc::new(crate::aggregation::bucket::DocumentQueryEvaluator::new(
|
||||
query,
|
||||
schema.clone(),
|
||||
reader,
|
||||
)?);
|
||||
let evaluator = crate::aggregation::bucket::DocumentQueryEvaluator::new(
|
||||
query,
|
||||
schema.clone(),
|
||||
reader,
|
||||
)?;
|
||||
|
||||
// Pre-allocate buffer for batch filtering
|
||||
let max_doc = reader.max_doc();
|
||||
let buffer_capacity = crate::docset::COLLECT_BLOCK_BUFFER_LEN.min(max_doc as usize);
|
||||
let matching_docs_buffer = Vec::with_capacity(buffer_capacity);
|
||||
|
||||
let idx_in_req_data = data.push_filter_req_data(FilterAggReqData {
|
||||
name: agg_name.to_string(),
|
||||
req: filter_req.clone(),
|
||||
segment_reader: reader.clone(),
|
||||
evaluator,
|
||||
matching_docs_buffer,
|
||||
is_top_level,
|
||||
});
|
||||
let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?;
|
||||
@@ -886,20 +1006,10 @@ fn build_terms_or_cardinality_nodes(
|
||||
(idx_in_req_data, AggKind::Terms)
|
||||
}
|
||||
TermsOrCardinalityRequest::Cardinality(ref req) => {
|
||||
// `str_dict_column` is computed once per field; for JSON paths
|
||||
// with mixed types it's `Some` even on the numeric req_data.
|
||||
// Cardinality only consults it for the str column path, so
|
||||
// gate by column_type to avoid driving non-str collectors
|
||||
// through the coupon-cache path.
|
||||
let str_dict_column_for_req = if column_type == ColumnType::Str {
|
||||
str_dict_column.clone()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let idx_in_req_data = data.push_cardinality_req_data(CardinalityAggReqData {
|
||||
accessor,
|
||||
column_type,
|
||||
str_dict_column: str_dict_column_for_req,
|
||||
str_dict_column: str_dict_column.clone(),
|
||||
missing_value_for_accessor,
|
||||
name: agg_name.to_string(),
|
||||
req: req.clone(),
|
||||
|
||||
@@ -16,7 +16,6 @@ use crate::{SegmentReader, TantivyError};
|
||||
|
||||
/// Contains all information required by the SegmentCompositeCollector to perform the
|
||||
/// composite aggregation on a segment.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompositeAggReqData {
|
||||
/// The name of the aggregation.
|
||||
pub name: String,
|
||||
@@ -35,7 +34,6 @@ impl CompositeAggReqData {
|
||||
}
|
||||
|
||||
/// Accessors for a single column in a composite source.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompositeAccessor {
|
||||
/// The fast field column
|
||||
pub column: Column<u64>,
|
||||
@@ -50,7 +48,6 @@ pub struct CompositeAccessor {
|
||||
}
|
||||
|
||||
/// Accessors to all the columns that belong to the field of a composite source.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompositeSourceAccessors {
|
||||
/// The accessors for this source
|
||||
pub accessors: Vec<CompositeAccessor>,
|
||||
@@ -361,7 +358,7 @@ impl PrecomputedDateInterval {
|
||||
///
|
||||
/// Some column types (term, IP) might not have an exact representation of the
|
||||
/// specified after key
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug)]
|
||||
pub enum PrecomputedAfterKey {
|
||||
/// The after key could be exactly represented in the column space.
|
||||
Exact(u64),
|
||||
|
||||
@@ -118,7 +118,7 @@ impl InternalValueRepr {
|
||||
pub struct SegmentCompositeCollector {
|
||||
/// One DynArrayHeapMap per parent bucket.
|
||||
parent_buckets: Vec<DynArrayHeapMap<InternalValueRepr, CompositeBucketCollector>>,
|
||||
req_data: CompositeAggReqData,
|
||||
accessor_idx: usize,
|
||||
sub_agg: Option<BufferedSubAggs<HighCardSubAggBuffer>>,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
/// Number of sources, needed when creating new DynArrayHeapMaps.
|
||||
@@ -132,7 +132,10 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
results: &mut IntermediateAggregationResults,
|
||||
parent_bucket_id: BucketId,
|
||||
) -> crate::Result<()> {
|
||||
let name = self.req_data.name.clone();
|
||||
let name = agg_data
|
||||
.get_composite_req_data(self.accessor_idx)
|
||||
.name
|
||||
.clone();
|
||||
|
||||
let buckets = self.add_intermediate_bucket_result(agg_data, parent_bucket_id)?;
|
||||
results.push(
|
||||
@@ -150,11 +153,12 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let mem_pre = self.get_memory_consumption(parent_bucket_id);
|
||||
let composite_agg_data = agg_data.take_composite_req_data(self.accessor_idx);
|
||||
|
||||
for doc in docs {
|
||||
let mut visitor = CompositeKeyVisitor {
|
||||
doc_id: *doc,
|
||||
composite_agg_data: &self.req_data,
|
||||
composite_agg_data: &composite_agg_data,
|
||||
buckets: &mut self.parent_buckets[parent_bucket_id as usize],
|
||||
sub_agg: &mut self.sub_agg,
|
||||
bucket_id_provider: &mut self.bucket_id_provider,
|
||||
@@ -162,6 +166,7 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
};
|
||||
visitor.visit(0, true)?;
|
||||
}
|
||||
agg_data.put_back_composite_req_data(self.accessor_idx, composite_agg_data);
|
||||
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
sub_agg.check_flush_local(agg_data)?;
|
||||
@@ -216,13 +221,7 @@ impl SegmentCompositeCollector {
|
||||
req_data: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Self> {
|
||||
let composite_req_data =
|
||||
req_data.per_request.composite_req_data[node.idx_in_req_data].clone();
|
||||
validate_req(&composite_req_data)?;
|
||||
req_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(composite_req_data.get_memory_consumption() as u64)?;
|
||||
validate_req(req_data, node.idx_in_req_data)?;
|
||||
|
||||
let has_sub_aggregations = !node.children.is_empty();
|
||||
let sub_agg = if has_sub_aggregations {
|
||||
@@ -232,11 +231,12 @@ impl SegmentCompositeCollector {
|
||||
None
|
||||
};
|
||||
|
||||
let composite_req_data = req_data.get_composite_req_data(node.idx_in_req_data);
|
||||
let num_sources = composite_req_data.req.sources.len();
|
||||
|
||||
Ok(SegmentCompositeCollector {
|
||||
parent_buckets: vec![DynArrayHeapMap::try_new(num_sources)?],
|
||||
req_data: composite_req_data,
|
||||
accessor_idx: node.idx_in_req_data,
|
||||
sub_agg,
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
num_sources,
|
||||
@@ -258,7 +258,7 @@ impl SegmentCompositeCollector {
|
||||
let mut dict: FxHashMap<Vec<CompositeIntermediateKey>, IntermediateCompositeBucketEntry> =
|
||||
Default::default();
|
||||
dict.reserve(heap_map.size());
|
||||
let composite_data = &self.req_data;
|
||||
let composite_data = agg_data.get_composite_req_data(self.accessor_idx);
|
||||
for (key_internal_repr, agg) in heap_map.into_iter() {
|
||||
let key = resolve_key(&key_internal_repr, composite_data)?;
|
||||
let mut sub_aggregation_res = IntermediateAggregationResults::default();
|
||||
@@ -298,7 +298,8 @@ impl SegmentCompositeCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn validate_req(composite_data: &CompositeAggReqData) -> crate::Result<()> {
|
||||
fn validate_req(req_data: &mut AggregationsSegmentCtx, accessor_idx: usize) -> crate::Result<()> {
|
||||
let composite_data = req_data.get_composite_req_data(accessor_idx);
|
||||
let req = &composite_data.req;
|
||||
if req.sources.is_empty() {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::fmt::Debug;
|
||||
use std::rc::Rc;
|
||||
|
||||
use common::BitSet;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
@@ -397,7 +396,6 @@ impl PartialEq for FilterAggregation {
|
||||
|
||||
/// Request data for filter aggregation
|
||||
/// This struct holds the per-segment data needed to execute a filter aggregation
|
||||
#[derive(Clone)]
|
||||
pub struct FilterAggReqData {
|
||||
/// The name of the filter aggregation
|
||||
pub name: String,
|
||||
@@ -405,20 +403,22 @@ pub struct FilterAggReqData {
|
||||
pub req: FilterAggregation,
|
||||
/// The segment reader
|
||||
pub segment_reader: SegmentReader,
|
||||
/// Document evaluator for the filter query (precomputed BitSet).
|
||||
/// Wrapped in `Rc` so cloning the request data does not duplicate the (potentially large)
|
||||
/// underlying BitSet.
|
||||
pub evaluator: Rc<DocumentQueryEvaluator>,
|
||||
/// Document evaluator for the filter query (precomputed BitSet)
|
||||
/// This is built once when the request data is created
|
||||
pub evaluator: DocumentQueryEvaluator,
|
||||
/// Reusable buffer for matching documents to minimize allocations during collection
|
||||
pub matching_docs_buffer: Vec<DocId>,
|
||||
/// True if this filter aggregation is at the top level of the aggregation tree (not nested).
|
||||
pub is_top_level: bool,
|
||||
}
|
||||
|
||||
impl FilterAggReqData {
|
||||
pub(crate) fn get_memory_consumption(&self) -> usize {
|
||||
// Estimate: name + segment reader reference + bitset
|
||||
// Estimate: name + segment reader reference + bitset + buffer capacity
|
||||
self.name.len()
|
||||
+ std::mem::size_of::<SegmentReader>()
|
||||
+ self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes)
|
||||
+ self.matching_docs_buffer.capacity() * std::mem::size_of::<DocId>()
|
||||
+ std::mem::size_of::<bool>()
|
||||
}
|
||||
}
|
||||
@@ -509,10 +509,8 @@ pub struct SegmentFilterCollector<B: SubAggBuffer> {
|
||||
/// Sub-aggregation collectors
|
||||
sub_aggregations: Option<BufferedSubAggs<B>>,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
/// Per-segment filter request data, owned by this collector.
|
||||
req_data: FilterAggReqData,
|
||||
/// Reusable buffer for matching documents to minimize allocations during collection.
|
||||
matching_docs_buffer: Vec<DocId>,
|
||||
/// Accessor index for this filter aggregation (to access FilterAggReqData)
|
||||
accessor_idx: usize,
|
||||
}
|
||||
|
||||
impl<B: SubAggBuffer> SegmentFilterCollector<B> {
|
||||
@@ -520,7 +518,6 @@ impl<B: SubAggBuffer> SegmentFilterCollector<B> {
|
||||
pub(crate) fn from_req_and_validate(
|
||||
req: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
req_data: FilterAggReqData,
|
||||
) -> crate::Result<Self> {
|
||||
// Build sub-aggregation collectors if any
|
||||
let sub_agg_collector = if !node.children.is_empty() {
|
||||
@@ -530,15 +527,11 @@ impl<B: SubAggBuffer> SegmentFilterCollector<B> {
|
||||
};
|
||||
let sub_agg_collector = sub_agg_collector.map(BufferedSubAggs::new);
|
||||
|
||||
let max_doc = req_data.segment_reader.max_doc();
|
||||
let buffer_capacity = crate::docset::COLLECT_BLOCK_BUFFER_LEN.min(max_doc as usize);
|
||||
|
||||
Ok(SegmentFilterCollector {
|
||||
parent_buckets: Vec::new(),
|
||||
sub_aggregations: sub_agg_collector,
|
||||
req_data,
|
||||
accessor_idx: node.idx_in_req_data,
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
matching_docs_buffer: Vec::with_capacity(buffer_capacity),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -547,23 +540,18 @@ pub(crate) fn build_segment_filter_collector(
|
||||
req: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
||||
let req_data = req.per_request.filter_req_data[node.idx_in_req_data].clone();
|
||||
req.context
|
||||
.limits
|
||||
.add_memory_consumed(req_data.get_memory_consumption() as u64)?;
|
||||
let is_top_level = req_data.is_top_level;
|
||||
let is_top_level = req.per_request.filter_req_data[node.idx_in_req_data]
|
||||
.as_ref()
|
||||
.expect("filter_req_data slot is empty")
|
||||
.is_top_level;
|
||||
|
||||
if is_top_level {
|
||||
Ok(Box::new(
|
||||
SegmentFilterCollector::<LowCardSubAggBuffer>::from_req_and_validate(
|
||||
req, node, req_data,
|
||||
)?,
|
||||
SegmentFilterCollector::<LowCardSubAggBuffer>::from_req_and_validate(req, node)?,
|
||||
))
|
||||
} else {
|
||||
Ok(Box::new(
|
||||
SegmentFilterCollector::<HighCardSubAggBuffer>::from_req_and_validate(
|
||||
req, node, req_data,
|
||||
)?,
|
||||
SegmentFilterCollector::<HighCardSubAggBuffer>::from_req_and_validate(req, node)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -573,7 +561,7 @@ impl<B: SubAggBuffer> Debug for SegmentFilterCollector<B> {
|
||||
f.debug_struct("SegmentFilterCollector")
|
||||
.field("buckets", &self.parent_buckets)
|
||||
.field("has_sub_aggs", &self.sub_aggregations.is_some())
|
||||
.field("name", &self.req_data.name)
|
||||
.field("accessor_idx", &self.accessor_idx)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -610,7 +598,11 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentFilterCollector<B>
|
||||
};
|
||||
|
||||
// Get the name of this filter aggregation
|
||||
let name = self.req_data.name.clone();
|
||||
let name = agg_data.per_request.filter_req_data[self.accessor_idx]
|
||||
.as_ref()
|
||||
.expect("filter_req_data slot is empty")
|
||||
.name
|
||||
.clone();
|
||||
|
||||
results.push(
|
||||
name,
|
||||
@@ -631,24 +623,27 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentFilterCollector<B>
|
||||
}
|
||||
|
||||
let mut bucket = self.parent_buckets[parent_bucket_id as usize];
|
||||
// Take the request data to avoid borrow checker issues with sub-aggregations
|
||||
let mut req = agg_data.take_filter_req_data(self.accessor_idx);
|
||||
|
||||
// Use batch filtering with O(1) BitSet lookups
|
||||
self.matching_docs_buffer.clear();
|
||||
self.req_data
|
||||
.evaluator
|
||||
.filter_batch(docs, &mut self.matching_docs_buffer);
|
||||
req.matching_docs_buffer.clear();
|
||||
req.evaluator
|
||||
.filter_batch(docs, &mut req.matching_docs_buffer);
|
||||
|
||||
bucket.doc_count += self.matching_docs_buffer.len() as u64;
|
||||
bucket.doc_count += req.matching_docs_buffer.len() as u64;
|
||||
|
||||
// Batch process sub-aggregations if we have matches
|
||||
if !self.matching_docs_buffer.is_empty() {
|
||||
if !req.matching_docs_buffer.is_empty() {
|
||||
if let Some(sub_aggs) = &mut self.sub_aggregations {
|
||||
for &doc_id in &self.matching_docs_buffer {
|
||||
for &doc_id in &req.matching_docs_buffer {
|
||||
sub_aggs.push(bucket.bucket_id, doc_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Put the request data back
|
||||
agg_data.put_back_filter_req_data(self.accessor_idx, req);
|
||||
if let Some(sub_aggs) = &mut self.sub_aggregations {
|
||||
sub_aggs.check_flush_local(agg_data)?;
|
||||
}
|
||||
|
||||
@@ -21,7 +21,6 @@ use crate::TantivyError;
|
||||
|
||||
/// Contains all information required by the SegmentHistogramCollector to perform the
|
||||
/// histogram or date_histogram aggregation on a segment.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HistogramAggReqData {
|
||||
/// The column accessor to access the fast field values.
|
||||
pub accessor: Column<u64>,
|
||||
@@ -298,7 +297,7 @@ pub struct SegmentHistogramCollector {
|
||||
/// One Histogram bucket per parent bucket id.
|
||||
parent_buckets: Vec<HistogramBuckets>,
|
||||
sub_agg: Option<HighCardBufferedSubAggs>,
|
||||
req_data: HistogramAggReqData,
|
||||
accessor_idx: usize,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
}
|
||||
|
||||
@@ -309,7 +308,10 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
||||
results: &mut IntermediateAggregationResults,
|
||||
parent_bucket_id: BucketId,
|
||||
) -> crate::Result<()> {
|
||||
let name = self.req_data.name.clone();
|
||||
let name = agg_data
|
||||
.get_histogram_req_data(self.accessor_idx)
|
||||
.name
|
||||
.clone();
|
||||
// TODO: avoid prepare_max_bucket here and handle empty buckets.
|
||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
||||
let histogram = std::mem::take(&mut self.parent_buckets[parent_bucket_id as usize]);
|
||||
@@ -326,10 +328,10 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
||||
docs: &[crate::DocId],
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let req = agg_data.take_histogram_req_data(self.accessor_idx);
|
||||
let mem_pre = self.get_memory_consumption(parent_bucket_id);
|
||||
let buckets = &mut self.parent_buckets[parent_bucket_id as usize].buckets;
|
||||
|
||||
let req = &self.req_data;
|
||||
let bounds = req.bounds;
|
||||
let interval = req.req.interval;
|
||||
let offset = req.offset;
|
||||
@@ -359,6 +361,7 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
||||
}
|
||||
}
|
||||
}
|
||||
agg_data.put_back_histogram_req_data(self.accessor_idx, req);
|
||||
|
||||
let mem_delta = self.get_memory_consumption(parent_bucket_id) - mem_pre;
|
||||
if mem_delta > 0 {
|
||||
@@ -424,7 +427,10 @@ impl SegmentHistogramCollector {
|
||||
}
|
||||
buckets.sort_unstable_by(|b1, b2| b1.key.total_cmp(&b2.key));
|
||||
|
||||
let is_date_agg = self.req_data.field_type == ColumnType::DateTime;
|
||||
let is_date_agg = agg_data
|
||||
.get_histogram_req_data(self.accessor_idx)
|
||||
.field_type
|
||||
== ColumnType::DateTime;
|
||||
Ok(IntermediateBucketResult::Histogram {
|
||||
buckets,
|
||||
is_date_agg,
|
||||
@@ -440,7 +446,7 @@ impl SegmentHistogramCollector {
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let mut req_data = agg_data.per_request.histogram_req_data[node.idx_in_req_data].clone();
|
||||
let req_data = agg_data.get_histogram_req_data_mut(node.idx_in_req_data);
|
||||
req_data.req.validate()?;
|
||||
if req_data.field_type == ColumnType::DateTime && !req_data.is_date_histogram {
|
||||
req_data.req.normalize_date_time();
|
||||
@@ -450,16 +456,12 @@ impl SegmentHistogramCollector {
|
||||
max: f64::MAX,
|
||||
});
|
||||
req_data.offset = req_data.req.offset.unwrap_or(0.0);
|
||||
agg_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(req_data.get_memory_consumption() as u64)?;
|
||||
let sub_agg = sub_agg.map(BufferedSubAggs::new);
|
||||
|
||||
Ok(Self {
|
||||
parent_buckets: Default::default(),
|
||||
sub_agg,
|
||||
req_data,
|
||||
accessor_idx: node.idx_in_req_data,
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -23,7 +23,6 @@ use crate::TantivyError;
|
||||
|
||||
/// Contains all information required by the SegmentRangeCollector to perform the
|
||||
/// range aggregation on a segment.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RangeAggReqData {
|
||||
/// The column accessor to access the fast field values.
|
||||
pub accessor: Column<u64>,
|
||||
@@ -162,7 +161,7 @@ pub struct SegmentRangeCollector<B: SubAggBuffer> {
|
||||
/// One for each ParentBucketId
|
||||
parent_buckets: Vec<Vec<SegmentRangeAndBucketEntry>>,
|
||||
column_type: ColumnType,
|
||||
pub(crate) req_data: RangeAggReqData,
|
||||
pub(crate) accessor_idx: usize,
|
||||
sub_agg: Option<BufferedSubAggs<B>>,
|
||||
/// Here things get a bit weird. We need to assign unique bucket ids across all
|
||||
/// parent buckets. So we keep track of the next available bucket id here.
|
||||
@@ -185,7 +184,7 @@ impl<B: SubAggBuffer> Debug for SegmentRangeCollector<B> {
|
||||
f.debug_struct("SegmentRangeCollector")
|
||||
.field("parent_buckets_len", &self.parent_buckets.len())
|
||||
.field("column_type", &self.column_type)
|
||||
.field("name", &self.req_data.name)
|
||||
.field("accessor_idx", &self.accessor_idx)
|
||||
.field("has_sub_agg", &self.sub_agg.is_some())
|
||||
.finish()
|
||||
}
|
||||
@@ -240,7 +239,10 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
|
||||
) -> crate::Result<()> {
|
||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
||||
let field_type = self.column_type;
|
||||
let name = self.req_data.name.to_string();
|
||||
let name = agg_data
|
||||
.get_range_req_data(self.accessor_idx)
|
||||
.name
|
||||
.to_string();
|
||||
|
||||
let buckets = std::mem::take(&mut self.parent_buckets[parent_bucket_id as usize]);
|
||||
|
||||
@@ -279,15 +281,17 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
|
||||
docs: &[crate::DocId],
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let req = agg_data.take_range_req_data(self.accessor_idx);
|
||||
|
||||
agg_data
|
||||
.column_block_accessor
|
||||
.fetch_block(docs, &self.req_data.accessor);
|
||||
.fetch_block(docs, &req.accessor);
|
||||
|
||||
let buckets = &mut self.parent_buckets[parent_bucket_id as usize];
|
||||
|
||||
for (doc, val) in agg_data
|
||||
.column_block_accessor
|
||||
.iter_docid_vals(docs, &self.req_data.accessor)
|
||||
.iter_docid_vals(docs, &req.accessor)
|
||||
{
|
||||
let bucket_pos = get_bucket_pos(val, buckets);
|
||||
let bucket = &mut buckets[bucket_pos];
|
||||
@@ -297,6 +301,7 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
|
||||
}
|
||||
}
|
||||
|
||||
agg_data.put_back_range_req_data(self.accessor_idx, req);
|
||||
if let Some(sub_agg) = self.sub_agg.as_mut() {
|
||||
sub_agg.check_flush_local(agg_data)?;
|
||||
}
|
||||
@@ -314,10 +319,10 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
|
||||
fn prepare_max_bucket(
|
||||
&mut self,
|
||||
max_bucket: BucketId,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
while self.parent_buckets.len() <= max_bucket as usize {
|
||||
let new_buckets = self.create_new_buckets()?;
|
||||
let new_buckets = self.create_new_buckets(agg_data)?;
|
||||
self.parent_buckets.push(new_buckets);
|
||||
}
|
||||
|
||||
@@ -341,11 +346,8 @@ pub(crate) fn build_segment_range_collector(
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
||||
let req_data = agg_data.per_request.range_req_data[node.idx_in_req_data].clone();
|
||||
agg_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(req_data.get_memory_consumption() as u64)?;
|
||||
let accessor_idx = node.idx_in_req_data;
|
||||
let req_data = agg_data.get_range_req_data(node.idx_in_req_data);
|
||||
let field_type = req_data.field_type;
|
||||
|
||||
// TODO: A better metric instead of is_top_level would be the number of buckets expected.
|
||||
@@ -363,7 +365,7 @@ pub(crate) fn build_segment_range_collector(
|
||||
Ok(Box::new(SegmentRangeCollector::<LowCardSubAggBuffer> {
|
||||
sub_agg: sub_agg.map(LowCardBufferedSubAggs::new),
|
||||
column_type: field_type,
|
||||
req_data,
|
||||
accessor_idx,
|
||||
parent_buckets: Vec::new(),
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
limits: agg_data.context.limits.clone(),
|
||||
@@ -372,7 +374,7 @@ pub(crate) fn build_segment_range_collector(
|
||||
Ok(Box::new(SegmentRangeCollector::<HighCardSubAggBuffer> {
|
||||
sub_agg: sub_agg.map(BufferedSubAggs::new),
|
||||
column_type: field_type,
|
||||
req_data,
|
||||
accessor_idx,
|
||||
parent_buckets: Vec::new(),
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
limits: agg_data.context.limits.clone(),
|
||||
@@ -381,9 +383,12 @@ pub(crate) fn build_segment_range_collector(
|
||||
}
|
||||
|
||||
impl<B: SubAggBuffer> SegmentRangeCollector<B> {
|
||||
pub(crate) fn create_new_buckets(&mut self) -> crate::Result<Vec<SegmentRangeAndBucketEntry>> {
|
||||
pub(crate) fn create_new_buckets(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<Vec<SegmentRangeAndBucketEntry>> {
|
||||
let field_type = self.column_type;
|
||||
let req_data = &self.req_data;
|
||||
let req_data = agg_data.get_range_req_data(self.accessor_idx);
|
||||
// The range input on the request is f64.
|
||||
// We need to convert to u64 ranges, because we read the values as u64.
|
||||
// The mapping from the conversion is monotonic so ordering is preserved.
|
||||
@@ -558,16 +563,17 @@ mod tests {
|
||||
get_test_index_with_num_docs,
|
||||
};
|
||||
|
||||
pub fn build_test_buckets(
|
||||
pub fn get_collector_from_ranges(
|
||||
ranges: Vec<RangeAggregationRange>,
|
||||
field_type: ColumnType,
|
||||
) -> Vec<SegmentRangeAndBucketEntry> {
|
||||
) -> SegmentRangeCollector<HighCardSubAggBuffer> {
|
||||
let req = RangeAggregation {
|
||||
field: "dummy".to_string(),
|
||||
ranges,
|
||||
..Default::default()
|
||||
};
|
||||
extend_validate_ranges(&req.ranges, &field_type)
|
||||
// Build buckets directly as in from_req_and_validate without AggregationsData
|
||||
let buckets: Vec<_> = extend_validate_ranges(&req.ranges, &field_type)
|
||||
.expect("unexpected error in extend_validate_ranges")
|
||||
.iter()
|
||||
.map(|range| {
|
||||
@@ -598,7 +604,16 @@ mod tests {
|
||||
},
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
.collect();
|
||||
|
||||
SegmentRangeCollector {
|
||||
parent_buckets: vec![buckets],
|
||||
column_type: field_type,
|
||||
accessor_idx: 0,
|
||||
sub_agg: None,
|
||||
bucket_id_provider: Default::default(),
|
||||
limits: AggregationLimitsGuard::default(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -842,9 +857,9 @@ mod tests {
|
||||
#[test]
|
||||
fn bucket_test_extend_range_hole() {
|
||||
let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()];
|
||||
let parent_buckets = vec![build_test_buckets(buckets, ColumnType::F64)];
|
||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||
|
||||
let buckets = parent_buckets[0].clone();
|
||||
let buckets = collector.parent_buckets[0].clone();
|
||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||
@@ -865,9 +880,9 @@ mod tests {
|
||||
(10f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
];
|
||||
let parent_buckets = vec![build_test_buckets(buckets, ColumnType::F64)];
|
||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||
|
||||
let buckets = parent_buckets[0].clone();
|
||||
let buckets = collector.parent_buckets[0].clone();
|
||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||
@@ -880,18 +895,18 @@ mod tests {
|
||||
#[test]
|
||||
fn bucket_range_test_negative_vals() {
|
||||
let buckets = vec![(-10f64..-1f64).into()];
|
||||
let parent_buckets = vec![build_test_buckets(buckets, ColumnType::F64)];
|
||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||
|
||||
let buckets = parent_buckets[0].clone();
|
||||
let buckets = collector.parent_buckets[0].clone();
|
||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*--10");
|
||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "-1-*");
|
||||
}
|
||||
#[test]
|
||||
fn bucket_range_test_positive_vals() {
|
||||
let buckets = vec![(0f64..10f64).into()];
|
||||
let parent_buckets = vec![build_test_buckets(buckets, ColumnType::F64)];
|
||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||
|
||||
let buckets = parent_buckets[0].clone();
|
||||
let buckets = collector.parent_buckets[0].clone();
|
||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*-0");
|
||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "10-*");
|
||||
}
|
||||
@@ -899,8 +914,8 @@ mod tests {
|
||||
#[test]
|
||||
fn range_binary_search_test_u64() {
|
||||
let check_ranges = |ranges: Vec<RangeAggregationRange>| {
|
||||
let parent_buckets = vec![build_test_buckets(ranges, ColumnType::U64)];
|
||||
let search = |val: u64| get_bucket_pos(val, &parent_buckets[0]);
|
||||
let collector = get_collector_from_ranges(ranges, ColumnType::U64);
|
||||
let search = |val: u64| get_bucket_pos(val, &collector.parent_buckets[0]);
|
||||
|
||||
assert_eq!(search(u64::MIN), 0);
|
||||
assert_eq!(search(9), 0);
|
||||
@@ -945,8 +960,8 @@ mod tests {
|
||||
fn range_binary_search_test_f64() {
|
||||
let ranges = vec![(10.0..100.0).into()];
|
||||
|
||||
let parent_buckets = vec![build_test_buckets(ranges, ColumnType::F64)];
|
||||
let search = |val: u64| get_bucket_pos(val, &parent_buckets[0]);
|
||||
let collector = get_collector_from_ranges(ranges, ColumnType::F64);
|
||||
let search = |val: u64| get_bucket_pos(val, &collector.parent_buckets[0]);
|
||||
|
||||
assert_eq!(search(u64::MIN), 0);
|
||||
assert_eq!(search(9f64.to_u64()), 0);
|
||||
|
||||
@@ -4,7 +4,6 @@ use std::io;
|
||||
|
||||
use columnar::column_values::CompactSpaceU64Accessor;
|
||||
use columnar::{Column, ColumnType, Dictionary, StrColumn};
|
||||
use common::{BitSet, TinySet};
|
||||
use datasketches::hll::{Coupon, HllSketch, HllType, HllUnion};
|
||||
use rustc_hash::{FxBuildHasher, FxHashMap, FxHashSet};
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
@@ -21,12 +20,6 @@ use crate::TantivyError;
|
||||
/// 2^11 = 2048 registers, giving ~2.3% relative error and ~1KB per sketch (Hll4).
|
||||
const LG_K: u8 = 11;
|
||||
|
||||
/// Promote FxHashSet<u64> -> PagedBitset at ~3% density (`len * 32 >
|
||||
/// dict_num_terms`). Past this point the bitset (~`dict_num_terms / 7.5`
|
||||
/// bytes) is smaller than the hashset (~10 B/entry minimum) and avoids
|
||||
/// the per-insert hash.
|
||||
const PROMOTION_RATIO: u64 = 32;
|
||||
|
||||
/// # Cardinality
|
||||
///
|
||||
/// The cardinality aggregation allows for computing an estimate
|
||||
@@ -184,263 +177,9 @@ impl CouponCache {
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
// PagedBitset: a sparse bitset indexed by term_ord.
|
||||
//
|
||||
// Used as the dense alternative to FxHashSet<u64> once a string
|
||||
// cardinality bucket has accumulated enough unique term ordinals.
|
||||
// Memory is bounded to (touched pages) * (page bytes), not
|
||||
// (max_term_ord / 8).
|
||||
//
|
||||
// Page geometry mirrors `PagedTermMap` in `term_agg.rs`: 1024 ords
|
||||
// per page, lazy `Vec<Option<Box<Page>>>` directory.
|
||||
// =================================================================
|
||||
const BITSET_PAGE_SHIFT: u32 = 10;
|
||||
const BITSET_PAGE_BITS: u64 = 1u64 << BITSET_PAGE_SHIFT; // 1024
|
||||
const BITSET_PAGE_MASK: u64 = BITSET_PAGE_BITS - 1;
|
||||
const BITSET_WORDS_PER_PAGE: usize = (BITSET_PAGE_BITS / 64) as usize; // 16
|
||||
|
||||
#[derive(Clone)]
|
||||
struct PagedBitsetPage {
|
||||
words: [TinySet; BITSET_WORDS_PER_PAGE],
|
||||
}
|
||||
|
||||
impl PagedBitsetPage {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
words: [TinySet::empty(); BITSET_WORDS_PER_PAGE],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct PagedBitset {
|
||||
pages: Vec<Option<Box<PagedBitsetPage>>>,
|
||||
/// Cached number of set bits, maintained on insert.
|
||||
count: u64,
|
||||
}
|
||||
|
||||
impl PagedBitset {
|
||||
/// Allocates a directory big enough to hold ords up to and including
|
||||
/// `max_term_ord`. Pages are allocated lazily on first set.
|
||||
fn with_max_term_ord(max_term_ord: u64) -> Self {
|
||||
let max_page_idx = (max_term_ord >> BITSET_PAGE_SHIFT) as usize;
|
||||
let num_pages = max_page_idx + 1;
|
||||
Self {
|
||||
pages: vec![None; num_pages],
|
||||
count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn insert(&mut self, term_ord: u64) {
|
||||
let page_idx = (term_ord >> BITSET_PAGE_SHIFT) as usize;
|
||||
let intra = term_ord & BITSET_PAGE_MASK;
|
||||
let word_idx = (intra >> 6) as usize;
|
||||
let bit_idx = (intra & 63) as u32;
|
||||
|
||||
let page = match &mut self.pages[page_idx] {
|
||||
Some(p) => p,
|
||||
None => {
|
||||
self.pages[page_idx] = Some(Box::new(PagedBitsetPage::new()));
|
||||
self.pages[page_idx].as_mut().unwrap()
|
||||
}
|
||||
};
|
||||
if page.words[word_idx].insert_mut(bit_idx) {
|
||||
self.count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Number of set bits. O(1).
|
||||
#[inline]
|
||||
fn len(&self) -> u64 {
|
||||
self.count
|
||||
}
|
||||
|
||||
/// Iterate set ords in ascending order.
|
||||
fn iter_sorted(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
self.pages
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(page_idx, page_opt)| page_opt.as_ref().map(|p| (page_idx, p)))
|
||||
.flat_map(|(page_idx, page)| {
|
||||
let page_base_ord = (page_idx as u64) << BITSET_PAGE_SHIFT;
|
||||
page.words
|
||||
.iter()
|
||||
.enumerate()
|
||||
.flat_map(move |(word_idx, &word)| {
|
||||
let word_base_ord = page_base_ord + (word_idx as u64) * 64;
|
||||
word.into_iter()
|
||||
.map(move |bit| word_base_ord + u64::from(bit))
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Threshold below which we use `BitSet` instead of `TermOrdSet`.
|
||||
///
|
||||
/// Both `BitSet` and `FxHashSet<u64>` have the same 32-byte struct, so the comparison is heap only:
|
||||
/// * `BitSet` at T=256: 5 `TinySet` words covering 258 bits (with the missing-value sentinel) =
|
||||
/// 40 bytes.
|
||||
/// * `FxHashSet<u64>` after one insert: 4-bucket hashbrown table ≈ 56 bytes
|
||||
pub(crate) const BITSET_MAX_TERM_ORD: u64 = 256;
|
||||
|
||||
// =================================================================
|
||||
// TermOrdAccumulator: per-bucket abstraction over the entries set.
|
||||
//
|
||||
// Implementations:
|
||||
// - `BitSet` (from `common`): used when `column.max_value()` is small (< BITSET_MAX_TERM_ORD).
|
||||
// Pre-allocated, no promotion.
|
||||
// - `TermOrdSet`: adaptive, starts as FxHashSet and promotes to a paged bitset when occupancy
|
||||
// crosses the density threshold (only if promotion is enabled — typically gated on top-level
|
||||
// aggregation).
|
||||
//
|
||||
// The trait lets `SegmentCardinalityCollector` be generic over the choice
|
||||
// so the hot collect() loop monomorphizes to a direct call (no enum
|
||||
// dispatch per insert).
|
||||
// =================================================================
|
||||
pub(crate) trait TermOrdAccumulator: Sized {
|
||||
/// Construct an empty accumulator.
|
||||
/// `max_term_ord_inclusive` is the largest term_ord that may be
|
||||
/// inserted (used to size pre-allocated bitsets and the dense bitset
|
||||
/// on promotion).
|
||||
fn new(max_term_ord_inclusive: u64) -> Self;
|
||||
fn insert(&mut self, term_ord: u64);
|
||||
/// Bulk insert. Implementations may override to hoist any inner
|
||||
/// dispatch outside the loop. Default loops `insert`.
|
||||
#[inline]
|
||||
fn extend_from_iter<I: IntoIterator<Item = u64>>(&mut self, ords: I) {
|
||||
for ord in ords {
|
||||
self.insert(ord);
|
||||
}
|
||||
}
|
||||
/// Hook called once per ingested block. Adaptive impls use this to
|
||||
/// decide on sparse->dense promotion.
|
||||
fn maybe_compact(&mut self) {}
|
||||
fn len(&self) -> usize;
|
||||
fn iter_ords(&self) -> impl Iterator<Item = u64> + '_;
|
||||
}
|
||||
|
||||
impl TermOrdAccumulator for BitSet {
|
||||
#[inline]
|
||||
fn new(max_term_ord_inclusive: u64) -> Self {
|
||||
// `BitSet::with_max_value(M)` accepts ords in [0, M).
|
||||
// We need ords up to and including `max_term_ord_inclusive`, plus
|
||||
// the missing-value sentinel `column.max_value() + 1`.
|
||||
BitSet::with_max_value((max_term_ord_inclusive + 2) as u32)
|
||||
}
|
||||
#[inline]
|
||||
fn insert(&mut self, term_ord: u64) {
|
||||
BitSet::insert(self, term_ord as u32);
|
||||
}
|
||||
#[inline]
|
||||
fn len(&self) -> usize {
|
||||
BitSet::len(self)
|
||||
}
|
||||
fn iter_ords(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
// `BitSet` itself doesn't expose iteration, but
|
||||
// `BitSet::tinyset(bucket)` does. Walk per-bucket and yield each
|
||||
// set bit. The capacity is `max_value()`; iterating to
|
||||
// `div_ceil(64)` covers every possible ord exactly once.
|
||||
let num_buckets = self.max_value().div_ceil(64);
|
||||
(0..num_buckets).flat_map(move |bucket| {
|
||||
let chunk_base = u64::from(bucket) * 64;
|
||||
self.tinyset(bucket)
|
||||
.into_iter()
|
||||
.map(move |bit| chunk_base + u64::from(bit))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
// TermOrdSet: adaptive sparse->dense accumulator.
|
||||
//
|
||||
// Starts as an FxHashSet (cheap when few ords are seen). When occupancy
|
||||
// crosses `len * PROMOTION_RATIO > max_term_ord_inclusive`, drains into
|
||||
// a `PagedBitset` and continues dense. Promotion is one-way.
|
||||
// =================================================================
|
||||
pub(crate) struct TermOrdSet {
|
||||
inner: TermOrdSetInner,
|
||||
/// Largest term_ord that may be inserted. Used for both sizing the
|
||||
/// dense bitset on promotion and as the promotion-threshold reference.
|
||||
max_term_ord_inclusive: u64,
|
||||
}
|
||||
|
||||
enum TermOrdSetInner {
|
||||
Sparse(FxHashSet<u64>),
|
||||
Dense(PagedBitset),
|
||||
}
|
||||
|
||||
impl TermOrdAccumulator for TermOrdSet {
|
||||
fn new(max_term_ord_inclusive: u64) -> Self {
|
||||
Self {
|
||||
inner: TermOrdSetInner::Sparse(FxHashSet::default()),
|
||||
max_term_ord_inclusive,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn insert(&mut self, term_ord: u64) {
|
||||
match &mut self.inner {
|
||||
TermOrdSetInner::Sparse(set) => {
|
||||
set.insert(term_ord);
|
||||
}
|
||||
TermOrdSetInner::Dense(bitset) => bitset.insert(term_ord),
|
||||
}
|
||||
}
|
||||
|
||||
/// Hoist the Sparse/Dense match outside the per-ord loop so that a
|
||||
/// block of inserts dispatches once.
|
||||
fn extend_from_iter<I: IntoIterator<Item = u64>>(&mut self, ords: I) {
|
||||
match &mut self.inner {
|
||||
TermOrdSetInner::Sparse(set) => {
|
||||
for ord in ords {
|
||||
set.insert(ord);
|
||||
}
|
||||
}
|
||||
TermOrdSetInner::Dense(bitset) => {
|
||||
for ord in ords {
|
||||
bitset.insert(ord);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn maybe_compact(&mut self) {
|
||||
let TermOrdSetInner::Sparse(set) = &mut self.inner else {
|
||||
return;
|
||||
};
|
||||
if set.len() as u64 * PROMOTION_RATIO <= self.max_term_ord_inclusive {
|
||||
return;
|
||||
}
|
||||
// Size for ord <= max_term_ord_inclusive plus the missing sentinel
|
||||
// (column.max_value() + 1, which may equal max_term_ord_inclusive
|
||||
// when the column references every dictionary term).
|
||||
let mut bitset = PagedBitset::with_max_term_ord(self.max_term_ord_inclusive + 1);
|
||||
let set = std::mem::take(set);
|
||||
for ord in set {
|
||||
bitset.insert(ord);
|
||||
}
|
||||
self.inner = TermOrdSetInner::Dense(bitset);
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
match &self.inner {
|
||||
TermOrdSetInner::Sparse(set) => set.len(),
|
||||
TermOrdSetInner::Dense(bitset) => bitset.len() as usize,
|
||||
}
|
||||
}
|
||||
|
||||
fn iter_ords(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
match &self.inner {
|
||||
TermOrdSetInner::Sparse(set) => itertools::Either::Left(set.iter().copied()),
|
||||
TermOrdSetInner::Dense(bitset) => itertools::Either::Right(bitset.iter_sorted()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct SegmentCardinalityCollector<S: TermOrdAccumulator> {
|
||||
pub(crate) struct SegmentCardinalityCollector {
|
||||
/// Buckets are Some(_) until they get consumed by into_intermediate_results().
|
||||
buckets: Vec<Option<SegmentCardinalityCollectorBucket<S>>>,
|
||||
buckets: Vec<Option<SegmentCardinalityCollectorBucket>>,
|
||||
accessor_idx: usize,
|
||||
/// The column accessor to access the fast field values.
|
||||
accessor: Column<u64>,
|
||||
@@ -449,13 +188,9 @@ pub(crate) struct SegmentCardinalityCollector<S: TermOrdAccumulator> {
|
||||
/// The missing value normalized to the internal u64 representation of the field type.
|
||||
missing_value_for_accessor: Option<u64>,
|
||||
coupon_cache: Option<CouponCache>,
|
||||
/// Largest term_ord that may be inserted into a bucket. For str columns
|
||||
/// this is `accessor.max_value()`; for non-str columns this is unused
|
||||
/// (no inserts go into `entries`) and set to 0.
|
||||
max_term_ord_inclusive: u64,
|
||||
}
|
||||
|
||||
impl<S: TermOrdAccumulator> Debug for SegmentCardinalityCollector<S> {
|
||||
impl Debug for SegmentCardinalityCollector {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentCardinalityCollector")
|
||||
.field("column_type", &self.column_type)
|
||||
@@ -467,21 +202,16 @@ impl<S: TermOrdAccumulator> Debug for SegmentCardinalityCollector<S> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-bucket state. Shape depends on column kind: str columns dedup
|
||||
/// term ords and only build the HLL sketch at finalization (saves the
|
||||
/// ~96 B `CardinalityCollector` per bucket during collect); numeric/IpAddr
|
||||
/// columns feed the sketch directly during collect.
|
||||
pub(crate) enum SegmentCardinalityCollectorBucket<S: TermOrdAccumulator> {
|
||||
Str(S),
|
||||
Numeric(CardinalityCollector),
|
||||
pub(crate) struct SegmentCardinalityCollectorBucket {
|
||||
cardinality: CardinalityCollector,
|
||||
entries: FxHashSet<u64>,
|
||||
}
|
||||
impl<S: TermOrdAccumulator> SegmentCardinalityCollectorBucket<S> {
|
||||
impl SegmentCardinalityCollectorBucket {
|
||||
#[inline(always)]
|
||||
pub fn new(column_type: ColumnType, max_term_ord_inclusive: u64) -> Self {
|
||||
if column_type == ColumnType::Str {
|
||||
Self::Str(S::new(max_term_ord_inclusive))
|
||||
} else {
|
||||
Self::Numeric(CardinalityCollector::new(column_type as u8))
|
||||
pub fn new(column_type: ColumnType) -> Self {
|
||||
Self {
|
||||
cardinality: CardinalityCollector::new(column_type as u8),
|
||||
entries: FxHashSet::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -492,57 +222,37 @@ impl<S: TermOrdAccumulator> SegmentCardinalityCollectorBucket<S> {
|
||||
//
|
||||
// If the column is str, then the values are dictionary encoded
|
||||
// and have not been added to the sketch yet.
|
||||
// We need to resolves the term ords accumulated in the str entries
|
||||
// with the coupon cache, and append the results to a fresh sketch.
|
||||
// We need to resolves the term ords accumulated in self.entries
|
||||
// with the coupon cache, and append the results to the sketch.
|
||||
fn into_intermediate_metric_result(
|
||||
self,
|
||||
mut self,
|
||||
coupon_cache_opt: Option<&CouponCache>,
|
||||
) -> crate::Result<IntermediateMetricResult> {
|
||||
let cardinality = match self {
|
||||
Self::Str(entries) => {
|
||||
let mut cardinality = CardinalityCollector::new(ColumnType::Str as u8);
|
||||
if let Some(coupon_cache) = coupon_cache_opt {
|
||||
// Sketch must be empty for str columns: coupons are appended here
|
||||
// from the term_ord set (and not directly during collection).
|
||||
assert!(cardinality.sketch.is_empty());
|
||||
append_to_sketch(&entries, coupon_cache, &mut cardinality);
|
||||
}
|
||||
cardinality
|
||||
}
|
||||
Self::Numeric(cardinality) => cardinality,
|
||||
};
|
||||
Ok(IntermediateMetricResult::Cardinality(cardinality))
|
||||
if let Some(coupon_cache) = coupon_cache_opt {
|
||||
assert!(self.cardinality.sketch.is_empty());
|
||||
append_to_sketch(&self.entries, coupon_cache, &mut self.cardinality);
|
||||
}
|
||||
Ok(IntermediateMetricResult::Cardinality(self.cardinality))
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a coupon cache from the given buckets, dictionary, and optional missing value.
|
||||
/// Returns a mapping from term_ord to the hash (coupon) of the associated term.
|
||||
fn build_coupon_cache<S: TermOrdAccumulator>(
|
||||
buckets: &[Option<SegmentCardinalityCollectorBucket<S>>],
|
||||
fn build_coupon_cache(
|
||||
buckets: &[Option<SegmentCardinalityCollectorBucket>],
|
||||
dictionary: &Dictionary,
|
||||
missing_value_opt: Option<&Key>,
|
||||
) -> io::Result<CouponCache> {
|
||||
// Caller restricts this to str cardinality collectors, so every
|
||||
// present bucket must be the `Str` variant. Pass 1 validates and
|
||||
// computes the capacity hint; pass 2 inserts.
|
||||
let mut max_bucket_len = 0usize;
|
||||
let term_ords_capacity: usize = buckets
|
||||
.iter()
|
||||
.flatten()
|
||||
.map(|bucket| bucket.entries.len())
|
||||
.max()
|
||||
.unwrap_or(0)
|
||||
* 2;
|
||||
let mut term_ords_set = FxHashSet::with_capacity_and_hasher(term_ords_capacity, FxBuildHasher);
|
||||
for bucket in buckets.iter().flatten() {
|
||||
match bucket {
|
||||
SegmentCardinalityCollectorBucket::Str(entries) => {
|
||||
max_bucket_len = max_bucket_len.max(entries.len());
|
||||
}
|
||||
SegmentCardinalityCollectorBucket::Numeric(_) => {
|
||||
return Err(io::Error::other(
|
||||
"build_coupon_cache invoked with a non-str bucket",
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut term_ords_set = FxHashSet::with_capacity_and_hasher(max_bucket_len * 2, FxBuildHasher);
|
||||
for bucket in buckets.iter().flatten() {
|
||||
if let SegmentCardinalityCollectorBucket::Str(entries) = bucket {
|
||||
term_ords_set.extend(entries.iter_ords());
|
||||
}
|
||||
term_ords_set.extend(bucket.entries.iter().copied());
|
||||
}
|
||||
let mut term_ords: Vec<u64> = term_ords_set.into_iter().collect();
|
||||
term_ords.sort_unstable();
|
||||
@@ -574,8 +284,8 @@ fn build_coupon_cache<S: TermOrdAccumulator>(
|
||||
Ok(CouponCache::new(term_ords, coupons, missing_coupon_opt))
|
||||
}
|
||||
|
||||
fn append_to_sketch<S: TermOrdAccumulator>(
|
||||
term_ords: &S,
|
||||
fn append_to_sketch(
|
||||
term_ords: &FxHashSet<u64>,
|
||||
coupon_cache: &CouponCache,
|
||||
sketch: &mut CardinalityCollector,
|
||||
) {
|
||||
@@ -584,7 +294,7 @@ fn append_to_sketch<S: TermOrdAccumulator>(
|
||||
coupon_map,
|
||||
missing_coupon_opt,
|
||||
} => {
|
||||
for term_ord in term_ords.iter_ords() {
|
||||
for &term_ord in term_ords {
|
||||
if let Some(coupon) = coupon_map
|
||||
.get(term_ord as usize)
|
||||
.copied()
|
||||
@@ -598,8 +308,8 @@ fn append_to_sketch<S: TermOrdAccumulator>(
|
||||
coupon_map,
|
||||
missing_coupon_opt,
|
||||
} => {
|
||||
for term_ord in term_ords.iter_ords() {
|
||||
if let Some(coupon) = coupon_map.get(&term_ord).copied().or(*missing_coupon_opt) {
|
||||
for term_ord in term_ords {
|
||||
if let Some(coupon) = coupon_map.get(term_ord).copied().or(*missing_coupon_opt) {
|
||||
sketch.insert_coupon(coupon);
|
||||
}
|
||||
}
|
||||
@@ -607,13 +317,12 @@ fn append_to_sketch<S: TermOrdAccumulator>(
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: TermOrdAccumulator> SegmentCardinalityCollector<S> {
|
||||
impl SegmentCardinalityCollector {
|
||||
pub fn from_req(
|
||||
column_type: ColumnType,
|
||||
accessor_idx: usize,
|
||||
accessor: Column<u64>,
|
||||
missing_value_for_accessor: Option<u64>,
|
||||
max_term_ord_inclusive: u64,
|
||||
) -> Self {
|
||||
Self {
|
||||
buckets: Vec::new(),
|
||||
@@ -622,7 +331,6 @@ impl<S: TermOrdAccumulator> SegmentCardinalityCollector<S> {
|
||||
accessor,
|
||||
missing_value_for_accessor,
|
||||
coupon_cache: None,
|
||||
max_term_ord_inclusive,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -639,9 +347,7 @@ impl<S: TermOrdAccumulator> SegmentCardinalityCollector<S> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: TermOrdAccumulator + 'static> SegmentAggregationCollector
|
||||
for SegmentCardinalityCollector<S>
|
||||
{
|
||||
impl SegmentAggregationCollector for SegmentCardinalityCollector {
|
||||
fn add_intermediate_aggregation_result(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
@@ -696,41 +402,31 @@ impl<S: TermOrdAccumulator + 'static> SegmentAggregationCollector
|
||||
));
|
||||
};
|
||||
let col_block_accessor = &agg_data.column_block_accessor;
|
||||
match bucket {
|
||||
SegmentCardinalityCollectorBucket::Str(entries) => {
|
||||
// Promotion check runs on the pre-block state: the first call
|
||||
// sees an empty set (no-op), and the last block of inserts
|
||||
// doesn't trigger a promotion of a set we won't grow further.
|
||||
// The trait dispatches once per block (via `extend_from_iter`)
|
||||
// for adaptive variants and inlines to a tight loop for the
|
||||
// BitSet path.
|
||||
entries.maybe_compact();
|
||||
entries.extend_from_iter(col_block_accessor.iter_vals());
|
||||
if self.column_type == ColumnType::Str {
|
||||
for term_ord in col_block_accessor.iter_vals() {
|
||||
bucket.entries.insert(term_ord);
|
||||
}
|
||||
SegmentCardinalityCollectorBucket::Numeric(cardinality) => {
|
||||
if self.column_type == ColumnType::IpAddr {
|
||||
let compact_space_accessor = self
|
||||
.accessor
|
||||
.values
|
||||
.clone()
|
||||
.downcast_arc::<CompactSpaceU64Accessor>()
|
||||
.map_err(|_| {
|
||||
TantivyError::AggregationError(
|
||||
crate::aggregation::AggregationError::InternalError(
|
||||
"Type mismatch: Could not downcast to CompactSpaceU64Accessor"
|
||||
.to_string(),
|
||||
),
|
||||
)
|
||||
})?;
|
||||
for val in col_block_accessor.iter_vals() {
|
||||
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
|
||||
cardinality.insert(val);
|
||||
}
|
||||
} else {
|
||||
for val in col_block_accessor.iter_vals() {
|
||||
cardinality.insert(val);
|
||||
}
|
||||
}
|
||||
} else if self.column_type == ColumnType::IpAddr {
|
||||
let compact_space_accessor = self
|
||||
.accessor
|
||||
.values
|
||||
.clone()
|
||||
.downcast_arc::<CompactSpaceU64Accessor>()
|
||||
.map_err(|_| {
|
||||
TantivyError::AggregationError(
|
||||
crate::aggregation::AggregationError::InternalError(
|
||||
"Type mismatch: Could not downcast to CompactSpaceU64Accessor"
|
||||
.to_string(),
|
||||
),
|
||||
)
|
||||
})?;
|
||||
for val in col_block_accessor.iter_vals() {
|
||||
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
|
||||
bucket.cardinality.insert(val);
|
||||
}
|
||||
} else {
|
||||
for val in col_block_accessor.iter_vals() {
|
||||
bucket.cardinality.insert(val);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -743,13 +439,8 @@ impl<S: TermOrdAccumulator + 'static> SegmentAggregationCollector
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
if max_bucket as usize >= self.buckets.len() {
|
||||
let column_type = self.column_type;
|
||||
let max_term_ord_inclusive = self.max_term_ord_inclusive;
|
||||
self.buckets.resize_with(max_bucket as usize + 1, || {
|
||||
Some(SegmentCardinalityCollectorBucket::<S>::new(
|
||||
column_type,
|
||||
max_term_ord_inclusive,
|
||||
))
|
||||
Some(SegmentCardinalityCollectorBucket::new(self.column_type))
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
@@ -767,14 +458,13 @@ impl<S: TermOrdAccumulator + 'static> SegmentAggregationCollector
|
||||
return None;
|
||||
}
|
||||
let bucket = self.buckets.get(bucket_id as usize)?.as_ref()?;
|
||||
// For string columns the sketch isn't built until finalization; the
|
||||
// term_ord set's len is the exact distinct count. For numeric columns
|
||||
// the sketch is populated during collect.
|
||||
match bucket {
|
||||
SegmentCardinalityCollectorBucket::Str(entries) => Some(entries.len() as f64),
|
||||
SegmentCardinalityCollectorBucket::Numeric(cardinality) => {
|
||||
Some(cardinality.sketch.estimate().trunc())
|
||||
}
|
||||
// For string columns the HLL sketch is empty until materialization; entries holds
|
||||
// the deduplicated term ordinals seen, which is the exact distinct count.
|
||||
// For numeric columns the sketch is populated during collect.
|
||||
if self.column_type == ColumnType::Str {
|
||||
Some(bucket.entries.len() as f64)
|
||||
} else {
|
||||
Some(bucket.cardinality.sketch.estimate().trunc())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -924,134 +614,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Build a single-segment string-cardinality index with 32 unique terms.
|
||||
/// `column.max_value() = 31` is well below `BITSET_MAX_TERM_ORD`,
|
||||
/// so the bucket exercises the `BitSet` path end to end.
|
||||
#[test]
|
||||
fn cardinality_aggregation_test_str_bitset() -> crate::Result<()> {
|
||||
let terms: Vec<String> = (0..32).map(|i| format!("term_{i}")).collect();
|
||||
let term_refs: Vec<Vec<&str>> = terms.iter().map(|t| vec![t.as_str()]).collect::<Vec<_>>();
|
||||
// single segment so we have a single dictionary of 32 terms.
|
||||
let index = get_test_index_from_terms(true, &term_refs)?;
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"cardinality": {
|
||||
"cardinality": { "field": "string_id" }
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["cardinality"]["value"], 32.0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// `BitSet` path with a `missing` parameter: the column-level missing
|
||||
/// sentinel (`column.max_value() + 1`) flows into the bitset, the
|
||||
/// dict lookup filter at finalization drops it, and the missing
|
||||
/// coupon is applied separately.
|
||||
#[test]
|
||||
fn cardinality_aggregation_test_str_bitset_with_missing() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let name_field = schema_builder.add_text_field("name", STRING | FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
for i in 0..16 {
|
||||
let term = format!("t{i:02}");
|
||||
writer.add_document(doc!(name_field => term)).unwrap();
|
||||
}
|
||||
// One empty doc, exercising the missing sentinel.
|
||||
writer.add_document(doc!()).unwrap();
|
||||
writer.commit().unwrap();
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "name",
|
||||
"missing": "MISSING_SENTINEL_KEY",
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index).unwrap();
|
||||
// 16 distinct real terms + 1 distinct "missing" value = 17.
|
||||
assert_eq!(res["cardinality"]["value"], 17.0);
|
||||
}
|
||||
|
||||
/// Unit-test the PagedBitset itself: cross-page inserts produce sorted
|
||||
/// iteration, len() matches the inserted set, and duplicates are
|
||||
/// idempotent.
|
||||
#[test]
|
||||
fn paged_bitset_basic() {
|
||||
use super::PagedBitset;
|
||||
// Span several pages: BITSET_PAGE_BITS = 1024, so ords > 1024 land
|
||||
// on the second page, > 2048 on the third, etc.
|
||||
let ords = [0u64, 1, 63, 64, 1023, 1024, 1025, 4096, 4097, 9999, 10_000];
|
||||
let max_ord = *ords.iter().max().unwrap();
|
||||
let mut bitset = PagedBitset::with_max_term_ord(max_ord);
|
||||
for &ord in &ords {
|
||||
bitset.insert(ord);
|
||||
// Idempotent: inserting again must not increase count.
|
||||
bitset.insert(ord);
|
||||
}
|
||||
assert_eq!(bitset.len(), ords.len() as u64);
|
||||
let collected: Vec<u64> = bitset.iter_sorted().collect();
|
||||
let mut expected: Vec<u64> = ords.to_vec();
|
||||
expected.sort_unstable();
|
||||
assert_eq!(collected, expected);
|
||||
}
|
||||
|
||||
/// Unit-test `TermOrdSet`: starts Sparse, promotes to Dense on
|
||||
/// `maybe_compact` once the density threshold is crossed, and
|
||||
/// `iter_ords()` yields the same set in either state. Ords spanning
|
||||
/// multiple paged-bitset pages exercise the Dense iter ordering.
|
||||
#[test]
|
||||
fn term_ord_set_promotes_on_maybe_compact() {
|
||||
use super::{TermOrdAccumulator, TermOrdSet, PROMOTION_RATIO};
|
||||
// Pick max so promotion needs few inserts: len * RATIO > max with
|
||||
// RATIO=32 and max=64 trips at len=3 (3*32=96 > 64).
|
||||
let max_term_ord = 64u64;
|
||||
let mut set = <TermOrdSet as TermOrdAccumulator>::new(max_term_ord);
|
||||
// Two inserts: should stay Sparse after maybe_compact (2 * RATIO = 64, not > 64).
|
||||
set.insert(0);
|
||||
set.insert(7);
|
||||
set.maybe_compact();
|
||||
assert_eq!(set.len(), 2);
|
||||
|
||||
// Third insert promotes on next maybe_compact.
|
||||
set.insert(20);
|
||||
assert_eq!(set.len(), 3);
|
||||
// Sanity check: at len=3, 3 * PROMOTION_RATIO = 96 > 64.
|
||||
assert!(3u64 * PROMOTION_RATIO > max_term_ord);
|
||||
set.maybe_compact();
|
||||
|
||||
// Post-promotion: extending continues to work.
|
||||
set.insert(15);
|
||||
set.insert(15); // dup
|
||||
assert_eq!(set.len(), 4);
|
||||
|
||||
let mut collected: Vec<u64> = set.iter_ords().collect();
|
||||
collected.sort_unstable();
|
||||
assert_eq!(collected, vec![0, 7, 15, 20]);
|
||||
}
|
||||
|
||||
/// Unit-test the `BitSet` impl of `TermOrdAccumulator`: insert,
|
||||
/// dedup, and iter_ords order.
|
||||
#[test]
|
||||
fn bitset_accumulator_basic() {
|
||||
use common::BitSet;
|
||||
|
||||
use super::TermOrdAccumulator;
|
||||
let mut set = <BitSet as TermOrdAccumulator>::new(255);
|
||||
for ord in [0u64, 1, 63, 64, 65, 128, 200, 200, 0] {
|
||||
<BitSet as TermOrdAccumulator>::insert(&mut set, ord);
|
||||
}
|
||||
assert_eq!(<BitSet as TermOrdAccumulator>::len(&set), 7);
|
||||
let collected: Vec<u64> = set.iter_ords().collect();
|
||||
assert_eq!(collected, vec![0, 1, 63, 64, 65, 128, 200]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cardinality_aggregation_u64() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -1143,42 +705,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// A JSON path that resolves to both a Str column and a numeric column
|
||||
/// produces two collector instances per segment — one with `Str` buckets
|
||||
/// and one with `Numeric` buckets. Their `IntermediateMetricResult`s must
|
||||
/// merge into the union cardinality.
|
||||
#[test]
|
||||
fn cardinality_aggregation_json_str_and_numeric() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_json_field("json", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
writer.add_document(doc!(field => json!({"value": "hello"})))?;
|
||||
writer.add_document(doc!(field => json!({"value": "world"})))?;
|
||||
writer.add_document(doc!(field => json!({"value": "hello"})))?; // dup str
|
||||
writer.add_document(doc!(field => json!({"value": i64::from_u64(7u64)})))?;
|
||||
writer.add_document(doc!(field => json!({"value": i64::from_u64(42u64)})))?;
|
||||
writer.add_document(doc!(field => json!({"value": i64::from_u64(7u64)})))?; // dup num
|
||||
writer.commit()?;
|
||||
}
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "json.value"
|
||||
},
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
// 4 distinct values: "hello", "world", 7, 42.
|
||||
assert_eq!(res["cardinality"]["value"], 4.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cardinality_collector_serde_roundtrip() {
|
||||
use super::CardinalityCollector;
|
||||
|
||||
@@ -9,8 +9,8 @@ use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::weight::{for_each_docset_buffered, for_each_pruning_scorer, for_each_scorer};
|
||||
use crate::query::{
|
||||
intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude, Explanation, Occur,
|
||||
RequiredOptionalScorer, Scorer, Weight,
|
||||
intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude, Explanation,
|
||||
Intersection, Occur, RequiredOptionalScorer, Scorer, Weight,
|
||||
};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
@@ -50,9 +50,10 @@ where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
{
|
||||
assert!(!scorers.is_empty());
|
||||
if scorers.len() == 1 && !scorers[0].is::<TermScorer>() {
|
||||
if scorers.len() == 1 {
|
||||
return SpecializedScorer::Other(scorers.into_iter().next().unwrap()); //< we checked the size beforehand
|
||||
}
|
||||
|
||||
{
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| scorer.is::<TermScorer>());
|
||||
if is_all_term_queries {
|
||||
@@ -66,9 +67,6 @@ where
|
||||
{
|
||||
// Block wand is only available if we read frequencies.
|
||||
return SpecializedScorer::TermUnion(scorers);
|
||||
} else if scorers.len() == 1 {
|
||||
// Single TermScorer without freq reading — unwrap directly.
|
||||
return SpecializedScorer::Other(Box::new(scorers.into_iter().next().unwrap()));
|
||||
} else {
|
||||
return SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
|
||||
scorers,
|
||||
|
||||
Reference in New Issue
Block a user