mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-23 02:40:44 +00:00
Compare commits
1 Commits
trinity.po
...
dependabot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f50cb0208b |
2
.github/workflows/coverage.yml
vendored
2
.github/workflows/coverage.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
- name: Generate code coverage
|
||||
run: cargo +nightly-2025-12-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f # v7.0.0
|
||||
uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2 # v6.0.0
|
||||
continue-on-error: true
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
||||
|
||||
2
.github/workflows/scorecard.yml
vendored
2
.github/workflows/scorecard.yml
vendored
@@ -44,6 +44,6 @@ jobs:
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard.
|
||||
- name: 'Upload to code-scanning'
|
||||
uses: github/codeql-action/upload-sarif@87557b9c84dde89fdd9b10e88954ac2f4248e463 # v4.36.1
|
||||
uses: github/codeql-action/upload-sarif@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
|
||||
with:
|
||||
sarif_file: results.sarif
|
||||
|
||||
@@ -241,28 +241,6 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::column_values::u64_based::tests::create_and_validate;
|
||||
|
||||
// A block boundary where a high run ends and a low run begins: y0 ≈ 2^32, y511 ≈ 0.
|
||||
// This large jump used to cause an overflow which made us render all value on 64b
|
||||
// when 32 was enough.
|
||||
fn large_descending_jump_vals() -> Vec<u64> {
|
||||
let high_start: u64 = 4_294_967_039; // ≈ 2^32 - 257
|
||||
(0u64..256)
|
||||
.map(|i| high_start + i)
|
||||
.chain(0u64..256)
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_blockwise_linear_large_descending_jump_uses_at_most_32bit() {
|
||||
let vals = large_descending_jump_vals();
|
||||
let (_, actual_rate) =
|
||||
create_and_validate::<BlockwiseLinearCodec>(&vals, "large descending jump").unwrap();
|
||||
assert!(
|
||||
actual_rate <= 0.6,
|
||||
"compression rate {actual_rate:.3} is too high (bug: 64-bit residuals)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets_simple() {
|
||||
create_and_validate::<BlockwiseLinearCodec>(
|
||||
|
||||
@@ -37,7 +37,7 @@ fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 {
|
||||
} else {
|
||||
y0.wrapping_sub(y1)
|
||||
};
|
||||
if abs_dy >= 1 << 31 {
|
||||
if abs_dy >= 1 << 32 {
|
||||
// This is outside of realm we handle.
|
||||
// Let's just bail.
|
||||
return 0u64;
|
||||
|
||||
@@ -41,7 +41,7 @@ pub struct AggregationsSegmentCtx {
|
||||
|
||||
impl AggregationsSegmentCtx {
|
||||
pub(crate) fn push_term_req_data(&mut self, data: TermsAggReqData) -> usize {
|
||||
self.per_request.term_req_data.push(data);
|
||||
self.per_request.term_req_data.push(Some(Box::new(data)));
|
||||
self.per_request.term_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_cardinality_req_data(&mut self, data: CardinalityAggReqData) -> usize {
|
||||
@@ -61,25 +61,31 @@ impl AggregationsSegmentCtx {
|
||||
self.per_request.missing_term_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_histogram_req_data(&mut self, data: HistogramAggReqData) -> usize {
|
||||
self.per_request.histogram_req_data.push(data);
|
||||
self.per_request
|
||||
.histogram_req_data
|
||||
.push(Some(Box::new(data)));
|
||||
self.per_request.histogram_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_range_req_data(&mut self, data: RangeAggReqData) -> usize {
|
||||
self.per_request.range_req_data.push(data);
|
||||
self.per_request.range_req_data.push(Some(Box::new(data)));
|
||||
self.per_request.range_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_filter_req_data(&mut self, data: FilterAggReqData) -> usize {
|
||||
self.per_request.filter_req_data.push(data);
|
||||
self.per_request.filter_req_data.push(Some(Box::new(data)));
|
||||
self.per_request.filter_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_composite_req_data(&mut self, data: CompositeAggReqData) -> usize {
|
||||
self.per_request.composite_req_data.push(data);
|
||||
self.per_request
|
||||
.composite_req_data
|
||||
.push(Some(Box::new(data)));
|
||||
self.per_request.composite_req_data.len() - 1
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_term_req_data(&self, idx: usize) -> &TermsAggReqData {
|
||||
&self.per_request.term_req_data[idx]
|
||||
self.per_request.term_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("term_req_data slot is empty (taken)")
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_cardinality_req_data(&self, idx: usize) -> &CardinalityAggReqData {
|
||||
@@ -97,6 +103,116 @@ impl AggregationsSegmentCtx {
|
||||
pub(crate) fn get_missing_term_req_data(&self, idx: usize) -> &MissingTermAggReqData {
|
||||
&self.per_request.missing_term_req_data[idx]
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_histogram_req_data(&self, idx: usize) -> &HistogramAggReqData {
|
||||
self.per_request.histogram_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_range_req_data(&self, idx: usize) -> &RangeAggReqData {
|
||||
self.per_request.range_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("range_req_data slot is empty (taken)")
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_composite_req_data(&self, idx: usize) -> &CompositeAggReqData {
|
||||
self.per_request.composite_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("composite_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
// ---------- mutable getters ----------
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_metric_req_data_mut(&mut self, idx: usize) -> &mut MetricAggReqData {
|
||||
&mut self.per_request.stats_metric_req_data[idx]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_cardinality_req_data_mut(
|
||||
&mut self,
|
||||
idx: usize,
|
||||
) -> &mut CardinalityAggReqData {
|
||||
&mut self.per_request.cardinality_req_data[idx]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_histogram_req_data_mut(&mut self, idx: usize) -> &mut HistogramAggReqData {
|
||||
self.per_request.histogram_req_data[idx]
|
||||
.as_deref_mut()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
// ---------- take / put (terms, histogram, range) ----------
|
||||
|
||||
/// Move out the boxed Histogram request at `idx`, leaving `None`.
|
||||
#[inline]
|
||||
pub(crate) fn take_histogram_req_data(&mut self, idx: usize) -> Box<HistogramAggReqData> {
|
||||
self.per_request.histogram_req_data[idx]
|
||||
.take()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Histogram request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_histogram_req_data(
|
||||
&mut self,
|
||||
idx: usize,
|
||||
value: Box<HistogramAggReqData>,
|
||||
) {
|
||||
debug_assert!(self.per_request.histogram_req_data[idx].is_none());
|
||||
self.per_request.histogram_req_data[idx] = Some(value);
|
||||
}
|
||||
|
||||
/// Move out the boxed Range request at `idx`, leaving `None`.
|
||||
#[inline]
|
||||
pub(crate) fn take_range_req_data(&mut self, idx: usize) -> Box<RangeAggReqData> {
|
||||
self.per_request.range_req_data[idx]
|
||||
.take()
|
||||
.expect("range_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Range request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_range_req_data(&mut self, idx: usize, value: Box<RangeAggReqData>) {
|
||||
debug_assert!(self.per_request.range_req_data[idx].is_none());
|
||||
self.per_request.range_req_data[idx] = Some(value);
|
||||
}
|
||||
|
||||
/// Move out the boxed Filter request at `idx`, leaving `None`.
|
||||
#[inline]
|
||||
pub(crate) fn take_filter_req_data(&mut self, idx: usize) -> Box<FilterAggReqData> {
|
||||
self.per_request.filter_req_data[idx]
|
||||
.take()
|
||||
.expect("filter_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Filter request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_filter_req_data(&mut self, idx: usize, value: Box<FilterAggReqData>) {
|
||||
debug_assert!(self.per_request.filter_req_data[idx].is_none());
|
||||
self.per_request.filter_req_data[idx] = Some(value);
|
||||
}
|
||||
|
||||
/// Move out the Composite request at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn take_composite_req_data(&mut self, idx: usize) -> Box<CompositeAggReqData> {
|
||||
self.per_request.composite_req_data[idx]
|
||||
.take()
|
||||
.expect("composite_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Composite request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_composite_req_data(
|
||||
&mut self,
|
||||
idx: usize,
|
||||
value: Box<CompositeAggReqData>,
|
||||
) {
|
||||
debug_assert!(self.per_request.composite_req_data[idx].is_none());
|
||||
self.per_request.composite_req_data[idx] = Some(value);
|
||||
}
|
||||
}
|
||||
|
||||
/// Each type of aggregation has its own request data struct. This struct holds
|
||||
@@ -107,14 +223,15 @@ impl AggregationsSegmentCtx {
|
||||
/// for a node with [AggKind::Terms]).
|
||||
#[derive(Default)]
|
||||
pub struct PerRequestAggSegCtx {
|
||||
// Box for cheap take/put - Only necessary for bucket aggs that have sub-aggregations
|
||||
/// TermsAggReqData contains the request data for a terms aggregation.
|
||||
pub term_req_data: Vec<TermsAggReqData>,
|
||||
pub term_req_data: Vec<Option<Box<TermsAggReqData>>>,
|
||||
/// HistogramAggReqData contains the request data for a histogram aggregation.
|
||||
pub histogram_req_data: Vec<HistogramAggReqData>,
|
||||
pub histogram_req_data: Vec<Option<Box<HistogramAggReqData>>>,
|
||||
/// RangeAggReqData contains the request data for a range aggregation.
|
||||
pub range_req_data: Vec<RangeAggReqData>,
|
||||
pub range_req_data: Vec<Option<Box<RangeAggReqData>>>,
|
||||
/// FilterAggReqData contains the request data for a filter aggregation.
|
||||
pub filter_req_data: Vec<FilterAggReqData>,
|
||||
pub filter_req_data: Vec<Option<Box<FilterAggReqData>>>,
|
||||
/// Shared by avg, min, max, sum, stats, extended_stats, count
|
||||
pub stats_metric_req_data: Vec<MetricAggReqData>,
|
||||
/// CardinalityAggReqData contains the request data for a cardinality aggregation.
|
||||
@@ -124,7 +241,7 @@ pub struct PerRequestAggSegCtx {
|
||||
/// MissingTermAggReqData contains the request data for a missing term aggregation.
|
||||
pub missing_term_req_data: Vec<MissingTermAggReqData>,
|
||||
/// CompositeAggReqData contains the request data for a composite aggregation.
|
||||
pub composite_req_data: Vec<CompositeAggReqData>,
|
||||
pub composite_req_data: Vec<Option<Box<CompositeAggReqData>>>,
|
||||
|
||||
/// Request tree used to build collectors.
|
||||
pub agg_tree: Vec<AggRefNode>,
|
||||
@@ -135,22 +252,22 @@ impl PerRequestAggSegCtx {
|
||||
fn get_memory_consumption(&self) -> usize {
|
||||
self.term_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().unwrap().get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.histogram_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().unwrap().get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.range_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().unwrap().get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.filter_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().unwrap().get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.stats_metric_req_data
|
||||
@@ -175,7 +292,7 @@ impl PerRequestAggSegCtx {
|
||||
+ self
|
||||
.composite_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().map(|d| d.get_memory_consumption()).unwrap_or(0))
|
||||
.sum::<usize>()
|
||||
+ self.agg_tree.len() * std::mem::size_of::<AggRefNode>()
|
||||
}
|
||||
@@ -184,16 +301,40 @@ impl PerRequestAggSegCtx {
|
||||
let idx = node.idx_in_req_data;
|
||||
let kind = node.kind;
|
||||
match kind {
|
||||
AggKind::Terms => self.term_req_data[idx].name.as_str(),
|
||||
AggKind::Terms => self.term_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("term_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Cardinality => &self.cardinality_req_data[idx].name,
|
||||
AggKind::StatsKind(_) => &self.stats_metric_req_data[idx].name,
|
||||
AggKind::TopHits => &self.top_hits_req_data[idx].name,
|
||||
AggKind::MissingTerm => &self.missing_term_req_data[idx].name,
|
||||
AggKind::Histogram => self.histogram_req_data[idx].name.as_str(),
|
||||
AggKind::DateHistogram => self.histogram_req_data[idx].name.as_str(),
|
||||
AggKind::Range => self.range_req_data[idx].name.as_str(),
|
||||
AggKind::Filter => self.filter_req_data[idx].name.as_str(),
|
||||
AggKind::Composite => self.composite_req_data[idx].name.as_str(),
|
||||
AggKind::Histogram => self.histogram_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::DateHistogram => self.histogram_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Range => self.range_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("range_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Filter => self.filter_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("filter_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Composite => self.composite_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("composite_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -271,7 +412,7 @@ pub(crate) fn build_segment_agg_collector(
|
||||
Ok(Box::new(TermMissingAgg::new(req, node)?))
|
||||
}
|
||||
AggKind::Cardinality => {
|
||||
let req_data = req.get_cardinality_req_data(node.idx_in_req_data);
|
||||
let req_data = &mut req.get_cardinality_req_data_mut(node.idx_in_req_data);
|
||||
// For str columns, choose the per-bucket entries representation
|
||||
// based on the segment's column.max_value():
|
||||
// * small (< BITSET_MAX_TERM_ORD): `BitSet`, pre-allocated, no promotion machinery.
|
||||
@@ -318,7 +459,7 @@ pub(crate) fn build_segment_agg_collector(
|
||||
SegmentExtendedStatsCollector::from_req(req_data, sigma),
|
||||
)),
|
||||
StatsType::Percentiles => {
|
||||
let req_data = req.get_metric_req_data(node.idx_in_req_data);
|
||||
let req_data = req.get_metric_req_data_mut(node.idx_in_req_data);
|
||||
Ok(Box::new(
|
||||
SegmentPercentilesCollector::from_req_and_validate(
|
||||
req_data.field_type,
|
||||
@@ -658,18 +799,23 @@ fn build_nodes(
|
||||
let schema = reader.schema();
|
||||
let tokenizers = &data.context.tokenizers;
|
||||
let query = filter_req.parse_query(schema, tokenizers)?;
|
||||
let evaluator =
|
||||
std::rc::Rc::new(crate::aggregation::bucket::DocumentQueryEvaluator::new(
|
||||
query,
|
||||
schema.clone(),
|
||||
reader,
|
||||
)?);
|
||||
let evaluator = crate::aggregation::bucket::DocumentQueryEvaluator::new(
|
||||
query,
|
||||
schema.clone(),
|
||||
reader,
|
||||
)?;
|
||||
|
||||
// Pre-allocate buffer for batch filtering
|
||||
let max_doc = reader.max_doc();
|
||||
let buffer_capacity = crate::docset::COLLECT_BLOCK_BUFFER_LEN.min(max_doc as usize);
|
||||
let matching_docs_buffer = Vec::with_capacity(buffer_capacity);
|
||||
|
||||
let idx_in_req_data = data.push_filter_req_data(FilterAggReqData {
|
||||
name: agg_name.to_string(),
|
||||
req: filter_req.clone(),
|
||||
segment_reader: reader.clone(),
|
||||
evaluator,
|
||||
matching_docs_buffer,
|
||||
is_top_level,
|
||||
});
|
||||
let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?;
|
||||
|
||||
@@ -299,12 +299,6 @@ impl AggregationVariants {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_sum(&self) -> Option<&SumAggregation> {
|
||||
match &self {
|
||||
AggregationVariants::Sum(sum) => Some(sum),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -16,7 +16,6 @@ use crate::{SegmentReader, TantivyError};
|
||||
|
||||
/// Contains all information required by the SegmentCompositeCollector to perform the
|
||||
/// composite aggregation on a segment.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompositeAggReqData {
|
||||
/// The name of the aggregation.
|
||||
pub name: String,
|
||||
@@ -35,7 +34,6 @@ impl CompositeAggReqData {
|
||||
}
|
||||
|
||||
/// Accessors for a single column in a composite source.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompositeAccessor {
|
||||
/// The fast field column
|
||||
pub column: Column<u64>,
|
||||
@@ -50,7 +48,6 @@ pub struct CompositeAccessor {
|
||||
}
|
||||
|
||||
/// Accessors to all the columns that belong to the field of a composite source.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompositeSourceAccessors {
|
||||
/// The accessors for this source
|
||||
pub accessors: Vec<CompositeAccessor>,
|
||||
@@ -361,7 +358,7 @@ impl PrecomputedDateInterval {
|
||||
///
|
||||
/// Some column types (term, IP) might not have an exact representation of the
|
||||
/// specified after key
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug)]
|
||||
pub enum PrecomputedAfterKey {
|
||||
/// The after key could be exactly represented in the column space.
|
||||
Exact(u64),
|
||||
|
||||
@@ -118,7 +118,7 @@ impl InternalValueRepr {
|
||||
pub struct SegmentCompositeCollector {
|
||||
/// One DynArrayHeapMap per parent bucket.
|
||||
parent_buckets: Vec<DynArrayHeapMap<InternalValueRepr, CompositeBucketCollector>>,
|
||||
req_data: CompositeAggReqData,
|
||||
accessor_idx: usize,
|
||||
sub_agg: Option<BufferedSubAggs<HighCardSubAggBuffer>>,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
/// Number of sources, needed when creating new DynArrayHeapMaps.
|
||||
@@ -132,7 +132,10 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
results: &mut IntermediateAggregationResults,
|
||||
parent_bucket_id: BucketId,
|
||||
) -> crate::Result<()> {
|
||||
let name = self.req_data.name.clone();
|
||||
let name = agg_data
|
||||
.get_composite_req_data(self.accessor_idx)
|
||||
.name
|
||||
.clone();
|
||||
|
||||
let buckets = self.add_intermediate_bucket_result(agg_data, parent_bucket_id)?;
|
||||
results.push(
|
||||
@@ -150,11 +153,12 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let mem_pre = self.get_memory_consumption(parent_bucket_id);
|
||||
let composite_agg_data = agg_data.take_composite_req_data(self.accessor_idx);
|
||||
|
||||
for doc in docs {
|
||||
let mut visitor = CompositeKeyVisitor {
|
||||
doc_id: *doc,
|
||||
composite_agg_data: &self.req_data,
|
||||
composite_agg_data: &composite_agg_data,
|
||||
buckets: &mut self.parent_buckets[parent_bucket_id as usize],
|
||||
sub_agg: &mut self.sub_agg,
|
||||
bucket_id_provider: &mut self.bucket_id_provider,
|
||||
@@ -162,6 +166,7 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
};
|
||||
visitor.visit(0, true)?;
|
||||
}
|
||||
agg_data.put_back_composite_req_data(self.accessor_idx, composite_agg_data);
|
||||
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
sub_agg.check_flush_local(agg_data)?;
|
||||
@@ -216,13 +221,7 @@ impl SegmentCompositeCollector {
|
||||
req_data: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Self> {
|
||||
let composite_req_data =
|
||||
req_data.per_request.composite_req_data[node.idx_in_req_data].clone();
|
||||
validate_req(&composite_req_data)?;
|
||||
req_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(composite_req_data.get_memory_consumption() as u64)?;
|
||||
validate_req(req_data, node.idx_in_req_data)?;
|
||||
|
||||
let has_sub_aggregations = !node.children.is_empty();
|
||||
let sub_agg = if has_sub_aggregations {
|
||||
@@ -232,11 +231,12 @@ impl SegmentCompositeCollector {
|
||||
None
|
||||
};
|
||||
|
||||
let composite_req_data = req_data.get_composite_req_data(node.idx_in_req_data);
|
||||
let num_sources = composite_req_data.req.sources.len();
|
||||
|
||||
Ok(SegmentCompositeCollector {
|
||||
parent_buckets: vec![DynArrayHeapMap::try_new(num_sources)?],
|
||||
req_data: composite_req_data,
|
||||
accessor_idx: node.idx_in_req_data,
|
||||
sub_agg,
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
num_sources,
|
||||
@@ -258,7 +258,7 @@ impl SegmentCompositeCollector {
|
||||
let mut dict: FxHashMap<Vec<CompositeIntermediateKey>, IntermediateCompositeBucketEntry> =
|
||||
Default::default();
|
||||
dict.reserve(heap_map.size());
|
||||
let composite_data = &self.req_data;
|
||||
let composite_data = agg_data.get_composite_req_data(self.accessor_idx);
|
||||
for (key_internal_repr, agg) in heap_map.into_iter() {
|
||||
let key = resolve_key(&key_internal_repr, composite_data)?;
|
||||
let mut sub_aggregation_res = IntermediateAggregationResults::default();
|
||||
@@ -298,7 +298,8 @@ impl SegmentCompositeCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn validate_req(composite_data: &CompositeAggReqData) -> crate::Result<()> {
|
||||
fn validate_req(req_data: &mut AggregationsSegmentCtx, accessor_idx: usize) -> crate::Result<()> {
|
||||
let composite_data = req_data.get_composite_req_data(accessor_idx);
|
||||
let req = &composite_data.req;
|
||||
if req.sources.is_empty() {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::fmt::Debug;
|
||||
use std::rc::Rc;
|
||||
|
||||
use common::BitSet;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
@@ -397,7 +396,6 @@ impl PartialEq for FilterAggregation {
|
||||
|
||||
/// Request data for filter aggregation
|
||||
/// This struct holds the per-segment data needed to execute a filter aggregation
|
||||
#[derive(Clone)]
|
||||
pub struct FilterAggReqData {
|
||||
/// The name of the filter aggregation
|
||||
pub name: String,
|
||||
@@ -405,20 +403,22 @@ pub struct FilterAggReqData {
|
||||
pub req: FilterAggregation,
|
||||
/// The segment reader
|
||||
pub segment_reader: SegmentReader,
|
||||
/// Document evaluator for the filter query (precomputed BitSet).
|
||||
/// Wrapped in `Rc` so cloning the request data does not duplicate the (potentially large)
|
||||
/// underlying BitSet.
|
||||
pub evaluator: Rc<DocumentQueryEvaluator>,
|
||||
/// Document evaluator for the filter query (precomputed BitSet)
|
||||
/// This is built once when the request data is created
|
||||
pub evaluator: DocumentQueryEvaluator,
|
||||
/// Reusable buffer for matching documents to minimize allocations during collection
|
||||
pub matching_docs_buffer: Vec<DocId>,
|
||||
/// True if this filter aggregation is at the top level of the aggregation tree (not nested).
|
||||
pub is_top_level: bool,
|
||||
}
|
||||
|
||||
impl FilterAggReqData {
|
||||
pub(crate) fn get_memory_consumption(&self) -> usize {
|
||||
// Estimate: name + segment reader reference + bitset
|
||||
// Estimate: name + segment reader reference + bitset + buffer capacity
|
||||
self.name.len()
|
||||
+ std::mem::size_of::<SegmentReader>()
|
||||
+ self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes)
|
||||
+ self.matching_docs_buffer.capacity() * std::mem::size_of::<DocId>()
|
||||
+ std::mem::size_of::<bool>()
|
||||
}
|
||||
}
|
||||
@@ -509,10 +509,8 @@ pub struct SegmentFilterCollector<B: SubAggBuffer> {
|
||||
/// Sub-aggregation collectors
|
||||
sub_aggregations: Option<BufferedSubAggs<B>>,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
/// Per-segment filter request data, owned by this collector.
|
||||
req_data: FilterAggReqData,
|
||||
/// Reusable buffer for matching documents to minimize allocations during collection.
|
||||
matching_docs_buffer: Vec<DocId>,
|
||||
/// Accessor index for this filter aggregation (to access FilterAggReqData)
|
||||
accessor_idx: usize,
|
||||
}
|
||||
|
||||
impl<B: SubAggBuffer> SegmentFilterCollector<B> {
|
||||
@@ -520,7 +518,6 @@ impl<B: SubAggBuffer> SegmentFilterCollector<B> {
|
||||
pub(crate) fn from_req_and_validate(
|
||||
req: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
req_data: FilterAggReqData,
|
||||
) -> crate::Result<Self> {
|
||||
// Build sub-aggregation collectors if any
|
||||
let sub_agg_collector = if !node.children.is_empty() {
|
||||
@@ -530,15 +527,11 @@ impl<B: SubAggBuffer> SegmentFilterCollector<B> {
|
||||
};
|
||||
let sub_agg_collector = sub_agg_collector.map(BufferedSubAggs::new);
|
||||
|
||||
let max_doc = req_data.segment_reader.max_doc();
|
||||
let buffer_capacity = crate::docset::COLLECT_BLOCK_BUFFER_LEN.min(max_doc as usize);
|
||||
|
||||
Ok(SegmentFilterCollector {
|
||||
parent_buckets: Vec::new(),
|
||||
sub_aggregations: sub_agg_collector,
|
||||
req_data,
|
||||
accessor_idx: node.idx_in_req_data,
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
matching_docs_buffer: Vec::with_capacity(buffer_capacity),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -547,23 +540,18 @@ pub(crate) fn build_segment_filter_collector(
|
||||
req: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
||||
let req_data = req.per_request.filter_req_data[node.idx_in_req_data].clone();
|
||||
req.context
|
||||
.limits
|
||||
.add_memory_consumed(req_data.get_memory_consumption() as u64)?;
|
||||
let is_top_level = req_data.is_top_level;
|
||||
let is_top_level = req.per_request.filter_req_data[node.idx_in_req_data]
|
||||
.as_ref()
|
||||
.expect("filter_req_data slot is empty")
|
||||
.is_top_level;
|
||||
|
||||
if is_top_level {
|
||||
Ok(Box::new(
|
||||
SegmentFilterCollector::<LowCardSubAggBuffer>::from_req_and_validate(
|
||||
req, node, req_data,
|
||||
)?,
|
||||
SegmentFilterCollector::<LowCardSubAggBuffer>::from_req_and_validate(req, node)?,
|
||||
))
|
||||
} else {
|
||||
Ok(Box::new(
|
||||
SegmentFilterCollector::<HighCardSubAggBuffer>::from_req_and_validate(
|
||||
req, node, req_data,
|
||||
)?,
|
||||
SegmentFilterCollector::<HighCardSubAggBuffer>::from_req_and_validate(req, node)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -573,7 +561,7 @@ impl<B: SubAggBuffer> Debug for SegmentFilterCollector<B> {
|
||||
f.debug_struct("SegmentFilterCollector")
|
||||
.field("buckets", &self.parent_buckets)
|
||||
.field("has_sub_aggs", &self.sub_aggregations.is_some())
|
||||
.field("name", &self.req_data.name)
|
||||
.field("accessor_idx", &self.accessor_idx)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -610,7 +598,11 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentFilterCollector<B>
|
||||
};
|
||||
|
||||
// Get the name of this filter aggregation
|
||||
let name = self.req_data.name.clone();
|
||||
let name = agg_data.per_request.filter_req_data[self.accessor_idx]
|
||||
.as_ref()
|
||||
.expect("filter_req_data slot is empty")
|
||||
.name
|
||||
.clone();
|
||||
|
||||
results.push(
|
||||
name,
|
||||
@@ -631,24 +623,27 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentFilterCollector<B>
|
||||
}
|
||||
|
||||
let mut bucket = self.parent_buckets[parent_bucket_id as usize];
|
||||
// Take the request data to avoid borrow checker issues with sub-aggregations
|
||||
let mut req = agg_data.take_filter_req_data(self.accessor_idx);
|
||||
|
||||
// Use batch filtering with O(1) BitSet lookups
|
||||
self.matching_docs_buffer.clear();
|
||||
self.req_data
|
||||
.evaluator
|
||||
.filter_batch(docs, &mut self.matching_docs_buffer);
|
||||
req.matching_docs_buffer.clear();
|
||||
req.evaluator
|
||||
.filter_batch(docs, &mut req.matching_docs_buffer);
|
||||
|
||||
bucket.doc_count += self.matching_docs_buffer.len() as u64;
|
||||
bucket.doc_count += req.matching_docs_buffer.len() as u64;
|
||||
|
||||
// Batch process sub-aggregations if we have matches
|
||||
if !self.matching_docs_buffer.is_empty() {
|
||||
if !req.matching_docs_buffer.is_empty() {
|
||||
if let Some(sub_aggs) = &mut self.sub_aggregations {
|
||||
for &doc_id in &self.matching_docs_buffer {
|
||||
for &doc_id in &req.matching_docs_buffer {
|
||||
sub_aggs.push(bucket.bucket_id, doc_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Put the request data back
|
||||
agg_data.put_back_filter_req_data(self.accessor_idx, req);
|
||||
if let Some(sub_aggs) = &mut self.sub_aggregations {
|
||||
sub_aggs.check_flush_local(agg_data)?;
|
||||
}
|
||||
|
||||
@@ -21,7 +21,6 @@ use crate::TantivyError;
|
||||
|
||||
/// Contains all information required by the SegmentHistogramCollector to perform the
|
||||
/// histogram or date_histogram aggregation on a segment.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HistogramAggReqData {
|
||||
/// The column accessor to access the fast field values.
|
||||
pub accessor: Column<u64>,
|
||||
@@ -298,7 +297,7 @@ pub struct SegmentHistogramCollector {
|
||||
/// One Histogram bucket per parent bucket id.
|
||||
parent_buckets: Vec<HistogramBuckets>,
|
||||
sub_agg: Option<HighCardBufferedSubAggs>,
|
||||
req_data: HistogramAggReqData,
|
||||
accessor_idx: usize,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
}
|
||||
|
||||
@@ -309,7 +308,10 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
||||
results: &mut IntermediateAggregationResults,
|
||||
parent_bucket_id: BucketId,
|
||||
) -> crate::Result<()> {
|
||||
let name = self.req_data.name.clone();
|
||||
let name = agg_data
|
||||
.get_histogram_req_data(self.accessor_idx)
|
||||
.name
|
||||
.clone();
|
||||
// TODO: avoid prepare_max_bucket here and handle empty buckets.
|
||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
||||
let histogram = std::mem::take(&mut self.parent_buckets[parent_bucket_id as usize]);
|
||||
@@ -326,10 +328,10 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
||||
docs: &[crate::DocId],
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let req = agg_data.take_histogram_req_data(self.accessor_idx);
|
||||
let mem_pre = self.get_memory_consumption(parent_bucket_id);
|
||||
let buckets = &mut self.parent_buckets[parent_bucket_id as usize].buckets;
|
||||
|
||||
let req = &self.req_data;
|
||||
let bounds = req.bounds;
|
||||
let interval = req.req.interval;
|
||||
let offset = req.offset;
|
||||
@@ -359,6 +361,7 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
||||
}
|
||||
}
|
||||
}
|
||||
agg_data.put_back_histogram_req_data(self.accessor_idx, req);
|
||||
|
||||
let mem_delta = self.get_memory_consumption(parent_bucket_id) - mem_pre;
|
||||
if mem_delta > 0 {
|
||||
@@ -424,7 +427,10 @@ impl SegmentHistogramCollector {
|
||||
}
|
||||
buckets.sort_unstable_by(|b1, b2| b1.key.total_cmp(&b2.key));
|
||||
|
||||
let is_date_agg = self.req_data.field_type == ColumnType::DateTime;
|
||||
let is_date_agg = agg_data
|
||||
.get_histogram_req_data(self.accessor_idx)
|
||||
.field_type
|
||||
== ColumnType::DateTime;
|
||||
Ok(IntermediateBucketResult::Histogram {
|
||||
buckets,
|
||||
is_date_agg,
|
||||
@@ -440,7 +446,7 @@ impl SegmentHistogramCollector {
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let mut req_data = agg_data.per_request.histogram_req_data[node.idx_in_req_data].clone();
|
||||
let req_data = agg_data.get_histogram_req_data_mut(node.idx_in_req_data);
|
||||
req_data.req.validate()?;
|
||||
if req_data.field_type == ColumnType::DateTime && !req_data.is_date_histogram {
|
||||
req_data.req.normalize_date_time();
|
||||
@@ -450,16 +456,12 @@ impl SegmentHistogramCollector {
|
||||
max: f64::MAX,
|
||||
});
|
||||
req_data.offset = req_data.req.offset.unwrap_or(0.0);
|
||||
agg_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(req_data.get_memory_consumption() as u64)?;
|
||||
let sub_agg = sub_agg.map(BufferedSubAggs::new);
|
||||
|
||||
Ok(Self {
|
||||
parent_buckets: Default::default(),
|
||||
sub_agg,
|
||||
req_data,
|
||||
accessor_idx: node.idx_in_req_data,
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -23,7 +23,6 @@ use crate::TantivyError;
|
||||
|
||||
/// Contains all information required by the SegmentRangeCollector to perform the
|
||||
/// range aggregation on a segment.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RangeAggReqData {
|
||||
/// The column accessor to access the fast field values.
|
||||
pub accessor: Column<u64>,
|
||||
@@ -162,7 +161,7 @@ pub struct SegmentRangeCollector<B: SubAggBuffer> {
|
||||
/// One for each ParentBucketId
|
||||
parent_buckets: Vec<Vec<SegmentRangeAndBucketEntry>>,
|
||||
column_type: ColumnType,
|
||||
pub(crate) req_data: RangeAggReqData,
|
||||
pub(crate) accessor_idx: usize,
|
||||
sub_agg: Option<BufferedSubAggs<B>>,
|
||||
/// Here things get a bit weird. We need to assign unique bucket ids across all
|
||||
/// parent buckets. So we keep track of the next available bucket id here.
|
||||
@@ -185,7 +184,7 @@ impl<B: SubAggBuffer> Debug for SegmentRangeCollector<B> {
|
||||
f.debug_struct("SegmentRangeCollector")
|
||||
.field("parent_buckets_len", &self.parent_buckets.len())
|
||||
.field("column_type", &self.column_type)
|
||||
.field("name", &self.req_data.name)
|
||||
.field("accessor_idx", &self.accessor_idx)
|
||||
.field("has_sub_agg", &self.sub_agg.is_some())
|
||||
.finish()
|
||||
}
|
||||
@@ -240,7 +239,10 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
|
||||
) -> crate::Result<()> {
|
||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
||||
let field_type = self.column_type;
|
||||
let name = self.req_data.name.to_string();
|
||||
let name = agg_data
|
||||
.get_range_req_data(self.accessor_idx)
|
||||
.name
|
||||
.to_string();
|
||||
|
||||
let buckets = std::mem::take(&mut self.parent_buckets[parent_bucket_id as usize]);
|
||||
|
||||
@@ -279,15 +281,17 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
|
||||
docs: &[crate::DocId],
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let req = agg_data.take_range_req_data(self.accessor_idx);
|
||||
|
||||
agg_data
|
||||
.column_block_accessor
|
||||
.fetch_block(docs, &self.req_data.accessor);
|
||||
.fetch_block(docs, &req.accessor);
|
||||
|
||||
let buckets = &mut self.parent_buckets[parent_bucket_id as usize];
|
||||
|
||||
for (doc, val) in agg_data
|
||||
.column_block_accessor
|
||||
.iter_docid_vals(docs, &self.req_data.accessor)
|
||||
.iter_docid_vals(docs, &req.accessor)
|
||||
{
|
||||
let bucket_pos = get_bucket_pos(val, buckets);
|
||||
let bucket = &mut buckets[bucket_pos];
|
||||
@@ -297,6 +301,7 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
|
||||
}
|
||||
}
|
||||
|
||||
agg_data.put_back_range_req_data(self.accessor_idx, req);
|
||||
if let Some(sub_agg) = self.sub_agg.as_mut() {
|
||||
sub_agg.check_flush_local(agg_data)?;
|
||||
}
|
||||
@@ -314,10 +319,10 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
|
||||
fn prepare_max_bucket(
|
||||
&mut self,
|
||||
max_bucket: BucketId,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
while self.parent_buckets.len() <= max_bucket as usize {
|
||||
let new_buckets = self.create_new_buckets()?;
|
||||
let new_buckets = self.create_new_buckets(agg_data)?;
|
||||
self.parent_buckets.push(new_buckets);
|
||||
}
|
||||
|
||||
@@ -341,11 +346,8 @@ pub(crate) fn build_segment_range_collector(
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
||||
let req_data = agg_data.per_request.range_req_data[node.idx_in_req_data].clone();
|
||||
agg_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(req_data.get_memory_consumption() as u64)?;
|
||||
let accessor_idx = node.idx_in_req_data;
|
||||
let req_data = agg_data.get_range_req_data(node.idx_in_req_data);
|
||||
let field_type = req_data.field_type;
|
||||
|
||||
// TODO: A better metric instead of is_top_level would be the number of buckets expected.
|
||||
@@ -363,7 +365,7 @@ pub(crate) fn build_segment_range_collector(
|
||||
Ok(Box::new(SegmentRangeCollector::<LowCardSubAggBuffer> {
|
||||
sub_agg: sub_agg.map(LowCardBufferedSubAggs::new),
|
||||
column_type: field_type,
|
||||
req_data,
|
||||
accessor_idx,
|
||||
parent_buckets: Vec::new(),
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
limits: agg_data.context.limits.clone(),
|
||||
@@ -372,7 +374,7 @@ pub(crate) fn build_segment_range_collector(
|
||||
Ok(Box::new(SegmentRangeCollector::<HighCardSubAggBuffer> {
|
||||
sub_agg: sub_agg.map(BufferedSubAggs::new),
|
||||
column_type: field_type,
|
||||
req_data,
|
||||
accessor_idx,
|
||||
parent_buckets: Vec::new(),
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
limits: agg_data.context.limits.clone(),
|
||||
@@ -381,9 +383,12 @@ pub(crate) fn build_segment_range_collector(
|
||||
}
|
||||
|
||||
impl<B: SubAggBuffer> SegmentRangeCollector<B> {
|
||||
pub(crate) fn create_new_buckets(&mut self) -> crate::Result<Vec<SegmentRangeAndBucketEntry>> {
|
||||
pub(crate) fn create_new_buckets(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<Vec<SegmentRangeAndBucketEntry>> {
|
||||
let field_type = self.column_type;
|
||||
let req_data = &self.req_data;
|
||||
let req_data = agg_data.get_range_req_data(self.accessor_idx);
|
||||
// The range input on the request is f64.
|
||||
// We need to convert to u64 ranges, because we read the values as u64.
|
||||
// The mapping from the conversion is monotonic so ordering is preserved.
|
||||
@@ -558,16 +563,17 @@ mod tests {
|
||||
get_test_index_with_num_docs,
|
||||
};
|
||||
|
||||
pub fn build_test_buckets(
|
||||
ranges: &[RangeAggregationRange],
|
||||
pub fn get_collector_from_ranges(
|
||||
ranges: Vec<RangeAggregationRange>,
|
||||
field_type: ColumnType,
|
||||
) -> Vec<SegmentRangeAndBucketEntry> {
|
||||
) -> SegmentRangeCollector<HighCardSubAggBuffer> {
|
||||
let req = RangeAggregation {
|
||||
field: "dummy".to_string(),
|
||||
ranges: ranges.to_vec(),
|
||||
ranges,
|
||||
..Default::default()
|
||||
};
|
||||
extend_validate_ranges(&req.ranges, &field_type)
|
||||
// Build buckets directly as in from_req_and_validate without AggregationsData
|
||||
let buckets: Vec<_> = extend_validate_ranges(&req.ranges, &field_type)
|
||||
.expect("unexpected error in extend_validate_ranges")
|
||||
.iter()
|
||||
.map(|range| {
|
||||
@@ -598,7 +604,16 @@ mod tests {
|
||||
},
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
.collect();
|
||||
|
||||
SegmentRangeCollector {
|
||||
parent_buckets: vec![buckets],
|
||||
column_type: field_type,
|
||||
accessor_idx: 0,
|
||||
sub_agg: None,
|
||||
bucket_id_provider: Default::default(),
|
||||
limits: AggregationLimitsGuard::default(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -841,10 +856,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn bucket_test_extend_range_hole() {
|
||||
let buckets = [(10f64..20f64).into(), (30f64..40f64).into()];
|
||||
let parent_buckets = [build_test_buckets(&buckets, ColumnType::F64)];
|
||||
let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||
|
||||
let buckets = parent_buckets[0].clone();
|
||||
let buckets = collector.parent_buckets[0].clone();
|
||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||
@@ -860,14 +875,14 @@ mod tests {
|
||||
fn bucket_test_range_conversion_special_case() {
|
||||
// the monotonic conversion between f64 and u64, does not map f64::MIN.to_u64() ==
|
||||
// u64::MIN, but the into trait converts f64::MIN/MAX to None
|
||||
let buckets = [
|
||||
let buckets = vec![
|
||||
(f64::MIN..10f64).into(),
|
||||
(10f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
];
|
||||
let parent_buckets = [build_test_buckets(&buckets, ColumnType::F64)];
|
||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||
|
||||
let buckets = parent_buckets[0].clone();
|
||||
let buckets = collector.parent_buckets[0].clone();
|
||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||
@@ -879,28 +894,28 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn bucket_range_test_negative_vals() {
|
||||
let buckets = [(-10f64..-1f64).into()];
|
||||
let parent_buckets = [build_test_buckets(&buckets, ColumnType::F64)];
|
||||
let buckets = vec![(-10f64..-1f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||
|
||||
let buckets = parent_buckets[0].clone();
|
||||
let buckets = collector.parent_buckets[0].clone();
|
||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*--10");
|
||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "-1-*");
|
||||
}
|
||||
#[test]
|
||||
fn bucket_range_test_positive_vals() {
|
||||
let buckets = [(0f64..10f64).into()];
|
||||
let parent_buckets = [build_test_buckets(&buckets, ColumnType::F64)];
|
||||
let buckets = vec![(0f64..10f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||
|
||||
let buckets = parent_buckets[0].clone();
|
||||
let buckets = collector.parent_buckets[0].clone();
|
||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*-0");
|
||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "10-*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_u64() {
|
||||
let check_ranges = |ranges: &[RangeAggregationRange]| {
|
||||
let parent_buckets = [build_test_buckets(ranges, ColumnType::U64)];
|
||||
let search = |val: u64| get_bucket_pos(val, &parent_buckets[0]);
|
||||
let check_ranges = |ranges: Vec<RangeAggregationRange>| {
|
||||
let collector = get_collector_from_ranges(ranges, ColumnType::U64);
|
||||
let search = |val: u64| get_bucket_pos(val, &collector.parent_buckets[0]);
|
||||
|
||||
assert_eq!(search(u64::MIN), 0);
|
||||
assert_eq!(search(9), 0);
|
||||
@@ -913,7 +928,7 @@ mod tests {
|
||||
};
|
||||
|
||||
let ranges = vec![(10.0..100.0).into()];
|
||||
check_ranges(&ranges);
|
||||
check_ranges(ranges);
|
||||
|
||||
let ranges = vec![
|
||||
RangeAggregationRange {
|
||||
@@ -923,7 +938,7 @@ mod tests {
|
||||
},
|
||||
(10.0..100.0).into(),
|
||||
];
|
||||
check_ranges(&ranges);
|
||||
check_ranges(ranges);
|
||||
|
||||
let ranges = vec![
|
||||
RangeAggregationRange {
|
||||
@@ -938,15 +953,15 @@ mod tests {
|
||||
from: Some(100.0),
|
||||
},
|
||||
];
|
||||
check_ranges(&ranges);
|
||||
check_ranges(ranges);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_f64() {
|
||||
let ranges = [(10.0..100.0).into()];
|
||||
let ranges = vec![(10.0..100.0).into()];
|
||||
|
||||
let parent_buckets = [build_test_buckets(&ranges, ColumnType::F64)];
|
||||
let search = |val: u64| get_bucket_pos(val, &parent_buckets[0]);
|
||||
let collector = get_collector_from_ranges(ranges, ColumnType::F64);
|
||||
let search = |val: u64| get_bucket_pos(val, &collector.parent_buckets[0]);
|
||||
|
||||
assert_eq!(search(u64::MIN), 0);
|
||||
assert_eq!(search(9f64.to_u64()), 0);
|
||||
|
||||
@@ -377,22 +377,7 @@ impl IntermediateMetricResult {
|
||||
MetricResult::ExtendedStats(intermediate_stats.finalize())
|
||||
}
|
||||
IntermediateMetricResult::Sum(intermediate_sum) => {
|
||||
// By default match Elasticsearch: empty / all-missing sum
|
||||
// buckets serialize as `"value": 0`, not `"value": null`.
|
||||
// The non-ES `none_if_no_match` flag on `SumAggregation`
|
||||
// opts into SQL-style `null` for downstream consumers.
|
||||
let none_if_no_match = req
|
||||
.agg
|
||||
.as_sum()
|
||||
.and_then(|sum| sum.none_if_no_match)
|
||||
.unwrap_or(false);
|
||||
let value = intermediate_sum.finalize();
|
||||
if none_if_no_match {
|
||||
MetricResult::Sum(value.into())
|
||||
} else {
|
||||
let value = Some(value.unwrap_or(0.0));
|
||||
MetricResult::Sum(value.into())
|
||||
}
|
||||
MetricResult::Sum(intermediate_sum.finalize().into())
|
||||
}
|
||||
IntermediateMetricResult::Percentiles(percentiles) => MetricResult::Percentiles(
|
||||
percentiles
|
||||
|
||||
@@ -171,7 +171,6 @@ impl CouponCache {
|
||||
let uninitialized_coupon = Coupon::from_hash(0);
|
||||
let mut coupon_map: Vec<Coupon> =
|
||||
vec![uninitialized_coupon; highest_term_ord as usize + 1];
|
||||
|
||||
for (term_ord, coupon) in term_ords.into_iter().zip(coupons) {
|
||||
coupon_map[term_ord as usize] = coupon;
|
||||
}
|
||||
|
||||
@@ -27,16 +27,6 @@ pub struct SumAggregation {
|
||||
/// { "field": "my_numbers", "missing": "10.0" }
|
||||
#[serde(default, deserialize_with = "deserialize_option_f64")]
|
||||
pub missing: Option<f64>,
|
||||
/// Non-Elasticsearch extension. When `Some(true)`, the serialized result
|
||||
/// returns `"value": null` if no values were collected (all documents had
|
||||
/// missing/NULL values for the field), matching the behavior of `min`,
|
||||
/// `max`, and `avg`. When `None` or `Some(false)` (the default) the
|
||||
/// result returns `"value": 0`, matching Elasticsearch.
|
||||
///
|
||||
/// Intended for SQL-style consumers where `SUM` of zero rows is `NULL`
|
||||
/// and must be distinguishable from a bucket that genuinely sums to `0`.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub none_if_no_match: Option<bool>,
|
||||
}
|
||||
|
||||
impl SumAggregation {
|
||||
@@ -45,7 +35,6 @@ impl SumAggregation {
|
||||
Self {
|
||||
field: field_name,
|
||||
missing: None,
|
||||
none_if_no_match: None,
|
||||
}
|
||||
}
|
||||
/// Returns the field name the aggregation is computed on.
|
||||
@@ -70,104 +59,8 @@ impl IntermediateSum {
|
||||
pub fn merge_fruits(&mut self, other: IntermediateSum) {
|
||||
self.stats.merge_fruits(other.stats);
|
||||
}
|
||||
/// Computes the final sum value.
|
||||
///
|
||||
/// Returns `None` when no values were collected, matching the Rust-side
|
||||
/// behavior of `IntermediateMin`, `IntermediateMax`, and
|
||||
/// `IntermediateAvg`. The Elasticsearch-vs-SQL choice for the
|
||||
/// user-visible result is made at the boundary in
|
||||
/// [`IntermediateMetricResult::into_final_metric_result`]: by default
|
||||
/// `None` is coerced to `Some(0.0)` to match Elasticsearch
|
||||
/// (`"value": 0`), and the [`SumAggregation::none_if_no_match`] flag
|
||||
/// opts out of that coercion for SQL-style consumers.
|
||||
/// Computes the final minimum value.
|
||||
pub fn finalize(&self) -> Option<f64> {
|
||||
let stats = self.stats.finalize();
|
||||
if stats.count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(stats.sum)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_sum_finalize_returns_none_when_no_values() {
|
||||
// Default IntermediateSum has count=0 — finalize should return None,
|
||||
// matching MIN/MAX/AVG behavior for all-NULL groups.
|
||||
let sum = IntermediateSum::default();
|
||||
assert_eq!(sum.finalize(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sum_finalize_returns_value_when_has_values() {
|
||||
let mut sum = IntermediateSum::default();
|
||||
// Merge in a result that has actual values
|
||||
let stats = IntermediateStats {
|
||||
count: 3,
|
||||
sum: 42.0,
|
||||
min: 10.0,
|
||||
max: 20.0,
|
||||
..Default::default()
|
||||
};
|
||||
let other = IntermediateSum::from_stats(stats);
|
||||
sum.merge_fruits(other);
|
||||
assert_eq!(sum.finalize(), Some(42.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sum_merge_two_empty_still_none() {
|
||||
let mut a = IntermediateSum::default();
|
||||
let b = IntermediateSum::default();
|
||||
a.merge_fruits(b);
|
||||
assert_eq!(a.finalize(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sum_aggregation_empty_index_default_matches_es() -> crate::Result<()> {
|
||||
use serde_json::json;
|
||||
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
|
||||
|
||||
// Empty index — sum has no values to collect.
|
||||
let values: Vec<Vec<&str>> = vec![];
|
||||
let index = get_test_index_from_terms(false, &values)?;
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"score_sum": { "sum": { "field": "score" } }
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
// Default: match Elasticsearch — empty sum serializes as 0, not null.
|
||||
assert_eq!(res["score_sum"]["value"], 0.0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sum_aggregation_empty_index_none_if_no_match_opt_in() -> crate::Result<()> {
|
||||
use serde_json::json;
|
||||
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
|
||||
|
||||
let values: Vec<Vec<&str>> = vec![];
|
||||
let index = get_test_index_from_terms(false, &values)?;
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"score_sum": { "sum": { "field": "score", "none_if_no_match": true } }
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
// Opt-in non-ES extension — empty sum serializes as null.
|
||||
assert!(
|
||||
res["score_sum"]["value"].is_null(),
|
||||
"expected null, got {:?}",
|
||||
res["score_sum"]["value"]
|
||||
);
|
||||
Ok(())
|
||||
Some(self.stats.finalize().sum)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -287,33 +287,6 @@ impl BlockSegmentPostings {
|
||||
doc
|
||||
}
|
||||
|
||||
/// Returns the number of documents with a doc id strictly smaller than `target`
|
||||
/// (i.e. the *rank* of `target` in this posting list).
|
||||
///
|
||||
/// This jumps to the block that may contain `target` through the skip list, so no
|
||||
/// skipped block is decoded; a single block is then decoded to locate `target`
|
||||
/// within it. The cost is therefore `O(number_of_skip_list_entries)` plus one block
|
||||
/// decode, rather than `O(doc_freq)`.
|
||||
///
|
||||
/// Like [`Self::seek`], the underlying cursor only ever moves forward. This method
|
||||
/// must be called with **non-decreasing** `target` values (galloping); calling it
|
||||
/// with a `target` smaller than a previous one yields an incorrect result. `target`
|
||||
/// must be a valid doc id (i.e. `target <= TERMINATED`), exactly as for `seek`.
|
||||
///
|
||||
/// Edge cases: returns `0` when `target` is smaller than every doc id, and
|
||||
/// `doc_freq()` when `target` is larger than every doc id.
|
||||
pub fn rank(&mut self, target: DocId) -> u32 {
|
||||
if self.doc_freq == 0 {
|
||||
return 0;
|
||||
}
|
||||
// `within` = number of docs in the landed block with a doc id < target.
|
||||
let within = self.seek(target);
|
||||
// `remaining_docs` counts the landed block and everything after it, so the
|
||||
// difference is the number of docs in all blocks strictly before it.
|
||||
let docs_before_block = self.doc_freq - self.skip_reader.remaining_docs();
|
||||
docs_before_block + within as u32
|
||||
}
|
||||
|
||||
pub(crate) fn position_offset(&self) -> u64 {
|
||||
self.skip_reader.position_offset()
|
||||
}
|
||||
@@ -595,38 +568,4 @@ mod tests {
|
||||
assert_eq!(block_segments.docs(), &[1, 3, 5]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings_rank() -> crate::Result<()> {
|
||||
// ~8 blocks worth of docs so the skip list is actually exercised.
|
||||
let docs: Vec<DocId> = (0..1000u32).map(|i| i * 3).collect();
|
||||
let mut block_postings = build_block_postings(&docs[..])?;
|
||||
let doc_freq = block_postings.doc_freq();
|
||||
|
||||
// rank(target) must equal the number of docs strictly below target.
|
||||
// Targets are queried in non-decreasing order, as the API requires.
|
||||
// `target` values must be a valid doc id (<= TERMINATED) and non-decreasing.
|
||||
let targets = [
|
||||
0u32, 1, 2, 3, 4, 299, 300, 301, 1500, 2996, 2997, 3000, 10_000,
|
||||
];
|
||||
for &target in &targets {
|
||||
let expected = docs.iter().filter(|&&d| d < target).count() as u32;
|
||||
assert_eq!(
|
||||
block_postings.rank(target),
|
||||
expected,
|
||||
"rank({target}) mismatch"
|
||||
);
|
||||
}
|
||||
|
||||
// Edge cases: below the first doc -> 0, above the last doc -> doc_freq.
|
||||
let mut fresh = build_block_postings(&docs[..])?;
|
||||
assert_eq!(fresh.rank(0), 0);
|
||||
let mut fresh = build_block_postings(&docs[..])?;
|
||||
assert_eq!(fresh.rank(1_000_000), doc_freq);
|
||||
|
||||
// Empty postings: rank is always 0.
|
||||
let mut empty = BlockSegmentPostings::empty();
|
||||
assert_eq!(empty.rank(42), 0);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -187,12 +187,6 @@ impl SkipReader {
|
||||
self.last_doc_in_block
|
||||
}
|
||||
|
||||
/// Number of docs from the start of the current block to the end of the postings
|
||||
/// (i.e. the current block plus every block after it).
|
||||
pub(crate) fn remaining_docs(&self) -> u32 {
|
||||
self.remaining_docs
|
||||
}
|
||||
|
||||
pub fn position_offset(&self) -> u64 {
|
||||
self.position_offset
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user