mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-22 18:19:58 +00:00
Found via `codespell -S benches,stopwords.rs -L womens,parth,abd,childs,ond,ser,ue,mot,hel,atleast,pris,claus,allo`
2749 lines
100 KiB
Rust
2749 lines
100 KiB
Rust
use std::fmt::Debug;
|
||
use std::io;
|
||
use std::net::Ipv6Addr;
|
||
|
||
use columnar::column_values::CompactSpaceU64Accessor;
|
||
use columnar::{
|
||
Column, ColumnBlockAccessor, ColumnType, Dictionary, MonotonicallyMappableToU128,
|
||
MonotonicallyMappableToU64, NumericalValue, StrColumn,
|
||
};
|
||
use common::BitSet;
|
||
use rustc_hash::FxHashMap;
|
||
use serde::{Deserialize, Serialize};
|
||
|
||
use super::{CustomOrder, Order, OrderTarget};
|
||
use crate::aggregation::agg_data::{
|
||
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
||
};
|
||
use crate::aggregation::agg_limits::MemoryConsumption;
|
||
use crate::aggregation::agg_req::Aggregations;
|
||
use crate::aggregation::buf_collector::BufAggregationCollector;
|
||
use crate::aggregation::intermediate_agg_result::{
|
||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||
IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult,
|
||
};
|
||
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||
use crate::aggregation::{format_date, Key};
|
||
use crate::error::DataCorruption;
|
||
use crate::TantivyError;
|
||
|
||
/// Contains all information required by the SegmentTermCollector to perform the
|
||
/// terms aggregation on a segment.
|
||
pub struct TermsAggReqData {
|
||
/// The column accessor to access the fast field values.
|
||
pub accessor: Column<u64>,
|
||
/// The type of the column.
|
||
pub column_type: ColumnType,
|
||
/// The string dictionary column if the field is of type text.
|
||
pub str_dict_column: Option<StrColumn>,
|
||
/// The missing value as u64 value.
|
||
pub missing_value_for_accessor: Option<u64>,
|
||
/// The column block accessor to access the fast field values.
|
||
pub column_block_accessor: ColumnBlockAccessor<u64>,
|
||
/// Note: sub_aggregation_blueprint is filled later when building collectors
|
||
pub sub_aggregation_blueprint: Option<Box<dyn SegmentAggregationCollector>>,
|
||
/// Used to build the correct nested result when we have an empty result.
|
||
pub sug_aggregations: Aggregations,
|
||
/// The name of the aggregation.
|
||
pub name: String,
|
||
/// The normalized term aggregation request.
|
||
pub req: TermsAggregationInternal,
|
||
/// Preloaded allowed term ords (string columns only). If set, only ords present are collected.
|
||
pub allowed_term_ids: Option<BitSet>,
|
||
/// True if this terms aggregation is at the top level of the aggregation tree (not nested).
|
||
pub is_top_level: bool,
|
||
}
|
||
|
||
impl TermsAggReqData {
|
||
/// Estimate the memory consumption of this struct in bytes.
|
||
pub fn get_memory_consumption(&self) -> usize {
|
||
std::mem::size_of::<Self>()
|
||
+ std::mem::size_of::<TermsAggregationInternal>()
|
||
+ self
|
||
.allowed_term_ids
|
||
.as_ref()
|
||
.map(|bs| bs.len() / 8)
|
||
.unwrap_or(0)
|
||
}
|
||
}
|
||
|
||
/// Creates a bucket for every unique term and counts the number of occurrences.
|
||
/// Note that doc_count in the response buckets equals term count here.
|
||
///
|
||
/// If the text is untokenized and single value, that means one term per document and therefore it
|
||
/// is in fact doc count.
|
||
///
|
||
/// ## Prerequisite
|
||
/// Term aggregations work only on [fast fields](`crate::fastfield`) of type `u64`, `f64`, `i64` and
|
||
/// text.
|
||
///
|
||
/// ## Document count error
|
||
/// To improve performance, results from one segment are cut off at `segment_size`. On a index with
|
||
/// a single segment this is fine. When combining results of multiple segments, terms that
|
||
/// don't make it in the top n of a shard increase the theoretical upper bound error by lowest
|
||
/// term-count.
|
||
///
|
||
/// Even with a larger `segment_size` value, doc_count values for a terms aggregation may be
|
||
/// approximate. As a result, any sub-aggregations on the terms aggregation may also be approximate.
|
||
/// `sum_other_doc_count` is the number of documents that didn’t make it into the top size
|
||
/// terms. If this is greater than 0, you can be sure that the terms agg had to throw away some
|
||
/// buckets, either because they didn’t fit into size on the root node or they didn’t fit into
|
||
/// `segment_size` on the segment node.
|
||
///
|
||
/// ## Per bucket document count error
|
||
/// If you set the `show_term_doc_count_error` parameter to true, the terms aggregation will include
|
||
/// doc_count_error_upper_bound, which is an upper bound to the error on the doc_count returned by
|
||
/// each segment. It’s the sum of the size of the largest bucket on each segment that didn’t fit
|
||
/// into segment_size.
|
||
///
|
||
/// Result type is [`BucketResult`](crate::aggregation::agg_result::BucketResult) with
|
||
/// [`BucketEntry`](crate::aggregation::agg_result::BucketEntry) on the
|
||
/// `AggregationCollector`.
|
||
///
|
||
/// Result type is
|
||
/// [`IntermediateBucketResult`](crate::aggregation::intermediate_agg_result::IntermediateBucketResult) with
|
||
/// [`IntermediateTermBucketEntry`](crate::aggregation::intermediate_agg_result::IntermediateTermBucketEntry) on the
|
||
/// `DistributedAggregationCollector`.
|
||
///
|
||
/// # Limitations/Compatibility
|
||
///
|
||
/// Each segment returns up to [segment_size](TermsAggregation::segment_size) results. This
|
||
/// differences to elasticsearch behaviour.
|
||
///
|
||
/// # Request JSON Format
|
||
/// ```json
|
||
/// {
|
||
/// "genres": {
|
||
/// "terms":{ "field": "genre" }
|
||
/// }
|
||
/// }
|
||
/// ```
|
||
///
|
||
/// /// # Response JSON Format
|
||
/// ```json
|
||
/// {
|
||
/// ...
|
||
/// "aggregations": {
|
||
/// "genres": {
|
||
/// "doc_count_error_upper_bound": 0,
|
||
/// "sum_other_doc_count": 0,
|
||
/// "buckets": [
|
||
/// { "key": "drumnbass", "doc_count": 6 },
|
||
/// { "key": "raggae", "doc_count": 4 },
|
||
/// { "key": "jazz", "doc_count": 2 }
|
||
/// ]
|
||
/// }
|
||
/// }
|
||
/// }
|
||
/// ```
|
||
|
||
#[derive(Clone, Debug, PartialEq)]
|
||
pub enum IncludeExcludeParam {
|
||
/// A single string pattern is treated as regex.
|
||
Regex(String),
|
||
/// An array of strings is treated as exact values.
|
||
Values(Vec<String>),
|
||
}
|
||
|
||
impl Serialize for IncludeExcludeParam {
|
||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||
where S: serde::Serializer {
|
||
match self {
|
||
IncludeExcludeParam::Regex(s) => serializer.serialize_str(s),
|
||
IncludeExcludeParam::Values(v) => v.serialize(serializer),
|
||
}
|
||
}
|
||
}
|
||
|
||
// Custom deserializer to accept either a single string (regex) or an array of strings (values).
|
||
impl<'de> Deserialize<'de> for IncludeExcludeParam {
|
||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||
where D: serde::Deserializer<'de> {
|
||
use serde::de::{self, SeqAccess, Visitor};
|
||
struct IncludeExcludeVisitor;
|
||
|
||
impl<'de> Visitor<'de> for IncludeExcludeVisitor {
|
||
type Value = IncludeExcludeParam;
|
||
|
||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||
formatter.write_str("a string (regex) or an array of strings")
|
||
}
|
||
|
||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||
where E: de::Error {
|
||
Ok(IncludeExcludeParam::Regex(v.to_string()))
|
||
}
|
||
|
||
fn visit_borrowed_str<E>(self, v: &'de str) -> Result<Self::Value, E>
|
||
where E: de::Error {
|
||
Ok(IncludeExcludeParam::Regex(v.to_string()))
|
||
}
|
||
|
||
fn visit_string<E>(self, v: String) -> Result<Self::Value, E>
|
||
where E: de::Error {
|
||
Ok(IncludeExcludeParam::Regex(v))
|
||
}
|
||
|
||
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
|
||
where A: SeqAccess<'de> {
|
||
let mut values: Vec<String> = Vec::new();
|
||
while let Some(elem) = seq.next_element::<String>()? {
|
||
values.push(elem);
|
||
}
|
||
Ok(IncludeExcludeParam::Values(values))
|
||
}
|
||
}
|
||
|
||
deserializer.deserialize_any(IncludeExcludeVisitor)
|
||
}
|
||
}
|
||
|
||
/// The terms aggregation allows you to group documents by unique values of a field.
|
||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||
pub struct TermsAggregation {
|
||
/// The field to aggregate on.
|
||
pub field: String,
|
||
/// By default, the top 10 terms with the most documents are returned.
|
||
/// Larger values for size are more expensive.
|
||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||
pub size: Option<u32>,
|
||
|
||
/// To get more accurate results, we fetch more than `size` from each segment.
|
||
///
|
||
/// Increasing this value is will increase the cost for more accuracy.
|
||
///
|
||
/// Defaults to 10 * size.
|
||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||
#[serde(alias = "shard_size")]
|
||
#[serde(alias = "split_size")]
|
||
pub segment_size: Option<u32>,
|
||
|
||
/// If you set the `show_term_doc_count_error` parameter to true, the terms aggregation will
|
||
/// include doc_count_error_upper_bound, which is an upper bound to the error on the
|
||
/// doc_count returned by each shard. It’s the sum of the size of the largest bucket on
|
||
/// each segment that didn’t fit into `shard_size`.
|
||
///
|
||
/// Defaults to true when ordering by count desc.
|
||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||
pub show_term_doc_count_error: Option<bool>,
|
||
|
||
/// Filter all terms that are lower than `min_doc_count`. Defaults to 1.
|
||
///
|
||
/// **Expensive**: When set to 0, this will return all terms in the field.
|
||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||
pub min_doc_count: Option<u64>,
|
||
|
||
/// Set the order. `String` is here a target, which is either "_count", "_key", or the name of
|
||
/// a metric sub_aggregation.
|
||
///
|
||
/// Single value metrics like average can be addressed by its name.
|
||
/// Multi value metrics like stats are required to address their field by name e.g.
|
||
/// "stats.avg"
|
||
///
|
||
/// Examples in JSON format:
|
||
/// { "_count": "asc" }
|
||
/// { "_key": "asc" }
|
||
/// { "average_price": "asc" }
|
||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||
pub order: Option<CustomOrder>,
|
||
|
||
/// The missing parameter defines how documents that are missing a value should be treated.
|
||
/// By default they will be ignored but it is also possible to treat them as if they had a
|
||
/// value. Examples in JSON format:
|
||
/// { "missing": "NO_DATA" }
|
||
///
|
||
/// # Internal
|
||
///
|
||
/// Internally, `missing` requires some specialized handling in some scenarios.
|
||
///
|
||
/// Simple Case:
|
||
/// In the simplest case, we can just put the missing value in the termmap use that. In case of
|
||
/// text we put a special u64::MAX and replace it at the end with the actual missing value,
|
||
/// when loading the text.
|
||
/// Special Case 1:
|
||
/// If we have multiple columns on one field, we need to have a union on the indices on both
|
||
/// columns, to find docids without a value. That requires a special missing aggregation.
|
||
/// Special Case 2: if the key is of type text and the column is numerical, we also need to use
|
||
/// the special missing aggregation, since there is no mechanism in the numerical column to
|
||
/// add text.
|
||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||
pub missing: Option<Key>,
|
||
|
||
/// Include terms by either regex (single string) or exact values (array).
|
||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||
pub include: Option<IncludeExcludeParam>,
|
||
/// Exclude terms by either regex (single string) or exact values (array).
|
||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||
pub exclude: Option<IncludeExcludeParam>,
|
||
}
|
||
|
||
/// Same as TermsAggregation, but with populated defaults.
|
||
#[derive(Clone, Debug, PartialEq)]
|
||
pub struct TermsAggregationInternal {
|
||
/// The field to aggregate on.
|
||
pub field: String,
|
||
/// By default, the top 10 terms with the most documents are returned.
|
||
/// Larger values for size are more expensive.
|
||
///
|
||
/// Defaults to 10.
|
||
pub size: u32,
|
||
|
||
/// If you set the `show_term_doc_count_error` parameter to true, the terms aggregation will
|
||
/// include doc_count_error_upper_bound, which is an upper bound to the error on the
|
||
/// doc_count returned by each shard. It’s the sum of the size of the largest bucket on
|
||
/// each segment that didn’t fit into `segment_size`.
|
||
pub show_term_doc_count_error: bool,
|
||
|
||
/// The get more accurate results, we fetch more than `size` from each segment.
|
||
///
|
||
/// Increasing this value is will increase the cost for more accuracy.
|
||
pub segment_size: u32,
|
||
|
||
/// Filter all terms that are lower than `min_doc_count`. Defaults to 1.
|
||
///
|
||
/// *Expensive*: When set to 0, this will return all terms in the field.
|
||
pub min_doc_count: u64,
|
||
|
||
/// Set the order. `String` is here a target, which is either "_count", "_key", or the name of
|
||
/// a metric sub_aggregation.
|
||
pub order: CustomOrder,
|
||
|
||
/// The missing parameter defines how documents that are missing a value should be treated.
|
||
pub missing: Option<Key>,
|
||
}
|
||
|
||
impl TermsAggregationInternal {
|
||
pub(crate) fn from_req(req: &TermsAggregation) -> Self {
|
||
let size = req.size.unwrap_or(10);
|
||
|
||
let mut segment_size = req.segment_size.unwrap_or(size * 10);
|
||
|
||
let order = req.order.clone().unwrap_or_default();
|
||
segment_size = segment_size.max(size);
|
||
TermsAggregationInternal {
|
||
field: req.field.to_string(),
|
||
size,
|
||
segment_size,
|
||
show_term_doc_count_error: req
|
||
.show_term_doc_count_error
|
||
.unwrap_or_else(|| order == CustomOrder::default()),
|
||
min_doc_count: req.min_doc_count.unwrap_or(1),
|
||
order,
|
||
missing: req.missing.clone(),
|
||
}
|
||
}
|
||
}
|
||
|
||
impl<'a> From<&'a dyn SegmentAggregationCollector> for BufAggregationCollector {
|
||
#[inline(always)]
|
||
fn from(sub_agg_blueprint_opt: &'a dyn SegmentAggregationCollector) -> Self {
|
||
let sub_agg = sub_agg_blueprint_opt.clone_box();
|
||
BufAggregationCollector::new(sub_agg)
|
||
}
|
||
}
|
||
|
||
#[derive(Debug, Clone)]
|
||
struct BoxedAggregation(Box<dyn SegmentAggregationCollector>);
|
||
|
||
impl<'a> From<&'a dyn SegmentAggregationCollector> for BoxedAggregation {
|
||
#[inline(always)]
|
||
fn from(sub_agg_blueprint: &'a dyn SegmentAggregationCollector) -> Self {
|
||
BoxedAggregation(sub_agg_blueprint.clone_box())
|
||
}
|
||
}
|
||
|
||
impl SegmentAggregationCollector for BoxedAggregation {
|
||
#[inline(always)]
|
||
fn add_intermediate_aggregation_result(
|
||
self: Box<Self>,
|
||
agg_data: &AggregationsSegmentCtx,
|
||
results: &mut IntermediateAggregationResults,
|
||
) -> crate::Result<()> {
|
||
self.0
|
||
.add_intermediate_aggregation_result(agg_data, results)
|
||
}
|
||
|
||
#[inline(always)]
|
||
fn collect(
|
||
&mut self,
|
||
doc: crate::DocId,
|
||
agg_data: &mut AggregationsSegmentCtx,
|
||
) -> crate::Result<()> {
|
||
self.0.collect(doc, agg_data)
|
||
}
|
||
|
||
#[inline(always)]
|
||
fn collect_block(
|
||
&mut self,
|
||
docs: &[crate::DocId],
|
||
agg_data: &mut AggregationsSegmentCtx,
|
||
) -> crate::Result<()> {
|
||
self.0.collect_block(docs, agg_data)
|
||
}
|
||
}
|
||
|
||
#[derive(Debug, Clone, Copy)]
|
||
struct NoSubAgg;
|
||
|
||
impl SegmentAggregationCollector for NoSubAgg {
|
||
#[inline(always)]
|
||
fn add_intermediate_aggregation_result(
|
||
self: Box<Self>,
|
||
_agg_data: &AggregationsSegmentCtx,
|
||
_results: &mut IntermediateAggregationResults,
|
||
) -> crate::Result<()> {
|
||
Ok(())
|
||
}
|
||
|
||
#[inline(always)]
|
||
fn collect(
|
||
&mut self,
|
||
_doc: crate::DocId,
|
||
_agg_data: &mut AggregationsSegmentCtx,
|
||
) -> crate::Result<()> {
|
||
Ok(())
|
||
}
|
||
|
||
#[inline(always)]
|
||
fn collect_block(
|
||
&mut self,
|
||
_docs: &[crate::DocId],
|
||
_agg_data: &mut AggregationsSegmentCtx,
|
||
) -> crate::Result<()> {
|
||
Ok(())
|
||
}
|
||
}
|
||
|
||
/// Build a concrete `SegmentTermCollector` with either a Vec- or HashMap-backed
|
||
/// bucket storage, depending on the column type and aggregation level.
|
||
pub(crate) fn build_segment_term_collector(
|
||
req_data: &mut AggregationsSegmentCtx,
|
||
node: &AggRefNode,
|
||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
||
let accessor_idx = node.idx_in_req_data;
|
||
let column_type = {
|
||
let terms_req_data = req_data.get_term_req_data(accessor_idx);
|
||
terms_req_data.column_type
|
||
};
|
||
|
||
if column_type == ColumnType::Bytes {
|
||
return Err(TantivyError::InvalidArgument(format!(
|
||
"terms aggregation is not supported for column type {column_type:?}"
|
||
)));
|
||
}
|
||
|
||
// Validate sub aggregation exists when ordering by sub-aggregation.
|
||
{
|
||
let terms_req_data = req_data.get_term_req_data(accessor_idx);
|
||
if let OrderTarget::SubAggregation(sub_agg_name) = &terms_req_data.req.order.target {
|
||
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
|
||
|
||
node.get_sub_agg(agg_name, &req_data.per_request)
|
||
.ok_or_else(|| {
|
||
TantivyError::InvalidArgument(format!(
|
||
"could not find aggregation with name {agg_name} in metric \
|
||
sub_aggregations"
|
||
))
|
||
})?;
|
||
}
|
||
}
|
||
|
||
// Build sub-aggregation blueprint if there are children.
|
||
let has_sub_aggregations = !node.children.is_empty();
|
||
let blueprint = if has_sub_aggregations {
|
||
let sub_aggregation = build_segment_agg_collectors(req_data, &node.children)?;
|
||
Some(sub_aggregation)
|
||
} else {
|
||
None
|
||
};
|
||
{
|
||
let terms_req_data_mut = req_data.get_term_req_data_mut(accessor_idx);
|
||
terms_req_data_mut.sub_aggregation_blueprint = blueprint;
|
||
}
|
||
|
||
// Decide whether to use a Vec-backed or HashMap-backed bucket storage.
|
||
let terms_req_data = req_data.get_term_req_data(accessor_idx);
|
||
|
||
// TODO: A better metric instead of is_top_level would be the number of buckets expected.
|
||
// E.g. If term agg is not top level, but the parent is a bucket agg with less than 10 buckets,
|
||
// we can still use Vec.
|
||
let can_use_vec = terms_req_data.is_top_level;
|
||
|
||
// TODO: Benchmark to validate the threshold
|
||
const MAX_NUM_TERMS_FOR_VEC: usize = 100;
|
||
|
||
// Let's see if we can use a vec to aggregate our data
|
||
// instead of a hashmap.
|
||
let col_max_value = terms_req_data.accessor.max_value();
|
||
let max_term: usize =
|
||
col_max_value.max(terms_req_data.missing_value_for_accessor.unwrap_or(0u64)) as usize;
|
||
|
||
// - use a Vec instead of a hashmap for our aggregation.
|
||
// - buffer aggregation of our child aggregations (in any)
|
||
#[allow(clippy::collapsible_else_if)]
|
||
if can_use_vec && max_term < MAX_NUM_TERMS_FOR_VEC {
|
||
if has_sub_aggregations {
|
||
let sub_agg_blueprint = &req_data
|
||
.get_term_req_data_mut(accessor_idx)
|
||
.sub_aggregation_blueprint
|
||
.as_ref()
|
||
.ok_or_else(|| {
|
||
// Handle the error case here
|
||
// For example, return an error message or a default value
|
||
TantivyError::InternalError("Sub-aggregation blueprint not found".to_string())
|
||
})?;
|
||
let term_buckets = VecTermBuckets::new(max_term + 1, || {
|
||
let collector_clone = sub_agg_blueprint.clone_box();
|
||
BufAggregationCollector::new(collector_clone)
|
||
});
|
||
let collector = SegmentTermCollector {
|
||
term_buckets,
|
||
accessor_idx,
|
||
};
|
||
Ok(Box::new(collector))
|
||
} else {
|
||
let term_buckets = VecTermBuckets::new(max_term + 1, || NoSubAgg);
|
||
let collector = SegmentTermCollector {
|
||
term_buckets,
|
||
accessor_idx,
|
||
};
|
||
Ok(Box::new(collector))
|
||
}
|
||
} else {
|
||
if has_sub_aggregations {
|
||
let term_buckets: HashMapTermBuckets<BoxedAggregation> = HashMapTermBuckets::default();
|
||
let collector: SegmentTermCollector<HashMapTermBuckets<BoxedAggregation>> =
|
||
SegmentTermCollector {
|
||
term_buckets,
|
||
accessor_idx,
|
||
};
|
||
Ok(Box::new(collector))
|
||
} else {
|
||
let term_buckets: HashMapTermBuckets<NoSubAgg> = HashMapTermBuckets::default();
|
||
let collector: SegmentTermCollector<HashMapTermBuckets<NoSubAgg>> =
|
||
SegmentTermCollector {
|
||
term_buckets,
|
||
accessor_idx,
|
||
};
|
||
Ok(Box::new(collector))
|
||
}
|
||
}
|
||
}
|
||
|
||
#[derive(Debug, Clone)]
|
||
struct Bucket<SubAgg> {
|
||
pub count: u32,
|
||
pub sub_agg: SubAgg,
|
||
}
|
||
|
||
impl<SubAgg> Bucket<SubAgg> {
|
||
#[inline(always)]
|
||
fn new(sub_agg: SubAgg) -> Self {
|
||
Self { count: 0, sub_agg }
|
||
}
|
||
}
|
||
|
||
/// Abstraction over the storage used for term buckets (counts only).
|
||
trait TermAggregationMap: Clone + Debug + 'static {
|
||
type SubAggregation: SegmentAggregationCollector + Debug + Clone + 'static;
|
||
|
||
/// Estimate the memory consumption of this struct in bytes.
|
||
fn get_memory_consumption(&self) -> usize;
|
||
|
||
/// Returns the bucket associated to a given term_id.
|
||
fn term_entry(
|
||
&mut self,
|
||
term_id: u64,
|
||
blue_print: &dyn SegmentAggregationCollector,
|
||
) -> &mut Bucket<Self::SubAggregation>;
|
||
|
||
/// If the tree of aggregations contains buffered aggregations, flush them.
|
||
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()>;
|
||
|
||
/// Returns the term aggregation as a vector of (term_id, bucket) pairs,
|
||
/// in any order.
|
||
fn into_vec(self) -> Vec<(u64, Bucket<Self::SubAggregation>)>;
|
||
}
|
||
|
||
#[derive(Clone, Debug)]
|
||
struct HashMapTermBuckets<SubAgg> {
|
||
bucket_map: FxHashMap<u64, Bucket<SubAgg>>,
|
||
}
|
||
|
||
impl<SubAgg> Default for HashMapTermBuckets<SubAgg> {
|
||
#[inline(always)]
|
||
fn default() -> Self {
|
||
Self {
|
||
bucket_map: FxHashMap::default(),
|
||
}
|
||
}
|
||
}
|
||
|
||
impl<
|
||
SubAgg: Debug
|
||
+ Clone
|
||
+ SegmentAggregationCollector
|
||
+ for<'a> From<&'a dyn SegmentAggregationCollector>
|
||
+ 'static,
|
||
> TermAggregationMap for HashMapTermBuckets<SubAgg>
|
||
{
|
||
type SubAggregation = SubAgg;
|
||
|
||
#[inline]
|
||
fn get_memory_consumption(&self) -> usize {
|
||
self.bucket_map.memory_consumption()
|
||
}
|
||
|
||
#[inline(always)]
|
||
fn term_entry(
|
||
&mut self,
|
||
term_id: u64,
|
||
sub_agg_blueprint: &dyn SegmentAggregationCollector,
|
||
) -> &mut Bucket<SubAgg> {
|
||
self.bucket_map
|
||
.entry(term_id)
|
||
.or_insert_with(|| Bucket::new(SubAgg::from(sub_agg_blueprint)))
|
||
}
|
||
|
||
#[inline(always)]
|
||
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||
for bucket in self.bucket_map.values_mut() {
|
||
bucket.sub_agg.flush(agg_data)?;
|
||
}
|
||
Ok(())
|
||
}
|
||
|
||
fn into_vec(self) -> Vec<(u64, Bucket<SubAgg>)> {
|
||
self.bucket_map.into_iter().collect()
|
||
}
|
||
}
|
||
|
||
/// An optimized term map implementation for a compact set of term ordinals.
|
||
#[derive(Clone, Debug)]
|
||
struct VecTermBuckets<SubAgg> {
|
||
buckets: Vec<Bucket<SubAgg>>,
|
||
}
|
||
|
||
impl<SubAgg> VecTermBuckets<SubAgg> {
|
||
fn new(num_terms: usize, item_factory_fn: impl Fn() -> SubAgg) -> Self {
|
||
VecTermBuckets {
|
||
buckets: std::iter::repeat_with(item_factory_fn)
|
||
.map(Bucket::new)
|
||
.take(num_terms)
|
||
.collect(),
|
||
}
|
||
}
|
||
}
|
||
|
||
impl<SubAgg: Debug + Clone + SegmentAggregationCollector + 'static> TermAggregationMap
|
||
for VecTermBuckets<SubAgg>
|
||
{
|
||
type SubAggregation = SubAgg;
|
||
|
||
/// Estimate the memory consumption of this struct in bytes.
|
||
fn get_memory_consumption(&self) -> usize {
|
||
// We do not include `std::mem::size_of::<Self>()`
|
||
// It is already measure by the parent aggregation.
|
||
//
|
||
// The root aggregation mem size is not measure but we do not care.
|
||
self.buckets.capacity() * std::mem::size_of::<Bucket<SubAgg>>()
|
||
}
|
||
|
||
/// Add an occurrence of the given term id.
|
||
#[inline(always)]
|
||
fn term_entry(
|
||
&mut self,
|
||
term_id: u64,
|
||
_sub_agg_blueprint: &dyn SegmentAggregationCollector,
|
||
) -> &mut Bucket<SubAgg> {
|
||
let term_id_usize = term_id as usize;
|
||
debug_assert!(
|
||
term_id_usize < self.buckets.len(),
|
||
"term_id {} out of bounds for VecTermBuckets (len={})",
|
||
term_id,
|
||
self.buckets.len()
|
||
);
|
||
unsafe { self.buckets.get_unchecked_mut(term_id_usize) }
|
||
}
|
||
|
||
#[inline(always)]
|
||
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||
for bucket in &mut self.buckets {
|
||
if bucket.count > 0 {
|
||
bucket.sub_agg.flush(agg_data)?;
|
||
}
|
||
}
|
||
Ok(())
|
||
}
|
||
|
||
fn into_vec(self) -> Vec<(u64, Bucket<SubAgg>)> {
|
||
self.buckets
|
||
.into_iter()
|
||
.enumerate()
|
||
.filter(|(_, bucket)| bucket.count > 0)
|
||
.map(|(term_id, bucket)| (term_id as u64, bucket))
|
||
.collect()
|
||
}
|
||
}
|
||
|
||
impl<'a> From<&'a dyn SegmentAggregationCollector> for NoSubAgg {
|
||
#[inline(always)]
|
||
fn from(_: &'a dyn SegmentAggregationCollector) -> Self {
|
||
Self
|
||
}
|
||
}
|
||
|
||
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||
/// the correct datatype.
|
||
#[derive(Clone, Debug)]
|
||
struct SegmentTermCollector<TermMap> {
|
||
/// The buckets containing the aggregation data.
|
||
term_buckets: TermMap,
|
||
accessor_idx: usize,
|
||
}
|
||
|
||
pub(crate) fn get_agg_name_and_property(name: &str) -> (&str, &str) {
|
||
let (agg_name, agg_property) = name.split_once('.').unwrap_or((name, ""));
|
||
(agg_name, agg_property)
|
||
}
|
||
|
||
impl<TermMap> SegmentAggregationCollector for SegmentTermCollector<TermMap>
|
||
where
|
||
TermMap: TermAggregationMap,
|
||
TermMap::SubAggregation: for<'a> From<&'a dyn SegmentAggregationCollector>,
|
||
{
|
||
fn add_intermediate_aggregation_result(
|
||
self: Box<Self>,
|
||
agg_data: &AggregationsSegmentCtx,
|
||
results: &mut IntermediateAggregationResults,
|
||
) -> crate::Result<()> {
|
||
let name = agg_data.get_term_req_data(self.accessor_idx).name.clone();
|
||
let bucket = self.into_intermediate_bucket_result(agg_data)?;
|
||
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
|
||
Ok(())
|
||
}
|
||
|
||
#[inline]
|
||
fn collect(
|
||
&mut self,
|
||
doc: crate::DocId,
|
||
agg_data: &mut AggregationsSegmentCtx,
|
||
) -> crate::Result<()> {
|
||
self.collect_block(&[doc], agg_data)
|
||
}
|
||
|
||
#[inline]
|
||
fn collect_block(
|
||
&mut self,
|
||
docs: &[crate::DocId],
|
||
agg_data: &mut AggregationsSegmentCtx,
|
||
) -> crate::Result<()> {
|
||
let mut req_data = agg_data.take_term_req_data(self.accessor_idx);
|
||
|
||
let mem_pre = self.get_memory_consumption();
|
||
|
||
if let Some(missing) = req_data.missing_value_for_accessor {
|
||
req_data.column_block_accessor.fetch_block_with_missing(
|
||
docs,
|
||
&req_data.accessor,
|
||
missing,
|
||
);
|
||
} else {
|
||
req_data
|
||
.column_block_accessor
|
||
.fetch_block(docs, &req_data.accessor);
|
||
}
|
||
|
||
if std::any::TypeId::of::<NoSubAgg>() == std::any::TypeId::of::<TermMap::SubAggregation>() {
|
||
for term_id in req_data.column_block_accessor.iter_vals() {
|
||
if let Some(allowed_bs) = req_data.allowed_term_ids.as_ref() {
|
||
if !allowed_bs.contains(term_id as u32) {
|
||
continue;
|
||
}
|
||
}
|
||
let bucket = self.term_buckets.term_entry(term_id, &NoSubAgg);
|
||
bucket.count += 1;
|
||
}
|
||
} else {
|
||
let Some(sub_aggregation_blueprint) = req_data.sub_aggregation_blueprint.as_deref()
|
||
else {
|
||
return Err(TantivyError::InternalError(
|
||
"Could not find sub-aggregation blueprint".to_string(),
|
||
));
|
||
};
|
||
for (doc, term_id) in req_data
|
||
.column_block_accessor
|
||
.iter_docid_vals(docs, &req_data.accessor)
|
||
{
|
||
if let Some(allowed_bs) = req_data.allowed_term_ids.as_ref() {
|
||
if !allowed_bs.contains(term_id as u32) {
|
||
continue;
|
||
}
|
||
}
|
||
let bucket = self
|
||
.term_buckets
|
||
.term_entry(term_id, sub_aggregation_blueprint);
|
||
bucket.count += 1;
|
||
bucket.sub_agg.collect(doc, agg_data)?;
|
||
}
|
||
}
|
||
|
||
let mem_delta = self.get_memory_consumption() - mem_pre;
|
||
if mem_delta > 0 {
|
||
agg_data
|
||
.context
|
||
.limits
|
||
.add_memory_consumed(mem_delta as u64)?;
|
||
}
|
||
agg_data.put_back_term_req_data(self.accessor_idx, req_data);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[inline(always)]
|
||
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||
self.term_buckets.flush(agg_data)?;
|
||
Ok(())
|
||
}
|
||
}
|
||
|
||
/// Missing value are represented as a sentinel value in the column.
|
||
///
|
||
/// This function extracts the missing value from the entries vector,
|
||
/// computes the intermediate key, and returns it the key and the bucket
|
||
/// in an Option.
|
||
fn extract_missing_value<T>(
|
||
entries: &mut Vec<(u64, T)>,
|
||
term_req: &TermsAggReqData,
|
||
) -> Option<(IntermediateKey, T)> {
|
||
let missing_sentinel = term_req.missing_value_for_accessor?;
|
||
let missing_value_entry_pos = entries
|
||
.iter()
|
||
.position(|(term_id, _)| *term_id == missing_sentinel)?;
|
||
let (_term_id, bucket) = entries.swap_remove(missing_value_entry_pos);
|
||
let missing_key = term_req.req.missing.as_ref()?;
|
||
let key = match missing_key {
|
||
Key::Str(missing) => IntermediateKey::Str(missing.clone()),
|
||
Key::F64(val) => IntermediateKey::F64(*val),
|
||
Key::U64(val) => IntermediateKey::U64(*val),
|
||
Key::I64(val) => IntermediateKey::I64(*val),
|
||
};
|
||
Some((key, bucket))
|
||
}
|
||
|
||
impl<TermMap> SegmentTermCollector<TermMap>
|
||
where TermMap: TermAggregationMap
|
||
{
|
||
fn get_memory_consumption(&self) -> usize {
|
||
self.term_buckets.get_memory_consumption()
|
||
}
|
||
|
||
#[inline]
|
||
pub(crate) fn into_intermediate_bucket_result(
|
||
self,
|
||
agg_data: &AggregationsSegmentCtx,
|
||
) -> crate::Result<IntermediateBucketResult> {
|
||
let term_req = agg_data.get_term_req_data(self.accessor_idx);
|
||
let mut entries: Vec<(u64, Bucket<TermMap::SubAggregation>)> = self.term_buckets.into_vec();
|
||
|
||
let order_by_sub_aggregation =
|
||
matches!(term_req.req.order.target, OrderTarget::SubAggregation(_));
|
||
|
||
match &term_req.req.order.target {
|
||
OrderTarget::Key => {
|
||
// We rely on the fact, that term ordinals match the order of the strings
|
||
// TODO: We could have a special collector, that keeps only TOP n results at any
|
||
// time.
|
||
if term_req.req.order.order == Order::Desc {
|
||
entries.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.0));
|
||
} else {
|
||
entries.sort_unstable_by_key(|bucket| bucket.0);
|
||
}
|
||
}
|
||
OrderTarget::SubAggregation(_name) => {
|
||
// don't sort and cut off since it's hard to make assumptions on the quality of the
|
||
// results when cutting off du to unknown nature of the sub_aggregation (possible
|
||
// to check).
|
||
}
|
||
OrderTarget::Count => {
|
||
if term_req.req.order.order == Order::Desc {
|
||
entries.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.1.count));
|
||
} else {
|
||
entries.sort_unstable_by_key(|bucket| bucket.1.count);
|
||
}
|
||
}
|
||
}
|
||
|
||
let (term_doc_count_before_cutoff, sum_other_doc_count) = if order_by_sub_aggregation {
|
||
(0, 0)
|
||
} else {
|
||
cut_off_buckets(&mut entries, term_req.req.segment_size as usize)
|
||
};
|
||
|
||
let mut dict: FxHashMap<IntermediateKey, IntermediateTermBucketEntry> = Default::default();
|
||
dict.reserve(entries.len());
|
||
|
||
let into_intermediate_bucket_entry =
|
||
|bucket: Bucket<TermMap::SubAggregation>| -> crate::Result<IntermediateTermBucketEntry> {
|
||
let intermediate_entry = if term_req.sub_aggregation_blueprint.as_ref().is_some() {
|
||
let mut sub_aggregation_res = IntermediateAggregationResults::default();
|
||
// TODO remove box new
|
||
Box::new(bucket.sub_agg)
|
||
.add_intermediate_aggregation_result(agg_data, &mut sub_aggregation_res)?;
|
||
IntermediateTermBucketEntry {
|
||
doc_count: bucket.count,
|
||
sub_aggregation: sub_aggregation_res,
|
||
}
|
||
} else {
|
||
IntermediateTermBucketEntry {
|
||
doc_count: bucket.count,
|
||
sub_aggregation: Default::default(),
|
||
}
|
||
};
|
||
Ok(intermediate_entry)
|
||
};
|
||
|
||
if term_req.column_type == ColumnType::Str {
|
||
let fallback_dict = Dictionary::empty();
|
||
let term_dict = term_req
|
||
.str_dict_column
|
||
.as_ref()
|
||
.map(|el| el.dictionary())
|
||
.unwrap_or_else(|| &fallback_dict);
|
||
|
||
if let Some((intermediate_key, bucket)) = extract_missing_value(&mut entries, term_req)
|
||
{
|
||
let intermediate_entry = into_intermediate_bucket_entry(bucket)?;
|
||
dict.insert(intermediate_key, intermediate_entry);
|
||
}
|
||
|
||
// Sort by term ord
|
||
entries.sort_unstable_by_key(|bucket| bucket.0);
|
||
|
||
let (term_ids, buckets): (Vec<u64>, Vec<Bucket<TermMap::SubAggregation>>) =
|
||
entries.into_iter().unzip();
|
||
let mut buckets_it = buckets.into_iter();
|
||
|
||
term_dict.sorted_ords_to_term_cb(term_ids.into_iter(), |term| {
|
||
let bucket = buckets_it.next().unwrap();
|
||
let intermediate_entry =
|
||
into_intermediate_bucket_entry(bucket).map_err(io::Error::other)?;
|
||
dict.insert(
|
||
IntermediateKey::Str(
|
||
String::from_utf8(term.to_vec()).expect("could not convert to String"),
|
||
),
|
||
intermediate_entry,
|
||
);
|
||
Ok(())
|
||
})?;
|
||
|
||
if term_req.req.min_doc_count == 0 {
|
||
// TODO: Handle rev streaming for descending sorting by keys
|
||
let mut stream = term_dict.stream()?;
|
||
let empty_sub_aggregation =
|
||
IntermediateAggregationResults::empty_from_req(&term_req.sug_aggregations);
|
||
while stream.advance() {
|
||
if dict.len() >= term_req.req.segment_size as usize {
|
||
break;
|
||
}
|
||
|
||
// Respect allowed filters if present
|
||
if let Some(allowed_bs) = term_req.allowed_term_ids.as_ref() {
|
||
if !allowed_bs.contains(stream.term_ord() as u32) {
|
||
continue;
|
||
}
|
||
}
|
||
|
||
let key = IntermediateKey::Str(
|
||
std::str::from_utf8(stream.key())
|
||
.map_err(|utf8_err| DataCorruption::comment_only(utf8_err.to_string()))?
|
||
.to_string(),
|
||
);
|
||
|
||
dict.entry(key.clone())
|
||
.or_insert_with(|| IntermediateTermBucketEntry {
|
||
doc_count: 0,
|
||
sub_aggregation: empty_sub_aggregation.clone(),
|
||
});
|
||
}
|
||
}
|
||
} else if term_req.column_type == ColumnType::DateTime {
|
||
for (val, doc_count) in entries {
|
||
let intermediate_entry = into_intermediate_bucket_entry(doc_count)?;
|
||
let val = i64::from_u64(val);
|
||
let date = format_date(val)?;
|
||
dict.insert(IntermediateKey::Str(date), intermediate_entry);
|
||
}
|
||
} else if term_req.column_type == ColumnType::Bool {
|
||
for (val, doc_count) in entries {
|
||
let intermediate_entry = into_intermediate_bucket_entry(doc_count)?;
|
||
let val = bool::from_u64(val);
|
||
dict.insert(IntermediateKey::Bool(val), intermediate_entry);
|
||
}
|
||
} else if term_req.column_type == ColumnType::IpAddr {
|
||
let compact_space_accessor = term_req
|
||
.accessor
|
||
.values
|
||
.clone()
|
||
.downcast_arc::<CompactSpaceU64Accessor>()
|
||
.map_err(|_| {
|
||
TantivyError::AggregationError(
|
||
crate::aggregation::AggregationError::InternalError(
|
||
"Type mismatch: Could not downcast to CompactSpaceU64Accessor"
|
||
.to_string(),
|
||
),
|
||
)
|
||
})?;
|
||
|
||
for (val, doc_count) in entries {
|
||
let intermediate_entry = into_intermediate_bucket_entry(doc_count)?;
|
||
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
|
||
let val = Ipv6Addr::from_u128(val);
|
||
dict.insert(IntermediateKey::IpAddr(val), intermediate_entry);
|
||
}
|
||
} else {
|
||
for (val, doc_count) in entries {
|
||
let intermediate_entry = into_intermediate_bucket_entry(doc_count)?;
|
||
if term_req.column_type == ColumnType::U64 {
|
||
dict.insert(IntermediateKey::U64(val), intermediate_entry);
|
||
} else if term_req.column_type == ColumnType::I64 {
|
||
dict.insert(IntermediateKey::I64(i64::from_u64(val)), intermediate_entry);
|
||
} else {
|
||
let val = f64::from_u64(val);
|
||
let val: NumericalValue = val.into();
|
||
|
||
match val.normalize() {
|
||
NumericalValue::U64(val) => {
|
||
dict.insert(IntermediateKey::U64(val), intermediate_entry);
|
||
}
|
||
NumericalValue::I64(val) => {
|
||
dict.insert(IntermediateKey::I64(val), intermediate_entry);
|
||
}
|
||
NumericalValue::F64(val) => {
|
||
dict.insert(IntermediateKey::F64(val), intermediate_entry);
|
||
}
|
||
}
|
||
};
|
||
}
|
||
};
|
||
|
||
Ok(IntermediateBucketResult::Terms {
|
||
buckets: IntermediateTermBucketResult {
|
||
entries: dict,
|
||
sum_other_doc_count,
|
||
doc_count_error_upper_bound: term_doc_count_before_cutoff,
|
||
},
|
||
})
|
||
}
|
||
}
|
||
|
||
pub(crate) trait GetDocCount {
|
||
fn doc_count(&self) -> u64;
|
||
}
|
||
|
||
impl GetDocCount for (String, IntermediateTermBucketEntry) {
|
||
fn doc_count(&self) -> u64 {
|
||
self.1.doc_count as u64
|
||
}
|
||
}
|
||
|
||
impl<SubAgg> GetDocCount for (u64, Bucket<SubAgg>) {
|
||
fn doc_count(&self) -> u64 {
|
||
self.1.count as u64
|
||
}
|
||
}
|
||
|
||
pub(crate) fn cut_off_buckets<T: GetDocCount + Debug>(
|
||
entries: &mut Vec<T>,
|
||
num_elem: usize,
|
||
) -> (u64, u64) {
|
||
let term_doc_count_before_cutoff = entries
|
||
.get(num_elem)
|
||
.map(|entry| entry.doc_count())
|
||
.unwrap_or(0);
|
||
|
||
let sum_other_doc_count = entries
|
||
.get(num_elem..)
|
||
.map(|cut_off_range| cut_off_range.iter().map(|entry| entry.doc_count()).sum())
|
||
.unwrap_or(0);
|
||
|
||
entries.truncate(num_elem);
|
||
(term_doc_count_before_cutoff, sum_other_doc_count)
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use std::net::IpAddr;
|
||
use std::str::FromStr;
|
||
|
||
use common::DateTime;
|
||
use time::{Date, Month};
|
||
|
||
use crate::aggregation::agg_req::Aggregations;
|
||
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
||
use crate::aggregation::tests::{
|
||
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
|
||
get_test_index_from_terms, get_test_index_from_values_and_terms,
|
||
};
|
||
use crate::aggregation::{AggregationLimitsGuard, DistributedAggregationCollector};
|
||
use crate::indexer::NoMergePolicy;
|
||
use crate::query::AllQuery;
|
||
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING};
|
||
use crate::{Index, IndexWriter};
|
||
|
||
#[test]
|
||
fn terms_aggregation_test_single_segment() -> crate::Result<()> {
|
||
terms_aggregation_test_merge_segment(true)
|
||
}
|
||
#[test]
|
||
fn terms_aggregation_test() -> crate::Result<()> {
|
||
terms_aggregation_test_merge_segment(false)
|
||
}
|
||
fn terms_aggregation_test_merge_segment(merge_segments: bool) -> crate::Result<()> {
|
||
let segment_and_terms = vec![
|
||
vec!["terma"],
|
||
vec!["termb"],
|
||
vec!["termc"],
|
||
vec!["terma"],
|
||
vec!["terma"],
|
||
vec!["terma"],
|
||
vec!["termb"],
|
||
vec!["terma"],
|
||
];
|
||
let index = get_test_index_from_terms(merge_segments, &segment_and_terms)?;
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc");
|
||
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 1);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"size": 2,
|
||
"segment_size": 2
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][2]["key"],
|
||
serde_json::Value::Null
|
||
);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 1);
|
||
|
||
// include filter: only terma and termc
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"include": ["terma", "termc"],
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termc");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
|
||
// exclude filter: remove termc
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"exclude": ["termc"],
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
|
||
// include regex (single string): only termb
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"include": "termb",
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
|
||
// include regex (term.*) with exclude regex (termc): expect terma and termb
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"include": "term.*",
|
||
"exclude": "termc",
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
|
||
// test min_doc_count
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"size": 2,
|
||
"min_doc_count": 3,
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 5);
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][1]["key"],
|
||
serde_json::Value::Null
|
||
);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0); // TODO sum_other_doc_count with min_doc_count
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_test_order_count_single_segment() -> crate::Result<()> {
|
||
terms_aggregation_test_order_count_merge_segment(true)
|
||
}
|
||
#[test]
|
||
fn terms_aggregation_test_count_order() -> crate::Result<()> {
|
||
terms_aggregation_test_order_count_merge_segment(false)
|
||
}
|
||
fn terms_aggregation_test_order_count_merge_segment(merge_segments: bool) -> crate::Result<()> {
|
||
let segment_and_terms = vec![
|
||
vec![(5.0, "terma".to_string())],
|
||
vec![(2.0, "termb".to_string())],
|
||
vec![(2.0, "terma".to_string())],
|
||
vec![(1.0, "termc".to_string())],
|
||
vec![(1.0, "termc".to_string())],
|
||
vec![(1.0, "termc".to_string())],
|
||
vec![(5.0, "terma".to_string())],
|
||
vec![(5.0, "terma".to_string())],
|
||
vec![(5.0, "terma".to_string())],
|
||
vec![(8.0, "termb".to_string())],
|
||
vec![(5.0, "terma".to_string())],
|
||
];
|
||
let index = get_test_index_from_values_and_terms(merge_segments, &segment_and_terms)?;
|
||
|
||
let sub_agg: Aggregations = serde_json::from_value(json!({
|
||
"avg_score": {
|
||
"avg": {
|
||
"field": "score",
|
||
}
|
||
},
|
||
"stats_score": {
|
||
"stats": {
|
||
"field": "score",
|
||
}
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
// sub agg desc
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"order": {
|
||
"_count": "asc",
|
||
},
|
||
},
|
||
"aggs": sub_agg,
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
|
||
assert_eq!(res["my_texts"]["buckets"][0]["avg_score"]["value"], 5.0);
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termc");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 3);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["avg_score"]["value"], 1.0);
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 6);
|
||
assert_eq!(res["my_texts"]["buckets"][2]["avg_score"]["value"], 4.5);
|
||
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
|
||
// Agg on non string
|
||
//
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_scores1": {
|
||
"terms": {
|
||
"field": "score",
|
||
"order": {
|
||
"_count": "asc",
|
||
},
|
||
},
|
||
"aggs": sub_agg,
|
||
},
|
||
"my_scores2": {
|
||
"terms": {
|
||
"field": "score_f64",
|
||
"order": {
|
||
"_count": "asc",
|
||
},
|
||
},
|
||
"aggs": sub_agg,
|
||
},
|
||
"my_scores3": {
|
||
"terms": {
|
||
"field": "score_i64",
|
||
"order": {
|
||
"_count": "asc",
|
||
},
|
||
},
|
||
"aggs": sub_agg,
|
||
}
|
||
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_scores1"]["buckets"][0]["key"], 8.0);
|
||
assert_eq!(res["my_scores1"]["buckets"][0]["doc_count"], 1);
|
||
assert_eq!(res["my_scores1"]["buckets"][0]["avg_score"]["value"], 8.0);
|
||
|
||
assert_eq!(res["my_scores1"]["buckets"][1]["key"], 2.0);
|
||
assert_eq!(res["my_scores1"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(res["my_scores1"]["buckets"][1]["avg_score"]["value"], 2.0);
|
||
|
||
assert_eq!(res["my_scores1"]["buckets"][2]["key"], 1.0);
|
||
assert_eq!(res["my_scores1"]["buckets"][2]["doc_count"], 3);
|
||
assert_eq!(res["my_scores1"]["buckets"][2]["avg_score"]["value"], 1.0);
|
||
|
||
assert_eq!(res["my_scores1"]["buckets"][3]["key"], 5.0);
|
||
assert_eq!(res["my_scores1"]["buckets"][3]["doc_count"], 5);
|
||
assert_eq!(res["my_scores1"]["buckets"][3]["avg_score"]["value"], 5.0);
|
||
|
||
assert_eq!(res["my_scores1"]["sum_other_doc_count"], 0);
|
||
|
||
assert_eq!(res["my_scores2"]["buckets"][0]["key"], 8.0);
|
||
assert_eq!(res["my_scores2"]["buckets"][0]["doc_count"], 1);
|
||
assert_eq!(res["my_scores2"]["buckets"][0]["avg_score"]["value"], 8.0);
|
||
|
||
assert_eq!(res["my_scores2"]["buckets"][1]["key"], 2.0);
|
||
assert_eq!(res["my_scores2"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(res["my_scores2"]["buckets"][1]["avg_score"]["value"], 2.0);
|
||
|
||
assert_eq!(res["my_scores2"]["buckets"][2]["key"], 1.0);
|
||
assert_eq!(res["my_scores2"]["buckets"][2]["doc_count"], 3);
|
||
assert_eq!(res["my_scores2"]["buckets"][2]["avg_score"]["value"], 1.0);
|
||
|
||
assert_eq!(res["my_scores2"]["sum_other_doc_count"], 0);
|
||
|
||
assert_eq!(res["my_scores3"]["buckets"][0]["key"], 8.0);
|
||
assert_eq!(res["my_scores3"]["buckets"][0]["doc_count"], 1);
|
||
assert_eq!(res["my_scores3"]["buckets"][0]["avg_score"]["value"], 8.0);
|
||
|
||
assert_eq!(res["my_scores3"]["buckets"][1]["key"], 2.0);
|
||
assert_eq!(res["my_scores3"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(res["my_scores3"]["buckets"][1]["avg_score"]["value"], 2.0);
|
||
|
||
assert_eq!(res["my_scores3"]["buckets"][2]["key"], 1.0);
|
||
assert_eq!(res["my_scores3"]["buckets"][2]["doc_count"], 3);
|
||
assert_eq!(res["my_scores3"]["buckets"][2]["avg_score"]["value"], 1.0);
|
||
|
||
assert_eq!(res["my_scores3"]["sum_other_doc_count"], 0);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn test_simple_agg() {
|
||
let segment_and_terms = vec![vec![(5.0, "terma".to_string())]];
|
||
let index = get_test_index_from_values_and_terms(true, &segment_and_terms).unwrap();
|
||
|
||
let sub_agg: Aggregations = serde_json::from_value(json!({
|
||
"avg_score": {
|
||
"avg": {
|
||
"field": "score",
|
||
}
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
// sub agg desc
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"order": {
|
||
"_count": "asc",
|
||
},
|
||
},
|
||
"aggs": sub_agg,
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index).unwrap();
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
|
||
assert_eq!(res["my_texts"]["buckets"][0]["avg_score"]["value"], 5.0);
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_test_order_sub_agg_single_segment() -> crate::Result<()> {
|
||
terms_aggregation_test_order_sub_agg_merge_segment(true)
|
||
}
|
||
#[test]
|
||
fn terms_aggregation_test_sub_agg_order() -> crate::Result<()> {
|
||
terms_aggregation_test_order_sub_agg_merge_segment(false)
|
||
}
|
||
fn terms_aggregation_test_order_sub_agg_merge_segment(
|
||
merge_segments: bool,
|
||
) -> crate::Result<()> {
|
||
let segment_and_terms = vec![
|
||
vec![(5.0, "terma".to_string())],
|
||
vec![(4.0, "termb".to_string())],
|
||
vec![(1.0, "termc".to_string())],
|
||
vec![(1.0, "termc".to_string())],
|
||
vec![(1.0, "termc".to_string())],
|
||
vec![(5.0, "terma".to_string())],
|
||
vec![(5.0, "terma".to_string())],
|
||
vec![(5.0, "terma".to_string())],
|
||
vec![(8.0, "termb".to_string())],
|
||
vec![(5.0, "terma".to_string())],
|
||
];
|
||
let index = get_test_index_from_values_and_terms(merge_segments, &segment_and_terms)?;
|
||
|
||
let sub_agg: Aggregations = serde_json::from_value(json!({
|
||
"avg_score": {
|
||
"avg": {
|
||
"field": "score",
|
||
}
|
||
},
|
||
"stats_score": {
|
||
"stats": {
|
||
"field": "score",
|
||
}
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
// sub agg desc
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"order": {
|
||
"avg_score": "desc"
|
||
}
|
||
},
|
||
"aggs": sub_agg,
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
|
||
assert_eq!(res["my_texts"]["buckets"][0]["avg_score"]["value"], 6.0);
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["avg_score"]["value"], 5.0);
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc");
|
||
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 3);
|
||
assert_eq!(res["my_texts"]["buckets"][2]["avg_score"]["value"], 1.0);
|
||
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
|
||
// sub agg asc
|
||
//
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"order": {
|
||
"avg_score": "asc"
|
||
}
|
||
},
|
||
"aggs": sub_agg,
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "termc");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 3);
|
||
assert_eq!(res["my_texts"]["buckets"][0]["avg_score"]["value"], 1.0);
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["avg_score"]["value"], 5.0);
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 2);
|
||
assert_eq!(res["my_texts"]["buckets"][2]["avg_score"]["value"], 6.0);
|
||
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
|
||
// sub agg multi value asc
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"order": {
|
||
"stats_score.avg": "asc"
|
||
}
|
||
},
|
||
"aggs": sub_agg,
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "termc");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 3);
|
||
assert_eq!(res["my_texts"]["buckets"][0]["avg_score"]["value"], 1.0);
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["avg_score"]["value"], 5.0);
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 2);
|
||
assert_eq!(res["my_texts"]["buckets"][2]["avg_score"]["value"], 6.0);
|
||
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
|
||
// sub agg invalid request
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"order": {
|
||
"doesnotexist": "asc"
|
||
}
|
||
},
|
||
"aggs": sub_agg,
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index);
|
||
assert!(res.is_err());
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_test_order_key_single_segment() -> crate::Result<()> {
|
||
terms_aggregation_test_order_key_merge_segment(true)
|
||
}
|
||
#[test]
|
||
fn terms_aggregation_test_key_order() -> crate::Result<()> {
|
||
terms_aggregation_test_order_key_merge_segment(false)
|
||
}
|
||
fn terms_aggregation_test_order_key_merge_segment(merge_segments: bool) -> crate::Result<()> {
|
||
let segment_and_terms = vec![
|
||
vec![(5.0, "terma".to_string())],
|
||
vec![(4.0, "termb".to_string())],
|
||
vec![(1.0, "termc".to_string())],
|
||
vec![(1.0, "termc".to_string())],
|
||
vec![(1.0, "termc".to_string())],
|
||
vec![(5.0, "terma".to_string())],
|
||
vec![(5.0, "terma".to_string())],
|
||
vec![(5.0, "terma".to_string())],
|
||
vec![(8.0, "termb".to_string())],
|
||
vec![(5.0, "terma".to_string())],
|
||
];
|
||
let index = get_test_index_from_values_and_terms(merge_segments, &segment_and_terms)?;
|
||
|
||
// key asc
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"order": {
|
||
"_key": "asc"
|
||
}
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc");
|
||
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 3);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
|
||
// key desc and size cut_off
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"order": {
|
||
"_key": "asc"
|
||
},
|
||
"size": 2
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][2]["doc_count"],
|
||
serde_json::Value::Null
|
||
);
|
||
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 3);
|
||
|
||
// key asc and segment_size cut_off
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"order": {
|
||
"_key": "asc"
|
||
},
|
||
"size": 2,
|
||
"segment_size": 2
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][2]["doc_count"],
|
||
serde_json::Value::Null
|
||
);
|
||
|
||
// key desc
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"order": {
|
||
"_key": "desc"
|
||
},
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "termc");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 3);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
|
||
// key desc, size cut_off
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"order": {
|
||
"_key": "desc"
|
||
},
|
||
"size": 2
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "termc");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 3);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][2]["doc_count"],
|
||
serde_json::Value::Null
|
||
);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 5);
|
||
|
||
// key desc, segment_size cut_off
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"order": {
|
||
"_key": "desc"
|
||
},
|
||
"size": 2,
|
||
"segment_size": 2
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "termc");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 3);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][2]["doc_count"],
|
||
serde_json::Value::Null
|
||
);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_min_doc_count_special_case() -> crate::Result<()> {
|
||
let terms_per_segment = vec![
|
||
vec!["terma", "terma", "termb", "termb", "termb"],
|
||
vec!["terma", "terma", "termb"],
|
||
];
|
||
|
||
let index = get_test_index_from_terms(false, &terms_per_segment)?;
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"min_doc_count": 0,
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
// searching for terma, but min_doc_count will return all terms
|
||
let res = exec_request_with_query(agg_req, &index, Some(("string_id", "terma")))?;
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_min_doc_count_special_case_with_sub_agg_empty_merge() -> crate::Result<()>
|
||
{
|
||
let mut schema_builder = Schema::builder();
|
||
let string_field_1 = schema_builder.add_text_field("string1", STRING | FAST);
|
||
let string_field_2 = schema_builder.add_text_field("string2", STRING | FAST);
|
||
let score_fieldtype = crate::schema::NumericOptions::default().set_fast();
|
||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||
let index = Index::create_in_ram(schema_builder.build());
|
||
{
|
||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||
// writing the segment
|
||
index_writer.add_document(doc!(
|
||
string_field_1 => "A".to_string(),
|
||
string_field_2 => "hit".to_string(),
|
||
score_field => 1u64,
|
||
))?;
|
||
index_writer.add_document(doc!(
|
||
string_field_1 => "B".to_string(),
|
||
string_field_2 => "nohit".to_string(), // this doc gets filtered in this segment,
|
||
// but the term will still be loaded because
|
||
// min_doc_count == 0
|
||
score_field => 2u64,
|
||
))?;
|
||
index_writer.commit()?;
|
||
|
||
index_writer.add_document(doc!(
|
||
string_field_1 => "A".to_string(),
|
||
string_field_2 => "hit".to_string(),
|
||
score_field => 2u64,
|
||
))?;
|
||
index_writer.add_document(doc!(
|
||
string_field_1 => "B".to_string(),
|
||
string_field_2 => "hit".to_string(),
|
||
score_field => 4u64,
|
||
))?;
|
||
index_writer.commit()?;
|
||
}
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string1",
|
||
"min_doc_count": 0,
|
||
},
|
||
"aggs":{
|
||
"elhistogram": {
|
||
"histogram": {
|
||
"field": "score",
|
||
"interval": 1
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
// searching for terma, but min_doc_count will return all terms
|
||
let res = exec_request_with_query(agg_req, &index, Some(("string2", "hit")))?;
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][0]["elhistogram"]["buckets"],
|
||
json!([{ "doc_count": 1, "key": 1.0 }, { "doc_count": 1, "key": 2.0 } ])
|
||
);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "B");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][1]["elhistogram"]["buckets"],
|
||
json!([ { "doc_count": 1, "key": 4.0 } ])
|
||
);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_error_count_test() -> crate::Result<()> {
|
||
let terms_per_segment = vec![
|
||
vec!["terma", "terma", "termb", "termb", "termb", "termc"], /* termc doesn't make it
|
||
* from this segment */
|
||
vec!["terma", "terma", "termb", "termc", "termc"], /* termb doesn't make it from
|
||
* this segment */
|
||
];
|
||
|
||
let index = get_test_index_from_terms(false, &terms_per_segment)?;
|
||
assert_eq!(index.searchable_segments().unwrap().len(), 2);
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"size": 2,
|
||
"segment_size": 2
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 3);
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][2]["doc_count"],
|
||
serde_json::Value::Null
|
||
);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 4);
|
||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 2);
|
||
|
||
// disable doc_count_error_upper_bound
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"size": 2,
|
||
"segment_size": 2,
|
||
"show_term_doc_count_error": false
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request(agg_req, &index)?;
|
||
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 4);
|
||
assert_eq!(
|
||
res["my_texts"]["doc_count_error_upper_bound"],
|
||
serde_json::Value::Null
|
||
);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_term_bucket_limit() -> crate::Result<()> {
|
||
let terms: Vec<String> = (0..20_000).map(|el| el.to_string()).collect();
|
||
let terms_per_segment = vec![terms.iter().map(|el| el.as_str()).collect()];
|
||
|
||
let index = get_test_index_from_terms(true, &terms_per_segment)?;
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"min_doc_count": 0,
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query_and_memory_limit(
|
||
agg_req,
|
||
&index,
|
||
None,
|
||
AggregationLimitsGuard::new(Some(50_000), None),
|
||
)
|
||
.unwrap_err();
|
||
assert!(res
|
||
.to_string()
|
||
.contains("Aborting aggregation because memory limit was exceeded. Limit: 50.00 KB"));
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_different_tokenizer_on_ff_test() -> crate::Result<()> {
|
||
let terms = vec!["Hello Hello", "Hallo Hallo", "Hallo Hallo"];
|
||
|
||
let index = get_test_index_from_terms(true, &[terms])?;
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "text_id",
|
||
"min_doc_count": 0,
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query(agg_req, &index, None).unwrap();
|
||
println!("{}", serde_json::to_string_pretty(&res).unwrap());
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hallo Hallo");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Hello Hello");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn test_json_format() -> crate::Result<()> {
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"term_agg_test": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"size": 2,
|
||
"segment_size": 2,
|
||
"order": {
|
||
"_key": "desc"
|
||
}
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let elasticsearch_compatible_json = json!(
|
||
{
|
||
"term_agg_test":{
|
||
"terms": {
|
||
"field": "string_id",
|
||
"size": 2u64,
|
||
"segment_size": 2u64,
|
||
"order": {"_key": "desc"}
|
||
}
|
||
}
|
||
});
|
||
|
||
let agg_req_deser: Aggregations =
|
||
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
|
||
.unwrap();
|
||
assert_eq!(agg_req, agg_req_deser);
|
||
|
||
let elasticsearch_compatible_json = json!(
|
||
{
|
||
"term_agg_test":{
|
||
"terms": {
|
||
"field": "string_id",
|
||
"split_size": 2u64,
|
||
}
|
||
}
|
||
});
|
||
|
||
// test alias shard_size, split_size
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"term_agg_test": {
|
||
"terms": {
|
||
"field": "string_id",
|
||
"split_size": 2,
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let agg_req_deser: Aggregations =
|
||
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
|
||
.unwrap();
|
||
assert_eq!(agg_req, agg_req_deser);
|
||
|
||
let elasticsearch_compatible_json = json!(
|
||
{
|
||
"term_agg_test":{
|
||
"terms": {
|
||
"field": "string_id",
|
||
"shard_size": 2u64,
|
||
}
|
||
}
|
||
});
|
||
|
||
let agg_req_deser: Aggregations =
|
||
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
|
||
.unwrap();
|
||
assert_eq!(agg_req, agg_req_deser);
|
||
|
||
Ok(())
|
||
}
|
||
#[test]
|
||
fn terms_empty_json() -> crate::Result<()> {
|
||
let mut schema_builder = Schema::builder();
|
||
let json = schema_builder.add_json_field("json", FAST);
|
||
let schema = schema_builder.build();
|
||
let index = Index::create_in_ram(schema);
|
||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||
// => Segment with empty json
|
||
index_writer.add_document(doc!()).unwrap();
|
||
index_writer.commit().unwrap();
|
||
// => Segment with json, but no field partially_empty
|
||
index_writer
|
||
.add_document(doc!(json => json!({"different_field": "blue"})))
|
||
.unwrap();
|
||
index_writer.commit().unwrap();
|
||
//// => Segment with field partially_empty
|
||
index_writer
|
||
.add_document(doc!(json => json!({"partially_empty": "blue"})))
|
||
.unwrap();
|
||
index_writer.add_document(doc!())?;
|
||
index_writer.commit().unwrap();
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "json.partially_empty"
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "blue");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
|
||
assert_eq!(res["my_texts"]["buckets"][1], serde_json::Value::Null);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_bytes() -> crate::Result<()> {
|
||
let mut schema_builder = Schema::builder();
|
||
let bytes_field = schema_builder.add_bytes_field("bytes", FAST);
|
||
let index = Index::create_in_ram(schema_builder.build());
|
||
{
|
||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||
index_writer.add_document(doc!(
|
||
bytes_field => vec![1,2,3],
|
||
))?;
|
||
index_writer.commit()?;
|
||
}
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "bytes"
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||
|
||
// TODO: Returning an error would be better instead of an empty result, since this is not a
|
||
// JSON field
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][0]["key"],
|
||
serde_json::Value::Null
|
||
);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_missing_multi_value() -> crate::Result<()> {
|
||
let mut schema_builder = Schema::builder();
|
||
let text_field = schema_builder.add_text_field("text", FAST);
|
||
let id_field = schema_builder.add_u64_field("id", FAST);
|
||
let index = Index::create_in_ram(schema_builder.build());
|
||
{
|
||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||
index_writer.add_document(doc!(
|
||
text_field => "Hello Hello",
|
||
text_field => "Hello Hello",
|
||
id_field => 1u64,
|
||
id_field => 1u64,
|
||
))?;
|
||
// Missing
|
||
index_writer.add_document(doc!())?;
|
||
index_writer.add_document(doc!(
|
||
text_field => "Hello Hello",
|
||
))?;
|
||
index_writer.add_document(doc!(
|
||
text_field => "Hello Hello",
|
||
))?;
|
||
index_writer.commit()?;
|
||
// Empty segment special case
|
||
index_writer.add_document(doc!())?;
|
||
index_writer.commit()?;
|
||
// Full segment special case
|
||
index_writer.add_document(doc!(
|
||
text_field => "Hello Hello",
|
||
id_field => 1u64,
|
||
))?;
|
||
index_writer.commit()?;
|
||
}
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "text",
|
||
"missing": "Empty"
|
||
},
|
||
},
|
||
"my_texts2": {
|
||
"terms": {
|
||
"field": "text",
|
||
"missing": 1337
|
||
},
|
||
},
|
||
"my_ids": {
|
||
"terms": {
|
||
"field": "id",
|
||
"missing": 1337
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||
|
||
// text field
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hello Hello");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 5);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Empty");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][2]["key"],
|
||
serde_json::Value::Null
|
||
);
|
||
// text field with number as missing fallback
|
||
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
||
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 5);
|
||
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
||
assert_eq!(res["my_texts2"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(
|
||
res["my_texts2"]["buckets"][2]["key"],
|
||
serde_json::Value::Null
|
||
);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||
|
||
// id field
|
||
assert_eq!(res["my_ids"]["buckets"][0]["key"], 1337.0);
|
||
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 4);
|
||
assert_eq!(res["my_ids"]["buckets"][1]["key"], 1.0);
|
||
assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 3);
|
||
assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null);
|
||
|
||
Ok(())
|
||
}
|
||
#[test]
|
||
fn terms_aggregation_missing_simple_id() -> crate::Result<()> {
|
||
let mut schema_builder = Schema::builder();
|
||
let id_field = schema_builder.add_u64_field("id", FAST);
|
||
let index = Index::create_in_ram(schema_builder.build());
|
||
{
|
||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||
index_writer.add_document(doc!(
|
||
id_field => 1u64,
|
||
))?;
|
||
// Missing
|
||
index_writer.add_document(doc!())?;
|
||
index_writer.add_document(doc!())?;
|
||
index_writer.commit()?;
|
||
}
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_ids": {
|
||
"terms": {
|
||
"field": "id",
|
||
"missing": 1337
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||
|
||
// id field
|
||
assert_eq!(res["my_ids"]["buckets"][0]["key"], 1337.0);
|
||
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 2);
|
||
assert_eq!(res["my_ids"]["buckets"][1]["key"], 1.0);
|
||
assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 1);
|
||
assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_u64_value() -> crate::Result<()> {
|
||
// Make sure that large u64 are not truncated
|
||
let mut schema_builder = Schema::builder();
|
||
let id_field = schema_builder.add_u64_field("id", FAST);
|
||
let index = Index::create_in_ram(schema_builder.build());
|
||
{
|
||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||
index_writer.add_document(doc!(
|
||
id_field => 9_223_372_036_854_775_807u64,
|
||
))?;
|
||
index_writer.add_document(doc!(
|
||
id_field => 1_769_070_189_829_214_202u64,
|
||
))?;
|
||
index_writer.add_document(doc!(
|
||
id_field => 1_769_070_189_829_214_202u64,
|
||
))?;
|
||
index_writer.commit()?;
|
||
}
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_ids": {
|
||
"terms": {
|
||
"field": "id"
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||
|
||
// id field
|
||
assert_eq!(
|
||
res["my_ids"]["buckets"][0]["key"],
|
||
1_769_070_189_829_214_202u64
|
||
);
|
||
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 2);
|
||
assert_eq!(
|
||
res["my_ids"]["buckets"][1]["key"],
|
||
9_223_372_036_854_775_807u64
|
||
);
|
||
assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 1);
|
||
assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_missing1() -> crate::Result<()> {
|
||
let mut schema_builder = Schema::builder();
|
||
let text_field = schema_builder.add_text_field("text", FAST);
|
||
let id_field = schema_builder.add_u64_field("id", FAST);
|
||
let index = Index::create_in_ram(schema_builder.build());
|
||
{
|
||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||
index_writer.add_document(doc!(
|
||
text_field => "Hello Hello",
|
||
id_field => 1u64,
|
||
))?;
|
||
// Missing
|
||
index_writer.add_document(doc!())?;
|
||
index_writer.add_document(doc!(
|
||
text_field => "Hello Hello",
|
||
))?;
|
||
index_writer.add_document(doc!(
|
||
text_field => "Hello Hello",
|
||
))?;
|
||
index_writer.commit()?;
|
||
// Empty segment special case
|
||
index_writer.add_document(doc!())?;
|
||
index_writer.commit()?;
|
||
// Full segment special case
|
||
index_writer.add_document(doc!(
|
||
text_field => "Hello Hello",
|
||
id_field => 1u64,
|
||
))?;
|
||
index_writer.commit()?;
|
||
}
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "text",
|
||
"missing": "Empty"
|
||
},
|
||
},
|
||
"my_texts2": {
|
||
"terms": {
|
||
"field": "text",
|
||
"missing": 1337
|
||
},
|
||
},
|
||
"my_ids": {
|
||
"terms": {
|
||
"field": "id",
|
||
"missing": 1337
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||
|
||
// text field
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hello Hello");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
|
||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Empty");
|
||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][2]["key"],
|
||
serde_json::Value::Null
|
||
);
|
||
// text field with number as missing fallback
|
||
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
||
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 4);
|
||
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
||
assert_eq!(res["my_texts2"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(
|
||
res["my_texts2"]["buckets"][2]["key"],
|
||
serde_json::Value::Null
|
||
);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||
|
||
// id field
|
||
assert_eq!(res["my_ids"]["buckets"][0]["key"], 1337.0);
|
||
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 4);
|
||
assert_eq!(res["my_ids"]["buckets"][1]["key"], 1.0);
|
||
assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 2);
|
||
assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null);
|
||
|
||
Ok(())
|
||
}
|
||
#[test]
|
||
fn terms_aggregation_missing_empty() -> crate::Result<()> {
|
||
let mut schema_builder = Schema::builder();
|
||
schema_builder.add_text_field("text", FAST);
|
||
schema_builder.add_u64_field("id", FAST);
|
||
let index = Index::create_in_ram(schema_builder.build());
|
||
{
|
||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||
// Empty segment special case
|
||
index_writer.add_document(doc!())?;
|
||
index_writer.commit()?;
|
||
}
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_texts": {
|
||
"terms": {
|
||
"field": "text",
|
||
"missing": "Empty"
|
||
},
|
||
},
|
||
"my_texts2": {
|
||
"terms": {
|
||
"field": "text",
|
||
"missing": 1337
|
||
},
|
||
},
|
||
"my_ids": {
|
||
"terms": {
|
||
"field": "id",
|
||
"missing": 1337
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||
|
||
// text field
|
||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Empty");
|
||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
|
||
assert_eq!(
|
||
res["my_texts"]["buckets"][1]["key"],
|
||
serde_json::Value::Null
|
||
);
|
||
// text field with number as missing fallback
|
||
assert_eq!(res["my_texts2"]["buckets"][0]["key"], 1337.0);
|
||
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 1);
|
||
assert_eq!(
|
||
res["my_texts2"]["buckets"][1]["key"],
|
||
serde_json::Value::Null
|
||
);
|
||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||
|
||
// id field
|
||
assert_eq!(res["my_ids"]["buckets"][0]["key"], 1337.0);
|
||
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 1);
|
||
assert_eq!(res["my_ids"]["buckets"][1]["key"], serde_json::Value::Null);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_date() -> crate::Result<()> {
|
||
let mut schema_builder = Schema::builder();
|
||
let date_field = schema_builder.add_date_field("date_field", FAST);
|
||
let schema = schema_builder.build();
|
||
let index = Index::create_in_ram(schema);
|
||
{
|
||
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
|
||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
|
||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
|
||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
|
||
writer.commit()?;
|
||
}
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_date": {
|
||
"terms": {
|
||
"field": "date_field"
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||
|
||
// date_field field
|
||
assert_eq!(res["my_date"]["buckets"][0]["key"], "1982-09-17T00:00:00Z");
|
||
assert_eq!(res["my_date"]["buckets"][0]["doc_count"], 2);
|
||
assert_eq!(res["my_date"]["buckets"][1]["key"], "1983-09-27T00:00:00Z");
|
||
assert_eq!(res["my_date"]["buckets"][1]["doc_count"], 1);
|
||
assert_eq!(res["my_date"]["buckets"][2]["key"], serde_json::Value::Null);
|
||
|
||
Ok(())
|
||
}
|
||
#[test]
|
||
fn terms_aggregation_date_missing() -> crate::Result<()> {
|
||
let mut schema_builder = Schema::builder();
|
||
let date_field = schema_builder.add_date_field("date_field", FAST);
|
||
let schema = schema_builder.build();
|
||
let index = Index::create_in_ram(schema);
|
||
{
|
||
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
|
||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
|
||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
|
||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
|
||
writer.add_document(doc!())?;
|
||
writer.commit()?;
|
||
}
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_date": {
|
||
"terms": {
|
||
"field": "date_field",
|
||
"missing": "1982-09-17T00:00:00Z"
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||
|
||
// date_field field
|
||
assert_eq!(res["my_date"]["buckets"][0]["key"], "1982-09-17T00:00:00Z");
|
||
assert_eq!(res["my_date"]["buckets"][0]["doc_count"], 3);
|
||
assert_eq!(res["my_date"]["buckets"][1]["key"], "1983-09-27T00:00:00Z");
|
||
assert_eq!(res["my_date"]["buckets"][1]["doc_count"], 1);
|
||
assert_eq!(res["my_date"]["buckets"][2]["key"], serde_json::Value::Null);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_bool() -> crate::Result<()> {
|
||
let mut schema_builder = Schema::builder();
|
||
let field = schema_builder.add_bool_field("bool_field", FAST);
|
||
let schema = schema_builder.build();
|
||
let index = Index::create_in_ram(schema);
|
||
{
|
||
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
|
||
writer.add_document(doc!(field=>true))?;
|
||
writer.add_document(doc!(field=>false))?;
|
||
writer.add_document(doc!(field=>true))?;
|
||
writer.commit()?;
|
||
}
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_bool": {
|
||
"terms": {
|
||
"field": "bool_field"
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||
|
||
assert_eq!(res["my_bool"]["buckets"][0]["key"], 1.0);
|
||
assert_eq!(res["my_bool"]["buckets"][0]["key_as_string"], "true");
|
||
assert_eq!(res["my_bool"]["buckets"][0]["doc_count"], 2);
|
||
assert_eq!(res["my_bool"]["buckets"][1]["key"], 0.0);
|
||
assert_eq!(res["my_bool"]["buckets"][1]["key_as_string"], "false");
|
||
assert_eq!(res["my_bool"]["buckets"][1]["doc_count"], 1);
|
||
assert_eq!(res["my_bool"]["buckets"][2]["key"], serde_json::Value::Null);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggregation_ip_addr() -> crate::Result<()> {
|
||
let mut schema_builder = Schema::builder();
|
||
let field = schema_builder.add_ip_addr_field("ip_field", FAST);
|
||
let schema = schema_builder.build();
|
||
let index = Index::create_in_ram(schema);
|
||
{
|
||
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
|
||
// IpV6 loopback
|
||
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
|
||
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
|
||
// IpV4
|
||
writer.add_document(
|
||
doc!(field=>IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr()),
|
||
)?;
|
||
writer.commit()?;
|
||
}
|
||
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"my_bool": {
|
||
"terms": {
|
||
"field": "ip_field"
|
||
},
|
||
}
|
||
}))
|
||
.unwrap();
|
||
|
||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||
// print as json
|
||
// println!("{}", serde_json::to_string_pretty(&res).unwrap());
|
||
|
||
assert_eq!(res["my_bool"]["buckets"][0]["key"], "::1");
|
||
assert_eq!(res["my_bool"]["buckets"][0]["doc_count"], 2);
|
||
assert_eq!(res["my_bool"]["buckets"][1]["key"], "127.0.0.1");
|
||
assert_eq!(res["my_bool"]["buckets"][1]["doc_count"], 1);
|
||
assert_eq!(res["my_bool"]["buckets"][2]["key"], serde_json::Value::Null);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn terms_aggs_hosts_and_tags_merge_on_mixed_order_request() -> crate::Result<()> {
|
||
// This test ensures that merging of aggregation results works correctly
|
||
// even if the order of the aggregation requests is different and
|
||
// running on different indexes with the same data.
|
||
let build_index = || -> crate::Result<Index> {
|
||
let mut schema_builder = Schema::builder();
|
||
let fielda = schema_builder.add_text_field("fielda", FAST);
|
||
let fieldb = schema_builder.add_text_field("fieldb", FAST);
|
||
let host = schema_builder.add_text_field("host", FAST);
|
||
let tags = schema_builder.add_text_field("tags", FAST);
|
||
let schema = schema_builder.build();
|
||
|
||
let index = Index::create_in_ram(schema.clone());
|
||
let mut writer = index.writer(50_000_000).unwrap();
|
||
|
||
// --- Ingest documents (batch #1) ---
|
||
writer.add_document(doc!(
|
||
host => "192.168.0.10",
|
||
tags => "nice",
|
||
fielda => "a",
|
||
fieldb => "b",
|
||
))?;
|
||
writer.add_document(doc!(
|
||
host => "192.168.0.1",
|
||
tags => "nice",
|
||
))?;
|
||
writer.add_document(doc!(
|
||
host => "192.168.0.11",
|
||
tags => "nice",
|
||
))?;
|
||
writer.add_document(doc!(
|
||
host => "192.168.0.10",
|
||
tags => "nice",
|
||
tags => "cool",
|
||
))?;
|
||
writer.add_document(doc!(
|
||
host => "192.168.0.1",
|
||
tags => "nice",
|
||
tags => "cool",
|
||
))?;
|
||
|
||
writer.commit()?;
|
||
|
||
// --- Ingest documents (batch #2) ---
|
||
writer.add_document(doc!())?;
|
||
writer.add_document(doc!())?;
|
||
writer.add_document(doc!(
|
||
host => "192.168.0.10",
|
||
))?;
|
||
writer.add_document(doc!(
|
||
host => "192.168.0.10",
|
||
))?;
|
||
writer.add_document(doc!())?;
|
||
|
||
writer.commit()?;
|
||
Ok(index)
|
||
};
|
||
let index = build_index()?;
|
||
let index2 = build_index()?;
|
||
|
||
let search = |idx: &Index,
|
||
agg_req: &Aggregations|
|
||
-> crate::Result<IntermediateAggregationResults> {
|
||
let collector =
|
||
DistributedAggregationCollector::from_aggs(agg_req.clone(), Default::default());
|
||
let reader = idx.reader()?;
|
||
let searcher = reader.searcher();
|
||
let agg_res = searcher.search(&AllQuery, &collector)?;
|
||
Ok(agg_res)
|
||
};
|
||
|
||
// --- Aggregations: terms on host and tags ---
|
||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||
"hosts": { "terms": { "field": "host" } },
|
||
"tags": { "terms": { "field": "tags" } },
|
||
"fielda": { "terms": { "field": "fielda" } },
|
||
"fieldb": { "terms": { "field": "fieldb" } },
|
||
}))
|
||
.unwrap();
|
||
|
||
let mut agg_res = search(&index, &agg_req)?;
|
||
|
||
// --- Aggregations: terms on host and tags ---
|
||
let mut agg_req2: Aggregations =
|
||
Aggregations::with_capacity_and_hasher(20, Default::default());
|
||
agg_req2.insert(
|
||
"tags".to_string(),
|
||
serde_json::from_value(json!({ "terms": { "field": "tags" } }))?,
|
||
);
|
||
agg_req2.insert(
|
||
"fielda".to_string(),
|
||
serde_json::from_value(json!({ "terms": { "field": "fielda" } }))?,
|
||
);
|
||
agg_req2.insert(
|
||
"hosts".to_string(),
|
||
serde_json::from_value(json!({ "terms": { "field": "host" } }))?,
|
||
);
|
||
agg_req2.insert(
|
||
"fieldb".to_string(),
|
||
serde_json::from_value(json!({ "terms": { "field": "fieldb" } }))?,
|
||
);
|
||
// make sure the order of the aggregation request is different
|
||
// disabled to avoid flaky test with hashmap changes
|
||
// assert_ne!(agg_req.keys().next(), agg_req2.keys().next());
|
||
|
||
let agg_res2 = search(&index2, &agg_req2)?;
|
||
|
||
agg_res.merge_fruits(agg_res2).unwrap();
|
||
let agg_json =
|
||
serde_json::to_value(&agg_res.into_final_result(agg_req2, Default::default())?)?;
|
||
|
||
// hosts:
|
||
let hosts = &agg_json["hosts"]["buckets"];
|
||
assert_eq!(hosts[0]["key"], "192.168.0.10");
|
||
assert_eq!(hosts[0]["doc_count"], 8);
|
||
assert_eq!(hosts[1]["key"], "192.168.0.1");
|
||
assert_eq!(hosts[1]["doc_count"], 4);
|
||
assert_eq!(hosts[2]["key"], "192.168.0.11");
|
||
assert_eq!(hosts[2]["doc_count"], 2);
|
||
// Implementation currently reports error bounds/other count; ensure zero.
|
||
assert_eq!(agg_json["hosts"]["doc_count_error_upper_bound"], 0);
|
||
assert_eq!(agg_json["hosts"]["sum_other_doc_count"], 0);
|
||
|
||
// tags:
|
||
let tags_buckets = &agg_json["tags"]["buckets"];
|
||
assert_eq!(tags_buckets[0]["key"], "nice");
|
||
assert_eq!(tags_buckets[0]["doc_count"], 10);
|
||
assert_eq!(tags_buckets[1]["key"], "cool");
|
||
assert_eq!(tags_buckets[1]["doc_count"], 4);
|
||
assert_eq!(agg_json["tags"]["doc_count_error_upper_bound"], 0);
|
||
assert_eq!(agg_json["tags"]["sum_other_doc_count"], 0);
|
||
|
||
Ok(())
|
||
}
|
||
}
|