mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
* Initial impl * Added `Filter` impl in `build_single_agg_segment_collector_with_reader` + Added tests * Added `Filter(FilterBucketResult)` + Made tests work. * Fixed type issues. * Fixed a test. * 8a7a73a: Pass `segment_reader` * Added more tests. * Improved parsing + tests * refactoring * Added more tests. * refactoring: moved parsing code under QueryParser * Use Tantivy syntax instead of ES * Added a sanity check test. * Simplified impl + tests * Added back tests in a more maintable way * nitz. * nitz * implemented very simple fast-path * improved a comment * implemented fast field support * Used `BoundsRange` * Improved fast field impl + tests * Simplified execution. * Fixed exports + nitz * Improved the tests to check to the expected result. * Improved test by checking the whole result JSON * Removed brittle perf checks. * Added efficiency verification tests. * Added one more efficiency check test. * Improved the efficiency tests. * Removed unnecessary parsing code + added direct Query obj * Fixed tests. * Improved tests * Fixed code structure * Fixed lint issues * nitz. * nitz * nitz. * nitz. * nitz. * Added an example * Fixed PR comments. * Applied PR comments + nitz * nitz. * Improved the code. * Fixed a perf issue. * Added batch processing. * Made the example more interesting * Fixed bucket count * Renamed Direct to CustomQuery * Fixed lint issues. * No need for scorer to be an `Option` * nitz * Used BitSet * Added an optimization for AllQuery * Fixed merge issues. * Fixed lint issues. * Added benchmark for FILTER * Removed the Option wrapper. * nitz. * Applied PR comments. * Fixed the AllQuery optimization * Applied PR comments. * feat: used `erased_serde` to allow filter query to be serialized * further improved a comment * Added back tests. * removed an unused method * removed an unused method * Added documentation * nitz. * Added query builder. * Fixed a comment. * Applied PR comments. * Fixed doctest issues. * Added ser/de * Removed bench in test * Fixed a lint issue.
539 lines
17 KiB
Rust
539 lines
17 KiB
Rust
use binggan::plugins::PeakMemAllocPlugin;
|
|
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
|
use rand::prelude::SliceRandom;
|
|
use rand::rngs::StdRng;
|
|
use rand::{Rng, SeedableRng};
|
|
use rand_distr::Distribution;
|
|
use serde_json::json;
|
|
use tantivy::aggregation::agg_req::Aggregations;
|
|
use tantivy::aggregation::AggregationCollector;
|
|
use tantivy::query::{AllQuery, TermQuery};
|
|
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
|
|
use tantivy::{doc, Index, Term};
|
|
|
|
#[global_allocator]
|
|
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
|
|
|
/// Mini macro to register a function via its name
|
|
/// runner.register("average_u64", move |index| average_u64(index));
|
|
macro_rules! register {
|
|
($runner:expr, $func:ident) => {
|
|
$runner.register(stringify!($func), move |index| {
|
|
$func(index);
|
|
})
|
|
};
|
|
}
|
|
|
|
fn main() {
|
|
let inputs = vec![
|
|
("full", get_test_index_bench(Cardinality::Full).unwrap()),
|
|
(
|
|
"dense",
|
|
get_test_index_bench(Cardinality::OptionalDense).unwrap(),
|
|
),
|
|
(
|
|
"sparse",
|
|
get_test_index_bench(Cardinality::OptionalSparse).unwrap(),
|
|
),
|
|
(
|
|
"multivalue",
|
|
get_test_index_bench(Cardinality::Multivalued).unwrap(),
|
|
),
|
|
];
|
|
|
|
bench_agg(InputGroup::new_with_inputs(inputs));
|
|
}
|
|
|
|
fn bench_agg(mut group: InputGroup<Index>) {
|
|
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
|
|
|
|
register!(group, average_u64);
|
|
register!(group, average_f64);
|
|
register!(group, average_f64_u64);
|
|
register!(group, stats_f64);
|
|
register!(group, extendedstats_f64);
|
|
register!(group, percentiles_f64);
|
|
register!(group, terms_few);
|
|
register!(group, terms_many);
|
|
register!(group, terms_many_top_1000);
|
|
register!(group, terms_many_order_by_term);
|
|
register!(group, terms_many_with_top_hits);
|
|
register!(group, terms_many_with_avg_sub_agg);
|
|
register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
|
|
|
|
register!(group, cardinality_agg);
|
|
register!(group, terms_few_with_cardinality_agg);
|
|
|
|
register!(group, range_agg);
|
|
register!(group, range_agg_with_avg_sub_agg);
|
|
register!(group, range_agg_with_term_agg_few);
|
|
register!(group, range_agg_with_term_agg_many);
|
|
register!(group, histogram);
|
|
register!(group, histogram_hard_bounds);
|
|
register!(group, histogram_with_avg_sub_agg);
|
|
register!(group, histogram_with_term_agg_few);
|
|
register!(group, avg_and_range_with_avg_sub_agg);
|
|
|
|
// Filter aggregation benchmarks
|
|
register!(group, filter_agg_all_query_count_agg);
|
|
register!(group, filter_agg_term_query_count_agg);
|
|
register!(group, filter_agg_all_query_with_sub_aggs);
|
|
register!(group, filter_agg_term_query_with_sub_aggs);
|
|
|
|
group.run();
|
|
}
|
|
|
|
fn exec_term_with_agg(index: &Index, agg_req: serde_json::Value) {
|
|
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
|
|
|
|
let reader = index.reader().unwrap();
|
|
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
|
let term_query = TermQuery::new(
|
|
Term::from_field_text(text_field, "cool"),
|
|
IndexRecordOption::Basic,
|
|
);
|
|
let collector = get_collector(agg_req);
|
|
let searcher = reader.searcher();
|
|
black_box(searcher.search(&term_query, &collector).unwrap());
|
|
}
|
|
|
|
fn average_u64(index: &Index) {
|
|
let agg_req = json!({
|
|
"average": { "avg": { "field": "score", } }
|
|
});
|
|
exec_term_with_agg(index, agg_req)
|
|
}
|
|
fn average_f64(index: &Index) {
|
|
let agg_req = json!({
|
|
"average": { "avg": { "field": "score_f64", } }
|
|
});
|
|
exec_term_with_agg(index, agg_req)
|
|
}
|
|
fn average_f64_u64(index: &Index) {
|
|
let agg_req = json!({
|
|
"average_f64": { "avg": { "field": "score_f64" } },
|
|
"average": { "avg": { "field": "score" } },
|
|
});
|
|
exec_term_with_agg(index, agg_req)
|
|
}
|
|
fn stats_f64(index: &Index) {
|
|
let agg_req = json!({
|
|
"average_f64": { "stats": { "field": "score_f64", } }
|
|
});
|
|
exec_term_with_agg(index, agg_req)
|
|
}
|
|
fn extendedstats_f64(index: &Index) {
|
|
let agg_req = json!({
|
|
"extendedstats_f64": { "extended_stats": { "field": "score_f64", } }
|
|
});
|
|
exec_term_with_agg(index, agg_req)
|
|
}
|
|
fn percentiles_f64(index: &Index) {
|
|
let agg_req = json!({
|
|
"mypercentiles": {
|
|
"percentiles": {
|
|
"field": "score_f64",
|
|
"percents": [ 95, 99, 99.9 ]
|
|
}
|
|
}
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
|
|
fn cardinality_agg(index: &Index) {
|
|
let agg_req = json!({
|
|
"cardinality": {
|
|
"cardinality": {
|
|
"field": "text_many_terms"
|
|
},
|
|
}
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn terms_few_with_cardinality_agg(index: &Index) {
|
|
let agg_req = json!({
|
|
"my_texts": {
|
|
"terms": { "field": "text_few_terms" },
|
|
"aggs": {
|
|
"cardinality": {
|
|
"cardinality": {
|
|
"field": "text_many_terms"
|
|
},
|
|
}
|
|
}
|
|
},
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
|
|
fn terms_few(index: &Index) {
|
|
let agg_req = json!({
|
|
"my_texts": { "terms": { "field": "text_few_terms" } },
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn terms_many(index: &Index) {
|
|
let agg_req = json!({
|
|
"my_texts": { "terms": { "field": "text_many_terms" } },
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn terms_many_top_1000(index: &Index) {
|
|
let agg_req = json!({
|
|
"my_texts": { "terms": { "field": "text_many_terms", "size": 1000 } },
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn terms_many_order_by_term(index: &Index) {
|
|
let agg_req = json!({
|
|
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn terms_many_with_top_hits(index: &Index) {
|
|
let agg_req = json!({
|
|
"my_texts": {
|
|
"terms": { "field": "text_many_terms" },
|
|
"aggs": {
|
|
"top_hits": { "top_hits":
|
|
{
|
|
"sort": [
|
|
{ "score": "desc" }
|
|
],
|
|
"size": 2,
|
|
"doc_value_fields": ["score_f64"]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn terms_many_with_avg_sub_agg(index: &Index) {
|
|
let agg_req = json!({
|
|
"my_texts": {
|
|
"terms": { "field": "text_many_terms" },
|
|
"aggs": {
|
|
"average_f64": { "avg": { "field": "score_f64" } }
|
|
}
|
|
},
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
|
|
let agg_req = json!({
|
|
"my_texts": {
|
|
"terms": { "field": "json.mixed_type" },
|
|
"aggs": {
|
|
"average_f64": { "avg": { "field": "score_f64" } }
|
|
}
|
|
},
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
|
|
fn execute_agg(index: &Index, agg_req: serde_json::Value) {
|
|
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
|
|
let collector = get_collector(agg_req);
|
|
|
|
let reader = index.reader().unwrap();
|
|
let searcher = reader.searcher();
|
|
black_box(searcher.search(&AllQuery, &collector).unwrap());
|
|
}
|
|
fn range_agg(index: &Index) {
|
|
let agg_req = json!({
|
|
"range_f64": { "range": { "field": "score_f64", "ranges": [
|
|
{ "from": 3, "to": 7000 },
|
|
{ "from": 7000, "to": 20000 },
|
|
{ "from": 20000, "to": 30000 },
|
|
{ "from": 30000, "to": 40000 },
|
|
{ "from": 40000, "to": 50000 },
|
|
{ "from": 50000, "to": 60000 }
|
|
] } },
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn range_agg_with_avg_sub_agg(index: &Index) {
|
|
let agg_req = json!({
|
|
"rangef64": {
|
|
"range": {
|
|
"field": "score_f64",
|
|
"ranges": [
|
|
{ "from": 3, "to": 7000 },
|
|
{ "from": 7000, "to": 20000 },
|
|
{ "from": 20000, "to": 30000 },
|
|
{ "from": 30000, "to": 40000 },
|
|
{ "from": 40000, "to": 50000 },
|
|
{ "from": 50000, "to": 60000 }
|
|
]
|
|
},
|
|
"aggs": {
|
|
"average_f64": { "avg": { "field": "score_f64" } }
|
|
}
|
|
},
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
|
|
fn range_agg_with_term_agg_few(index: &Index) {
|
|
let agg_req = json!({
|
|
"rangef64": {
|
|
"range": {
|
|
"field": "score_f64",
|
|
"ranges": [
|
|
{ "from": 3, "to": 7000 },
|
|
{ "from": 7000, "to": 20000 },
|
|
{ "from": 20000, "to": 30000 },
|
|
{ "from": 30000, "to": 40000 },
|
|
{ "from": 40000, "to": 50000 },
|
|
{ "from": 50000, "to": 60000 }
|
|
]
|
|
},
|
|
"aggs": {
|
|
"my_texts": { "terms": { "field": "text_few_terms" } },
|
|
}
|
|
},
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn range_agg_with_term_agg_many(index: &Index) {
|
|
let agg_req = json!({
|
|
"rangef64": {
|
|
"range": {
|
|
"field": "score_f64",
|
|
"ranges": [
|
|
{ "from": 3, "to": 7000 },
|
|
{ "from": 7000, "to": 20000 },
|
|
{ "from": 20000, "to": 30000 },
|
|
{ "from": 30000, "to": 40000 },
|
|
{ "from": 40000, "to": 50000 },
|
|
{ "from": 50000, "to": 60000 }
|
|
]
|
|
},
|
|
"aggs": {
|
|
"my_texts": { "terms": { "field": "text_many_terms" } },
|
|
}
|
|
},
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
|
|
fn histogram(index: &Index) {
|
|
let agg_req = json!({
|
|
"rangef64": {
|
|
"histogram": {
|
|
"field": "score_f64",
|
|
"interval": 100 // 1000 buckets
|
|
},
|
|
}
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn histogram_hard_bounds(index: &Index) {
|
|
let agg_req = json!({
|
|
"rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } },
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn histogram_with_avg_sub_agg(index: &Index) {
|
|
let agg_req = json!({
|
|
"rangef64": {
|
|
"histogram": { "field": "score_f64", "interval": 100 },
|
|
"aggs": {
|
|
"average_f64": { "avg": { "field": "score_f64" } }
|
|
}
|
|
}
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn histogram_with_term_agg_few(index: &Index) {
|
|
let agg_req = json!({
|
|
"rangef64": {
|
|
"histogram": { "field": "score_f64", "interval": 10 },
|
|
"aggs": {
|
|
"my_texts": { "terms": { "field": "text_few_terms" } }
|
|
}
|
|
}
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
fn avg_and_range_with_avg_sub_agg(index: &Index) {
|
|
let agg_req = json!({
|
|
"rangef64": {
|
|
"range": {
|
|
"field": "score_f64",
|
|
"ranges": [
|
|
{ "from": 3, "to": 7000 },
|
|
{ "from": 7000, "to": 20000 },
|
|
{ "from": 20000, "to": 60000 }
|
|
]
|
|
},
|
|
"aggs": {
|
|
"average_in_range": { "avg": { "field": "score" } }
|
|
}
|
|
},
|
|
"average": { "avg": { "field": "score" } }
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
|
|
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
|
enum Cardinality {
|
|
/// All documents contain exactly one value.
|
|
/// `Full` is the default for auto-detecting the Cardinality, since it is the most strict.
|
|
#[default]
|
|
Full = 0,
|
|
/// All documents contain at most one value.
|
|
OptionalDense = 1,
|
|
/// All documents may contain any number of values.
|
|
Multivalued = 2,
|
|
/// 1 / 20 documents has a value
|
|
OptionalSparse = 3,
|
|
}
|
|
|
|
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
|
|
AggregationCollector::from_aggs(agg_req, Default::default())
|
|
}
|
|
|
|
fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
|
let mut schema_builder = Schema::builder();
|
|
let text_fieldtype = tantivy::schema::TextOptions::default()
|
|
.set_indexing_options(
|
|
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
|
)
|
|
.set_stored();
|
|
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
|
let json_field = schema_builder.add_json_field("json", FAST);
|
|
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
|
|
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
|
|
let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast();
|
|
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
|
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
|
|
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
|
|
let index = Index::create_from_tempdir(schema_builder.build())?;
|
|
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
|
|
|
|
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
|
|
|
|
let many_terms_data = (0..150_000)
|
|
.map(|num| format!("author{num}"))
|
|
.collect::<Vec<_>>();
|
|
{
|
|
let mut rng = StdRng::from_seed([1u8; 32]);
|
|
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
|
|
// To make the different test cases comparable we just change one doc to force the
|
|
// cardinality
|
|
if cardinality == Cardinality::OptionalDense {
|
|
index_writer.add_document(doc!())?;
|
|
}
|
|
if cardinality == Cardinality::Multivalued {
|
|
index_writer.add_document(doc!(
|
|
json_field => json!({"mixed_type": 10.0}),
|
|
json_field => json!({"mixed_type": 10.0}),
|
|
text_field => "cool",
|
|
text_field => "cool",
|
|
text_field_many_terms => "cool",
|
|
text_field_many_terms => "cool",
|
|
text_field_few_terms => "cool",
|
|
text_field_few_terms => "cool",
|
|
score_field => 1u64,
|
|
score_field => 1u64,
|
|
score_field_f64 => lg_norm.sample(&mut rng),
|
|
score_field_f64 => lg_norm.sample(&mut rng),
|
|
score_field_i64 => 1i64,
|
|
score_field_i64 => 1i64,
|
|
))?;
|
|
}
|
|
let mut doc_with_value = 1_000_000;
|
|
if cardinality == Cardinality::OptionalSparse {
|
|
doc_with_value /= 20;
|
|
}
|
|
let _val_max = 1_000_000.0;
|
|
for _ in 0..doc_with_value {
|
|
let val: f64 = rng.gen_range(0.0..1_000_000.0);
|
|
let json = if rng.gen_bool(0.1) {
|
|
// 10% are numeric values
|
|
json!({ "mixed_type": val })
|
|
} else {
|
|
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
|
|
};
|
|
index_writer.add_document(doc!(
|
|
text_field => "cool",
|
|
json_field => json,
|
|
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
|
|
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
|
|
score_field => val as u64,
|
|
score_field_f64 => lg_norm.sample(&mut rng),
|
|
score_field_i64 => val as i64,
|
|
))?;
|
|
if cardinality == Cardinality::OptionalSparse {
|
|
for _ in 0..20 {
|
|
index_writer.add_document(doc!(text_field => "cool"))?;
|
|
}
|
|
}
|
|
}
|
|
// writing the segment
|
|
index_writer.commit()?;
|
|
}
|
|
|
|
Ok(index)
|
|
}
|
|
|
|
// Filter aggregation benchmarks
|
|
|
|
fn filter_agg_all_query_count_agg(index: &Index) {
|
|
let agg_req = json!({
|
|
"filtered": {
|
|
"filter": "*",
|
|
"aggs": {
|
|
"count": { "value_count": { "field": "score" } }
|
|
}
|
|
}
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
|
|
fn filter_agg_term_query_count_agg(index: &Index) {
|
|
let agg_req = json!({
|
|
"filtered": {
|
|
"filter": "text:cool",
|
|
"aggs": {
|
|
"count": { "value_count": { "field": "score" } }
|
|
}
|
|
}
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
|
|
fn filter_agg_all_query_with_sub_aggs(index: &Index) {
|
|
let agg_req = json!({
|
|
"filtered": {
|
|
"filter": "*",
|
|
"aggs": {
|
|
"avg_score": { "avg": { "field": "score" } },
|
|
"stats_score": { "stats": { "field": "score_f64" } },
|
|
"terms_text": {
|
|
"terms": { "field": "text_few_terms" }
|
|
}
|
|
}
|
|
}
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|
|
|
|
fn filter_agg_term_query_with_sub_aggs(index: &Index) {
|
|
let agg_req = json!({
|
|
"filtered": {
|
|
"filter": "text:cool",
|
|
"aggs": {
|
|
"avg_score": { "avg": { "field": "score" } },
|
|
"stats_score": { "stats": { "field": "score_f64" } },
|
|
"terms_text": {
|
|
"terms": { "field": "text_few_terms" }
|
|
}
|
|
}
|
|
}
|
|
});
|
|
execute_agg(index, agg_req);
|
|
}
|