mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
Added some benchmark for top K by a fast field (#2754)
Also removed query parsing from the bench code. Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>
This commit is contained in:
@@ -16,14 +16,14 @@
|
|||||||
// - This bench isolates boolean iteration speed and intersection/union cost.
|
// - This bench isolates boolean iteration speed and intersection/union cost.
|
||||||
// - Use `cargo bench --bench boolean_conjunction` to run.
|
// - Use `cargo bench --bench boolean_conjunction` to run.
|
||||||
|
|
||||||
use binggan::{black_box, BenchRunner};
|
use binggan::{black_box, BenchGroup, BenchRunner};
|
||||||
use rand::prelude::*;
|
use rand::prelude::*;
|
||||||
use rand::rngs::StdRng;
|
use rand::rngs::StdRng;
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
use tantivy::collector::{Count, TopDocs};
|
use tantivy::collector::{Collector, Count, TopDocs};
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::{Query, QueryParser};
|
||||||
use tantivy::schema::{Schema, TEXT};
|
use tantivy::schema::{Schema, FAST, TEXT};
|
||||||
use tantivy::{doc, Index, ReloadPolicy, Searcher};
|
use tantivy::{doc, Index, Order, ReloadPolicy, Searcher, SegmentReader};
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
struct BenchIndex {
|
struct BenchIndex {
|
||||||
@@ -33,23 +33,6 @@ struct BenchIndex {
|
|||||||
query_parser: QueryParser,
|
query_parser: QueryParser,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BenchIndex {
|
|
||||||
#[inline(always)]
|
|
||||||
fn count_query(&self, query_str: &str) -> usize {
|
|
||||||
let query = self.query_parser.parse_query(query_str).unwrap();
|
|
||||||
self.searcher.search(&query, &Count).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn topk_len(&self, query_str: &str, k: usize) -> usize {
|
|
||||||
let query = self.query_parser.parse_query(query_str).unwrap();
|
|
||||||
self.searcher
|
|
||||||
.search(&query, &TopDocs::with_limit(k))
|
|
||||||
.unwrap()
|
|
||||||
.len()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Build a single index containing both fields (title, body) and
|
/// Build a single index containing both fields (title, body) and
|
||||||
/// return two BenchIndex views:
|
/// return two BenchIndex views:
|
||||||
/// - single_field: QueryParser defaults to only "body"
|
/// - single_field: QueryParser defaults to only "body"
|
||||||
@@ -59,6 +42,8 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
|
|||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let f_title = schema_builder.add_text_field("title", TEXT);
|
let f_title = schema_builder.add_text_field("title", TEXT);
|
||||||
let f_body = schema_builder.add_text_field("body", TEXT);
|
let f_body = schema_builder.add_text_field("body", TEXT);
|
||||||
|
let f_score = schema_builder.add_u64_field("score", FAST);
|
||||||
|
let f_score2 = schema_builder.add_u64_field("score2", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
|
||||||
@@ -67,11 +52,13 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
|
|||||||
|
|
||||||
// Populate: spread each present token 90/10 to body/title
|
// Populate: spread each present token 90/10 to body/title
|
||||||
{
|
{
|
||||||
let mut writer = index.writer(500_000_000).unwrap();
|
let mut writer = index.writer_with_num_threads(1, 500_000_000).unwrap();
|
||||||
for _ in 0..num_docs {
|
for _ in 0..num_docs {
|
||||||
let has_a = rng.gen_bool(p_a as f64);
|
let has_a = rng.gen_bool(p_a as f64);
|
||||||
let has_b = rng.gen_bool(p_b as f64);
|
let has_b = rng.gen_bool(p_b as f64);
|
||||||
let has_c = rng.gen_bool(p_c as f64);
|
let has_c = rng.gen_bool(p_c as f64);
|
||||||
|
let score = rng.gen_range(0u64..100u64);
|
||||||
|
let score2 = rng.gen_range(0u64..100_000u64);
|
||||||
let mut title_tokens: Vec<&str> = Vec::new();
|
let mut title_tokens: Vec<&str> = Vec::new();
|
||||||
let mut body_tokens: Vec<&str> = Vec::new();
|
let mut body_tokens: Vec<&str> = Vec::new();
|
||||||
if has_a {
|
if has_a {
|
||||||
@@ -101,7 +88,9 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
|
|||||||
writer
|
writer
|
||||||
.add_document(doc!(
|
.add_document(doc!(
|
||||||
f_title=>title_tokens.join(" "),
|
f_title=>title_tokens.join(" "),
|
||||||
f_body=>body_tokens.join(" ")
|
f_body=>body_tokens.join(" "),
|
||||||
|
f_score=>score,
|
||||||
|
f_score2=>score2,
|
||||||
))
|
))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
@@ -153,72 +142,81 @@ fn main() {
|
|||||||
),
|
),
|
||||||
];
|
];
|
||||||
|
|
||||||
|
let queries = &["a", "+a +b", "+a +b +c", "a OR b", "a OR b OR c"];
|
||||||
|
|
||||||
let mut runner = BenchRunner::new();
|
let mut runner = BenchRunner::new();
|
||||||
for (label, n, pa, pb, pc) in scenarios {
|
for (label, n, pa, pb, pc) in scenarios {
|
||||||
let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc);
|
let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc);
|
||||||
|
|
||||||
// Single-field group: default field is body only
|
for (view_name, bench_index) in [("single_field", single_view), ("multi_field", multi_view)]
|
||||||
{
|
{
|
||||||
|
// Single-field group: default field is body only
|
||||||
let mut group = runner.new_group();
|
let mut group = runner.new_group();
|
||||||
group.set_name(format!("single_field — {}", label));
|
group.set_name(format!("{} — {}", view_name, label));
|
||||||
group.register_with_input("+a_+b_count", &single_view, |benv: &BenchIndex| {
|
for query_str in queries {
|
||||||
black_box(benv.count_query("+a +b"))
|
add_bench_task(&mut group, &bench_index, query_str, Count, "count");
|
||||||
});
|
add_bench_task(
|
||||||
group.register_with_input("+a_+b_+c_count", &single_view, |benv: &BenchIndex| {
|
&mut group,
|
||||||
black_box(benv.count_query("+a +b +c"))
|
&bench_index,
|
||||||
});
|
query_str,
|
||||||
group.register_with_input("+a_+b_top10", &single_view, |benv: &BenchIndex| {
|
TopDocs::with_limit(10),
|
||||||
black_box(benv.topk_len("+a +b", 10))
|
"top10",
|
||||||
});
|
);
|
||||||
group.register_with_input("+a_+b_+c_top10", &single_view, |benv: &BenchIndex| {
|
add_bench_task(
|
||||||
black_box(benv.topk_len("+a +b +c", 10))
|
&mut group,
|
||||||
});
|
&bench_index,
|
||||||
// OR queries
|
query_str,
|
||||||
group.register_with_input("a_OR_b_count", &single_view, |benv: &BenchIndex| {
|
TopDocs::with_limit(10).order_by_fast_field::<u64>("score", Order::Asc),
|
||||||
black_box(benv.count_query("a OR b"))
|
"top10_by_ff",
|
||||||
});
|
);
|
||||||
group.register_with_input("a_OR_b_OR_c_count", &single_view, |benv: &BenchIndex| {
|
add_bench_task(
|
||||||
black_box(benv.count_query("a OR b OR c"))
|
&mut group,
|
||||||
});
|
&bench_index,
|
||||||
group.register_with_input("a_OR_b_top10", &single_view, |benv: &BenchIndex| {
|
query_str,
|
||||||
black_box(benv.topk_len("a OR b", 10))
|
TopDocs::with_limit(10).custom_score(move |reader: &SegmentReader| {
|
||||||
});
|
let score_col = reader.fast_fields().u64("score").unwrap();
|
||||||
group.register_with_input("a_OR_b_OR_c_top10", &single_view, |benv: &BenchIndex| {
|
let score_col2 = reader.fast_fields().u64("score2").unwrap();
|
||||||
black_box(benv.topk_len("a OR b OR c", 10))
|
move |doc| {
|
||||||
});
|
let score = score_col.first(doc);
|
||||||
group.run();
|
let score2 = score_col2.first(doc);
|
||||||
}
|
(score, score2)
|
||||||
|
}
|
||||||
// Multi-field group: default fields are [title, body]
|
}),
|
||||||
{
|
"top10_by_2ff",
|
||||||
let mut group = runner.new_group();
|
);
|
||||||
group.set_name(format!("multi_field — {}", label));
|
}
|
||||||
group.register_with_input("+a_+b_count", &multi_view, |benv: &BenchIndex| {
|
|
||||||
black_box(benv.count_query("+a +b"))
|
|
||||||
});
|
|
||||||
group.register_with_input("+a_+b_+c_count", &multi_view, |benv: &BenchIndex| {
|
|
||||||
black_box(benv.count_query("+a +b +c"))
|
|
||||||
});
|
|
||||||
group.register_with_input("+a_+b_top10", &multi_view, |benv: &BenchIndex| {
|
|
||||||
black_box(benv.topk_len("+a +b", 10))
|
|
||||||
});
|
|
||||||
group.register_with_input("+a_+b_+c_top10", &multi_view, |benv: &BenchIndex| {
|
|
||||||
black_box(benv.topk_len("+a +b +c", 10))
|
|
||||||
});
|
|
||||||
// OR queries
|
|
||||||
group.register_with_input("a_OR_b_count", &multi_view, |benv: &BenchIndex| {
|
|
||||||
black_box(benv.count_query("a OR b"))
|
|
||||||
});
|
|
||||||
group.register_with_input("a_OR_b_OR_c_count", &multi_view, |benv: &BenchIndex| {
|
|
||||||
black_box(benv.count_query("a OR b OR c"))
|
|
||||||
});
|
|
||||||
group.register_with_input("a_OR_b_top10", &multi_view, |benv: &BenchIndex| {
|
|
||||||
black_box(benv.topk_len("a OR b", 10))
|
|
||||||
});
|
|
||||||
group.register_with_input("a_OR_b_OR_c_top10", &multi_view, |benv: &BenchIndex| {
|
|
||||||
black_box(benv.topk_len("a OR b OR c", 10))
|
|
||||||
});
|
|
||||||
group.run();
|
group.run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn add_bench_task<C: Collector + 'static>(
|
||||||
|
bench_group: &mut BenchGroup,
|
||||||
|
bench_index: &BenchIndex,
|
||||||
|
query_str: &str,
|
||||||
|
collector: C,
|
||||||
|
collector_name: &str,
|
||||||
|
) {
|
||||||
|
let task_name = format!("{}_{}", query_str.replace(" ", "_"), collector_name);
|
||||||
|
let query = bench_index.query_parser.parse_query(query_str).unwrap();
|
||||||
|
let search_task = SearchTask {
|
||||||
|
searcher: bench_index.searcher.clone(),
|
||||||
|
collector,
|
||||||
|
query,
|
||||||
|
};
|
||||||
|
bench_group.register(task_name, move |_| black_box(search_task.run()));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct SearchTask<C: Collector> {
|
||||||
|
searcher: Searcher,
|
||||||
|
collector: C,
|
||||||
|
query: Box<dyn Query>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<C: Collector> SearchTask<C> {
|
||||||
|
#[inline(never)]
|
||||||
|
pub fn run(&self) -> usize {
|
||||||
|
self.searcher.search(&self.query, &self.collector).unwrap();
|
||||||
|
1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user