Added some benchmark for top K by a fast field (#2754)

Also removed query parsing from the bench code.

Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>
This commit is contained in:
Paul Masurel
2025-12-01 14:58:29 +01:00
committed by GitHub
parent ca87fcd454
commit 7d513a44c5

View File

@@ -16,14 +16,14 @@
// - This bench isolates boolean iteration speed and intersection/union cost. // - This bench isolates boolean iteration speed and intersection/union cost.
// - Use `cargo bench --bench boolean_conjunction` to run. // - Use `cargo bench --bench boolean_conjunction` to run.
use binggan::{black_box, BenchRunner}; use binggan::{black_box, BenchGroup, BenchRunner};
use rand::prelude::*; use rand::prelude::*;
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::SeedableRng; use rand::SeedableRng;
use tantivy::collector::{Count, TopDocs}; use tantivy::collector::{Collector, Count, TopDocs};
use tantivy::query::QueryParser; use tantivy::query::{Query, QueryParser};
use tantivy::schema::{Schema, TEXT}; use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{doc, Index, ReloadPolicy, Searcher}; use tantivy::{doc, Index, Order, ReloadPolicy, Searcher, SegmentReader};
#[derive(Clone)] #[derive(Clone)]
struct BenchIndex { struct BenchIndex {
@@ -33,23 +33,6 @@ struct BenchIndex {
query_parser: QueryParser, query_parser: QueryParser,
} }
impl BenchIndex {
#[inline(always)]
fn count_query(&self, query_str: &str) -> usize {
let query = self.query_parser.parse_query(query_str).unwrap();
self.searcher.search(&query, &Count).unwrap()
}
#[inline(always)]
fn topk_len(&self, query_str: &str, k: usize) -> usize {
let query = self.query_parser.parse_query(query_str).unwrap();
self.searcher
.search(&query, &TopDocs::with_limit(k))
.unwrap()
.len()
}
}
/// Build a single index containing both fields (title, body) and /// Build a single index containing both fields (title, body) and
/// return two BenchIndex views: /// return two BenchIndex views:
/// - single_field: QueryParser defaults to only "body" /// - single_field: QueryParser defaults to only "body"
@@ -59,6 +42,8 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let f_title = schema_builder.add_text_field("title", TEXT); let f_title = schema_builder.add_text_field("title", TEXT);
let f_body = schema_builder.add_text_field("body", TEXT); let f_body = schema_builder.add_text_field("body", TEXT);
let f_score = schema_builder.add_u64_field("score", FAST);
let f_score2 = schema_builder.add_u64_field("score2", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
@@ -67,11 +52,13 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
// Populate: spread each present token 90/10 to body/title // Populate: spread each present token 90/10 to body/title
{ {
let mut writer = index.writer(500_000_000).unwrap(); let mut writer = index.writer_with_num_threads(1, 500_000_000).unwrap();
for _ in 0..num_docs { for _ in 0..num_docs {
let has_a = rng.gen_bool(p_a as f64); let has_a = rng.gen_bool(p_a as f64);
let has_b = rng.gen_bool(p_b as f64); let has_b = rng.gen_bool(p_b as f64);
let has_c = rng.gen_bool(p_c as f64); let has_c = rng.gen_bool(p_c as f64);
let score = rng.gen_range(0u64..100u64);
let score2 = rng.gen_range(0u64..100_000u64);
let mut title_tokens: Vec<&str> = Vec::new(); let mut title_tokens: Vec<&str> = Vec::new();
let mut body_tokens: Vec<&str> = Vec::new(); let mut body_tokens: Vec<&str> = Vec::new();
if has_a { if has_a {
@@ -101,7 +88,9 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
writer writer
.add_document(doc!( .add_document(doc!(
f_title=>title_tokens.join(" "), f_title=>title_tokens.join(" "),
f_body=>body_tokens.join(" ") f_body=>body_tokens.join(" "),
f_score=>score,
f_score2=>score2,
)) ))
.unwrap(); .unwrap();
} }
@@ -153,72 +142,81 @@ fn main() {
), ),
]; ];
let queries = &["a", "+a +b", "+a +b +c", "a OR b", "a OR b OR c"];
let mut runner = BenchRunner::new(); let mut runner = BenchRunner::new();
for (label, n, pa, pb, pc) in scenarios { for (label, n, pa, pb, pc) in scenarios {
let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc); let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc);
// Single-field group: default field is body only for (view_name, bench_index) in [("single_field", single_view), ("multi_field", multi_view)]
{ {
// Single-field group: default field is body only
let mut group = runner.new_group(); let mut group = runner.new_group();
group.set_name(format!("single_field — {}", label)); group.set_name(format!("{}{}", view_name, label));
group.register_with_input("+a_+b_count", &single_view, |benv: &BenchIndex| { for query_str in queries {
black_box(benv.count_query("+a +b")) add_bench_task(&mut group, &bench_index, query_str, Count, "count");
}); add_bench_task(
group.register_with_input("+a_+b_+c_count", &single_view, |benv: &BenchIndex| { &mut group,
black_box(benv.count_query("+a +b +c")) &bench_index,
}); query_str,
group.register_with_input("+a_+b_top10", &single_view, |benv: &BenchIndex| { TopDocs::with_limit(10),
black_box(benv.topk_len("+a +b", 10)) "top10",
}); );
group.register_with_input("+a_+b_+c_top10", &single_view, |benv: &BenchIndex| { add_bench_task(
black_box(benv.topk_len("+a +b +c", 10)) &mut group,
}); &bench_index,
// OR queries query_str,
group.register_with_input("a_OR_b_count", &single_view, |benv: &BenchIndex| { TopDocs::with_limit(10).order_by_fast_field::<u64>("score", Order::Asc),
black_box(benv.count_query("a OR b")) "top10_by_ff",
}); );
group.register_with_input("a_OR_b_OR_c_count", &single_view, |benv: &BenchIndex| { add_bench_task(
black_box(benv.count_query("a OR b OR c")) &mut group,
}); &bench_index,
group.register_with_input("a_OR_b_top10", &single_view, |benv: &BenchIndex| { query_str,
black_box(benv.topk_len("a OR b", 10)) TopDocs::with_limit(10).custom_score(move |reader: &SegmentReader| {
}); let score_col = reader.fast_fields().u64("score").unwrap();
group.register_with_input("a_OR_b_OR_c_top10", &single_view, |benv: &BenchIndex| { let score_col2 = reader.fast_fields().u64("score2").unwrap();
black_box(benv.topk_len("a OR b OR c", 10)) move |doc| {
}); let score = score_col.first(doc);
group.run(); let score2 = score_col2.first(doc);
} (score, score2)
}
// Multi-field group: default fields are [title, body] }),
{ "top10_by_2ff",
let mut group = runner.new_group(); );
group.set_name(format!("multi_field — {}", label)); }
group.register_with_input("+a_+b_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("+a +b"))
});
group.register_with_input("+a_+b_+c_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("+a +b +c"))
});
group.register_with_input("+a_+b_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("+a +b", 10))
});
group.register_with_input("+a_+b_+c_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("+a +b +c", 10))
});
// OR queries
group.register_with_input("a_OR_b_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("a OR b"))
});
group.register_with_input("a_OR_b_OR_c_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("a OR b OR c"))
});
group.register_with_input("a_OR_b_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("a OR b", 10))
});
group.register_with_input("a_OR_b_OR_c_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("a OR b OR c", 10))
});
group.run(); group.run();
} }
} }
} }
fn add_bench_task<C: Collector + 'static>(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query_str: &str,
collector: C,
collector_name: &str,
) {
let task_name = format!("{}_{}", query_str.replace(" ", "_"), collector_name);
let query = bench_index.query_parser.parse_query(query_str).unwrap();
let search_task = SearchTask {
searcher: bench_index.searcher.clone(),
collector,
query,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
struct SearchTask<C: Collector> {
searcher: Searcher,
collector: C,
query: Box<dyn Query>,
}
impl<C: Collector> SearchTask<C> {
#[inline(never)]
pub fn run(&self) -> usize {
self.searcher.search(&self.query, &self.collector).unwrap();
1
}
}