From 7d513a44c530ee6792a588c300802cf5cd3ded82 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 1 Dec 2025 14:58:29 +0100 Subject: [PATCH] Added some benchmark for top K by a fast field (#2754) Also removed query parsing from the bench code. Co-authored-by: Paul Masurel --- benches/and_or_queries.rs | 164 +++++++++++++++++++------------------- 1 file changed, 81 insertions(+), 83 deletions(-) diff --git a/benches/and_or_queries.rs b/benches/and_or_queries.rs index 377634c9a..3e6e0dcd3 100644 --- a/benches/and_or_queries.rs +++ b/benches/and_or_queries.rs @@ -16,14 +16,14 @@ // - This bench isolates boolean iteration speed and intersection/union cost. // - Use `cargo bench --bench boolean_conjunction` to run. -use binggan::{black_box, BenchRunner}; +use binggan::{black_box, BenchGroup, BenchRunner}; use rand::prelude::*; use rand::rngs::StdRng; use rand::SeedableRng; -use tantivy::collector::{Count, TopDocs}; -use tantivy::query::QueryParser; -use tantivy::schema::{Schema, TEXT}; -use tantivy::{doc, Index, ReloadPolicy, Searcher}; +use tantivy::collector::{Collector, Count, TopDocs}; +use tantivy::query::{Query, QueryParser}; +use tantivy::schema::{Schema, FAST, TEXT}; +use tantivy::{doc, Index, Order, ReloadPolicy, Searcher, SegmentReader}; #[derive(Clone)] struct BenchIndex { @@ -33,23 +33,6 @@ struct BenchIndex { query_parser: QueryParser, } -impl BenchIndex { - #[inline(always)] - fn count_query(&self, query_str: &str) -> usize { - let query = self.query_parser.parse_query(query_str).unwrap(); - self.searcher.search(&query, &Count).unwrap() - } - - #[inline(always)] - fn topk_len(&self, query_str: &str, k: usize) -> usize { - let query = self.query_parser.parse_query(query_str).unwrap(); - self.searcher - .search(&query, &TopDocs::with_limit(k)) - .unwrap() - .len() - } -} - /// Build a single index containing both fields (title, body) and /// return two BenchIndex views: /// - single_field: QueryParser defaults to only "body" @@ -59,6 +42,8 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench let mut schema_builder = Schema::builder(); let f_title = schema_builder.add_text_field("title", TEXT); let f_body = schema_builder.add_text_field("body", TEXT); + let f_score = schema_builder.add_u64_field("score", FAST); + let f_score2 = schema_builder.add_u64_field("score2", FAST); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); @@ -67,11 +52,13 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench // Populate: spread each present token 90/10 to body/title { - let mut writer = index.writer(500_000_000).unwrap(); + let mut writer = index.writer_with_num_threads(1, 500_000_000).unwrap(); for _ in 0..num_docs { let has_a = rng.gen_bool(p_a as f64); let has_b = rng.gen_bool(p_b as f64); let has_c = rng.gen_bool(p_c as f64); + let score = rng.gen_range(0u64..100u64); + let score2 = rng.gen_range(0u64..100_000u64); let mut title_tokens: Vec<&str> = Vec::new(); let mut body_tokens: Vec<&str> = Vec::new(); if has_a { @@ -101,7 +88,9 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench writer .add_document(doc!( f_title=>title_tokens.join(" "), - f_body=>body_tokens.join(" ") + f_body=>body_tokens.join(" "), + f_score=>score, + f_score2=>score2, )) .unwrap(); } @@ -153,72 +142,81 @@ fn main() { ), ]; + let queries = &["a", "+a +b", "+a +b +c", "a OR b", "a OR b OR c"]; + let mut runner = BenchRunner::new(); for (label, n, pa, pb, pc) in scenarios { let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc); - // Single-field group: default field is body only + for (view_name, bench_index) in [("single_field", single_view), ("multi_field", multi_view)] { + // Single-field group: default field is body only let mut group = runner.new_group(); - group.set_name(format!("single_field — {}", label)); - group.register_with_input("+a_+b_count", &single_view, |benv: &BenchIndex| { - black_box(benv.count_query("+a +b")) - }); - group.register_with_input("+a_+b_+c_count", &single_view, |benv: &BenchIndex| { - black_box(benv.count_query("+a +b +c")) - }); - group.register_with_input("+a_+b_top10", &single_view, |benv: &BenchIndex| { - black_box(benv.topk_len("+a +b", 10)) - }); - group.register_with_input("+a_+b_+c_top10", &single_view, |benv: &BenchIndex| { - black_box(benv.topk_len("+a +b +c", 10)) - }); - // OR queries - group.register_with_input("a_OR_b_count", &single_view, |benv: &BenchIndex| { - black_box(benv.count_query("a OR b")) - }); - group.register_with_input("a_OR_b_OR_c_count", &single_view, |benv: &BenchIndex| { - black_box(benv.count_query("a OR b OR c")) - }); - group.register_with_input("a_OR_b_top10", &single_view, |benv: &BenchIndex| { - black_box(benv.topk_len("a OR b", 10)) - }); - group.register_with_input("a_OR_b_OR_c_top10", &single_view, |benv: &BenchIndex| { - black_box(benv.topk_len("a OR b OR c", 10)) - }); - group.run(); - } - - // Multi-field group: default fields are [title, body] - { - let mut group = runner.new_group(); - group.set_name(format!("multi_field — {}", label)); - group.register_with_input("+a_+b_count", &multi_view, |benv: &BenchIndex| { - black_box(benv.count_query("+a +b")) - }); - group.register_with_input("+a_+b_+c_count", &multi_view, |benv: &BenchIndex| { - black_box(benv.count_query("+a +b +c")) - }); - group.register_with_input("+a_+b_top10", &multi_view, |benv: &BenchIndex| { - black_box(benv.topk_len("+a +b", 10)) - }); - group.register_with_input("+a_+b_+c_top10", &multi_view, |benv: &BenchIndex| { - black_box(benv.topk_len("+a +b +c", 10)) - }); - // OR queries - group.register_with_input("a_OR_b_count", &multi_view, |benv: &BenchIndex| { - black_box(benv.count_query("a OR b")) - }); - group.register_with_input("a_OR_b_OR_c_count", &multi_view, |benv: &BenchIndex| { - black_box(benv.count_query("a OR b OR c")) - }); - group.register_with_input("a_OR_b_top10", &multi_view, |benv: &BenchIndex| { - black_box(benv.topk_len("a OR b", 10)) - }); - group.register_with_input("a_OR_b_OR_c_top10", &multi_view, |benv: &BenchIndex| { - black_box(benv.topk_len("a OR b OR c", 10)) - }); + group.set_name(format!("{} — {}", view_name, label)); + for query_str in queries { + add_bench_task(&mut group, &bench_index, query_str, Count, "count"); + add_bench_task( + &mut group, + &bench_index, + query_str, + TopDocs::with_limit(10), + "top10", + ); + add_bench_task( + &mut group, + &bench_index, + query_str, + TopDocs::with_limit(10).order_by_fast_field::("score", Order::Asc), + "top10_by_ff", + ); + add_bench_task( + &mut group, + &bench_index, + query_str, + TopDocs::with_limit(10).custom_score(move |reader: &SegmentReader| { + let score_col = reader.fast_fields().u64("score").unwrap(); + let score_col2 = reader.fast_fields().u64("score2").unwrap(); + move |doc| { + let score = score_col.first(doc); + let score2 = score_col2.first(doc); + (score, score2) + } + }), + "top10_by_2ff", + ); + } group.run(); } } } + +fn add_bench_task( + bench_group: &mut BenchGroup, + bench_index: &BenchIndex, + query_str: &str, + collector: C, + collector_name: &str, +) { + let task_name = format!("{}_{}", query_str.replace(" ", "_"), collector_name); + let query = bench_index.query_parser.parse_query(query_str).unwrap(); + let search_task = SearchTask { + searcher: bench_index.searcher.clone(), + collector, + query, + }; + bench_group.register(task_name, move |_| black_box(search_task.run())); +} + +struct SearchTask { + searcher: Searcher, + collector: C, + query: Box, +} + +impl SearchTask { + #[inline(never)] + pub fn run(&self) -> usize { + self.searcher.search(&self.query, &self.collector).unwrap(); + 1 + } +}