Files
tantivy/benches/regex_all_terms.rs
PSeitz 28db952131 Add regex search and merge segments benchmark (#2826)
* add merge_segments benchmark

* add regex search bench
2026-02-02 17:28:02 +01:00

114 lines
3.2 KiB
Rust

// Benchmarks regex query that matches all terms in a synthetic index.
//
// Corpus model:
// - N unique terms: t000000, t000001, ...
// - M docs
// - K tokens per doc: doc i gets terms derived from (i, token_index)
//
// Query:
// - Regex "t.*" to match all terms
//
// Run with:
// - cargo bench --bench regex_all_terms
//
use std::fmt::Write;
use binggan::{black_box, BenchRunner};
use tantivy::collector::Count;
use tantivy::query::RegexQuery;
use tantivy::schema::{Schema, TEXT};
use tantivy::{doc, Index, ReloadPolicy};
const HEAP_SIZE_BYTES: usize = 200_000_000;
#[derive(Clone, Copy)]
struct BenchConfig {
num_terms: usize,
num_docs: usize,
tokens_per_doc: usize,
}
fn main() {
let configs = default_configs();
let mut runner = BenchRunner::new();
for config in configs {
let (index, text_field) = build_index(config, HEAP_SIZE_BYTES);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.expect("reader");
let searcher = reader.searcher();
let query = RegexQuery::from_pattern("t.*", text_field).expect("regex query");
let mut group = runner.new_group();
group.set_name(format!(
"regex_all_terms_t{}_d{}_k{}",
config.num_terms, config.num_docs, config.tokens_per_doc
));
group.register("regex_count", move |_| {
let count = searcher.search(&query, &Count).expect("search");
black_box(count);
});
group.run();
}
}
fn default_configs() -> Vec<BenchConfig> {
vec![
BenchConfig {
num_terms: 10_000,
num_docs: 100_000,
tokens_per_doc: 1,
},
BenchConfig {
num_terms: 10_000,
num_docs: 100_000,
tokens_per_doc: 8,
},
BenchConfig {
num_terms: 100_000,
num_docs: 100_000,
tokens_per_doc: 1,
},
BenchConfig {
num_terms: 100_000,
num_docs: 100_000,
tokens_per_doc: 8,
},
]
}
fn build_index(config: BenchConfig, heap_size_bytes: usize) -> (Index, tantivy::schema::Field) {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let term_width = config.num_terms.to_string().len();
{
let mut writer = index
.writer_with_num_threads(1, heap_size_bytes)
.expect("writer");
let mut buffer = String::new();
for doc_id in 0..config.num_docs {
buffer.clear();
for token_idx in 0..config.tokens_per_doc {
if token_idx > 0 {
buffer.push(' ');
}
let term_id = (doc_id * config.tokens_per_doc + token_idx) % config.num_terms;
write!(&mut buffer, "t{term_id:0term_width$}").expect("write token");
}
writer
.add_document(doc!(text_field => buffer.as_str()))
.expect("add_document");
}
writer.commit().expect("commit");
}
(index, text_field)
}