mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-16 00:00:40 +00:00
114 lines
3.2 KiB
Rust
114 lines
3.2 KiB
Rust
// Benchmarks regex query that matches all terms in a synthetic index.
|
|
//
|
|
// Corpus model:
|
|
// - N unique terms: t000000, t000001, ...
|
|
// - M docs
|
|
// - K tokens per doc: doc i gets terms derived from (i, token_index)
|
|
//
|
|
// Query:
|
|
// - Regex "t.*" to match all terms
|
|
//
|
|
// Run with:
|
|
// - cargo bench --bench regex_all_terms
|
|
//
|
|
|
|
use std::fmt::Write;
|
|
|
|
use binggan::{black_box, BenchRunner};
|
|
use tantivy::collector::Count;
|
|
use tantivy::query::RegexQuery;
|
|
use tantivy::schema::{Schema, TEXT};
|
|
use tantivy::{doc, Index, ReloadPolicy};
|
|
|
|
const HEAP_SIZE_BYTES: usize = 200_000_000;
|
|
|
|
#[derive(Clone, Copy)]
|
|
struct BenchConfig {
|
|
num_terms: usize,
|
|
num_docs: usize,
|
|
tokens_per_doc: usize,
|
|
}
|
|
|
|
fn main() {
|
|
let configs = default_configs();
|
|
|
|
let mut runner = BenchRunner::new();
|
|
for config in configs {
|
|
let (index, text_field) = build_index(config, HEAP_SIZE_BYTES);
|
|
let reader = index
|
|
.reader_builder()
|
|
.reload_policy(ReloadPolicy::Manual)
|
|
.try_into()
|
|
.expect("reader");
|
|
let searcher = reader.searcher();
|
|
let query = RegexQuery::from_pattern("t.*", text_field).expect("regex query");
|
|
|
|
let mut group = runner.new_group();
|
|
group.set_name(format!(
|
|
"regex_all_terms_t{}_d{}_k{}",
|
|
config.num_terms, config.num_docs, config.tokens_per_doc
|
|
));
|
|
group.register("regex_count", move |_| {
|
|
let count = searcher.search(&query, &Count).expect("search");
|
|
black_box(count);
|
|
});
|
|
group.run();
|
|
}
|
|
}
|
|
|
|
fn default_configs() -> Vec<BenchConfig> {
|
|
vec![
|
|
BenchConfig {
|
|
num_terms: 10_000,
|
|
num_docs: 100_000,
|
|
tokens_per_doc: 1,
|
|
},
|
|
BenchConfig {
|
|
num_terms: 10_000,
|
|
num_docs: 100_000,
|
|
tokens_per_doc: 8,
|
|
},
|
|
BenchConfig {
|
|
num_terms: 100_000,
|
|
num_docs: 100_000,
|
|
tokens_per_doc: 1,
|
|
},
|
|
BenchConfig {
|
|
num_terms: 100_000,
|
|
num_docs: 100_000,
|
|
tokens_per_doc: 8,
|
|
},
|
|
]
|
|
}
|
|
|
|
fn build_index(config: BenchConfig, heap_size_bytes: usize) -> (Index, tantivy::schema::Field) {
|
|
let mut schema_builder = Schema::builder();
|
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
let schema = schema_builder.build();
|
|
let index = Index::create_in_ram(schema);
|
|
|
|
let term_width = config.num_terms.to_string().len();
|
|
{
|
|
let mut writer = index
|
|
.writer_with_num_threads(1, heap_size_bytes)
|
|
.expect("writer");
|
|
let mut buffer = String::new();
|
|
for doc_id in 0..config.num_docs {
|
|
buffer.clear();
|
|
for token_idx in 0..config.tokens_per_doc {
|
|
if token_idx > 0 {
|
|
buffer.push(' ');
|
|
}
|
|
let term_id = (doc_id * config.tokens_per_doc + token_idx) % config.num_terms;
|
|
write!(&mut buffer, "t{term_id:0term_width$}").expect("write token");
|
|
}
|
|
writer
|
|
.add_document(doc!(text_field => buffer.as_str()))
|
|
.expect("add_document");
|
|
}
|
|
writer.commit().expect("commit");
|
|
}
|
|
|
|
(index, text_field)
|
|
}
|