use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion, Throughput}; use tantivy::schema::{TantivyDocument, FAST, INDEXED, STORED, STRING, TEXT}; use tantivy::{tokenizer, Index, IndexWriter}; const HDFS_LOGS: &str = include_str!("hdfs.json"); const GH_LOGS: &str = include_str!("gh.json"); const WIKI: &str = include_str!("wiki.json"); fn benchmark( b: &mut Bencher, input: &str, schema: tantivy::schema::Schema, commit: bool, parse_json: bool, is_dynamic: bool, ) { if is_dynamic { benchmark_dynamic_json(b, input, schema, commit, parse_json) } else { _benchmark(b, input, schema, commit, parse_json, |schema, doc_json| { TantivyDocument::parse_json(schema, doc_json).unwrap() }) } } fn get_index(schema: tantivy::schema::Schema) -> Index { let mut index = Index::create_in_ram(schema.clone()); let ff_tokenizer_manager = tokenizer::TokenizerManager::default(); ff_tokenizer_manager.register( "raw", tokenizer::TextAnalyzer::builder(tokenizer::RawTokenizer::default()) .filter(tokenizer::RemoveLongFilter::limit(255)) .build(), ); index.set_fast_field_tokenizers(ff_tokenizer_manager.clone()); index } fn _benchmark( b: &mut Bencher, input: &str, schema: tantivy::schema::Schema, commit: bool, include_json_parsing: bool, create_doc: impl Fn(&tantivy::schema::Schema, &str) -> TantivyDocument, ) { if include_json_parsing { let lines: Vec<&str> = input.trim().split('\n').collect(); b.iter(|| { let index = get_index(schema.clone()); let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap(); for doc_json in &lines { let doc = create_doc(&schema, doc_json); index_writer.add_document(doc).unwrap(); } if commit { index_writer.commit().unwrap(); } }) } else { let docs: Vec<_> = input .trim() .split('\n') .map(|doc_json| create_doc(&schema, doc_json)) .collect(); b.iter_batched( || docs.clone(), |docs| { let index = get_index(schema.clone()); let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap(); for doc in docs { index_writer.add_document(doc).unwrap(); } if commit { index_writer.commit().unwrap(); } }, BatchSize::SmallInput, ) } } fn benchmark_dynamic_json( b: &mut Bencher, input: &str, schema: tantivy::schema::Schema, commit: bool, parse_json: bool, ) { let json_field = schema.get_field("json").unwrap(); _benchmark(b, input, schema, commit, parse_json, |_schema, doc_json| { let json_val: serde_json::Value = serde_json::from_str(doc_json).unwrap(); tantivy::doc!(json_field=>json_val) }) } pub fn hdfs_index_benchmark(c: &mut Criterion) { let schema = { let mut schema_builder = tantivy::schema::SchemaBuilder::new(); schema_builder.add_u64_field("timestamp", INDEXED); schema_builder.add_text_field("body", TEXT); schema_builder.add_text_field("severity", STRING); schema_builder.build() }; let schema_only_fast = { let mut schema_builder = tantivy::schema::SchemaBuilder::new(); schema_builder.add_u64_field("timestamp", FAST); schema_builder.add_text_field("body", FAST); schema_builder.add_text_field("severity", FAST); schema_builder.build() }; let _schema_with_store = { let mut schema_builder = tantivy::schema::SchemaBuilder::new(); schema_builder.add_u64_field("timestamp", INDEXED | STORED); schema_builder.add_text_field("body", TEXT | STORED); schema_builder.add_text_field("severity", STRING | STORED); schema_builder.build() }; let dynamic_schema = { let mut schema_builder = tantivy::schema::SchemaBuilder::new(); schema_builder.add_json_field("json", TEXT | FAST); schema_builder.build() }; let mut group = c.benchmark_group("index-hdfs"); group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64)); group.sample_size(20); let benches = [ ("only-indexed-".to_string(), schema, false), //("stored-".to_string(), _schema_with_store, false), ("only-fast-".to_string(), schema_only_fast, false), ("dynamic-".to_string(), dynamic_schema, true), ]; for (prefix, schema, is_dynamic) in benches { for commit in [false, true] { let suffix = if commit { "with-commit" } else { "no-commit" }; { let parse_json = false; // for parse_json in [false, true] { let suffix = if parse_json { format!("{suffix}-with-json-parsing") } else { suffix.to_string() }; let bench_name = format!("{prefix}{suffix}"); group.bench_function(bench_name, |b| { benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic) }); } } } } pub fn gh_index_benchmark(c: &mut Criterion) { let dynamic_schema = { let mut schema_builder = tantivy::schema::SchemaBuilder::new(); schema_builder.add_json_field("json", TEXT | FAST); schema_builder.build() }; let dynamic_schema_fast = { let mut schema_builder = tantivy::schema::SchemaBuilder::new(); schema_builder.add_json_field("json", FAST); schema_builder.build() }; let mut group = c.benchmark_group("index-gh"); group.throughput(Throughput::Bytes(GH_LOGS.len() as u64)); group.bench_function("index-gh-no-commit", |b| { benchmark_dynamic_json(b, GH_LOGS, dynamic_schema.clone(), false, false) }); group.bench_function("index-gh-fast", |b| { benchmark_dynamic_json(b, GH_LOGS, dynamic_schema_fast.clone(), false, false) }); group.bench_function("index-gh-fast-with-commit", |b| { benchmark_dynamic_json(b, GH_LOGS, dynamic_schema_fast.clone(), true, false) }); } pub fn wiki_index_benchmark(c: &mut Criterion) { let dynamic_schema = { let mut schema_builder = tantivy::schema::SchemaBuilder::new(); schema_builder.add_json_field("json", TEXT | FAST); schema_builder.build() }; let mut group = c.benchmark_group("index-wiki"); group.throughput(Throughput::Bytes(WIKI.len() as u64)); group.bench_function("index-wiki-no-commit", |b| { benchmark_dynamic_json(b, WIKI, dynamic_schema.clone(), false, false) }); group.bench_function("index-wiki-with-commit", |b| { benchmark_dynamic_json(b, WIKI, dynamic_schema.clone(), true, false) }); } criterion_group! { name = benches; config = Criterion::default(); targets = hdfs_index_benchmark } criterion_group! { name = gh_benches; config = Criterion::default(); targets = gh_index_benchmark } criterion_group! { name = wiki_benches; config = Criterion::default(); targets = wiki_index_benchmark } criterion_main!(benches, gh_benches, wiki_benches);