mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
fix and extend benchmark (#2030)
* add benchmark, add missing inlines * fix stacker bench * add wiki benchmark * move line split out of bench
This commit is contained in:
1000
benches/gh.json
Normal file
1000
benches/gh.json
Normal file
File diff suppressed because one or more lines are too long
@@ -1,10 +1,15 @@
|
|||||||
use criterion::{criterion_group, criterion_main, Criterion};
|
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
|
||||||
use pprof::criterion::{Output, PProfProfiler};
|
use pprof::criterion::{Output, PProfProfiler};
|
||||||
use tantivy::schema::{INDEXED, STORED, STRING, TEXT};
|
use tantivy::schema::{FAST, INDEXED, STORED, STRING, TEXT};
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
|
|
||||||
const HDFS_LOGS: &str = include_str!("hdfs.json");
|
const HDFS_LOGS: &str = include_str!("hdfs.json");
|
||||||
const NUM_REPEATS: usize = 2;
|
const GH_LOGS: &str = include_str!("gh.json");
|
||||||
|
const WIKI: &str = include_str!("wiki.json");
|
||||||
|
|
||||||
|
fn get_lines(input: &str) -> Vec<&str> {
|
||||||
|
input.trim().split('\n').collect()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||||
let schema = {
|
let schema = {
|
||||||
@@ -28,85 +33,147 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let mut group = c.benchmark_group("index-hdfs");
|
let mut group = c.benchmark_group("index-hdfs");
|
||||||
|
group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64));
|
||||||
group.sample_size(20);
|
group.sample_size(20);
|
||||||
group.bench_function("index-hdfs-no-commit", |b| {
|
group.bench_function("index-hdfs-no-commit", |b| {
|
||||||
|
let lines = get_lines(HDFS_LOGS);
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_ram(schema.clone());
|
||||||
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||||
for _ in 0..NUM_REPEATS {
|
for doc_json in &lines {
|
||||||
for doc_json in HDFS_LOGS.trim().split('\n') {
|
let doc = schema.parse_document(doc_json).unwrap();
|
||||||
let doc = schema.parse_document(doc_json).unwrap();
|
index_writer.add_document(doc).unwrap();
|
||||||
index_writer.add_document(doc).unwrap();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
group.bench_function("index-hdfs-with-commit", |b| {
|
group.bench_function("index-hdfs-with-commit", |b| {
|
||||||
|
let lines = get_lines(HDFS_LOGS);
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_ram(schema.clone());
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||||
for _ in 0..NUM_REPEATS {
|
for doc_json in &lines {
|
||||||
for doc_json in HDFS_LOGS.trim().split('\n') {
|
let doc = schema.parse_document(doc_json).unwrap();
|
||||||
let doc = schema.parse_document(doc_json).unwrap();
|
index_writer.add_document(doc).unwrap();
|
||||||
index_writer.add_document(doc).unwrap();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
|
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
|
||||||
|
let lines = get_lines(HDFS_LOGS);
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
let index = Index::create_in_ram(schema_with_store.clone());
|
let index = Index::create_in_ram(schema_with_store.clone());
|
||||||
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||||
for _ in 0..NUM_REPEATS {
|
for doc_json in &lines {
|
||||||
for doc_json in HDFS_LOGS.trim().split('\n') {
|
let doc = schema.parse_document(doc_json).unwrap();
|
||||||
let doc = schema.parse_document(doc_json).unwrap();
|
index_writer.add_document(doc).unwrap();
|
||||||
index_writer.add_document(doc).unwrap();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
|
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
|
||||||
|
let lines = get_lines(HDFS_LOGS);
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
let index = Index::create_in_ram(schema_with_store.clone());
|
let index = Index::create_in_ram(schema_with_store.clone());
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||||
for _ in 0..NUM_REPEATS {
|
for doc_json in &lines {
|
||||||
for doc_json in HDFS_LOGS.trim().split('\n') {
|
let doc = schema.parse_document(doc_json).unwrap();
|
||||||
let doc = schema.parse_document(doc_json).unwrap();
|
index_writer.add_document(doc).unwrap();
|
||||||
index_writer.add_document(doc).unwrap();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
|
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
|
||||||
|
let lines = get_lines(HDFS_LOGS);
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||||
for _ in 0..NUM_REPEATS {
|
for doc_json in &lines {
|
||||||
for doc_json in HDFS_LOGS.trim().split('\n') {
|
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
serde_json::from_str(doc_json).unwrap();
|
||||||
serde_json::from_str(doc_json).unwrap();
|
let doc = tantivy::doc!(json_field=>json_val);
|
||||||
let doc = tantivy::doc!(json_field=>json_val);
|
index_writer.add_document(doc).unwrap();
|
||||||
index_writer.add_document(doc).unwrap();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
|
}
|
||||||
|
|
||||||
|
pub fn gh_index_benchmark(c: &mut Criterion) {
|
||||||
|
let dynamic_schema = {
|
||||||
|
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
|
||||||
|
schema_builder.add_json_field("json", TEXT | FAST);
|
||||||
|
schema_builder.build()
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut group = c.benchmark_group("index-gh");
|
||||||
|
group.throughput(Throughput::Bytes(GH_LOGS.len() as u64));
|
||||||
|
|
||||||
|
group.bench_function("index-gh-no-commit", |b| {
|
||||||
|
let lines = get_lines(GH_LOGS);
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
|
||||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||||
|
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||||
|
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||||
|
for doc_json in &lines {
|
||||||
|
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||||
|
serde_json::from_str(doc_json).unwrap();
|
||||||
|
let doc = tantivy::doc!(json_field=>json_val);
|
||||||
|
index_writer.add_document(doc).unwrap();
|
||||||
|
}
|
||||||
|
})
|
||||||
|
});
|
||||||
|
group.bench_function("index-gh-with-commit", |b| {
|
||||||
|
let lines = get_lines(GH_LOGS);
|
||||||
|
b.iter(|| {
|
||||||
|
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||||
|
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||||
for _ in 0..NUM_REPEATS {
|
for doc_json in &lines {
|
||||||
for doc_json in HDFS_LOGS.trim().split('\n') {
|
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
serde_json::from_str(doc_json).unwrap();
|
||||||
serde_json::from_str(doc_json).unwrap();
|
let doc = tantivy::doc!(json_field=>json_val);
|
||||||
let doc = tantivy::doc!(json_field=>json_val);
|
index_writer.add_document(doc).unwrap();
|
||||||
index_writer.add_document(doc).unwrap();
|
}
|
||||||
}
|
index_writer.commit().unwrap();
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn wiki_index_benchmark(c: &mut Criterion) {
|
||||||
|
let dynamic_schema = {
|
||||||
|
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
|
||||||
|
schema_builder.add_json_field("json", TEXT | FAST);
|
||||||
|
schema_builder.build()
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut group = c.benchmark_group("index-wiki");
|
||||||
|
group.throughput(Throughput::Bytes(WIKI.len() as u64));
|
||||||
|
|
||||||
|
group.bench_function("index-wiki-no-commit", |b| {
|
||||||
|
let lines = get_lines(WIKI);
|
||||||
|
b.iter(|| {
|
||||||
|
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||||
|
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||||
|
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||||
|
for doc_json in &lines {
|
||||||
|
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||||
|
serde_json::from_str(doc_json).unwrap();
|
||||||
|
let doc = tantivy::doc!(json_field=>json_val);
|
||||||
|
index_writer.add_document(doc).unwrap();
|
||||||
|
}
|
||||||
|
})
|
||||||
|
});
|
||||||
|
group.bench_function("index-wiki-with-commit", |b| {
|
||||||
|
let lines = get_lines(WIKI);
|
||||||
|
b.iter(|| {
|
||||||
|
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||||
|
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||||
|
for doc_json in &lines {
|
||||||
|
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||||
|
serde_json::from_str(doc_json).unwrap();
|
||||||
|
let doc = tantivy::doc!(json_field=>json_val);
|
||||||
|
index_writer.add_document(doc).unwrap();
|
||||||
}
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
})
|
})
|
||||||
@@ -115,7 +182,17 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
|||||||
|
|
||||||
criterion_group! {
|
criterion_group! {
|
||||||
name = benches;
|
name = benches;
|
||||||
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
config = Criterion::default();
|
||||||
targets = hdfs_index_benchmark
|
targets = hdfs_index_benchmark
|
||||||
}
|
}
|
||||||
criterion_main!(benches);
|
criterion_group! {
|
||||||
|
name = gh_benches;
|
||||||
|
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||||
|
targets = gh_index_benchmark
|
||||||
|
}
|
||||||
|
criterion_group! {
|
||||||
|
name = wiki_benches;
|
||||||
|
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||||
|
targets = wiki_index_benchmark
|
||||||
|
}
|
||||||
|
criterion_main!(benches, gh_benches, wiki_benches);
|
||||||
|
|||||||
1000
benches/wiki.json
Normal file
1000
benches/wiki.json
Normal file
File diff suppressed because one or more lines are too long
@@ -23,6 +23,7 @@ impl<Rec: Recorder> From<JsonPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||||
|
#[inline]
|
||||||
fn subscribe(
|
fn subscribe(
|
||||||
&mut self,
|
&mut self,
|
||||||
doc: crate::DocId,
|
doc: crate::DocId,
|
||||||
|
|||||||
@@ -179,6 +179,7 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||||
|
#[inline]
|
||||||
fn subscribe(&mut self, doc: DocId, position: u32, term: &Term, ctx: &mut IndexingContext) {
|
fn subscribe(&mut self, doc: DocId, position: u32, term: &Term, ctx: &mut IndexingContext) {
|
||||||
debug_assert!(term.serialized_term().len() >= 4);
|
debug_assert!(term.serialized_term().len() >= 4);
|
||||||
self.total_num_tokens += 1;
|
self.total_num_tokens += 1;
|
||||||
|
|||||||
@@ -98,17 +98,21 @@ impl Default for DocIdRecorder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Recorder for DocIdRecorder {
|
impl Recorder for DocIdRecorder {
|
||||||
|
#[inline]
|
||||||
fn current_doc(&self) -> DocId {
|
fn current_doc(&self) -> DocId {
|
||||||
self.current_doc
|
self.current_doc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
|
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
|
||||||
self.current_doc = doc;
|
self.current_doc = doc;
|
||||||
self.stack.writer(arena).write_u32_vint(doc);
|
self.stack.writer(arena).write_u32_vint(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {}
|
fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
fn close_doc(&mut self, _arena: &mut MemoryArena) {}
|
fn close_doc(&mut self, _arena: &mut MemoryArena) {}
|
||||||
|
|
||||||
fn serialize(
|
fn serialize(
|
||||||
@@ -153,20 +157,24 @@ pub struct TermFrequencyRecorder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Recorder for TermFrequencyRecorder {
|
impl Recorder for TermFrequencyRecorder {
|
||||||
|
#[inline]
|
||||||
fn current_doc(&self) -> DocId {
|
fn current_doc(&self) -> DocId {
|
||||||
self.current_doc
|
self.current_doc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
|
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
|
||||||
self.term_doc_freq += 1;
|
self.term_doc_freq += 1;
|
||||||
self.current_doc = doc;
|
self.current_doc = doc;
|
||||||
self.stack.writer(arena).write_u32_vint(doc);
|
self.stack.writer(arena).write_u32_vint(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {
|
fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {
|
||||||
self.current_tf += 1;
|
self.current_tf += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
fn close_doc(&mut self, arena: &mut MemoryArena) {
|
fn close_doc(&mut self, arena: &mut MemoryArena) {
|
||||||
debug_assert!(self.current_tf > 0);
|
debug_assert!(self.current_tf > 0);
|
||||||
self.stack.writer(arena).write_u32_vint(self.current_tf);
|
self.stack.writer(arena).write_u32_vint(self.current_tf);
|
||||||
@@ -226,22 +234,26 @@ impl Default for TfAndPositionRecorder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Recorder for TfAndPositionRecorder {
|
impl Recorder for TfAndPositionRecorder {
|
||||||
|
#[inline]
|
||||||
fn current_doc(&self) -> DocId {
|
fn current_doc(&self) -> DocId {
|
||||||
self.current_doc
|
self.current_doc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
|
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
|
||||||
self.current_doc = doc;
|
self.current_doc = doc;
|
||||||
self.term_doc_freq += 1u32;
|
self.term_doc_freq += 1u32;
|
||||||
self.stack.writer(arena).write_u32_vint(doc);
|
self.stack.writer(arena).write_u32_vint(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
fn record_position(&mut self, position: u32, arena: &mut MemoryArena) {
|
fn record_position(&mut self, position: u32, arena: &mut MemoryArena) {
|
||||||
self.stack
|
self.stack
|
||||||
.writer(arena)
|
.writer(arena)
|
||||||
.write_u32_vint(position.wrapping_add(1u32));
|
.write_u32_vint(position.wrapping_add(1u32));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
fn close_doc(&mut self, arena: &mut MemoryArena) {
|
fn close_doc(&mut self, arena: &mut MemoryArena) {
|
||||||
self.stack.writer(arena).write_u32_vint(POSITION_END);
|
self.stack.writer(arena).write_u32_vint(POSITION_END);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,3 +21,8 @@ path = "example/hashmap.rs"
|
|||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
zipf = "7.0.0"
|
zipf = "7.0.0"
|
||||||
criterion = "0.4.0"
|
criterion = "0.4.0"
|
||||||
|
|
||||||
|
|
||||||
|
[features]
|
||||||
|
unstable = [] # useful for benches.
|
||||||
|
|
||||||
|
|||||||
@@ -259,6 +259,7 @@ impl ArenaHashMap {
|
|||||||
/// will be in charge of returning a default value.
|
/// will be in charge of returning a default value.
|
||||||
/// If the key already as an associated value, then it will be passed
|
/// If the key already as an associated value, then it will be passed
|
||||||
/// `Some(previous_value)`.
|
/// `Some(previous_value)`.
|
||||||
|
#[inline]
|
||||||
pub fn mutate_or_create<V>(
|
pub fn mutate_or_create<V>(
|
||||||
&mut self,
|
&mut self,
|
||||||
key: &[u8],
|
key: &[u8],
|
||||||
|
|||||||
@@ -1,3 +1,8 @@
|
|||||||
|
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
||||||
|
|
||||||
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
|
extern crate test;
|
||||||
|
|
||||||
mod arena_hashmap;
|
mod arena_hashmap;
|
||||||
mod expull;
|
mod expull;
|
||||||
mod memory_arena;
|
mod memory_arena;
|
||||||
|
|||||||
Reference in New Issue
Block a user