fix and extend benchmark (#2030)

* add benchmark, add missing inlines

* fix stacker bench

* add wiki benchmark

* move line split out of bench
This commit is contained in:
PSeitz
2023-05-10 19:01:56 +08:00
committed by GitHub
parent 0eafbaab8e
commit d1988be8e9
9 changed files with 2143 additions and 41 deletions

1000
benches/gh.json Normal file

File diff suppressed because one or more lines are too long

View File

@@ -1,10 +1,15 @@
use criterion::{criterion_group, criterion_main, Criterion}; use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use pprof::criterion::{Output, PProfProfiler}; use pprof::criterion::{Output, PProfProfiler};
use tantivy::schema::{INDEXED, STORED, STRING, TEXT}; use tantivy::schema::{FAST, INDEXED, STORED, STRING, TEXT};
use tantivy::Index; use tantivy::Index;
const HDFS_LOGS: &str = include_str!("hdfs.json"); const HDFS_LOGS: &str = include_str!("hdfs.json");
const NUM_REPEATS: usize = 2; const GH_LOGS: &str = include_str!("gh.json");
const WIKI: &str = include_str!("wiki.json");
fn get_lines(input: &str) -> Vec<&str> {
input.trim().split('\n').collect()
}
pub fn hdfs_index_benchmark(c: &mut Criterion) { pub fn hdfs_index_benchmark(c: &mut Criterion) {
let schema = { let schema = {
@@ -28,85 +33,147 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
}; };
let mut group = c.benchmark_group("index-hdfs"); let mut group = c.benchmark_group("index-hdfs");
group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64));
group.sample_size(20); group.sample_size(20);
group.bench_function("index-hdfs-no-commit", |b| { group.bench_function("index-hdfs-no-commit", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| { b.iter(|| {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS { for doc_json in &lines {
for doc_json in HDFS_LOGS.trim().split('\n') { let doc = schema.parse_document(doc_json).unwrap();
let doc = schema.parse_document(doc_json).unwrap(); index_writer.add_document(doc).unwrap();
index_writer.add_document(doc).unwrap();
}
} }
}) })
}); });
group.bench_function("index-hdfs-with-commit", |b| { group.bench_function("index-hdfs-with-commit", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| { b.iter(|| {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS { for doc_json in &lines {
for doc_json in HDFS_LOGS.trim().split('\n') { let doc = schema.parse_document(doc_json).unwrap();
let doc = schema.parse_document(doc_json).unwrap(); index_writer.add_document(doc).unwrap();
index_writer.add_document(doc).unwrap();
}
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
}) })
}); });
group.bench_function("index-hdfs-no-commit-with-docstore", |b| { group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| { b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone()); let index = Index::create_in_ram(schema_with_store.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS { for doc_json in &lines {
for doc_json in HDFS_LOGS.trim().split('\n') { let doc = schema.parse_document(doc_json).unwrap();
let doc = schema.parse_document(doc_json).unwrap(); index_writer.add_document(doc).unwrap();
index_writer.add_document(doc).unwrap();
}
} }
}) })
}); });
group.bench_function("index-hdfs-with-commit-with-docstore", |b| { group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| { b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone()); let index = Index::create_in_ram(schema_with_store.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS { for doc_json in &lines {
for doc_json in HDFS_LOGS.trim().split('\n') { let doc = schema.parse_document(doc_json).unwrap();
let doc = schema.parse_document(doc_json).unwrap(); index_writer.add_document(doc).unwrap();
index_writer.add_document(doc).unwrap();
}
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
}) })
}); });
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| { group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| { b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone()); let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap(); let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS { for doc_json in &lines {
for doc_json in HDFS_LOGS.trim().split('\n') { let json_val: serde_json::Map<String, serde_json::Value> =
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(doc_json).unwrap();
serde_json::from_str(doc_json).unwrap(); let doc = tantivy::doc!(json_field=>json_val);
let doc = tantivy::doc!(json_field=>json_val); index_writer.add_document(doc).unwrap();
index_writer.add_document(doc).unwrap();
}
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
}) })
}); });
group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| { }
pub fn gh_index_benchmark(c: &mut Criterion) {
let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT | FAST);
schema_builder.build()
};
let mut group = c.benchmark_group("index-gh");
group.throughput(Throughput::Bytes(GH_LOGS.len() as u64));
group.bench_function("index-gh-no-commit", |b| {
let lines = get_lines(GH_LOGS);
b.iter(|| { b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap(); let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-gh-with-commit", |b| {
let lines = get_lines(GH_LOGS);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS { for doc_json in &lines {
for doc_json in HDFS_LOGS.trim().split('\n') { let json_val: serde_json::Map<String, serde_json::Value> =
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(doc_json).unwrap();
serde_json::from_str(doc_json).unwrap(); let doc = tantivy::doc!(json_field=>json_val);
let doc = tantivy::doc!(json_field=>json_val); index_writer.add_document(doc).unwrap();
index_writer.add_document(doc).unwrap(); }
} index_writer.commit().unwrap();
})
});
}
pub fn wiki_index_benchmark(c: &mut Criterion) {
let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT | FAST);
schema_builder.build()
};
let mut group = c.benchmark_group("index-wiki");
group.throughput(Throughput::Bytes(WIKI.len() as u64));
group.bench_function("index-wiki-no-commit", |b| {
let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-wiki-with-commit", |b| {
let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
}) })
@@ -115,7 +182,17 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
criterion_group! { criterion_group! {
name = benches; name = benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); config = Criterion::default();
targets = hdfs_index_benchmark targets = hdfs_index_benchmark
} }
criterion_main!(benches); criterion_group! {
name = gh_benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = gh_index_benchmark
}
criterion_group! {
name = wiki_benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = wiki_index_benchmark
}
criterion_main!(benches, gh_benches, wiki_benches);

1000
benches/wiki.json Normal file

File diff suppressed because one or more lines are too long

View File

@@ -23,6 +23,7 @@ impl<Rec: Recorder> From<JsonPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
} }
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> { impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
#[inline]
fn subscribe( fn subscribe(
&mut self, &mut self,
doc: crate::DocId, doc: crate::DocId,

View File

@@ -179,6 +179,7 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
} }
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> { impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
#[inline]
fn subscribe(&mut self, doc: DocId, position: u32, term: &Term, ctx: &mut IndexingContext) { fn subscribe(&mut self, doc: DocId, position: u32, term: &Term, ctx: &mut IndexingContext) {
debug_assert!(term.serialized_term().len() >= 4); debug_assert!(term.serialized_term().len() >= 4);
self.total_num_tokens += 1; self.total_num_tokens += 1;

View File

@@ -98,17 +98,21 @@ impl Default for DocIdRecorder {
} }
impl Recorder for DocIdRecorder { impl Recorder for DocIdRecorder {
#[inline]
fn current_doc(&self) -> DocId { fn current_doc(&self) -> DocId {
self.current_doc self.current_doc
} }
#[inline]
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) { fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
self.current_doc = doc; self.current_doc = doc;
self.stack.writer(arena).write_u32_vint(doc); self.stack.writer(arena).write_u32_vint(doc);
} }
#[inline]
fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {} fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {}
#[inline]
fn close_doc(&mut self, _arena: &mut MemoryArena) {} fn close_doc(&mut self, _arena: &mut MemoryArena) {}
fn serialize( fn serialize(
@@ -153,20 +157,24 @@ pub struct TermFrequencyRecorder {
} }
impl Recorder for TermFrequencyRecorder { impl Recorder for TermFrequencyRecorder {
#[inline]
fn current_doc(&self) -> DocId { fn current_doc(&self) -> DocId {
self.current_doc self.current_doc
} }
#[inline]
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) { fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
self.term_doc_freq += 1; self.term_doc_freq += 1;
self.current_doc = doc; self.current_doc = doc;
self.stack.writer(arena).write_u32_vint(doc); self.stack.writer(arena).write_u32_vint(doc);
} }
#[inline]
fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) { fn record_position(&mut self, _position: u32, _arena: &mut MemoryArena) {
self.current_tf += 1; self.current_tf += 1;
} }
#[inline]
fn close_doc(&mut self, arena: &mut MemoryArena) { fn close_doc(&mut self, arena: &mut MemoryArena) {
debug_assert!(self.current_tf > 0); debug_assert!(self.current_tf > 0);
self.stack.writer(arena).write_u32_vint(self.current_tf); self.stack.writer(arena).write_u32_vint(self.current_tf);
@@ -226,22 +234,26 @@ impl Default for TfAndPositionRecorder {
} }
impl Recorder for TfAndPositionRecorder { impl Recorder for TfAndPositionRecorder {
#[inline]
fn current_doc(&self) -> DocId { fn current_doc(&self) -> DocId {
self.current_doc self.current_doc
} }
#[inline]
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) { fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
self.current_doc = doc; self.current_doc = doc;
self.term_doc_freq += 1u32; self.term_doc_freq += 1u32;
self.stack.writer(arena).write_u32_vint(doc); self.stack.writer(arena).write_u32_vint(doc);
} }
#[inline]
fn record_position(&mut self, position: u32, arena: &mut MemoryArena) { fn record_position(&mut self, position: u32, arena: &mut MemoryArena) {
self.stack self.stack
.writer(arena) .writer(arena)
.write_u32_vint(position.wrapping_add(1u32)); .write_u32_vint(position.wrapping_add(1u32));
} }
#[inline]
fn close_doc(&mut self, arena: &mut MemoryArena) { fn close_doc(&mut self, arena: &mut MemoryArena) {
self.stack.writer(arena).write_u32_vint(POSITION_END); self.stack.writer(arena).write_u32_vint(POSITION_END);
} }

View File

@@ -21,3 +21,8 @@ path = "example/hashmap.rs"
rand = "0.8.5" rand = "0.8.5"
zipf = "7.0.0" zipf = "7.0.0"
criterion = "0.4.0" criterion = "0.4.0"
[features]
unstable = [] # useful for benches.

View File

@@ -259,6 +259,7 @@ impl ArenaHashMap {
/// will be in charge of returning a default value. /// will be in charge of returning a default value.
/// If the key already as an associated value, then it will be passed /// If the key already as an associated value, then it will be passed
/// `Some(previous_value)`. /// `Some(previous_value)`.
#[inline]
pub fn mutate_or_create<V>( pub fn mutate_or_create<V>(
&mut self, &mut self,
key: &[u8], key: &[u8],

View File

@@ -1,3 +1,8 @@
#![cfg_attr(all(feature = "unstable", test), feature(test))]
#[cfg(all(test, feature = "unstable"))]
extern crate test;
mod arena_hashmap; mod arena_hashmap;
mod expull; mod expull;
mod memory_arena; mod memory_arena;