Compare commits

...

2 Commits

Author SHA1 Message Date
François Massot
951a898633 Update bench. 2022-10-30 14:12:07 +01:00
François Massot
003722d831 Add bench to reproduce performance drop on array of texts. 2022-10-29 02:54:07 +02:00
2 changed files with 100129 additions and 88 deletions

100000
benches/hdfs_with_array.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,116 +1,157 @@
use criterion::{criterion_group, criterion_main, Criterion};
use itertools::Itertools;
use pprof::criterion::{Output, PProfProfiler};
use tantivy::schema::{INDEXED, STORED, STRING, TEXT};
use tantivy::Index;
use serde_json::{self, Value as JsonValue};
use tantivy::directory::RamDirectory;
use tantivy::schema::{INDEXED, STORED, STRING, TEXT, TextOptions, TextFieldIndexing, FieldValue, Value};
use tantivy::{Index, IndexBuilder, Document};
const HDFS_LOGS: &str = include_str!("hdfs.json");
const NUM_REPEATS: usize = 2;
pub fn hdfs_index_benchmark(c: &mut Criterion) {
let schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", INDEXED);
schema_builder.add_text_field("body", TEXT);
schema_builder.add_text_field("severity", STRING);
schema_builder.build()
};
let schema_with_store = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", INDEXED | STORED);
schema_builder.add_text_field("body", TEXT | STORED);
schema_builder.add_text_field("severity", STRING | STORED);
schema_builder.build()
};
let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT);
schema_builder.build()
};
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
let text_indexing_options = TextFieldIndexing::default()
.set_tokenizer("default")
.set_fieldnorms(false)
.set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions);
let mut text_options = TextOptions::default()
.set_indexing_options(text_indexing_options);
let text_field = schema_builder.add_text_field("body", text_options);
let schema = schema_builder.build();
// prepare doc
let mut documents_no_array = Vec::new();
let mut documents_with_array = Vec::new();
for doc_json in HDFS_LOGS.trim().split("\n") {
let json_obj: serde_json::Map<String, JsonValue> = serde_json::from_str(doc_json).unwrap();
let text = json_obj.get("body").unwrap().as_str().unwrap();
let text_array = text.split(" ");
let mut doc_no_array = Document::new();
doc_no_array.add_field_value(text_field, Value::Str(text.to_string()));
documents_no_array.push(doc_no_array);
let mut doc_with_array = Document::new();
for text_element in text_array {
doc_with_array.add_field_value(text_field, Value::Str(text_element.to_string()));
}
documents_with_array.push(doc_with_array);
}
let mut group = c.benchmark_group("index-hdfs");
group.sample_size(20);
group.bench_function("index-hdfs-no-commit", |b| {
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let ram_directory = RamDirectory::create();
let mut index_writer = IndexBuilder::new()
.schema(schema.clone())
.single_segment_index_writer(ram_directory, 100_000_000)
.unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let doc = schema.parse_document(doc_json).unwrap();
let documents_cloned = documents_no_array.clone();
for doc in documents_cloned {
index_writer.add_document(doc).unwrap();
}
}
})
});
group.bench_function("index-hdfs-with-commit", |b| {
group.bench_function("index-hdfs-with-array-no-commit", |b| {
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let ram_directory = RamDirectory::create();
let mut index_writer = IndexBuilder::new()
.schema(schema.clone())
.single_segment_index_writer(ram_directory, 100_000_000)
.unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let doc = schema.parse_document(doc_json).unwrap();
let documents_with_array_cloned = documents_with_array.clone();
for doc in documents_with_array_cloned {
index_writer.add_document(doc).unwrap();
}
}
})
});
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
// group.bench_function("index-hdfs-with-commit", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let doc = schema.parse_document(doc_json).unwrap();
// index_writer.add_document(doc).unwrap();
// }
// }
// index_writer.commit().unwrap();
// })
// });
// group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let doc = schema.parse_document(doc_json).unwrap();
// index_writer.add_document(doc).unwrap();
// }
// }
// })
// });
// group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let doc = schema.parse_document(doc_json).unwrap();
// index_writer.add_document(doc).unwrap();
// }
// }
// index_writer.commit().unwrap();
// })
// });
// group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let json_val: serde_json::Map<String, serde_json::Value> =
// serde_json::from_str(doc_json).unwrap();
// let doc = tantivy::doc!(json_field=>json_val);
// index_writer.add_document(doc).unwrap();
// }
// }
// index_writer.commit().unwrap();
// })
// });
// group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let json_val: serde_json::Map<String, serde_json::Value> =
// serde_json::from_str(doc_json).unwrap();
// let doc = tantivy::doc!(json_field=>json_val);
// index_writer.add_document(doc).unwrap();
// }
// }
// index_writer.commit().unwrap();
// })
//});
}
criterion_group! {