POC: Tantivy documents as a trait (#2071)

* fix windows build (#1)

* Fix windows build

* Add doc traits

* Add field value iter

* Add value and serialization

* Adjust order

* Fix bug

* Correct type

* Fix generic bugs

* Reformat code

* Add generic to index writer which I forgot about

* Fix missing generics on single segment writer

* Add missing type export

* Add default methods for convenience

* Cleanup

* Fix more-like-this query to use standard types

* Update API and fix tests

* Add doc traits

* Add field value iter

* Add value and serialization

* Adjust order

* Fix bug

* Correct type

* Rebase main and fix conflicts

* Reformat code

* Merge upstream

* Fix missing generics on single segment writer

* Add missing type export

* Add default methods for convenience

* Cleanup

* Fix more-like-this query to use standard types

* Update API and fix tests

* Add tokenizer improvements from previous commits

* Add tokenizer improvements from previous commits

* Reformat

* Fix unit tests

* Fix unit tests

* Use enum in changes

* Stage changes

* Add new deserializer logic

* Add serializer integration

* Add document deserializer

* Implement new (de)serialization api for existing types

* Fix bugs and type errors

* Add helper implementations

* Fix errors

* Reformat code

* Add unit tests and some code organisation for serialization

* Add unit tests to deserializer

* Add some small docs

* Add support for deserializing serde values

* Reformat

* Fix typo

* Fix typo

* Change repr of facet

* Remove unused trait methods

* Add child value type

* Resolve comments

* Fix build

* Fix more build errors

* Fix more build errors

* Fix the tests I missed

* Fix examples

* fix numerical order, serialize PreTok Str

* fix coverage

* rename Document to TantivyDocument, rename DocumentAccess to Document

add Binary prefix to binary de/serialization

* fix coverage

---------

Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>
This commit is contained in:
Harrison Burt
2023-10-02 09:01:16 +01:00
committed by GitHub
parent b525f653c0
commit 1c7c6fd591
96 changed files with 4191 additions and 1567 deletions

View File

@@ -39,9 +39,9 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
let doc = Document::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
})
@@ -50,9 +50,10 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
let doc = Document::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
@@ -62,9 +63,9 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
let doc = Document::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
})
@@ -73,9 +74,10 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
let doc = Document::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
@@ -86,7 +88,8 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
@@ -113,7 +116,7 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
@@ -127,7 +130,8 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
@@ -154,7 +158,7 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
@@ -168,7 +172,8 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();