POC: Tantivy documents as a trait (#2071)

* fix windows build (#1)

* Fix windows build

* Add doc traits

* Add field value iter

* Add value and serialization

* Adjust order

* Fix bug

* Correct type

* Fix generic bugs

* Reformat code

* Add generic to index writer which I forgot about

* Fix missing generics on single segment writer

* Add missing type export

* Add default methods for convenience

* Cleanup

* Fix more-like-this query to use standard types

* Update API and fix tests

* Add doc traits

* Add field value iter

* Add value and serialization

* Adjust order

* Fix bug

* Correct type

* Rebase main and fix conflicts

* Reformat code

* Merge upstream

* Fix missing generics on single segment writer

* Add missing type export

* Add default methods for convenience

* Cleanup

* Fix more-like-this query to use standard types

* Update API and fix tests

* Add tokenizer improvements from previous commits

* Add tokenizer improvements from previous commits

* Reformat

* Fix unit tests

* Fix unit tests

* Use enum in changes

* Stage changes

* Add new deserializer logic

* Add serializer integration

* Add document deserializer

* Implement new (de)serialization api for existing types

* Fix bugs and type errors

* Add helper implementations

* Fix errors

* Reformat code

* Add unit tests and some code organisation for serialization

* Add unit tests to deserializer

* Add some small docs

* Add support for deserializing serde values

* Reformat

* Fix typo

* Fix typo

* Change repr of facet

* Remove unused trait methods

* Add child value type

* Resolve comments

* Fix build

* Fix more build errors

* Fix more build errors

* Fix the tests I missed

* Fix examples

* fix numerical order, serialize PreTok Str

* fix coverage

* rename Document to TantivyDocument, rename DocumentAccess to Document

add Binary prefix to binary de/serialization

* fix coverage

---------

Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>
This commit is contained in:
Harrison Burt
2023-10-02 09:01:16 +01:00
committed by GitHub
parent b525f653c0
commit 1c7c6fd591
96 changed files with 4191 additions and 1567 deletions

View File

@@ -39,9 +39,9 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
let doc = Document::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
})
@@ -50,9 +50,10 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
let doc = Document::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
@@ -62,9 +63,9 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
let doc = Document::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
})
@@ -73,9 +74,10 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
let doc = Document::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
@@ -86,7 +88,8 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
@@ -113,7 +116,7 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
@@ -127,7 +130,8 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
@@ -154,7 +158,7 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
@@ -168,7 +172,8 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();

View File

@@ -1,11 +1,14 @@
#![allow(deprecated)]
use std::fmt;
use std::io::{Read, Write};
use serde::{Deserialize, Serialize};
use time::format_description::well_known::Rfc3339;
use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
use crate::BinarySerializable;
/// Precision with which datetimes are truncated when stored in fast fields. This setting is only
/// relevant for fast fields. In the docstore, datetimes are always saved with nanosecond precision.
#[derive(
@@ -164,3 +167,15 @@ impl fmt::Debug for DateTime {
f.write_str(&utc_rfc3339)
}
}
impl BinarySerializable for DateTime {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
let timestamp_micros = self.into_timestamp_micros();
<i64 as BinarySerializable>::serialize(&timestamp_micros, writer)
}
fn deserialize<R: Read>(reader: &mut R) -> std::io::Result<Self> {
let timestamp_micros = <i64 as BinarySerializable>::deserialize(reader)?;
Ok(Self::from_timestamp_micros(timestamp_micros))
}
}

View File

@@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::io::{Read, Write};
use std::{fmt, io};
@@ -249,6 +250,43 @@ impl BinarySerializable for String {
}
}
impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
Ok(Cow::Owned(result))
}
}
impl<'a> BinarySerializable for Cow<'a, [u8]> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
for it in self.iter() {
it.serialize(writer)?;
}
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
let num_items = VInt::deserialize(reader)?.val();
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = u8::deserialize(reader)?;
items.push(item);
}
Ok(Cow::Owned(items))
}
}
#[cfg(test)]
pub mod test {

View File

@@ -12,7 +12,7 @@ use tantivy::aggregation::agg_result::AggregationResults;
use tantivy::aggregation::AggregationCollector;
use tantivy::query::AllQuery;
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST};
use tantivy::Index;
use tantivy::{Index, IndexWriter, TantivyDocument};
fn main() -> tantivy::Result<()> {
// # Create Schema
@@ -132,10 +132,10 @@ fn main() -> tantivy::Result<()> {
let stream = Deserializer::from_str(data).into_iter::<Value>();
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut num_indexed = 0;
for value in stream {
let doc = schema.parse_document(&serde_json::to_string(&value.unwrap())?)?;
let doc = TantivyDocument::parse_json(&schema, &serde_json::to_string(&value.unwrap())?)?;
index_writer.add_document(doc)?;
num_indexed += 1;
if num_indexed > 4 {

View File

@@ -15,7 +15,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, ReloadPolicy};
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -75,7 +75,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty.
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
@@ -87,7 +87,7 @@ fn main() -> tantivy::Result<()> {
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let mut old_man_doc = Document::default();
let mut old_man_doc = TantivyDocument::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(
body,
@@ -217,8 +217,8 @@ fn main() -> tantivy::Result<()> {
// the document returned will only contain
// a title.
for (_score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
}
// We can also get an explanation to understand

View File

@@ -13,7 +13,7 @@ use columnar::Column;
use tantivy::collector::{Collector, SegmentCollector};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, Score, SegmentReader};
use tantivy::{doc, Index, IndexWriter, Score, SegmentReader};
#[derive(Default)]
struct Stats {
@@ -142,7 +142,7 @@ fn main() -> tantivy::Result<()> {
// this example.
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
index_writer.add_document(doc!(
product_name => "Super Broom 2000",
product_description => "While it is ok for short distance travel, this broom \

View File

@@ -6,7 +6,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer;
use tantivy::{doc, Index};
use tantivy::{doc, Index, IndexWriter};
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -62,7 +62,7 @@ fn main() -> tantivy::Result<()> {
//
// Here we use a buffer of 50MB per thread. Using a bigger
// memory arena for the indexer can increase its throughput.
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
@@ -103,8 +103,8 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (_, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
}
Ok(())

View File

@@ -5,7 +5,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::Index;
use tantivy::{Index, IndexWriter, TantivyDocument};
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -22,16 +22,18 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
// The dates are passed as string in the RFC3339 format
let doc = schema.parse_document(
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"occurred_at": "2022-06-22T12:53:50.53Z",
"event": "pull-request"
}"#,
)?;
index_writer.add_document(doc)?;
let doc = schema.parse_document(
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"occurred_at": "2022-06-22T13:00:00.22Z",
"event": "comment"
@@ -58,13 +60,13 @@ fn main() -> tantivy::Result<()> {
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc(doc_address)?;
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
assert!(matches!(
retrieved_doc.get_first(occurred_at),
Some(Value::Date(_))
));
assert_eq!(
schema.to_json(&retrieved_doc),
retrieved_doc.to_json(&schema),
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
);
}

View File

@@ -11,7 +11,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexReader};
use tantivy::{doc, Index, IndexReader, IndexWriter};
// A simple helper function to fetch a single document
// given its id from our index.
@@ -19,7 +19,7 @@ use tantivy::{doc, Index, IndexReader};
fn extract_doc_given_isbn(
reader: &IndexReader,
isbn_term: &Term,
) -> tantivy::Result<Option<Document>> {
) -> tantivy::Result<Option<TantivyDocument>> {
let searcher = reader.searcher();
// This is the simplest query you can think of.
@@ -69,10 +69,10 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
// Let's add a couple of documents, for the sake of the example.
let mut old_man_doc = Document::default();
let mut old_man_doc = TantivyDocument::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
index_writer.add_document(doc!(
isbn => "978-0099908401",
@@ -94,7 +94,7 @@ fn main() -> tantivy::Result<()> {
// Oops our frankenstein doc seems misspelled
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!(
schema.to_json(&frankenstein_doc_misspelled),
frankenstein_doc_misspelled.to_json(&schema),
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
);
@@ -136,7 +136,7 @@ fn main() -> tantivy::Result<()> {
// No more typo!
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!(
schema.to_json(&frankenstein_new_doc),
frankenstein_new_doc.to_json(&schema),
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
);

View File

@@ -17,7 +17,7 @@
use tantivy::collector::FacetCollector;
use tantivy::query::{AllQuery, TermQuery};
use tantivy::schema::*;
use tantivy::{doc, Index};
use tantivy::{doc, Index, IndexWriter};
fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the sake of this example
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(30_000_000)?;
let mut index_writer: IndexWriter = index.writer(30_000_000)?;
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.

View File

@@ -12,7 +12,7 @@ use std::collections::HashSet;
use tantivy::collector::TopDocs;
use tantivy::query::BooleanQuery;
use tantivy::schema::*;
use tantivy::{doc, DocId, Index, Score, SegmentReader};
use tantivy::{doc, DocId, Index, IndexWriter, Score, SegmentReader};
fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(30_000_000)?;
let mut index_writer: IndexWriter = index.writer(30_000_000)?;
index_writer.add_document(doc!(
title => "Fried egg",
@@ -91,11 +91,10 @@ fn main() -> tantivy::Result<()> {
.iter()
.map(|(_, doc_id)| {
searcher
.doc(*doc_id)
.doc::<TantivyDocument>(*doc_id)
.unwrap()
.get_first(title)
.unwrap()
.as_text()
.and_then(|v| v.as_str())
.unwrap()
.to_owned()
})

View File

@@ -14,7 +14,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::FuzzyTermQuery;
use tantivy::schema::*;
use tantivy::{doc, Index, ReloadPolicy};
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -66,7 +66,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty.
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
@@ -151,10 +151,10 @@ fn main() -> tantivy::Result<()> {
assert_eq!(count, 3);
assert_eq!(top_docs.len(), 3);
for (score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
// Note that the score is not lower for the fuzzy hit.
// There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563
println!("score {score:?} doc {}", schema.to_json(&retrieved_doc));
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("score {score:?} doc {}", retrieved_doc.to_json(&schema));
// score 1.0 doc {"title":["The Diary of Muadib"]}
//
// score 1.0 doc {"title":["The Diary of a Young Girl"]}

View File

@@ -21,7 +21,7 @@ fn main() -> tantivy::Result<()> {
}"#;
// We can parse our document
let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
let _mice_and_men_doc = TantivyDocument::parse_json(&schema, mice_and_men_doc_json)?;
// Multi-valued field are allowed, they are
// expressed in JSON by an array.
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
"title": ["Frankenstein", "The Modern Prometheus"],
"year": 1818
}"#;
let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
let _frankenstein_doc = TantivyDocument::parse_json(&schema, frankenstein_json)?;
// Note that the schema is saved in your index directory.
//

View File

@@ -5,7 +5,7 @@
use tantivy::collector::Count;
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, Result};
use tantivy::{doc, Index, IndexWriter, Result};
fn main() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
{
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year))?;
}

View File

@@ -6,7 +6,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING};
use tantivy::Index;
use tantivy::{Index, IndexWriter, TantivyDocument};
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -22,20 +22,22 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
// ### IPv4
// Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as
// `String`. Since the field is of type ip, we parse the IP address from the string and store it
// internally as IPv6.
let doc = schema.parse_document(
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"ip": "192.168.0.33",
"event_type": "login"
}"#,
)?;
index_writer.add_document(doc)?;
let doc = schema.parse_document(
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"ip": "192.168.0.80",
"event_type": "checkout"
@@ -44,7 +46,8 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc)?;
// ### IPv6
// Adding a document that contains an IPv6 address.
let doc = schema.parse_document(
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
"event_type": "checkout"

View File

@@ -10,7 +10,7 @@
// ---
// Importing tantivy...
use tantivy::schema::*;
use tantivy::{doc, DocSet, Index, Postings, TERMINATED};
use tantivy::{doc, DocSet, Index, IndexWriter, Postings, TERMINATED};
fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the
@@ -24,7 +24,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
index_writer.add_document(doc!(title => "The modern Promotheus"))?;

View File

@@ -7,7 +7,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
use tantivy::Index;
use tantivy::{Index, IndexWriter, TantivyDocument};
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -20,8 +20,9 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer(50_000_000)?;
let doc = schema.parse_document(
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"timestamp": "2022-02-22T23:20:50.53Z",
"event_type": "click",
@@ -33,7 +34,8 @@ fn main() -> tantivy::Result<()> {
}"#,
)?;
index_writer.add_document(doc)?;
let doc = schema.parse_document(
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"timestamp": "2022-02-22T23:20:51.53Z",
"event_type": "click",

View File

@@ -1,7 +1,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, ReloadPolicy, Result};
use tantivy::{doc, Index, IndexWriter, ReloadPolicy, Result};
use tempfile::TempDir;
fn main() -> Result<()> {
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
@@ -67,8 +67,12 @@ fn main() -> Result<()> {
let mut titles = top_docs
.into_iter()
.map(|(_score, doc_address)| {
let doc = searcher.doc(doc_address)?;
let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let title = doc
.get_first(title)
.and_then(|v| v.as_str())
.unwrap()
.to_owned();
Ok(title)
})
.collect::<Result<Vec<_>>>()?;

View File

@@ -13,7 +13,7 @@ use tantivy::collector::{Count, TopDocs};
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
use tantivy::{doc, Index, ReloadPolicy};
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tempfile::TempDir;
fn pre_tokenize_text(text: &str) -> Vec<Token> {
@@ -38,7 +38,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_dir(&index_path, schema.clone())?;
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
// We can create a document manually, by setting the fields
// one by one in a Document object.
@@ -83,7 +83,7 @@ fn main() -> tantivy::Result<()> {
}]
}"#;
let short_man_doc = schema.parse_document(short_man_json)?;
let short_man_doc = TantivyDocument::parse_json(&schema, short_man_json)?;
index_writer.add_document(short_man_doc)?;
@@ -115,8 +115,8 @@ fn main() -> tantivy::Result<()> {
// Note that the tokens are not stored along with the original text
// in the document store
for (_score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("Document: {}", schema.to_json(&retrieved_doc));
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
}
// In contrary to the previous query, when we search for the "man" term we

View File

@@ -10,7 +10,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, Snippet, SnippetGenerator};
use tantivy::{doc, Index, IndexWriter, Snippet, SnippetGenerator};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -27,7 +27,7 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
// we'll only need one doc for this example.
index_writer.add_document(doc!(
@@ -54,13 +54,10 @@ fn main() -> tantivy::Result<()> {
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
for (score, doc_address) in top_docs {
let doc = searcher.doc(doc_address)?;
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let snippet = snippet_generator.snippet_from_doc(&doc);
println!("Document score {score}:");
println!(
"title: {}",
doc.get_first(title).unwrap().as_text().unwrap()
);
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());
println!("snippet: {}", snippet.to_html());
println!("custom highlighting: {}", highlight(snippet));
}

View File

@@ -15,7 +15,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::*;
use tantivy::{doc, Index};
use tantivy::{doc, Index, IndexWriter};
fn main() -> tantivy::Result<()> {
// this example assumes you understand the content in `basic_search`
@@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> {
index.tokenizers().register("stoppy", tokenizer);
let mut index_writer = index.writer(50_000_000)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
@@ -105,9 +105,9 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("\n==\nDocument score {score}:");
println!("{}", schema.to_json(&retrieved_doc));
println!("{}", retrieved_doc.to_json(&schema));
}
Ok(())

View File

@@ -6,8 +6,8 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{
doc, DocAddress, DocId, Index, Opstamp, Searcher, SearcherGeneration, SegmentId, SegmentReader,
Warmer,
doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration, SegmentId,
SegmentReader, Warmer,
};
// This example shows how warmers can be used to
@@ -143,7 +143,7 @@ fn main() -> tantivy::Result<()> {
const SNEAKERS: ProductId = 23222;
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;

View File

@@ -9,7 +9,7 @@ use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_v
use crate::aggregation::DistributedAggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, Term};
use crate::{Index, IndexWriter, Term};
fn get_avg_req(field_name: &str) -> Aggregation {
serde_json::from_value(json!({
@@ -586,7 +586,7 @@ fn test_aggregation_on_json_object() {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"color": "red"})))
.unwrap();
@@ -630,7 +630,7 @@ fn test_aggregation_on_json_object_empty_columns() {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Empty column when accessing color
index_writer
.add_document(doc!(json => json!({"price": 10.0})))
@@ -748,7 +748,7 @@ fn test_aggregation_on_json_object_mixed_types() {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))

View File

@@ -252,7 +252,7 @@ pub mod tests {
use crate::aggregation::tests::exec_request;
use crate::indexer::NoMergePolicy;
use crate::schema::{Schema, FAST, STRING};
use crate::Index;
use crate::{Index, IndexWriter, TantivyDocument};
#[test]
fn test_parse_into_millisecs() {
@@ -316,7 +316,7 @@ pub mod tests {
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for values in segment_and_docs {
for doc_str in values {
let doc = schema.parse_document(doc_str)?;
let doc = TantivyDocument::parse_json(&schema, doc_str)?;
index_writer.add_document(doc)?;
}
// writing the segment
@@ -328,7 +328,7 @@ pub mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() > 1 {
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}

View File

@@ -601,7 +601,7 @@ mod tests {
use crate::aggregation::AggregationLimits;
use crate::indexer::NoMergePolicy;
use crate::schema::{Schema, FAST, STRING};
use crate::Index;
use crate::{Index, IndexWriter};
#[test]
fn terms_aggregation_test_single_segment() -> crate::Result<()> {
@@ -1473,7 +1473,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();

View File

@@ -117,7 +117,7 @@ mod tests {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST};
use crate::Index;
use crate::{Index, IndexWriter};
#[test]
fn terms_aggregation_missing_mixed_type_mult_seg_sub_agg() -> crate::Result<()> {
@@ -126,7 +126,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))
@@ -186,7 +186,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))?;
index_writer.add_document(doc!(score => 5.0))?;
@@ -231,7 +231,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
@@ -278,7 +278,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.add_document(doc!(score => 5.0))?;
@@ -323,7 +323,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
@@ -385,7 +385,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
@@ -427,7 +427,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))

View File

@@ -71,7 +71,7 @@ mod tests {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST};
use crate::Index;
use crate::{Index, IndexWriter};
#[test]
fn test_max_agg_with_missing() -> crate::Result<()> {
@@ -79,7 +79,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();

View File

@@ -88,7 +88,7 @@ mod tests {
use crate::aggregation::AggregationCollector;
use crate::query::AllQuery;
use crate::schema::{NumericOptions, Schema};
use crate::Index;
use crate::{Index, IndexWriter};
#[test]
fn test_metric_aggregations() {
@@ -96,7 +96,7 @@ mod tests {
let field_options = NumericOptions::default().set_fast();
let field = schema_builder.add_f64_field("price", field_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
for i in 0..3 {
index_writer

View File

@@ -300,7 +300,7 @@ mod tests {
use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, Term};
use crate::{Index, IndexWriter, Term};
#[test]
fn test_aggregation_stats_empty_index() -> crate::Result<()> {
@@ -494,7 +494,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();
@@ -541,7 +541,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();

View File

@@ -319,7 +319,7 @@ mod tests {
use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::{Index, Term};
use crate::{Index, IndexWriter, Term};
pub fn get_test_index_with_num_docs(
merge_segments: bool,
@@ -451,7 +451,7 @@ mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() > 1 {
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -565,7 +565,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}

View File

@@ -495,8 +495,8 @@ mod tests {
use crate::collector::Count;
use crate::core::Index;
use crate::query::{AllQuery, QueryParser, TermQuery};
use crate::schema::{Document, Facet, FacetOptions, IndexRecordOption, Schema};
use crate::Term;
use crate::schema::{Facet, FacetOptions, IndexRecordOption, Schema, TantivyDocument};
use crate::{IndexWriter, Term};
fn test_collapse_mapping_aux(
facet_terms: &[&str],
@@ -559,7 +559,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(facet_field=>Facet::from("/facet/a")))
.unwrap();
@@ -588,7 +588,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| {
@@ -601,7 +601,7 @@ mod tests {
})
.collect();
for i in 0..num_facets * 10 {
let mut doc = Document::new();
let mut doc = TantivyDocument::new();
doc.add_facet(facet_field, facets[i % num_facets].clone());
index_writer.add_document(doc).unwrap();
}
@@ -732,24 +732,25 @@ mod tests {
let index = Index::create_in_ram(schema);
let uniform = Uniform::new_inclusive(1, 100_000);
let mut docs: Vec<Document> = vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
.map(|mut doc| {
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(uniform)),
);
doc
})
.collect();
let mut docs: Vec<TantivyDocument> =
vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
.map(|mut doc| {
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(uniform)),
);
doc
})
.collect();
docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
for doc in docs {
index_writer.add_document(doc).unwrap();
}
@@ -780,7 +781,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let docs: Vec<Document> = vec![("b", 2), ("a", 2), ("c", 4)]
let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
@@ -828,7 +829,7 @@ mod bench {
use crate::collector::FacetCollector;
use crate::query::AllQuery;
use crate::schema::{Facet, Schema, INDEXED};
use crate::Index;
use crate::{Index, IndexWriter};
#[bench]
fn bench_facet_collector(b: &mut Bencher) {
@@ -847,7 +848,7 @@ mod bench {
// 40425 docs
docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
for doc in docs {
index_writer.add_document(doc).unwrap();
}

View File

@@ -7,7 +7,9 @@ use crate::query::{AllQuery, QueryParser};
use crate::schema::{Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{doc, DateTime, DocAddress, DocId, Document, Index, Score, Searcher, SegmentOrdinal};
use crate::{
doc, DateTime, DocAddress, DocId, Index, Score, Searcher, SegmentOrdinal, TantivyDocument,
};
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
compute_score: true,
@@ -280,8 +282,8 @@ fn make_test_searcher() -> crate::Result<Searcher> {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::default())?;
index_writer.add_document(Document::default())?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.commit()?;
Ok(index.reader()?.searcher())
}

View File

@@ -19,6 +19,7 @@ use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas;
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::document::Document;
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::IndexWriter;
@@ -184,11 +185,11 @@ impl IndexBuilder {
///
/// It expects an originally empty directory, and will not run any GC operation.
#[doc(hidden)]
pub fn single_segment_index_writer(
pub fn single_segment_index_writer<D: Document>(
self,
dir: impl Into<Box<dyn Directory>>,
mem_budget: usize,
) -> crate::Result<SingleSegmentIndexWriter> {
) -> crate::Result<SingleSegmentIndexWriter<D>> {
let index = self.create(dir)?;
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
Ok(index_simple_writer)
@@ -531,11 +532,11 @@ impl Index {
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
/// If the memory arena per thread is too small or too big, returns
/// `TantivyError::InvalidArgument`
pub fn writer_with_num_threads(
pub fn writer_with_num_threads<D: Document>(
&self,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter> {
) -> crate::Result<IndexWriter<D>> {
let directory_lock = self
.directory
.acquire_lock(&INDEX_WRITER_LOCK)
@@ -564,7 +565,7 @@ impl Index {
/// That index writer only simply has a single thread and a memory budget of 15 MB.
/// Using a single thread gives us a deterministic allocation of DocId.
#[cfg(test)]
pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {
pub fn writer_for_tests<D: Document>(&self) -> crate::Result<IndexWriter<D>> {
self.writer_with_num_threads(1, 15_000_000)
}
@@ -579,7 +580,10 @@ impl Index {
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// If the memory arena per thread is too small or too big, returns
/// `TantivyError::InvalidArgument`
pub fn writer(&self, memory_budget_in_bytes: usize) -> crate::Result<IndexWriter> {
pub fn writer<D: Document>(
&self,
memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter<D>> {
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads;
if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {

View File

@@ -5,6 +5,7 @@ use rustc_hash::FxHashMap;
use crate::fastfield::FastValue;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::document::{DocValue, ReferenceValue};
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{Field, Type, DATE_TIME_PRECISION_INDEXED};
use crate::time::format_description::well_known::Rfc3339;
@@ -64,9 +65,9 @@ impl IndexingPositionsPerPath {
}
}
pub(crate) fn index_json_values<'a>(
pub(crate) fn index_json_values<'a, V: DocValue<'a>>(
doc: DocId,
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
json_visitors: impl Iterator<Item = crate::Result<V::ObjectIter>>,
text_analyzer: &mut TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
@@ -75,11 +76,11 @@ pub(crate) fn index_json_values<'a>(
) -> crate::Result<()> {
let mut json_term_writer = JsonTermWriter::wrap(term_buffer, expand_dots_enabled);
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
for json_value_res in json_values {
let json_value = json_value_res?;
index_json_object(
for json_visitor_res in json_visitors {
let json_visitor = json_visitor_res?;
index_json_object::<V>(
doc,
json_value,
json_visitor,
text_analyzer,
&mut json_term_writer,
postings_writer,
@@ -90,20 +91,20 @@ pub(crate) fn index_json_values<'a>(
Ok(())
}
fn index_json_object(
fn index_json_object<'a, V: DocValue<'a>>(
doc: DocId,
json_value: &serde_json::Map<String, serde_json::Value>,
json_visitor: V::ObjectIter,
text_analyzer: &mut TextAnalyzer,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath,
) {
for (json_path_segment, json_value) in json_value {
for (json_path_segment, json_value_visitor) in json_visitor {
json_term_writer.push_path_segment(json_path_segment);
index_json_value(
doc,
json_value,
json_value_visitor,
text_analyzer,
json_term_writer,
postings_writer,
@@ -114,9 +115,9 @@ fn index_json_object(
}
}
fn index_json_value(
fn index_json_value<'a, V: DocValue<'a>>(
doc: DocId,
json_value: &serde_json::Value,
json_value: ReferenceValue<'a, V>,
text_analyzer: &mut TextAnalyzer,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
@@ -124,43 +125,56 @@ fn index_json_value(
positions_per_path: &mut IndexingPositionsPerPath,
) {
match json_value {
serde_json::Value::Null => {}
serde_json::Value::Bool(val_bool) => {
json_term_writer.set_fast_value(*val_bool);
ReferenceValue::Null => {}
ReferenceValue::Str(val) => {
let mut token_stream = text_analyzer.token_stream(val);
// TODO: make sure the chain position works out.
json_term_writer.close_path_and_set_type(Type::Str);
let indexing_position = positions_per_path.get_position(json_term_writer.term());
postings_writer.index_text(
doc,
&mut *token_stream,
json_term_writer.term_buffer,
ctx,
indexing_position,
);
}
ReferenceValue::U64(val) => {
json_term_writer.set_fast_value(val);
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
serde_json::Value::Number(number) => {
if let Some(number_i64) = number.as_i64() {
json_term_writer.set_fast_value(number_i64);
} else if let Some(number_u64) = number.as_u64() {
json_term_writer.set_fast_value(number_u64);
} else if let Some(number_f64) = number.as_f64() {
json_term_writer.set_fast_value(number_f64);
}
ReferenceValue::I64(val) => {
json_term_writer.set_fast_value(val);
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
serde_json::Value::String(text) => match infer_type_from_str(text) {
TextOrDateTime::Text(text) => {
let mut token_stream = text_analyzer.token_stream(text);
// TODO make sure the chain position works out.
json_term_writer.close_path_and_set_type(Type::Str);
let indexing_position = positions_per_path.get_position(json_term_writer.term());
postings_writer.index_text(
doc,
&mut *token_stream,
json_term_writer.term_buffer,
ctx,
indexing_position,
);
}
TextOrDateTime::DateTime(dt) => {
json_term_writer.set_fast_value(DateTime::from_utc(dt));
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
},
serde_json::Value::Array(arr) => {
for val in arr {
index_json_value(
ReferenceValue::F64(val) => {
json_term_writer.set_fast_value(val);
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
ReferenceValue::Bool(val) => {
json_term_writer.set_fast_value(val);
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
ReferenceValue::Facet(_) => {
unimplemented!("Facet support in dynamic fields is not yet implemented")
}
ReferenceValue::IpAddr(_) => {
unimplemented!("IP address support in dynamic fields is not yet implemented")
}
ReferenceValue::Date(val) => {
json_term_writer.set_fast_value(val);
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
ReferenceValue::PreTokStr(_) => {
unimplemented!("Pre-tokenized string support in dynamic fields is not yet implemented")
}
ReferenceValue::Bytes(_) => {
unimplemented!("Bytes support in dynamic fields is not yet implemented")
}
ReferenceValue::Array(elements) => {
for val in elements {
index_json_value::<V::ChildValue>(
doc,
val,
text_analyzer,
@@ -171,10 +185,10 @@ fn index_json_value(
);
}
}
serde_json::Value::Object(map) => {
index_json_object(
ReferenceValue::Object(object) => {
index_json_object::<V>(
doc,
map,
object,
text_analyzer,
json_term_writer,
postings_writer,
@@ -185,21 +199,6 @@ fn index_json_value(
}
}
enum TextOrDateTime<'a> {
Text(&'a str),
DateTime(OffsetDateTime),
}
fn infer_type_from_str(text: &str) -> TextOrDateTime {
match OffsetDateTime::parse(text, &Rfc3339) {
Ok(dt) => {
let dt_utc = dt.to_offset(UtcOffset::UTC);
TextOrDateTime::DateTime(dt_utc)
}
Err(_) => TextOrDateTime::Text(text),
}
}
// Tries to infer a JSON type from a string.
pub fn convert_to_fast_value_and_get_term(
json_term_writer: &mut JsonTermWriter,

View File

@@ -5,7 +5,8 @@ use std::{fmt, io};
use crate::collector::Collector;
use crate::core::{Executor, SegmentReader};
use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
use crate::schema::{Document, Schema, Term};
use crate::schema::document::Document;
use crate::schema::{Schema, Term};
use crate::space_usage::SearcherSpaceUsage;
use crate::store::{CacheStats, StoreReader};
use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject};
@@ -83,7 +84,7 @@ impl Searcher {
///
/// The searcher uses the segment ordinal to route the
/// request to the right `Segment`.
pub fn doc(&self, doc_address: DocAddress) -> crate::Result<Document> {
pub fn doc<D: Document>(&self, doc_address: DocAddress) -> crate::Result<D> {
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get(doc_address.doc_id)
}
@@ -103,7 +104,7 @@ impl Searcher {
/// Fetches a document in an asynchronous manner.
#[cfg(feature = "quickwit")]
pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result<Document> {
pub async fn doc_async<D: Document>(&self, doc_address: DocAddress) -> crate::Result<D> {
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get_async(doc_address.doc_id).await
}

View File

@@ -355,7 +355,7 @@ impl fmt::Debug for SegmentReader {
mod test {
use crate::core::Index;
use crate::schema::{Schema, Term, STORED, TEXT};
use crate::DocId;
use crate::{DocId, IndexWriter};
#[test]
fn test_num_alive() -> crate::Result<()> {
@@ -366,7 +366,7 @@ mod test {
let name = schema.get_field("name").unwrap();
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"))?;
index_writer.add_document(doc!(name => "horse"))?;
index_writer.add_document(doc!(name => "jockey"))?;
@@ -392,7 +392,7 @@ mod test {
let name = schema.get_field("name").unwrap();
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"))?;
index_writer.add_document(doc!(name => "horse"))?;
index_writer.add_document(doc!(name => "jockey"))?;
@@ -402,7 +402,7 @@ mod test {
}
{
let mut index_writer2 = index.writer(50_000_000)?;
let mut index_writer2: IndexWriter = index.writer(50_000_000)?;
index_writer2.delete_term(Term::from_field_text(name, "horse"));
index_writer2.delete_term(Term::from_field_text(name, "cap"));

View File

@@ -1,16 +1,20 @@
use std::marker::PhantomData;
use crate::indexer::operation::AddOperation;
use crate::indexer::segment_updater::save_metas;
use crate::indexer::SegmentWriter;
use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
use crate::schema::document::Document;
use crate::{Directory, Index, IndexMeta, Opstamp, Segment, TantivyDocument};
#[doc(hidden)]
pub struct SingleSegmentIndexWriter {
pub struct SingleSegmentIndexWriter<D: Document = TantivyDocument> {
segment_writer: SegmentWriter,
segment: Segment,
opstamp: Opstamp,
_phantom: PhantomData<D>,
}
impl SingleSegmentIndexWriter {
impl<D: Document> SingleSegmentIndexWriter<D> {
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
let segment = index.new_segment();
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
@@ -18,6 +22,7 @@ impl SingleSegmentIndexWriter {
segment_writer,
segment,
opstamp: 0,
_phantom: PhantomData,
})
}
@@ -25,7 +30,7 @@ impl SingleSegmentIndexWriter {
self.segment_writer.mem_usage()
}
pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
pub fn add_document(&mut self, document: D) -> crate::Result<()> {
let opstamp = self.opstamp;
self.opstamp += 1;
self.segment_writer

View File

@@ -5,8 +5,8 @@ use crate::query::TermQuery;
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
use crate::tokenizer::TokenizerManager;
use crate::{
Directory, Document, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, SegmentId,
Term,
Directory, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy,
SegmentId, TantivyDocument, Term,
};
#[test]
@@ -159,7 +159,7 @@ mod mmap_specific {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let mut index = Index::create_from_tempdir(schema)?;
let mut writer = index.writer_for_tests()?;
let mut writer: IndexWriter = index.writer_for_tests()?;
writer.commit()?;
let reader = index
.reader_builder()
@@ -208,7 +208,7 @@ fn test_index_on_commit_reload_policy_aux(
.watch(WatchCallback::new(move || {
let _ = sender.send(());
}));
let mut writer = index.writer_for_tests()?;
let mut writer: IndexWriter = index.writer_for_tests()?;
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64))?;
writer.commit().unwrap();
@@ -242,7 +242,7 @@ fn garbage_collect_works_as_intended() -> crate::Result<()> {
let field = schema.get_field("num_likes").unwrap();
let index = Index::create(directory.clone(), schema, IndexSettings::default())?;
let mut writer = index.writer_with_num_threads(1, 32_000_000).unwrap();
let mut writer: IndexWriter = index.writer_with_num_threads(1, 32_000_000).unwrap();
for _seg in 0..8 {
for i in 0u64..1_000u64 {
writer.add_document(doc!(field => i))?;
@@ -306,7 +306,7 @@ fn test_merging_segment_update_docfreq() {
let id_field = schema_builder.add_text_field("id", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..5 {
writer.add_document(doc!(text_field=>"hello")).unwrap();
@@ -317,13 +317,13 @@ fn test_merging_segment_update_docfreq() {
writer
.add_document(doc!(text_field=>"hello", id_field=>"TO_BE_DELETED"))
.unwrap();
writer.add_document(Document::default()).unwrap();
writer.add_document(TantivyDocument::default()).unwrap();
writer.commit().unwrap();
for _ in 0..7 {
writer.add_document(doc!(text_field=>"hello")).unwrap();
}
writer.add_document(Document::default()).unwrap();
writer.add_document(Document::default()).unwrap();
writer.add_document(TantivyDocument::default()).unwrap();
writer.add_document(TantivyDocument::default()).unwrap();
writer.delete_term(Term::from_field_text(id_field, "TO_BE_DELETED"));
writer.commit().unwrap();

View File

@@ -533,7 +533,7 @@ mod tests {
use super::*;
use crate::indexer::LogMergePolicy;
use crate::schema::{Schema, SchemaBuilder, TEXT};
use crate::{Index, IndexSettings, ReloadPolicy};
use crate::{Index, IndexSettings, IndexWriter, ReloadPolicy};
#[test]
fn test_open_non_existent_path() {
@@ -645,7 +645,7 @@ mod tests {
let index =
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_num_segments(3);
index_writer.set_merge_policy(Box::new(log_merge_policy));

View File

@@ -11,6 +11,7 @@ use crate::directory::error::{
Incompatibility, LockError, OpenDirectoryError, OpenReadError, OpenWriteError,
};
use crate::fastfield::FastFieldNotAvailableError;
use crate::schema::document::DeserializeError;
use crate::{query, schema};
/// Represents a `DataCorruption` error.
@@ -106,6 +107,9 @@ pub enum TantivyError {
/// e.g. a datastructure is incorrectly inititalized.
#[error("Internal error: '{0}'")]
InternalError(String),
#[error("Deserialize error: {0}")]
/// An error occurred while attempting to deserialize a document.
DeserializeError(DeserializeError),
}
impl From<io::Error> for TantivyError {
@@ -176,3 +180,9 @@ impl From<rayon::ThreadPoolBuildError> for TantivyError {
TantivyError::SystemError(error.to_string())
}
}
impl From<DeserializeError> for TantivyError {
fn from(error: DeserializeError) -> TantivyError {
TantivyError::DeserializeError(error)
}
}

View File

@@ -62,8 +62,9 @@ impl FacetReader {
#[cfg(test)]
mod tests {
use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED};
use crate::{DocAddress, Document, Index};
use crate::schema::document::DocValue;
use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED};
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
#[test]
fn test_facet_only_indexed() {
@@ -71,7 +72,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))
.unwrap();
@@ -85,8 +86,10 @@ mod tests {
let mut facet = Facet::default();
facet_reader.facet_from_ord(0, &mut facet).unwrap();
assert_eq!(facet.to_path_string(), "/a/b");
let doc = searcher.doc(DocAddress::new(0u32, 0u32)).unwrap();
let value = doc.get_first(facet_field).and_then(Value::as_facet);
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))
.unwrap();
let value = doc.get_first(facet_field).and_then(|v| v.as_facet());
assert_eq!(value, None);
}
@@ -96,7 +99,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(facet_field=>Facet::from_text("/parent/child1").unwrap()))
.unwrap();
@@ -142,8 +145,8 @@ mod tests {
let mut facet_ords = Vec::new();
facet_ords.extend(facet_reader.facet_ords(0u32));
assert_eq!(&facet_ords, &[0u64]);
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::as_facet);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))?;
let value: Option<&Facet> = doc.get_first(facet_field).and_then(|v| v.as_facet());
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
Ok(())
}
@@ -156,7 +159,7 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
index_writer.add_document(Document::default())?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
@@ -176,8 +179,8 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::default())?;
index_writer.add_document(Document::default())?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();

View File

@@ -90,12 +90,12 @@ mod tests {
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::{
Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder,
Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, TantivyDocument,
TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::time::OffsetDateTime;
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
use crate::{DateOptions, DateTimePrecision, Index, SegmentId, SegmentReader};
use crate::{DateOptions, DateTimePrecision, Index, IndexWriter, SegmentId, SegmentReader};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();
@@ -271,7 +271,7 @@ mod tests {
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
for i in -100i64..10_000i64 {
let mut doc = Document::default();
let mut doc = TantivyDocument::default();
doc.add_i64(i64_field, i);
fast_field_writers.add_document(&doc).unwrap();
}
@@ -312,7 +312,7 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = Document::default();
let doc = TantivyDocument::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
write.terminate().unwrap();
@@ -345,7 +345,7 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = Document::default();
let doc = TantivyDocument::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
write.terminate().unwrap();
@@ -416,7 +416,7 @@ mod tests {
let date_field = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer
.add_document(doc!(date_field => DateTime::from_utc(OffsetDateTime::now_utc())))
@@ -452,7 +452,7 @@ mod tests {
{
// first segment
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer
.add_document(doc!(
@@ -506,7 +506,7 @@ mod tests {
{
// second segment
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(
@@ -537,7 +537,7 @@ mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.merge(&segment_ids).wait().unwrap();
index_writer.wait_merging_threads().unwrap();
}
@@ -662,7 +662,7 @@ mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -824,7 +824,7 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(path).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = Document::default();
let doc = TantivyDocument::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
write.terminate().unwrap();
@@ -846,7 +846,7 @@ mod tests {
assert_eq!(col.get_val(0), true);
}
fn get_index(docs: &[crate::Document], schema: &Schema) -> crate::Result<RamDirectory> {
fn get_index(docs: &[crate::TantivyDocument], schema: &Schema) -> crate::Result<RamDirectory> {
let directory: RamDirectory = RamDirectory::create();
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
@@ -888,7 +888,7 @@ mod tests {
let field = schema_builder.add_date_field("field", date_options);
let schema = schema_builder.build();
let docs: Vec<Document> = times.iter().map(|time| doc!(field=>*time)).collect();
let docs: Vec<TantivyDocument> = times.iter().map(|time| doc!(field=>*time)).collect();
let directory = get_index(&docs[..], &schema).unwrap();
let path = Path::new("test");
@@ -962,11 +962,15 @@ mod tests {
let ip_field = schema_builder.add_u64_field("ip", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let ip_addr = Ipv6Addr::new(1, 2, 3, 4, 5, 1, 2, 3);
index_writer.add_document(Document::default()).unwrap();
index_writer
.add_document(TantivyDocument::default())
.unwrap();
index_writer.add_document(doc!(ip_field=>ip_addr)).unwrap();
index_writer.add_document(Document::default()).unwrap();
index_writer
.add_document(TantivyDocument::default())
.unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let fastfields = searcher.segment_reader(0u32).fast_fields();
@@ -1086,7 +1090,7 @@ mod tests {
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"attr.age": 32})))
.unwrap();
@@ -1112,7 +1116,7 @@ mod tests {
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"age": 32})))
.unwrap();
@@ -1139,7 +1143,7 @@ mod tests {
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"attr.age": 32})))
.unwrap();
@@ -1162,7 +1166,7 @@ mod tests {
let field_with_dot = schema_builder.add_i64_field("field.with.dot", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(field_with_dot => 32i64))
.unwrap();
@@ -1184,7 +1188,7 @@ mod tests {
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json_field=> json!({"attr": {"age": 32}}), shadowing_json_field=>json!({"age": 33})))
.unwrap();
@@ -1215,7 +1219,7 @@ mod tests {
let mut index = Index::create_in_ram(schema);
index.set_fast_field_tokenizers(ff_tokenizer_manager);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(text_field => "Test1 test2"))
.unwrap();
@@ -1244,7 +1248,7 @@ mod tests {
let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(log_field => "info"))
.unwrap();
@@ -1277,7 +1281,7 @@ mod tests {
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json_field=> json!({"attr.age": 32}), shadowing_json_field=>json!({"age": 33})))
.unwrap();

View File

@@ -357,7 +357,7 @@ mod tests {
use columnar::ColumnType;
use crate::schema::{JsonObjectOptions, Schema, FAST};
use crate::{Document, Index};
use crate::{Index, IndexWriter, TantivyDocument};
#[test]
fn test_fast_field_reader_resolve_with_dynamic_internal() {
@@ -373,8 +373,10 @@ mod tests {
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(Document::default()).unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(TantivyDocument::default())
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
@@ -443,7 +445,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(id=> 1u64, json => json!({"foo": 42})))
.unwrap();

View File

@@ -5,8 +5,9 @@ use common::replace_in_place;
use tokenizer_api::Token;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::schema::document::{DocValue, Document, ReferenceValue};
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{value_type_to_column_type, Document, FieldType, Schema, Type, Value};
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::{DateTimePrecision, DocId, TantivyError};
@@ -117,114 +118,115 @@ impl FastFieldsWriter {
}
/// Indexes all of the fastfields of a new document.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
pub fn add_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
let doc_id = self.num_docs;
for field_value in doc.field_values() {
if let Some(field_name) =
&self.fast_field_names[field_value.field().field_id() as usize]
{
match &field_value.value {
Value::U64(u64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*u64_val),
);
}
Value::I64(i64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*i64_val),
);
}
Value::F64(f64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*f64_val),
);
}
Value::Str(text_val) => {
if let Some(tokenizer) =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize]
{
let mut token_stream = tokenizer.token_stream(text_val);
token_stream.process(&mut |token: &Token| {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
&token.text,
);
})
} else {
self.columnar_writer
.record_str(doc_id, field_name.as_str(), text_val);
}
}
Value::Bytes(bytes_val) => {
self.columnar_writer
.record_bytes(doc_id, field_name.as_str(), bytes_val);
}
Value::PreTokStr(pre_tok) => {
for token in &pre_tok.tokens {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
&token.text,
);
}
}
Value::Bool(bool_val) => {
self.columnar_writer
.record_bool(doc_id, field_name.as_str(), *bool_val);
}
Value::Date(datetime) => {
let date_precision =
self.date_precisions[field_value.field().field_id() as usize];
let truncated_datetime = datetime.truncate(date_precision);
self.columnar_writer.record_datetime(
doc_id,
field_name.as_str(),
truncated_datetime,
);
}
Value::Facet(facet) => {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
facet.encoded_str(),
);
}
Value::JsonObject(json_obj) => {
let expand_dots = self.expand_dots[field_value.field().field_id() as usize];
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
for (field, value) in doc.iter_fields_and_values() {
let value_access = value as D::Value<'_>;
let text_analyzer =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
Value::IpAddr(ip_addr) => {
self.columnar_writer
.record_ip_addr(doc_id, field_name.as_str(), *ip_addr);
}
}
}
self.add_doc_value(doc_id, field, value_access.as_value())?;
}
self.num_docs += 1;
Ok(())
}
fn add_doc_value<'a, V: DocValue<'a>>(
&mut self,
doc_id: DocId,
field: Field,
value: ReferenceValue<'a, V>,
) -> crate::Result<()> {
let field_name = match &self.fast_field_names[field.field_id() as usize] {
None => return Ok(()),
Some(name) => name,
};
match value {
ReferenceValue::Null => {}
ReferenceValue::Str(val) => {
if let Some(tokenizer) = &mut self.per_field_tokenizer[field.field_id() as usize] {
let mut token_stream = tokenizer.token_stream(val);
token_stream.process(&mut |token: &Token| {
self.columnar_writer
.record_str(doc_id, field_name, &token.text);
})
} else {
self.columnar_writer.record_str(doc_id, field_name, val);
}
}
ReferenceValue::U64(val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name,
NumericalValue::from(val),
);
}
ReferenceValue::I64(val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name,
NumericalValue::from(val),
);
}
ReferenceValue::F64(val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name,
NumericalValue::from(val),
);
}
ReferenceValue::Date(val) => {
let date_precision = self.date_precisions[field.field_id() as usize];
let truncated_datetime = val.truncate(date_precision);
self.columnar_writer
.record_datetime(doc_id, field_name, truncated_datetime);
}
ReferenceValue::Facet(val) => {
self.columnar_writer
.record_str(doc_id, field_name, val.encoded_str());
}
ReferenceValue::Bytes(val) => {
self.columnar_writer.record_bytes(doc_id, field_name, val);
}
ReferenceValue::IpAddr(val) => {
self.columnar_writer.record_ip_addr(doc_id, field_name, val);
}
ReferenceValue::Bool(val) => {
self.columnar_writer.record_bool(doc_id, field_name, val);
}
ReferenceValue::PreTokStr(val) => {
for token in &val.tokens {
self.columnar_writer
.record_str(doc_id, field_name, &token.text);
}
}
ReferenceValue::Array(val) => {
// TODO: Check this is the correct behaviour we want.
for value in val {
self.add_doc_value(doc_id, field, value)?;
}
}
ReferenceValue::Object(val) => {
let expand_dots = self.expand_dots[field.field_id() as usize];
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
let text_analyzer = &mut self.per_field_tokenizer[field.field_id() as usize];
record_json_obj_to_columnar_writer::<V>(
doc_id,
val,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
}
Ok(())
}
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer.
pub fn serialize(
@@ -241,31 +243,16 @@ impl FastFieldsWriter {
}
}
#[inline]
fn columnar_numerical_value(json_number: &serde_json::Number) -> Option<NumericalValue> {
if let Some(num_i64) = json_number.as_i64() {
return Some(num_i64.into());
}
if let Some(num_u64) = json_number.as_u64() {
return Some(num_u64.into());
}
if let Some(num_f64) = json_number.as_f64() {
return Some(num_f64.into());
}
// This can happen with arbitrary precision.... but we do not handle it.
None
}
fn record_json_obj_to_columnar_writer(
fn record_json_obj_to_columnar_writer<'a, V: DocValue<'a>>(
doc: DocId,
json_obj: &serde_json::Map<String, serde_json::Value>,
json_visitor: V::ObjectIter,
expand_dots: bool,
remaining_depth_limit: usize,
json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>,
) {
for (key, child) in json_obj {
for (key, child) in json_visitor {
let len_path = json_path_buffer.len();
if !json_path_buffer.is_empty() {
json_path_buffer.push_str(JSON_PATH_SEGMENT_SEP_STR);
@@ -295,9 +282,9 @@ fn record_json_obj_to_columnar_writer(
}
}
fn record_json_value_to_columnar_writer(
fn record_json_value_to_columnar_writer<'a, V: DocValue<'a>>(
doc: DocId,
json_val: &serde_json::Value,
json_val: ReferenceValue<'a, V>,
expand_dots: bool,
mut remaining_depth_limit: usize,
json_path_writer: &mut String,
@@ -308,31 +295,63 @@ fn record_json_value_to_columnar_writer(
return;
}
remaining_depth_limit -= 1;
match json_val {
serde_json::Value::Null => {
// TODO handle null
}
serde_json::Value::Bool(bool_val) => {
columnar_writer.record_bool(doc, json_path_writer, *bool_val);
}
serde_json::Value::Number(json_number) => {
if let Some(numerical_value) = columnar_numerical_value(json_number) {
columnar_writer.record_numerical(doc, json_path_writer.as_str(), numerical_value);
}
}
serde_json::Value::String(text) => {
ReferenceValue::Null => {} // TODO: Handle null
ReferenceValue::Str(val) => {
if let Some(text_analyzer) = tokenizer.as_mut() {
let mut token_stream = text_analyzer.token_stream(text);
let mut token_stream = text_analyzer.token_stream(val);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
})
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
columnar_writer.record_str(doc, json_path_writer.as_str(), val);
}
}
serde_json::Value::Array(arr) => {
for el in arr {
record_json_value_to_columnar_writer(
ReferenceValue::U64(val) => {
columnar_writer.record_numerical(
doc,
json_path_writer.as_str(),
NumericalValue::from(val),
);
}
ReferenceValue::I64(val) => {
columnar_writer.record_numerical(
doc,
json_path_writer.as_str(),
NumericalValue::from(val),
);
}
ReferenceValue::F64(val) => {
columnar_writer.record_numerical(
doc,
json_path_writer.as_str(),
NumericalValue::from(val),
);
}
ReferenceValue::Bool(val) => {
columnar_writer.record_bool(doc, json_path_writer, val);
}
ReferenceValue::Date(val) => {
columnar_writer.record_datetime(doc, json_path_writer.as_str(), val);
}
ReferenceValue::Facet(_) => {
unimplemented!("Facet support in dynamic fields is not yet implemented")
}
ReferenceValue::Bytes(_) => {
// TODO: This can be re added once it is added to the JSON Utils section as well.
// columnar_writer.record_bytes(doc, json_path_writer.as_str(), val);
unimplemented!("Bytes support in dynamic fields is not yet implemented")
}
ReferenceValue::IpAddr(_) => {
unimplemented!("IP address support in dynamic fields is not yet implemented")
}
ReferenceValue::PreTokStr(_) => {
unimplemented!("Pre-tokenized string support in dynamic fields is not yet implemented")
}
ReferenceValue::Array(elements) => {
for el in elements {
record_json_value_to_columnar_writer::<V::ChildValue>(
doc,
el,
expand_dots,
@@ -343,10 +362,10 @@ fn record_json_value_to_columnar_writer(
);
}
}
serde_json::Value::Object(json_obj) => {
record_json_obj_to_columnar_writer(
ReferenceValue::Object(object) => {
record_json_obj_to_columnar_writer::<V>(
doc,
json_obj,
object,
expand_dots,
remaining_depth_limit,
json_path_writer,
@@ -363,6 +382,7 @@ mod tests {
use super::record_json_value_to_columnar_writer;
use crate::fastfield::writer::JSON_DEPTH_LIMIT;
use crate::schema::document::DocValue;
use crate::DocId;
fn test_columnar_from_jsons_aux(
@@ -374,7 +394,7 @@ mod tests {
for (doc, json_doc) in json_docs.iter().enumerate() {
record_json_value_to_columnar_writer(
doc as u32,
json_doc,
json_doc.as_value(),
expand_dots,
JSON_DEPTH_LIMIT,
&mut json_path,

View File

@@ -4,7 +4,7 @@ use rand::{thread_rng, Rng};
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::schema::*;
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, Order, Searcher};
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, IndexWriter, Order, Searcher};
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
assert!(searcher.segment_readers().len() < 20);
@@ -12,7 +12,7 @@ fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
for segment_reader in searcher.segment_readers() {
let store_reader = segment_reader.get_store_reader(1)?;
for doc_id in 0..segment_reader.max_doc() {
let _doc = store_reader.get(doc_id)?;
let _doc: TantivyDocument = store_reader.get(doc_id)?;
}
}
Ok(())
@@ -31,7 +31,8 @@ fn test_functional_store() -> crate::Result<()> {
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut index_writer: IndexWriter =
index.writer_with_num_threads(3, MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut doc_set: Vec<u64> = Vec::new();
@@ -91,7 +92,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
let mut index_writer: IndexWriter = index.writer_with_num_threads(3, 120_000_000)?;
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
@@ -114,7 +115,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
let mut doc = TantivyDocument::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
@@ -166,7 +167,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
let mut index_writer: IndexWriter = index.writer_with_num_threads(3, 120_000_000)?;
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
@@ -189,7 +190,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
let mut doc = TantivyDocument::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);

View File

@@ -158,6 +158,7 @@ mod tests_indexsorting {
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::document::DocValue;
use crate::schema::{Schema, *};
use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order};
@@ -308,16 +309,16 @@ mod tests_indexsorting {
{
assert_eq!(
searcher
.doc(DocAddress::new(0, 0))?
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
.get_first(my_string_field),
None
);
assert_eq!(
searcher
.doc(DocAddress::new(0, 3))?
.doc::<TantivyDocument>(DocAddress::new(0, 3))?
.get_first(my_string_field)
.unwrap()
.as_text(),
.as_str(),
Some("blublub")
);
}
@@ -337,13 +338,13 @@ mod tests_indexsorting {
{
assert_eq!(
searcher
.doc(DocAddress::new(0, 0))?
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
.get_first(my_string_field)
.unwrap()
.as_text(),
.as_str(),
Some("blublub")
);
let doc = searcher.doc(DocAddress::new(0, 4))?;
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
assert_eq!(doc.get_first(my_string_field), None);
}
// sort by field desc
@@ -360,9 +361,9 @@ mod tests_indexsorting {
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
{
let doc = searcher.doc(DocAddress::new(0, 4))?;
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
assert_eq!(
doc.get_first(my_string_field).unwrap().as_text(),
doc.get_first(my_string_field).unwrap().as_str(),
Some("blublub")
);
}

View File

@@ -20,7 +20,8 @@ use crate::indexer::operation::DeleteOperation;
use crate::indexer::stamper::Stamper;
use crate::indexer::{MergePolicy, SegmentEntry, SegmentWriter};
use crate::query::{EnableScoring, Query, TermQuery};
use crate::schema::{Document, IndexRecordOption, Term};
use crate::schema::document::Document;
use crate::schema::{IndexRecordOption, TantivyDocument, Term};
use crate::{FutureResult, Opstamp};
// Size of the margin for the `memory_arena`. A segment is closed when the remaining memory
@@ -50,7 +51,7 @@ fn error_in_index_worker_thread(context: &str) -> TantivyError {
/// indexing queue.
/// Each indexing thread builds its own independent [`Segment`], via
/// a `SegmentWriter` object.
pub struct IndexWriter {
pub struct IndexWriter<D: Document = TantivyDocument> {
// the lock is just used to bind the
// lifetime of the lock with that of the IndexWriter.
_directory_lock: Option<DirectoryLock>,
@@ -62,8 +63,8 @@ pub struct IndexWriter {
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>,
index_writer_status: IndexWriterStatus,
operation_sender: AddBatchSender,
index_writer_status: IndexWriterStatus<D>,
operation_sender: AddBatchSender<D>,
segment_updater: SegmentUpdater,
@@ -164,10 +165,10 @@ pub(crate) fn advance_deletes(
Ok(())
}
fn index_documents(
fn index_documents<D: Document>(
memory_budget: usize,
segment: Segment,
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch>,
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch<D>>,
segment_updater: &SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> crate::Result<()> {
@@ -247,7 +248,7 @@ fn apply_deletes(
})
}
impl IndexWriter {
impl<D: Document> IndexWriter<D> {
/// Create a new index writer. Attempts to acquire a lockfile.
///
/// The lockfile should be deleted on drop, but it is possible
@@ -267,7 +268,7 @@ impl IndexWriter {
num_threads: usize,
memory_budget_in_bytes_per_thread: usize,
directory_lock: DirectoryLock,
) -> crate::Result<IndexWriter> {
) -> crate::Result<Self> {
if memory_budget_in_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
let err_msg = format!(
"The memory arena in bytes per thread needs to be at least \
@@ -281,7 +282,7 @@ impl IndexWriter {
);
return Err(TantivyError::InvalidArgument(err_msg));
}
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
let (document_sender, document_receiver) =
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new();
@@ -293,7 +294,7 @@ impl IndexWriter {
let segment_updater =
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let mut index_writer = IndexWriter {
let mut index_writer = Self {
_directory_lock: Some(directory_lock),
memory_budget_in_bytes_per_thread,
@@ -375,7 +376,7 @@ impl IndexWriter {
self.index.new_segment()
}
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver> {
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver<D>> {
self.index_writer_status
.operation_receiver()
.ok_or_else(|| {
@@ -525,7 +526,7 @@ impl IndexWriter {
///
/// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) {
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
let (document_sender, document_receiver) =
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
self.operation_sender = document_sender;
self.index_writer_status = IndexWriterStatus::from(document_receiver);
@@ -552,7 +553,7 @@ impl IndexWriter {
.take()
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
let new_index_writer: IndexWriter = IndexWriter::new(
let new_index_writer = IndexWriter::new(
&self.index,
self.num_threads,
self.memory_budget_in_bytes_per_thread,
@@ -598,7 +599,7 @@ impl IndexWriter {
/// It is also possible to add a payload to the `commit`
/// using this API.
/// See [`PreparedCommit::set_payload()`].
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit> {
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<D>> {
// Here, because we join all of the worker threads,
// all of the segment update for this commit have been
// sent.
@@ -707,7 +708,7 @@ impl IndexWriter {
/// The opstamp is an increasing `u64` that can
/// be used by the client to align commits with its own
/// document queue.
pub fn add_document(&self, document: Document) -> crate::Result<Opstamp> {
pub fn add_document(&self, document: D) -> crate::Result<Opstamp> {
let opstamp = self.stamper.stamp();
self.send_add_documents_batch(smallvec![AddOperation { opstamp, document }])?;
Ok(opstamp)
@@ -744,7 +745,7 @@ impl IndexWriter {
/// visible to readers only after calling `commit()`.
pub fn run<I>(&self, user_operations: I) -> crate::Result<Opstamp>
where
I: IntoIterator<Item = UserOperation>,
I: IntoIterator<Item = UserOperation<D>>,
I::IntoIter: ExactSizeIterator,
{
let user_operations_it = user_operations.into_iter();
@@ -778,7 +779,7 @@ impl IndexWriter {
Ok(batch_opstamp)
}
fn send_add_documents_batch(&self, add_ops: AddBatch) -> crate::Result<()> {
fn send_add_documents_batch(&self, add_ops: AddBatch<D>) -> crate::Result<()> {
if self.index_writer_status.is_alive() && self.operation_sender.send(add_ops).is_ok() {
Ok(())
} else {
@@ -787,7 +788,7 @@ impl IndexWriter {
}
}
impl Drop for IndexWriter {
impl<D: Document> Drop for IndexWriter<D> {
fn drop(&mut self) {
self.segment_updater.kill();
self.drop_sender();
@@ -814,13 +815,15 @@ mod tests {
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::indexer::NoMergePolicy;
use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
use crate::schema::document::DocValue;
use crate::schema::{
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::store::DOCSTORE_CACHE_CAPACITY;
use crate::{
DateTime, DocAddress, Index, IndexSettings, IndexSortByField, Order, ReloadPolicy, Term,
DateTime, DocAddress, Index, IndexSettings, IndexSortByField, IndexWriter, Order,
ReloadPolicy, TantivyDocument, Term,
};
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
@@ -852,7 +855,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(text_field => "hello1"))
.unwrap();
@@ -905,7 +908,7 @@ mod tests {
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b");
let operations = vec![
@@ -943,7 +946,7 @@ mod tests {
fn test_empty_operations_group() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer_for_tests().unwrap();
let index_writer: IndexWriter = index.writer_for_tests().unwrap();
let operations1 = vec![];
let batch_opstamp1 = index_writer.run(operations1).unwrap();
assert_eq!(batch_opstamp1, 0u64);
@@ -956,8 +959,8 @@ mod tests {
fn test_lockfile_stops_duplicates() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer_for_tests().unwrap();
match index.writer_for_tests() {
let _index_writer: IndexWriter = index.writer_for_tests().unwrap();
match index.writer_for_tests::<TantivyDocument>() {
Err(TantivyError::LockFailure(LockError::LockBusy, _)) => {}
_ => panic!("Expected a `LockFailure` error"),
}
@@ -967,8 +970,8 @@ mod tests {
fn test_lockfile_already_exists_error_msg() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer_for_tests().unwrap();
match index.writer_for_tests() {
let _index_writer: IndexWriter = index.writer_for_tests().unwrap();
match index.writer_for_tests::<TantivyDocument>() {
Err(err) => {
let err_msg = err.to_string();
assert!(err_msg.contains("already an `IndexWriter`"));
@@ -981,7 +984,7 @@ mod tests {
fn test_set_merge_policy() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer_for_tests().unwrap();
let index_writer: IndexWriter = index.writer_for_tests().unwrap();
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, \
@@ -1000,11 +1003,11 @@ mod tests {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
{
let _index_writer = index.writer_for_tests().unwrap();
let _index_writer: IndexWriter = index.writer_for_tests().unwrap();
// the lock should be released when the
// index_writer leaves the scope.
}
let _index_writer_two = index.writer_for_tests().unwrap();
let _index_writer_two: IndexWriter = index.writer_for_tests().unwrap();
}
#[test]
@@ -1056,7 +1059,7 @@ mod tests {
reader.searcher().doc_freq(&term_a).unwrap()
};
// writing the segment
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.commit()?;
// this should create 1 segment
@@ -1096,7 +1099,7 @@ mod tests {
reader.searcher().doc_freq(&term_a).unwrap()
};
// writing the segment
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"))?;
@@ -1382,7 +1385,7 @@ mod tests {
fn test_delete_all_documents_empty_index() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index
let mut index_writer: IndexWriter = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap();
let clear = index_writer.delete_all_documents();
@@ -1395,7 +1398,7 @@ mod tests {
fn test_delete_all_documents_index_twice() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index
let mut index_writer: IndexWriter = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap();
let clear = index_writer.delete_all_documents();
@@ -1415,7 +1418,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::builder().schema(schema).create_in_ram().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(text_field => "one"))
.unwrap();
@@ -1777,7 +1780,7 @@ mod tests {
let num_segments_before_merge = searcher.segment_readers().len();
if force_end_merge {
index_writer.wait_merging_threads()?;
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
@@ -1973,14 +1976,14 @@ mod tests {
.get_store_reader(DOCSTORE_CACHE_CAPACITY)
.unwrap();
// test store iterator
for doc in store_reader.iter(segment_reader.alive_bitset()) {
for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) {
let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id));
}
// test store random access
for doc_id in segment_reader.doc_ids_alive() {
let id = store_reader
.get(doc_id)
.get::<TantivyDocument>(doc_id)
.unwrap()
.get_first(id_field)
.unwrap()
@@ -1989,7 +1992,7 @@ mod tests {
assert!(expected_ids_and_num_occurrences.contains_key(&id));
if id_exists(id) {
let id2 = store_reader
.get(doc_id)
.get::<TantivyDocument>(doc_id)
.unwrap()
.get_first(multi_numbers)
.unwrap()
@@ -1997,13 +2000,13 @@ mod tests {
.unwrap();
assert_eq!(id, id2);
let bool = store_reader
.get(doc_id)
.get::<TantivyDocument>(doc_id)
.unwrap()
.get_first(bool_field)
.unwrap()
.as_bool()
.unwrap();
let doc = store_reader.get(doc_id).unwrap();
let doc = store_reader.get::<TantivyDocument>(doc_id).unwrap();
let mut bool2 = doc.get_all(multi_bools);
assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap());
assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap());
@@ -2543,7 +2546,7 @@ mod tests {
// Merge
{
assert!(index_writer.wait_merging_threads().is_ok());
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
@@ -2585,7 +2588,7 @@ mod tests {
// Merge
{
assert!(index_writer.wait_merging_threads().is_ok());
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");

View File

@@ -2,13 +2,15 @@ use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock};
use super::AddBatchReceiver;
use crate::schema::document::Document;
use crate::TantivyDocument;
#[derive(Clone)]
pub(crate) struct IndexWriterStatus {
inner: Arc<Inner>,
pub(crate) struct IndexWriterStatus<D: Document = TantivyDocument> {
inner: Arc<Inner<D>>,
}
impl IndexWriterStatus {
impl<D: Document> IndexWriterStatus<D> {
/// Returns true iff the index writer is alive.
pub fn is_alive(&self) -> bool {
self.inner.as_ref().is_alive()
@@ -16,7 +18,7 @@ impl IndexWriterStatus {
/// Returns a copy of the operation receiver.
/// If the index writer was killed, returns `None`.
pub fn operation_receiver(&self) -> Option<AddBatchReceiver> {
pub fn operation_receiver(&self) -> Option<AddBatchReceiver<D>> {
let rlock = self
.inner
.receive_channel
@@ -27,19 +29,19 @@ impl IndexWriterStatus {
/// Create an index writer bomb.
/// If dropped, the index writer status will be killed.
pub(crate) fn create_bomb(&self) -> IndexWriterBomb {
pub(crate) fn create_bomb(&self) -> IndexWriterBomb<D> {
IndexWriterBomb {
inner: Some(self.inner.clone()),
}
}
}
struct Inner {
struct Inner<D: Document> {
is_alive: AtomicBool,
receive_channel: RwLock<Option<AddBatchReceiver>>,
receive_channel: RwLock<Option<AddBatchReceiver<D>>>,
}
impl Inner {
impl<D: Document> Inner<D> {
fn is_alive(&self) -> bool {
self.is_alive.load(Ordering::Relaxed)
}
@@ -53,8 +55,8 @@ impl Inner {
}
}
impl From<AddBatchReceiver> for IndexWriterStatus {
fn from(receiver: AddBatchReceiver) -> Self {
impl<D: Document> From<AddBatchReceiver<D>> for IndexWriterStatus<D> {
fn from(receiver: AddBatchReceiver<D>) -> Self {
IndexWriterStatus {
inner: Arc::new(Inner {
is_alive: AtomicBool::new(true),
@@ -66,11 +68,11 @@ impl From<AddBatchReceiver> for IndexWriterStatus {
/// If dropped, the index writer will be killed.
/// To prevent this, clients can call `.defuse()`.
pub(crate) struct IndexWriterBomb {
inner: Option<Arc<Inner>>,
pub(crate) struct IndexWriterBomb<D: Document> {
inner: Option<Arc<Inner<D>>>,
}
impl IndexWriterBomb {
impl<D: Document> IndexWriterBomb<D> {
/// Defuses the bomb.
///
/// This is the only way to drop the bomb without killing
@@ -80,7 +82,7 @@ impl IndexWriterBomb {
}
}
impl Drop for IndexWriterBomb {
impl<D: Document> Drop for IndexWriterBomb<D> {
fn drop(&mut self) {
if let Some(inner) = self.inner.take() {
inner.kill();

View File

@@ -753,9 +753,10 @@ mod tests {
use crate::collector::{Count, FacetCollector};
use crate::core::Index;
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::document::DocValue;
use crate::schema::{
Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term, TextFieldIndexing,
INDEXED, TEXT,
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
TextFieldIndexing, INDEXED, TEXT,
};
use crate::time::OffsetDateTime;
use crate::{
@@ -817,7 +818,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -866,30 +867,24 @@ mod tests {
);
}
{
let doc = searcher.doc(DocAddress::new(0, 0))?;
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b"));
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
}
{
let doc = searcher.doc(DocAddress::new(0, 1))?;
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("a b c"));
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c"));
}
{
let doc = searcher.doc(DocAddress::new(0, 2))?;
assert_eq!(
doc.get_first(text_field).unwrap().as_text(),
Some("a b c d")
);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c d"));
}
{
let doc = searcher.doc(DocAddress::new(0, 3))?;
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b"));
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
}
{
let doc = searcher.doc(DocAddress::new(0, 4))?;
assert_eq!(
doc.get_first(text_field).unwrap().as_text(),
Some("a b c g")
);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c g"));
}
{
@@ -1300,10 +1295,10 @@ mod tests {
let reader = index.reader().unwrap();
let mut int_val = 0;
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let index_doc =
|index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| {
let mut doc = Document::default();
let mut doc = TantivyDocument::default();
for facet in doc_facets {
doc.add_facet(facet_field, Facet::from(facet));
}
@@ -1384,7 +1379,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.merge(&segment_ids)
.wait()
@@ -1406,7 +1401,7 @@ mod tests {
// Deleting one term
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term);
@@ -1431,7 +1426,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(int_field => 1u64))?;
index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64))?;
@@ -1460,7 +1455,7 @@ mod tests {
let reader = index.reader()?;
{
let mut index_writer = index.writer_for_tests()?;
let mut doc = Document::default();
let mut doc = TantivyDocument::default();
doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone())?;
index_writer.commit()?;
@@ -1503,7 +1498,7 @@ mod tests {
{
let mut index_writer = index.writer_for_tests()?;
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = Document::default();
let mut doc = TantivyDocument::default();
for &val in int_vals {
doc.add_u64(int_field, val);
}
@@ -1566,7 +1561,7 @@ mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -1613,7 +1608,7 @@ mod tests {
writer.set_merge_policy(Box::new(policy));
for i in 0..100 {
let mut doc = Document::new();
let mut doc = TantivyDocument::new();
doc.add_f64(field, 42.0);
doc.add_f64(multi_field, 0.24);
doc.add_f64(multi_field, 0.27);

View File

@@ -4,11 +4,15 @@ mod tests {
use crate::core::Index;
use crate::fastfield::AliveBitSet;
use crate::query::QueryParser;
use crate::schema::document::DocValue;
use crate::schema::{
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions,
};
use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term};
use crate::{
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, Postings,
TantivyDocument, Term,
};
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
let mut schema_builder = schema::Schema::builder();
@@ -26,7 +30,7 @@ mod tests {
let index = index_builder.create_in_ram().unwrap();
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")))
.unwrap();
@@ -45,7 +49,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
assert!(index_writer.merge(&segment_ids).wait().is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
}
@@ -133,7 +137,7 @@ mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -272,12 +276,16 @@ mod tests {
} else {
2
};
let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap();
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
.unwrap();
assert_eq!(
doc.get_first(my_text_field).unwrap().as_text(),
doc.get_first(my_text_field).unwrap().as_str(),
Some("blubber")
);
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))
.unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000));
}
}
@@ -494,7 +502,7 @@ mod bench_sorted_index_merge {
let index = index_builder.create_in_ram().unwrap();
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, val: u64| {
index_writer.add_document(doc!(int_field=>val)).unwrap();
};

View File

@@ -44,9 +44,9 @@ pub type DefaultMergePolicy = LogMergePolicy;
// - all docs in the operation will happen on the same segment and continuous doc_ids.
// - all operations in the group are committed at the same time, making the group
// atomic.
type AddBatch = SmallVec<[AddOperation; 4]>;
type AddBatchSender = channel::Sender<AddBatch>;
type AddBatchReceiver = channel::Receiver<AddBatch>;
type AddBatch<D> = SmallVec<[AddOperation<D>; 4]>;
type AddBatchSender<D> = channel::Sender<AddBatch<D>>;
type AddBatchReceiver<D> = channel::Receiver<AddBatch<D>>;
#[cfg(feature = "mmap")]
#[cfg(test)]
@@ -55,14 +55,14 @@ mod tests_mmap {
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{JsonObjectOptions, Schema, TEXT};
use crate::{Index, Term};
use crate::{Index, IndexWriter, Term};
#[test]
fn test_advance_delete_bug() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_from_tempdir(schema_builder.build())?;
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
// there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b"))?;
index_writer.delete_term(Term::from_field_text(text_field, "b"));
@@ -79,7 +79,7 @@ mod tests_mmap {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
@@ -110,7 +110,7 @@ mod tests_mmap {
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();

View File

@@ -1,5 +1,6 @@
use crate::query::Weight;
use crate::schema::{Document, Term};
use crate::schema::document::Document;
use crate::schema::{TantivyDocument, Term};
use crate::Opstamp;
/// Timestamped Delete operation.
@@ -10,16 +11,16 @@ pub struct DeleteOperation {
/// Timestamped Add operation.
#[derive(Eq, PartialEq, Debug)]
pub struct AddOperation {
pub struct AddOperation<D: Document = TantivyDocument> {
pub opstamp: Opstamp,
pub document: Document,
pub document: D,
}
/// UserOperation is an enum type that encapsulates other operation types.
#[derive(Eq, PartialEq, Debug)]
pub enum UserOperation {
pub enum UserOperation<D: Document = TantivyDocument> {
/// Add operation
Add(Document),
Add(D),
/// Delete operation
Delete(Term),
}

View File

@@ -1,16 +1,17 @@
use super::IndexWriter;
use crate::{FutureResult, Opstamp};
use crate::schema::document::Document;
use crate::{FutureResult, Opstamp, TantivyDocument};
/// A prepared commit
pub struct PreparedCommit<'a> {
index_writer: &'a mut IndexWriter,
pub struct PreparedCommit<'a, D: Document = TantivyDocument> {
index_writer: &'a mut IndexWriter<D>,
payload: Option<String>,
opstamp: Opstamp,
}
impl<'a> PreparedCommit<'a> {
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit<'_> {
PreparedCommit {
impl<'a, D: Document> PreparedCommit<'a, D> {
pub(crate) fn new(index_writer: &'a mut IndexWriter<D>, opstamp: Opstamp) -> Self {
Self {
index_writer,
payload: None,
opstamp,

View File

@@ -13,10 +13,11 @@ use crate::postings::{
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
PerFieldPostingsWriter, PostingsWriter,
};
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED};
use crate::schema::document::{DocValue, Document, ReferenceValue};
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::{DocId, Document, Opstamp, SegmentComponent, TantivyError};
use crate::{DocId, Opstamp, SegmentComponent, TantivyError};
/// Computes the initial size of the hash table.
///
@@ -81,10 +82,7 @@ impl SegmentWriter {
/// the flushing behavior as a memory limit.
/// - segment: The segment being written
/// - schema
pub fn for_segment(
memory_budget_in_bytes: usize,
segment: Segment,
) -> crate::Result<SegmentWriter> {
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone();
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
@@ -113,7 +111,7 @@ impl SegmentWriter {
})
})
.collect::<Result<Vec<_>, _>>()?;
Ok(SegmentWriter {
Ok(Self {
max_doc: 0,
ctx: IndexingContext::new(table_size),
per_field_postings_writers,
@@ -164,18 +162,21 @@ impl SegmentWriter {
+ self.segment_serializer.mem_usage()
}
fn index_document(&mut self, doc: &Document) -> crate::Result<()> {
fn index_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
let doc_id = self.max_doc;
// TODO: Can this be optimised a bit?
let vals_grouped_by_field = doc
.field_values()
.iter()
.sorted_by_key(|el| el.field())
.group_by(|el| el.field());
.iter_fields_and_values()
.sorted_by_key(|(field, _)| *field)
.group_by(|(field, _)| *field);
for (field, field_values) in &vals_grouped_by_field {
let values = field_values.map(|field_value| field_value.value());
let values = field_values.map(|el| el.1);
let field_entry = self.schema.get_field_entry(field);
let make_schema_error = || {
crate::TantivyError::SchemaError(format!(
TantivyError::SchemaError(format!(
"Expected a {:?} for field {:?}",
field_entry.field_type().value_type(),
field_entry.name()
@@ -193,7 +194,10 @@ impl SegmentWriter {
match field_entry.field_type() {
FieldType::Facet(_) => {
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
for value in values {
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
let facet = value.as_facet().ok_or_else(make_schema_error)?;
let facet_str = facet.encoded_str();
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -209,19 +213,18 @@ impl SegmentWriter {
}
FieldType::Str(_) => {
let mut indexing_position = IndexingPosition::default();
for value in values {
let mut token_stream = match value {
Value::PreTokStr(tok_str) => {
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
}
Value::Str(ref text) => {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
}
_ => {
continue;
}
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
let mut token_stream = if let Some(text) = value.as_str() {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
} else {
continue;
};
assert!(term_buffer.is_empty());
@@ -240,7 +243,10 @@ impl SegmentWriter {
}
FieldType::U64(_) => {
let mut num_vals = 0;
for value in values {
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
num_vals += 1;
let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
term_buffer.set_u64(u64_val);
@@ -252,9 +258,13 @@ impl SegmentWriter {
}
FieldType::Date(_) => {
let mut num_vals = 0;
for value in values {
for value_access in values {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
num_vals += 1;
let date_val = value.as_date().ok_or_else(make_schema_error)?;
let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
term_buffer
.set_u64(date_val.truncate(DATE_TIME_PRECISION_INDEXED).to_u64());
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
@@ -265,7 +275,10 @@ impl SegmentWriter {
}
FieldType::I64(_) => {
let mut num_vals = 0;
for value in values {
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
num_vals += 1;
let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
term_buffer.set_i64(i64_val);
@@ -277,7 +290,10 @@ impl SegmentWriter {
}
FieldType::F64(_) => {
let mut num_vals = 0;
for value in values {
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
num_vals += 1;
let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val);
@@ -289,7 +305,10 @@ impl SegmentWriter {
}
FieldType::Bool(_) => {
let mut num_vals = 0;
for value in values {
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
num_vals += 1;
let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
term_buffer.set_bool(bool_val);
@@ -301,7 +320,10 @@ impl SegmentWriter {
}
FieldType::Bytes(_) => {
let mut num_vals = 0;
for value in values {
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
num_vals += 1;
let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes);
@@ -314,9 +336,17 @@ impl SegmentWriter {
FieldType::JsonObject(json_options) => {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it =
values.map(|value| value.as_json().ok_or_else(make_schema_error));
index_json_values(
let json_values_it = values.map(|value_access| {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
match value {
ReferenceValue::Object(object_iter) => Ok(object_iter),
_ => Err(make_schema_error()),
}
});
index_json_values::<D::Value<'_>>(
doc_id,
json_values_it,
text_analyzer,
@@ -328,7 +358,10 @@ impl SegmentWriter {
}
FieldType::IpAddr(_) => {
let mut num_vals = 0;
for value in values {
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
num_vals += 1;
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
term_buffer.set_ip_addr(ip_addr);
@@ -346,7 +379,10 @@ impl SegmentWriter {
/// Indexes a new document
///
/// As a user, you should rather use `IndexWriter`'s add_document.
pub fn add_document(&mut self, add_operation: AddOperation) -> crate::Result<()> {
pub fn add_document<D: Document>(
&mut self,
add_operation: AddOperation<D>,
) -> crate::Result<()> {
let AddOperation { document, opstamp } = add_operation;
self.doc_opstamps.push(opstamp);
self.fast_field_writers.add_document(&document)?;
@@ -445,6 +481,7 @@ fn remap_and_write(
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use tempfile::TempDir;
@@ -455,6 +492,7 @@ mod tests {
use crate::directory::RamDirectory;
use crate::postings::TermInfo;
use crate::query::PhraseQuery;
use crate::schema::document::DocValue;
use crate::schema::{
IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING, TEXT,
};
@@ -463,7 +501,8 @@ mod tests {
use crate::time::OffsetDateTime;
use crate::tokenizer::{PreTokenizedString, Token};
use crate::{
DateTime, Directory, DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED,
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, Postings, TantivyDocument,
Term, TERMINATED,
};
#[test]
@@ -480,7 +519,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("title", TEXT | STORED);
let schema = schema_builder.build();
let mut doc = Document::default();
let mut doc = TantivyDocument::default();
let pre_tokenized_text = PreTokenizedString {
text: String::from("A"),
tokens: vec![Token {
@@ -504,11 +543,11 @@ mod tests {
store_writer.close().unwrap();
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
let doc = reader.get(0).unwrap();
let doc = reader.get::<TantivyDocument>(0).unwrap();
assert_eq!(doc.field_values().len(), 2);
assert_eq!(doc.field_values()[0].value().as_text(), Some("A"));
assert_eq!(doc.field_values()[1].value().as_text(), Some("title"));
assert_eq!(doc.field_values()[0].value().as_str(), Some("A"));
assert_eq!(doc.field_values()[1].value().as_str(), Some("title"));
}
#[test]
@@ -539,13 +578,13 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let doc = searcher
.doc(DocAddress {
.doc::<TantivyDocument>(DocAddress {
segment_ord: 0u32,
doc_id: 0u32,
})
.unwrap();
let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
&schema.to_json(&doc),
&doc.to_json(&schema),
)
.unwrap()
.get("json")
@@ -675,10 +714,10 @@ mod tests {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
let schema = schema_builder.build();
let mut doc = Document::default();
let json_val: serde_json::Map<String, serde_json::Value> =
let mut doc = TantivyDocument::default();
let json_val: BTreeMap<String, crate::schema::Value> =
serde_json::from_str(r#"{"mykey": "repeated token token"}"#).unwrap();
doc.add_json_object(json_field, json_val);
doc.add_object(json_field, json_val);
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc).unwrap();
@@ -802,11 +841,10 @@ mod tests {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let doc = schema
.parse_document(r#"{"text": [ "bbb", "aaa", "", "aaa"]}"#)
let doc = TantivyDocument::parse_json(&schema, r#"{"text": [ "bbb", "aaa", "", "aaa"]}"#)
.unwrap();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
// On debug this did panic on the underflow
index_writer.commit().unwrap();
@@ -831,7 +869,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut doc = Document::default();
let mut doc = TantivyDocument::default();
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "roller-coaster".to_string(),
@@ -846,7 +884,7 @@ mod tests {
doc.add_pre_tokenized_text(text, tokens.clone());
doc.add_pre_tokenized_text(text, tokens);
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
@@ -869,7 +907,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut doc = Document::default();
let mut doc = TantivyDocument::default();
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
@@ -894,7 +932,7 @@ mod tests {
doc.add_pre_tokenized_text(text, tokens);
doc.add_text(text, "hello");
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
@@ -930,7 +968,7 @@ mod tests {
let schema = index.schema();
let mut index_writer = index.writer(50_000_000).unwrap();
let title = schema.get_field("title").unwrap();
let mut document = Document::default();
let mut document = TantivyDocument::default();
document.add_text(title, "The Old Man and the Sea");
index_writer.add_document(document).unwrap();
let error = index_writer.commit().unwrap_err();

View File

@@ -21,7 +21,7 @@
//! # use tantivy::collector::TopDocs;
//! # use tantivy::query::QueryParser;
//! # use tantivy::schema::*;
//! # use tantivy::{doc, DocAddress, Index, Score};
//! # use tantivy::{doc, DocAddress, Index, IndexWriter, Score};
//! #
//! # fn main() {
//! # // Let's create a temporary directory for the
@@ -53,7 +53,7 @@
//!
//! // Here we use a buffer of 100MB that will be split
//! // between indexing threads.
//! let mut index_writer = index.writer(100_000_000)?;
//! let mut index_writer: IndexWriter = index.writer(100_000_000)?;
//!
//! // Let's index one documents!
//! index_writer.add_document(doc!(
@@ -89,8 +89,8 @@
//!
//! for (_score, doc_address) in top_docs {
//! // Retrieve the actual content of documents given its `doc_address`.
//! let retrieved_doc = searcher.doc(doc_address)?;
//! println!("{}", schema.to_json(&retrieved_doc));
//! let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
//! println!("{}", retrieved_doc.to_json(&schema));
//! }
//!
//! # Ok(())
@@ -186,7 +186,7 @@ pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, Pr
pub use crate::postings::Postings;
#[allow(deprecated)]
pub use crate::schema::DatePrecision;
pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
pub use crate::schema::{DateOptions, DateTimePrecision, TantivyDocument, Term};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 5;
@@ -342,8 +342,9 @@ pub mod tests {
use crate::docset::{DocSet, TERMINATED};
use crate::merge_policy::NoMergePolicy;
use crate::query::BooleanQuery;
use crate::schema::document::DocValue;
use crate::schema::*;
use crate::{DateTime, DocAddress, Index, Postings, ReloadPolicy};
use crate::{DateTime, DocAddress, Index, IndexWriter, Postings, ReloadPolicy};
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
@@ -414,7 +415,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema)?;
// writing the segment
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc)?;
@@ -436,7 +437,7 @@ pub mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"))?;
@@ -463,7 +464,7 @@ pub mod tests {
let title_field = schema_builder.add_text_field("title", TEXT);
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?;
let index_reader = index.reader()?;
@@ -485,7 +486,7 @@ pub mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(text_field=>"a b"))?;
@@ -528,7 +529,7 @@ pub mod tests {
.unwrap();
{
// writing the segment
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
// 0
index_writer.add_document(doc!(text_field=>"a b"))?;
// 1
@@ -575,7 +576,7 @@ pub mod tests {
}
{
// writing the segment
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
// 0
index_writer.add_document(doc!(text_field=>"a b"))?;
// 1
@@ -612,7 +613,7 @@ pub mod tests {
}
{
// writing the segment
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b"))?;
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback()?;
@@ -662,7 +663,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>1u64))?;
index_writer.commit()?;
let reader = index.reader()?;
@@ -685,7 +686,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val))?;
index_writer.commit()?;
@@ -709,7 +710,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val))?;
index_writer.commit()?;
@@ -733,7 +734,7 @@ pub mod tests {
let absent_field = schema_builder.add_text_field("absent_text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a"))?;
assert!(index_writer.commit().is_ok());
let reader = index.reader()?;
@@ -756,7 +757,7 @@ pub mod tests {
.try_into()?;
// writing the segment
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"63"))?;
index_writer.add_document(doc!(text_field=>"70"))?;
index_writer.add_document(doc!(text_field=>"34"))?;
@@ -781,7 +782,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af bc bc"))?;
index_writer.commit()?;
}
@@ -813,7 +814,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
// writing the segment
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af b"))?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!(text_field=>"a b c d"))?;
@@ -877,7 +878,7 @@ pub mod tests {
.try_into()?;
assert_eq!(reader.searcher().num_docs(), 0u64);
// writing the segment
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af b"))?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!(text_field=>"a b c d"))?;
@@ -987,11 +988,11 @@ pub mod tests {
assert_eq!(document.len(), 3);
let values: Vec<&Value> = document.get_all(text_field).collect();
assert_eq!(values.len(), 2);
assert_eq!(values[0].as_text(), Some("tantivy"));
assert_eq!(values[1].as_text(), Some("some other value"));
assert_eq!(values[0].as_str(), Some("tantivy"));
assert_eq!(values[1].as_str(), Some("some other value"));
let values: Vec<&Value> = document.get_all(other_text_field).collect();
assert_eq!(values.len(), 1);
assert_eq!(values[0].as_text(), Some("short"));
assert_eq!(values[0].as_str(), Some("short"));
}
#[test]
@@ -1005,7 +1006,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
{
let document =
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
@@ -1071,7 +1072,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
let index_reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc_id in 0u64..DOC_COUNT {
@@ -1124,7 +1125,7 @@ pub mod tests {
let body = builder.add_text_field("body", TEXT | STORED);
let schema = builder.build();
let index = Index::create_in_dir(&index_path, schema)?;
let mut writer = index.writer(50_000_000)?;
let mut writer: IndexWriter = index.writer(50_000_000)?;
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..5000 {
writer.add_document(doc!(body => "foo"))?;

View File

@@ -45,12 +45,12 @@
macro_rules! doc(
() => {
{
($crate::Document::default())
($crate::TantivyDocument::default())
}
}; // avoids a warning due to the useless `mut`.
($($field:expr => $value:expr),*) => {
{
let mut document = $crate::Document::default();
let mut document = $crate::TantivyDocument::default();
$(
document.add_field_value($field, $value);
)*

View File

@@ -52,7 +52,7 @@ pub mod tests {
Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, INDEXED, TEXT,
};
use crate::tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
use crate::{DocId, HasLen, Score};
use crate::{DocId, HasLen, IndexWriter, Score};
#[test]
pub fn test_position_write() -> crate::Result<()> {
@@ -432,7 +432,7 @@ pub mod tests {
// delete some of the documents
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok());
}
@@ -483,7 +483,7 @@ pub mod tests {
// delete everything else
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok());
}
@@ -568,8 +568,8 @@ mod bench {
use crate::docset::TERMINATED;
use crate::query::Intersection;
use crate::schema::{Document, Field, IndexRecordOption, Schema, Term, STRING};
use crate::{tests, DocSet, Index};
use crate::schema::{Field, IndexRecordOption, Schema, TantivyDocument, Term, STRING};
use crate::{tests, DocSet, Index, IndexWriter};
pub static TERM_A: Lazy<Term> = Lazy::new(|| {
let field = Field::from_field_id(0);
@@ -598,9 +598,9 @@ mod bench {
let index = Index::create_in_ram(schema);
let posting_list_size = 1_000_000;
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
for _ in 0..posting_list_size {
let mut doc = Document::default();
let mut doc = TantivyDocument::default();
if rng.gen_bool(1f64 / 15f64) {
doc.add_text(text_field, "a");
}

View File

@@ -99,14 +99,14 @@ mod tests {
use crate::docset::{DocSet, BUFFER_LEN, TERMINATED};
use crate::query::{AllScorer, EnableScoring, Query};
use crate::schema::{Schema, TEXT};
use crate::Index;
use crate::{Index, IndexWriter};
fn create_test_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>"aaa"))?;
index_writer.add_document(doc!(field=>"bbb"))?;
index_writer.commit()?;

View File

@@ -117,13 +117,13 @@ mod tests {
use crate::docset::TERMINATED;
use crate::query::Weight;
use crate::schema::{Schema, STRING};
use crate::Index;
use crate::{Index, IndexWriter};
fn create_index() -> crate::Result<Index> {
let mut schema = Schema::builder();
let title = schema.add_text_field("title", STRING);
let index = Index::create_in_ram(schema.build());
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(title=>"abc"))?;
index_writer.add_document(doc!(title=>"bcd"))?;
index_writer.add_document(doc!(title=>"abcd"))?;

View File

@@ -24,6 +24,7 @@ use crate::schema::{IndexRecordOption, Term};
/// use tantivy::schema::{IndexRecordOption, Schema, TEXT};
/// use tantivy::Term;
/// use tantivy::Index;
/// use tantivy::IndexWriter;
///
/// fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
@@ -32,7 +33,7 @@ use crate::schema::{IndexRecordOption, Term};
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(15_000_000)?;
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ))?;

View File

@@ -19,7 +19,7 @@ mod tests {
TermQuery,
};
use crate::schema::*;
use crate::{assert_nearly_equals, DocAddress, DocId, Index, Score};
use crate::{assert_nearly_equals, DocAddress, DocId, Index, IndexWriter, Score};
fn aux_test_helper() -> crate::Result<(Index, Field)> {
let mut schema_builder = Schema::builder();
@@ -28,7 +28,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field => "a b c"))?;
index_writer.add_document(doc!(text_field => "a c"))?;
index_writer.add_document(doc!(text_field => "b c"))?;
@@ -224,7 +224,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field => "a b c"))?;
index_writer.add_document(doc!(text_field => "a c"))?;
index_writer.add_document(doc!(text_field => "b c"))?;
@@ -297,7 +297,7 @@ mod tests {
let text = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text=>"a"))?;
index_writer.add_document(doc!(text=>"b"))?;
index_writer.commit()?;

View File

@@ -136,14 +136,14 @@ mod tests {
use super::BoostQuery;
use crate::query::{AllQuery, Query};
use crate::schema::Schema;
use crate::{DocAddress, Document, Index};
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
#[test]
fn test_boost_query_explain() -> crate::Result<()> {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::new())?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(TantivyDocument::new())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();

View File

@@ -143,14 +143,14 @@ mod tests {
use super::ConstScoreQuery;
use crate::query::{AllQuery, Query};
use crate::schema::Schema;
use crate::{DocAddress, Document, Index};
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
#[test]
fn test_const_score_query_explain() -> crate::Result<()> {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::new())?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(TantivyDocument::new())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();

View File

@@ -15,6 +15,7 @@ use crate::{Score, Term};
/// use tantivy::schema::{IndexRecordOption, Schema, TEXT};
/// use tantivy::Term;
/// use tantivy::Index;
/// use tantivy::IndexWriter;
///
/// fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
@@ -23,7 +24,7 @@ use crate::{Score, Term};
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(15_000_000)?;
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of Girl",
/// ))?;

View File

@@ -38,7 +38,7 @@ impl Automaton for DfaWrapper {
/// use tantivy::collector::{Count, TopDocs};
/// use tantivy::query::FuzzyTermQuery;
/// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, Index, Term};
/// use tantivy::{doc, Index, IndexWriter, Term};
///
/// fn example() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
@@ -46,7 +46,7 @@ impl Automaton for DfaWrapper {
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(15_000_000)?;
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ))?;
@@ -188,7 +188,7 @@ mod test {
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::{Schema, STORED, TEXT};
use crate::{assert_nearly_equals, Index, Term};
use crate::{assert_nearly_equals, Index, IndexWriter, TantivyDocument, Term};
#[test]
pub fn test_fuzzy_json_path() -> crate::Result<()> {
@@ -202,7 +202,8 @@ mod test {
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let doc = schema.parse_document(
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"attributes": {
"a": "japan"
@@ -210,7 +211,8 @@ mod test {
}"#,
)?;
index_writer.add_document(doc)?;
let doc = schema.parse_document(
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"attributes": {
"aa": "japan"
@@ -275,7 +277,7 @@ mod test {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(
country_field => "japan",
))?;
@@ -324,7 +326,7 @@ mod test {
let country_field = schema_builder.add_text_field("country", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(country_field => "japan"))?;
index_writer.commit()?;
let reader = index.reader()?;

View File

@@ -1,11 +1,14 @@
use std::cmp::Reverse;
use std::collections::{BinaryHeap, HashMap};
use tokenizer_api::Token;
use crate::query::bm25::idf;
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
use crate::schema::document::{DocValue, Document};
use crate::schema::{Field, FieldType, IndexRecordOption, Term};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
use crate::{DocAddress, Result, Searcher, TantivyError};
use crate::{DocAddress, Result, Searcher, TantivyDocument, TantivyError};
#[derive(Debug, PartialEq)]
struct ScoreTerm {
@@ -90,10 +93,10 @@ impl MoreLikeThis {
}
/// Creates a [`BooleanQuery`] using a set of field values.
pub fn query_with_document_fields(
pub fn query_with_document_fields<'a, V: DocValue<'a>>(
&self,
searcher: &Searcher,
doc_fields: &[(Field, Vec<Value>)],
doc_fields: &[(Field, Vec<V>)],
) -> Result<BooleanQuery> {
let score_terms = self.retrieve_terms_from_doc_fields(searcher, doc_fields)?;
let query = self.create_query(score_terms);
@@ -126,26 +129,18 @@ impl MoreLikeThis {
searcher: &Searcher,
doc_address: DocAddress,
) -> Result<Vec<ScoreTerm>> {
let doc = searcher.doc(doc_address)?;
let field_to_values = doc
.get_sorted_field_values()
.iter()
.map(|(field, values)| {
(
*field,
values.iter().map(|v| (**v).clone()).collect::<Vec<Value>>(),
)
})
.collect::<Vec<_>>();
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let field_to_values = doc.get_sorted_field_values();
self.retrieve_terms_from_doc_fields(searcher, &field_to_values)
}
/// Finds terms for a more-like-this query.
/// field_to_field_values is a mapping from field to possible values of that field.
fn retrieve_terms_from_doc_fields(
fn retrieve_terms_from_doc_fields<'a, V: DocValue<'a>>(
&self,
searcher: &Searcher,
field_to_values: &[(Field, Vec<Value>)],
field_to_values: &[(Field, Vec<V>)],
) -> Result<Vec<ScoreTerm>> {
if field_to_values.is_empty() {
return Err(TantivyError::InvalidArgument(
@@ -164,11 +159,11 @@ impl MoreLikeThis {
/// Computes the frequency of values for a field while updating the term frequencies
/// Note: A FieldValue can be made up of multiple terms.
/// We are interested in extracting terms within FieldValue
fn add_term_frequencies(
fn add_term_frequencies<'a, V: DocValue<'a>>(
&self,
searcher: &Searcher,
field: Field,
values: &[Value],
values: &[V],
term_frequencies: &mut HashMap<Term, usize>,
) -> Result<()> {
let schema = searcher.schema();
@@ -184,11 +179,10 @@ impl MoreLikeThis {
FieldType::Facet(_) => {
let facets: Vec<&str> = values
.iter()
.map(|value| match value {
Value::Facet(ref facet) => Ok(facet.encoded_str()),
_ => Err(TantivyError::InvalidArgument(
"invalid field value".to_string(),
)),
.map(|value| {
value.as_facet().map(|f| f.encoded_str()).ok_or_else(|| {
TantivyError::InvalidArgument("invalid field value".to_string())
})
})
.collect::<Result<Vec<_>>>()?;
for fake_str in facets {
@@ -203,35 +197,31 @@ impl MoreLikeThis {
}
}
FieldType::Str(text_options) => {
let mut tokenizer_opt = text_options
.get_indexing_options()
.map(|options| options.tokenizer())
.and_then(|tokenizer_name| tokenizer_manager.get(tokenizer_name));
let sink = &mut |token: &Token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
};
// TOOD: Validate these changed align with the HEAD branch.
for value in values {
match value {
Value::PreTokStr(tok_str) => {
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
Value::Str(ref text) => {
if let Some(mut tokenizer) = text_options
.get_indexing_options()
.map(|text_indexing_options| {
text_indexing_options.tokenizer().to_string()
})
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
{
let mut token_stream = tokenizer.token_stream(text);
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
}
_ => (),
if let Some(text) = value.as_str() {
let tokenizer = match &mut tokenizer_opt {
None => continue,
Some(tokenizer) => tokenizer,
};
let mut token_stream = tokenizer.token_stream(text);
token_stream.process(sink);
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
token_stream.process(sink);
}
}
}
@@ -248,7 +238,7 @@ impl MoreLikeThis {
}
FieldType::Date(_) => {
for value in values {
let timestamp = value.as_date().ok_or_else(|| {
let timestamp = value.as_datetime().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
let term = Term::from_field_date(field, timestamp);

View File

@@ -1,3 +1,5 @@
use std::fmt::Debug;
use super::MoreLikeThis;
use crate::query::{EnableScoring, Query, Weight};
use crate::schema::{Field, Value};
@@ -28,9 +30,9 @@ pub struct MoreLikeThisQuery {
target: TargetDocument,
}
#[derive(Debug, PartialEq, Clone)]
#[derive(Debug, Clone, PartialEq)]
enum TargetDocument {
DocumentAdress(DocAddress),
DocumentAddress(DocAddress),
DocumentFields(Vec<(Field, Vec<Value>)>),
}
@@ -51,14 +53,20 @@ impl Query for MoreLikeThisQuery {
}
};
match &self.target {
TargetDocument::DocumentAdress(doc_address) => self
TargetDocument::DocumentAddress(doc_address) => self
.mlt
.query_with_document(searcher, *doc_address)?
.weight(enable_scoring),
TargetDocument::DocumentFields(doc_fields) => self
.mlt
.query_with_document_fields(searcher, doc_fields)?
.weight(enable_scoring),
TargetDocument::DocumentFields(doc_fields) => {
let values = doc_fields
.iter()
.map(|(field, values)| (*field, values.iter().collect::<Vec<&Value>>()))
.collect::<Vec<_>>();
self.mlt
.query_with_document_fields(searcher, &values)?
.weight(enable_scoring)
}
}
}
}
@@ -156,7 +164,7 @@ impl MoreLikeThisQueryBuilder {
pub fn with_document(self, doc_address: DocAddress) -> MoreLikeThisQuery {
MoreLikeThisQuery {
mlt: self.mlt,
target: TargetDocument::DocumentAdress(doc_address),
target: TargetDocument::DocumentAddress(doc_address),
}
}
@@ -180,7 +188,7 @@ mod tests {
use super::{MoreLikeThisQuery, TargetDocument};
use crate::collector::TopDocs;
use crate::schema::{Schema, STORED, TEXT};
use crate::{DocAddress, Index};
use crate::{DocAddress, Index, IndexWriter};
fn create_test_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -188,7 +196,7 @@ mod tests {
let body = schema_builder.add_text_field("body", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(title => "aaa", body => "the old man and the sea"))?;
index_writer.add_document(doc!(title => "bbb", body => "an old man sailing on the sea"))?;
index_writer.add_document(doc!(title => "ccc", body=> "send this message to alice"))?;
@@ -236,7 +244,7 @@ mod tests {
);
assert_eq!(
query.target,
TargetDocument::DocumentAdress(DocAddress::new(1, 2))
TargetDocument::DocumentAddress(DocAddress::new(1, 2))
);
}

View File

@@ -161,7 +161,7 @@ mod tests {
use crate::docset::TERMINATED;
use crate::query::{EnableScoring, PhrasePrefixQuery, Query};
use crate::schema::{Schema, TEXT};
use crate::{DocSet, Term};
use crate::{DocSet, IndexWriter, Term};
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -169,7 +169,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
for &text in texts {
let doc = doc!(text_field=>text);
index_writer.add_document(doc)?;

View File

@@ -17,7 +17,7 @@ pub mod tests {
use crate::core::Index;
use crate::query::{EnableScoring, QueryParser, Weight};
use crate::schema::{Schema, Term, TEXT};
use crate::{assert_nearly_equals, DocAddress, DocId, TERMINATED};
use crate::{assert_nearly_equals, DocAddress, DocId, IndexWriter, TERMINATED};
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -25,7 +25,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
for &text in texts {
let doc = doc!(text_field=>text);
index_writer.add_document(doc)?;
@@ -135,7 +135,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?;
}
@@ -278,7 +278,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"b"))?;
index_writer.add_document(doc!(text_field=>"a b"))?;
index_writer.add_document(doc!(text_field=>"b a"))?;
@@ -310,7 +310,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c d e f g h"))?;
index_writer.commit()?;
}
@@ -348,7 +348,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(json_field=>json!({
"text": "elliot smith the happy who"
})))?;

View File

@@ -41,14 +41,14 @@ use crate::{DateTime, DocId, Score};
/// use tantivy::collector::Count;
/// use tantivy::query::RangeQuery;
/// use tantivy::schema::{Schema, INDEXED};
/// use tantivy::{doc, Index};
/// use tantivy::{doc, Index, IndexWriter};
/// # fn test() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let year_field = schema_builder.add_u64_field("year", INDEXED);
/// let schema = schema_builder.build();
///
/// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
/// let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 20_000_000)?;
/// for year in 1950u64..2017u64 {
/// let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
/// for _ in 0..num_docs_within_year {
@@ -474,8 +474,10 @@ mod tests {
use crate::collector::{Count, TopDocs};
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::{Document, Field, IntoIpv6Addr, Schema, FAST, INDEXED, STORED, TEXT};
use crate::{doc, Index};
use crate::schema::{
Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT,
};
use crate::{doc, Index, IndexWriter};
#[test]
fn test_range_query_simple() -> crate::Result<()> {
@@ -552,7 +554,7 @@ mod tests {
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for i in 1..100 {
let mut doc = Document::new();
let mut doc = TantivyDocument::new();
for j in 1..100 {
if i % j == 0 {
doc.add_i64(int_field, j as i64);
@@ -617,7 +619,7 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 60_000_000).unwrap();
let mut docs = Vec::new();
for i in 1..100 {
let mut doc = Document::new();
let mut doc = TantivyDocument::new();
for j in 1..100 {
if i % j == 0 {
doc.add_f64(float_field, j as f64);
@@ -722,7 +724,7 @@ mod tests {
let ip_addr_2 = IpAddr::from_str("127.0.0.20").unwrap().into_ipv6_addr();
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
for _ in 0..1_000 {
index_writer
.add_document(doc!(

View File

@@ -88,7 +88,7 @@ pub mod tests {
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
use crate::Index;
use crate::{Index, IndexWriter};
#[derive(Clone, Debug)]
pub struct Doc {
@@ -158,7 +158,7 @@ pub mod tests {
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
.into_iter()
.map(Ipv6Addr::from_u128)

View File

@@ -141,7 +141,7 @@ pub mod tests {
use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight;
use crate::query::{QueryParser, Weight};
use crate::schema::{NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING};
use crate::{Index, TERMINATED};
use crate::{Index, IndexWriter, TERMINATED};
#[derive(Clone, Debug)]
pub struct Doc {
@@ -209,7 +209,7 @@ pub mod tests {
let field = schema_builder.add_u64_field("test_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
writer.add_document(doc!(field=>52_000u64)).unwrap();
writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();

View File

@@ -18,7 +18,7 @@ use crate::schema::Field;
/// use tantivy::collector::Count;
/// use tantivy::query::RegexQuery;
/// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, Index, Term};
/// use tantivy::{doc, Index, IndexWriter, Term};
///
/// # fn test() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
@@ -26,7 +26,7 @@ use crate::schema::Field;
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(15_000_000)?;
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ))?;
@@ -95,7 +95,7 @@ mod test {
use super::RegexQuery;
use crate::collector::TopDocs;
use crate::schema::{Field, Schema, TEXT};
use crate::{assert_nearly_equals, Index, IndexReader};
use crate::{assert_nearly_equals, Index, IndexReader, IndexWriter};
fn build_test_index() -> crate::Result<(IndexReader, Field)> {
let mut schema_builder = Schema::builder();
@@ -103,7 +103,7 @@ mod test {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(
country_field => "japan",
))?;

View File

@@ -116,7 +116,7 @@ mod tests {
use crate::collector::TopDocs;
use crate::query::{QueryParser, TermSetQuery};
use crate::schema::{Schema, TEXT};
use crate::{assert_nearly_equals, Index, Term};
use crate::{assert_nearly_equals, Index, IndexWriter, Term};
#[test]
pub fn test_term_set_query() -> crate::Result<()> {
@@ -126,7 +126,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(
field1 => "doc1",
field2 => "val1",
@@ -233,7 +233,7 @@ mod tests {
schema_builder.add_text_field("field", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let field = schema.get_field("field").unwrap();
index_writer.add_document(doc!(
field => "val1",

View File

@@ -14,7 +14,7 @@ mod tests {
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::query::{EnableScoring, Query, QueryParser, Scorer, TermQuery};
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
use crate::{assert_nearly_equals, DocAddress, Index, Term, TERMINATED};
use crate::{assert_nearly_equals, DocAddress, Index, IndexWriter, Term, TERMINATED};
#[test]
pub fn test_term_query_no_freq() -> crate::Result<()> {
@@ -24,7 +24,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let doc = doc!(text_field => "a");
index_writer.add_document(doc)?;
index_writer.commit()?;
@@ -50,7 +50,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
for _ in 0..COMPRESSION_BLOCK_SIZE {
let doc = doc!(text_field => "a");
index_writer.add_document(doc)?;
@@ -86,7 +86,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(
left_field => "left1 left2 left2 left2f2 left2f2 left3 abcde abcde abcde abcde abcde abcde abcde abcde abcde abcewde abcde abcde",
right_field => "right1 right2",
@@ -133,7 +133,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b"))?;
index_writer.add_document(doc!(text_field=>"a c"))?;
index_writer.delete_term(Term::from_field_text(text_field, "b"));
@@ -151,7 +151,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.commit()?;
@@ -185,7 +185,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"b"))?;
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.add_document(doc!(text_field=>"a"))?;

View File

@@ -20,14 +20,14 @@ use crate::Term;
/// use tantivy::collector::{Count, TopDocs};
/// use tantivy::query::TermQuery;
/// use tantivy::schema::{Schema, TEXT, IndexRecordOption};
/// use tantivy::{doc, Index, Term};
/// use tantivy::{doc, Index, IndexWriter, Term};
/// # fn test() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(15_000_000)?;
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ))?;
@@ -139,7 +139,7 @@ mod tests {
use crate::collector::{Count, TopDocs};
use crate::query::{Query, QueryParser, TermQuery};
use crate::schema::{IndexRecordOption, IntoIpv6Addr, Schema, INDEXED, STORED};
use crate::{doc, Index, Term};
use crate::{doc, Index, IndexWriter, Term};
#[test]
fn search_ip_test() {
@@ -151,7 +151,7 @@ mod tests {
let ip_addr_2 = Ipv6Addr::from_u128(10);
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(
ip_field => ip_addr_1

View File

@@ -133,7 +133,8 @@ mod tests {
use crate::query::{Bm25Weight, EnableScoring, Scorer, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TEXT};
use crate::{
assert_nearly_equals, DocId, DocSet, Index, Score, Searcher, SegmentId, Term, TERMINATED,
assert_nearly_equals, DocId, DocSet, Index, IndexWriter, Score, Searcher, SegmentId, Term,
TERMINATED,
};
#[test]
@@ -295,7 +296,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(3, 30_000_000)?;
let mut writer: IndexWriter = index.writer_with_num_threads(3, 30_000_000)?;
use rand::Rng;
let mut rng = rand::thread_rng();
writer.set_merge_policy(Box::new(NoMergePolicy));

View File

@@ -1,282 +0,0 @@
use std::collections::{HashMap, HashSet};
use std::io::{self, Read, Write};
use std::mem;
use std::net::Ipv6Addr;
use common::{BinarySerializable, VInt};
use super::*;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
/// Tantivy's Document is the object that can
/// be indexed and then searched for.
///
/// Documents are fundamentally a collection of unordered couples `(field, value)`.
/// In this list, one field may appear more than once.
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
pub struct Document {
field_values: Vec<FieldValue>,
}
impl From<Vec<FieldValue>> for Document {
fn from(field_values: Vec<FieldValue>) -> Self {
Document { field_values }
}
}
impl PartialEq for Document {
fn eq(&self, other: &Document) -> bool {
// super slow, but only here for tests
let convert_to_comparable_map = |field_values: &[FieldValue]| {
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
for field_value in field_values.iter() {
let json_val = serde_json::to_string(field_value.value()).unwrap();
field_value_set
.entry(field_value.field())
.or_default()
.insert(json_val);
}
field_value_set
};
let self_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&self.field_values);
let other_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&other.field_values);
self_field_values.eq(&other_field_values)
}
}
impl Eq for Document {}
impl IntoIterator for Document {
type Item = FieldValue;
type IntoIter = std::vec::IntoIter<FieldValue>;
fn into_iter(self) -> Self::IntoIter {
self.field_values.into_iter()
}
}
impl Document {
/// Creates a new, empty document object
pub fn new() -> Document {
Document::default()
}
/// Returns the number of `(field, value)` pairs.
pub fn len(&self) -> usize {
self.field_values.len()
}
/// Returns true if the document contains no fields.
pub fn is_empty(&self) -> bool {
self.field_values.is_empty()
}
/// Adding a facet to the document.
pub fn add_facet<F>(&mut self, field: Field, path: F)
where Facet: From<F> {
let facet = Facet::from(path);
let value = Value::Facet(facet);
self.add_field_value(field, value);
}
/// Add a text field.
pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
let value = Value::Str(text.to_string());
self.add_field_value(field, value);
}
/// Add a pre-tokenized text field.
pub fn add_pre_tokenized_text(&mut self, field: Field, pre_tokenized_text: PreTokenizedString) {
self.add_field_value(field, pre_tokenized_text);
}
/// Add a u64 field
pub fn add_u64(&mut self, field: Field, value: u64) {
self.add_field_value(field, value);
}
/// Add a IP address field. Internally only Ipv6Addr is used.
pub fn add_ip_addr(&mut self, field: Field, value: Ipv6Addr) {
self.add_field_value(field, value);
}
/// Add a i64 field
pub fn add_i64(&mut self, field: Field, value: i64) {
self.add_field_value(field, value);
}
/// Add a f64 field
pub fn add_f64(&mut self, field: Field, value: f64) {
self.add_field_value(field, value);
}
/// Add a bool field
pub fn add_bool(&mut self, field: Field, value: bool) {
self.add_field_value(field, value);
}
/// Add a date field with unspecified time zone offset
pub fn add_date(&mut self, field: Field, value: DateTime) {
self.add_field_value(field, value);
}
/// Add a bytes field
pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) {
self.add_field_value(field, value.into());
}
/// Add a JSON field
pub fn add_json_object(
&mut self,
field: Field,
json_object: serde_json::Map<String, serde_json::Value>,
) {
self.add_field_value(field, json_object);
}
/// Add a (field, value) to the document.
pub fn add_field_value<T: Into<Value>>(&mut self, field: Field, typed_val: T) {
let value = typed_val.into();
let field_value = FieldValue { field, value };
self.field_values.push(field_value);
}
/// field_values accessor
pub fn field_values(&self) -> &[FieldValue] {
&self.field_values
}
/// Sort and groups the field_values by field.
///
/// The result of this method is not cached and is
/// computed on the fly when this method is called.
pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&Value>)> {
let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect();
field_values.sort_by_key(|field_value| field_value.field());
let mut field_values_it = field_values.into_iter();
let first_field_value = if let Some(first_field_value) = field_values_it.next() {
first_field_value
} else {
return Vec::new();
};
let mut grouped_field_values = vec![];
let mut current_field = first_field_value.field();
let mut current_group = vec![first_field_value.value()];
for field_value in field_values_it {
if field_value.field() == current_field {
current_group.push(field_value.value());
} else {
grouped_field_values.push((
current_field,
mem::replace(&mut current_group, vec![field_value.value()]),
));
current_field = field_value.field();
}
}
grouped_field_values.push((current_field, current_group));
grouped_field_values
}
/// Returns all of the `FieldValue`s associated the given field
pub fn get_all(&self, field: Field) -> impl Iterator<Item = &Value> {
self.field_values
.iter()
.filter(move |field_value| field_value.field() == field)
.map(FieldValue::value)
}
/// Returns the first `FieldValue` associated the given field
pub fn get_first(&self, field: Field) -> Option<&Value> {
self.get_all(field).next()
}
/// Serializes stored field values.
pub fn serialize_stored<W: Write>(&self, schema: &Schema, writer: &mut W) -> io::Result<()> {
let stored_field_values = || {
self.field_values()
.iter()
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
};
let num_field_values = stored_field_values().count();
VInt(num_field_values as u64).serialize(writer)?;
for field_value in stored_field_values() {
match field_value {
FieldValue {
field,
value: Value::PreTokStr(pre_tokenized_text),
} => {
let field_value = FieldValue {
field: *field,
value: Value::Str(pre_tokenized_text.text.to_string()),
};
field_value.serialize(writer)?;
}
field_value => field_value.serialize(writer)?,
};
}
Ok(())
}
}
impl BinarySerializable for Document {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let field_values = self.field_values();
VInt(field_values.len() as u64).serialize(writer)?;
for field_value in field_values {
field_value.serialize(writer)?;
}
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let num_field_values = VInt::deserialize(reader)?.val() as usize;
let field_values = (0..num_field_values)
.map(|_| FieldValue::deserialize(reader))
.collect::<io::Result<Vec<FieldValue>>>()?;
Ok(Document::from(field_values))
}
}
#[cfg(test)]
mod tests {
use common::BinarySerializable;
use crate::schema::*;
#[test]
fn test_doc() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("title", TEXT);
let mut doc = Document::default();
doc.add_text(text_field, "My title");
assert_eq!(doc.field_values().len(), 1);
}
#[test]
fn test_doc_serialization_issue() {
let mut doc = Document::default();
doc.add_json_object(
Field::from_field_id(0),
serde_json::json!({"key": 2u64})
.as_object()
.unwrap()
.clone(),
);
doc.add_text(Field::from_field_id(1), "hello");
assert_eq!(doc.field_values().len(), 2);
let mut payload: Vec<u8> = Vec::new();
doc.serialize(&mut payload).unwrap();
assert_eq!(payload.len(), 26);
Document::deserialize(&mut &payload[..]).unwrap();
}
}

1029
src/schema/document/de.rs Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,310 @@
use std::collections::{BTreeMap, HashMap, HashSet};
use std::net::Ipv6Addr;
use common::DateTime;
use serde_json::Map;
use crate::schema::document::{
DeserializeError, Document, DocumentDeserialize, DocumentDeserializer,
};
use crate::schema::field_type::ValueParsingError;
use crate::schema::field_value::FieldValueIter;
use crate::schema::{Facet, Field, FieldValue, NamedFieldDocument, Schema, Value};
use crate::tokenizer::PreTokenizedString;
/// Tantivy's Document is the object that can be indexed and then searched for.
/// It provides a default implementation of the `Document` trait.
///
/// Documents are fundamentally a collection of unordered couples `(field, value)`.
/// In this list, one field may appear more than once.
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
pub struct TantivyDocument {
field_values: Vec<FieldValue>,
}
impl Document for TantivyDocument {
type Value<'a> = &'a Value;
type FieldsValuesIter<'a> = FieldValueIter<'a>;
fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
FieldValueIter(self.field_values.iter())
}
}
impl DocumentDeserialize for TantivyDocument {
fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
where D: DocumentDeserializer<'de> {
let mut field_values = Vec::with_capacity(deserializer.size_hint());
while let Some((field, value)) = deserializer.next_field()? {
field_values.push(FieldValue::new(field, value));
}
Ok(Self { field_values })
}
}
impl From<Vec<FieldValue>> for TantivyDocument {
fn from(field_values: Vec<FieldValue>) -> Self {
Self { field_values }
}
}
impl PartialEq for TantivyDocument {
fn eq(&self, other: &Self) -> bool {
// super slow, but only here for tests
let convert_to_comparable_map = |field_values: &[FieldValue]| {
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
for field_value in field_values.iter() {
let value = serde_json::to_string(field_value.value()).unwrap();
field_value_set
.entry(field_value.field())
.or_default()
.insert(value);
}
field_value_set
};
let self_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&self.field_values);
let other_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&other.field_values);
self_field_values.eq(&other_field_values)
}
}
impl Eq for TantivyDocument {}
impl IntoIterator for TantivyDocument {
type Item = FieldValue;
type IntoIter = std::vec::IntoIter<FieldValue>;
fn into_iter(self) -> Self::IntoIter {
self.field_values.into_iter()
}
}
impl TantivyDocument {
/// Creates a new, empty document object
pub fn new() -> TantivyDocument {
TantivyDocument::default()
}
/// Returns the length of the document.
pub fn len(&self) -> usize {
self.field_values.len()
}
/// Adding a facet to the document.
pub fn add_facet<F>(&mut self, field: Field, path: F)
where Facet: From<F> {
let facet = Facet::from(path);
let value = Value::Facet(facet);
self.add_field_value(field, value);
}
/// Add a text field.
pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
let value = Value::Str(text.to_string());
self.add_field_value(field, value);
}
/// Add a pre-tokenized text field.
pub fn add_pre_tokenized_text(&mut self, field: Field, pre_tokenized_text: PreTokenizedString) {
self.add_field_value(field, pre_tokenized_text);
}
/// Add a u64 field
pub fn add_u64(&mut self, field: Field, value: u64) {
self.add_field_value(field, value);
}
/// Add a IP address field. Internally only Ipv6Addr is used.
pub fn add_ip_addr(&mut self, field: Field, value: Ipv6Addr) {
self.add_field_value(field, value);
}
/// Add a i64 field
pub fn add_i64(&mut self, field: Field, value: i64) {
self.add_field_value(field, value);
}
/// Add a f64 field
pub fn add_f64(&mut self, field: Field, value: f64) {
self.add_field_value(field, value);
}
/// Add a bool field
pub fn add_bool(&mut self, field: Field, value: bool) {
self.add_field_value(field, value);
}
/// Add a date field with unspecified time zone offset
pub fn add_date(&mut self, field: Field, value: DateTime) {
self.add_field_value(field, value);
}
/// Add a bytes field
pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) {
self.add_field_value(field, value.into());
}
/// Add a dynamic object field
pub fn add_object(&mut self, field: Field, object: BTreeMap<String, Value>) {
self.add_field_value(field, object);
}
/// Add a (field, value) to the document.
pub fn add_field_value<T: Into<Value>>(&mut self, field: Field, typed_val: T) {
let value = typed_val.into();
let field_value = FieldValue { field, value };
self.field_values.push(field_value);
}
/// field_values accessor
pub fn field_values(&self) -> &[FieldValue] {
&self.field_values
}
/// Returns all of the `FieldValue`s associated the given field
pub fn get_all(&self, field: Field) -> impl Iterator<Item = &Value> {
self.field_values
.iter()
.filter(move |field_value| field_value.field() == field)
.map(FieldValue::value)
}
/// Returns the first `FieldValue` associated the given field
pub fn get_first(&self, field: Field) -> Option<&Value> {
self.get_all(field).next()
}
/// Create document from a named doc.
pub fn convert_named_doc(
schema: &Schema,
named_doc: NamedFieldDocument,
) -> Result<TantivyDocument, DocParsingError> {
let mut document = TantivyDocument::new();
for (field_name, values) in named_doc.0 {
if let Ok(field) = schema.get_field(&field_name) {
for value in values {
document.add_field_value(field, value);
}
}
}
Ok(document)
}
/// Create a named document from the doc.
pub fn to_named_doc(&self, schema: &Schema) -> NamedFieldDocument {
let mut field_map = BTreeMap::new();
for (field, field_values) in self.get_sorted_field_values() {
let field_name = schema.get_field_name(field);
let values: Vec<Value> = field_values.into_iter().cloned().collect();
field_map.insert(field_name.to_string(), values);
}
NamedFieldDocument(field_map)
}
/// Encode the schema in JSON.
///
/// Encoding a document cannot fail.
pub fn to_json(&self, schema: &Schema) -> String {
serde_json::to_string(&self.to_named_doc(schema))
.expect("doc encoding failed. This is a bug")
}
/// Build a document object from a json-object.
pub fn parse_json(schema: &Schema, doc_json: &str) -> Result<TantivyDocument, DocParsingError> {
let json_obj: Map<String, serde_json::Value> =
serde_json::from_str(doc_json).map_err(|_| DocParsingError::invalid_json(doc_json))?;
Self::from_json_object(schema, json_obj)
}
/// Build a document object from a json-object.
pub fn from_json_object(
schema: &Schema,
json_obj: Map<String, serde_json::Value>,
) -> Result<TantivyDocument, DocParsingError> {
let mut doc = TantivyDocument::default();
for (field_name, json_value) in json_obj {
if let Ok(field) = schema.get_field(&field_name) {
let field_entry = schema.get_field_entry(field);
let field_type = field_entry.field_type();
match json_value {
serde_json::Value::Array(json_items) => {
for json_item in json_items {
let value = field_type
.value_from_json(json_item)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value);
}
}
_ => {
let value = field_type
.value_from_json(json_value)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value);
}
}
}
}
Ok(doc)
}
}
/// Error that may happen when deserializing
/// a document from JSON.
#[derive(Debug, Error, PartialEq)]
pub enum DocParsingError {
/// The payload given is not valid JSON.
#[error("The provided string is not valid JSON")]
InvalidJson(String),
/// One of the value node could not be parsed.
#[error("The field '{0:?}' could not be parsed: {1:?}")]
ValueError(String, ValueParsingError),
}
impl DocParsingError {
/// Builds a NotJson DocParsingError
fn invalid_json(invalid_json: &str) -> Self {
let sample = invalid_json.chars().take(20).collect();
DocParsingError::InvalidJson(sample)
}
}
#[cfg(test)]
mod tests {
use crate::schema::document::default_doc_type::TantivyDocument;
use crate::schema::*;
#[test]
fn test_doc() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("title", TEXT);
let mut doc = TantivyDocument::default();
doc.add_text(text_field, "My title");
assert_eq!(doc.field_values().len(), 1);
}
// TODO: Should this be re-added with the serialize method
// technically this is no longer useful since the doc types
// do not implement BinarySerializable due to orphan rules.
// #[test]
// fn test_doc_serialization_issue() {
// let mut doc = Document::default();
// doc.add_json_object(
// Field::from_field_id(0),
// serde_json::json!({"key": 2u64})
// .as_object()
// .unwrap()
// .clone(),
// );
// doc.add_text(Field::from_field_id(1), "hello");
// assert_eq!(doc.field_values().len(), 2);
// let mut payload: Vec<u8> = Vec::new();
// doc_binary_wrappers::serialize(&doc, &mut payload).unwrap();
// assert_eq!(payload.len(), 26);
// doc_binary_wrappers::deserialize::<Document, _>(&mut &payload[..]).unwrap();
// }
}

View File

@@ -0,0 +1,207 @@
//! Implementations of some of the core traits on varius types to improve the ergonomics
//! of the API when providing custom documents.
//!
//! This allows users a bit more freedom and ergonomics if they want a simple API
//! and don't care about some of the more specialised types or only want to customise
//! part of the document structure.
use std::collections::{btree_map, hash_map, BTreeMap, HashMap};
use serde_json::Number;
use crate::schema::document::{
ArrayAccess, DeserializeError, DocValue, Document, DocumentDeserialize, DocumentDeserializer,
ObjectAccess, ReferenceValue, ValueDeserialize, ValueDeserializer, ValueVisitor,
};
use crate::schema::Field;
// Serde compatibility support.
impl<'a> DocValue<'a> for &'a serde_json::Value {
type ChildValue = Self;
type ArrayIter = JsonArrayIter<'a>;
type ObjectIter = JsonObjectIter<'a>;
fn as_value(&self) -> ReferenceValue<'a, Self> {
match self {
serde_json::Value::Null => ReferenceValue::Null,
serde_json::Value::Bool(value) => ReferenceValue::Bool(*value),
serde_json::Value::Number(number) => {
if let Some(val) = number.as_u64() {
ReferenceValue::U64(val)
} else if let Some(val) = number.as_i64() {
ReferenceValue::I64(val)
} else if let Some(val) = number.as_f64() {
ReferenceValue::F64(val)
} else {
panic!("Unsupported serde_json number {number}");
}
}
serde_json::Value::String(val) => ReferenceValue::Str(val),
serde_json::Value::Array(elements) => {
ReferenceValue::Array(JsonArrayIter(elements.iter()))
}
serde_json::Value::Object(object) => {
ReferenceValue::Object(JsonObjectIter(object.iter()))
}
}
}
}
impl ValueDeserialize for serde_json::Value {
fn deserialize<'de, D>(deserializer: D) -> Result<Self, DeserializeError>
where D: ValueDeserializer<'de> {
struct SerdeValueVisitor;
impl ValueVisitor for SerdeValueVisitor {
type Value = serde_json::Value;
fn visit_null(&self) -> Result<Self::Value, DeserializeError> {
Ok(serde_json::Value::Null)
}
fn visit_string(&self, val: String) -> Result<Self::Value, DeserializeError> {
Ok(serde_json::Value::String(val))
}
fn visit_u64(&self, val: u64) -> Result<Self::Value, DeserializeError> {
Ok(serde_json::Value::Number(val.into()))
}
fn visit_i64(&self, val: i64) -> Result<Self::Value, DeserializeError> {
Ok(serde_json::Value::Number(val.into()))
}
fn visit_f64(&self, val: f64) -> Result<Self::Value, DeserializeError> {
let num = Number::from_f64(val).ok_or_else(|| {
DeserializeError::custom(format!(
"serde_json::Value cannot deserialize float {val}"
))
})?;
Ok(serde_json::Value::Number(num))
}
fn visit_bool(&self, val: bool) -> Result<Self::Value, DeserializeError> {
Ok(serde_json::Value::Bool(val.into()))
}
fn visit_array<'de, A>(&self, mut access: A) -> Result<Self::Value, DeserializeError>
where A: ArrayAccess<'de> {
let mut elements = Vec::with_capacity(access.size_hint());
while let Some(value) = access.next_element()? {
elements.push(value);
}
Ok(serde_json::Value::Array(elements))
}
fn visit_object<'de, A>(&self, mut access: A) -> Result<Self::Value, DeserializeError>
where A: ObjectAccess<'de> {
let mut object = serde_json::Map::with_capacity(access.size_hint());
while let Some((key, value)) = access.next_entry()? {
object.insert(key, value);
}
Ok(serde_json::Value::Object(object))
}
}
deserializer.deserialize_any(SerdeValueVisitor)
}
}
/// A wrapper struct for an interator producing [Value]s.
pub struct JsonArrayIter<'a>(pub(crate) std::slice::Iter<'a, serde_json::Value>);
impl<'a> Iterator for JsonArrayIter<'a> {
type Item = ReferenceValue<'a, &'a serde_json::Value>;
fn next(&mut self) -> Option<Self::Item> {
let value = self.0.next()?;
Some(value.as_value())
}
}
/// A wrapper struct for an interator producing [Value]s.
pub struct JsonObjectIter<'a>(pub(crate) serde_json::map::Iter<'a>);
impl<'a> Iterator for JsonObjectIter<'a> {
type Item = (&'a str, ReferenceValue<'a, &'a serde_json::Value>);
fn next(&mut self) -> Option<Self::Item> {
let (key, value) = self.0.next()?;
Some((key, value.as_value()))
}
}
// Custom document types
// BTreeMap based documents
impl Document for BTreeMap<Field, crate::schema::Value> {
type Value<'a> = &'a crate::schema::Value;
type FieldsValuesIter<'a> = FieldCopyingIterator<
'a,
btree_map::Iter<'a, Field, crate::schema::Value>,
crate::schema::Value,
>;
fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
FieldCopyingIterator(self.iter())
}
}
impl DocumentDeserialize for BTreeMap<Field, crate::schema::Value> {
fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
where D: DocumentDeserializer<'de> {
let mut document = BTreeMap::new();
while let Some((field, value)) = deserializer.next_field()? {
document.insert(field, value);
}
Ok(document)
}
}
// HashMap based documents
impl Document for HashMap<Field, crate::schema::Value> {
type Value<'a> = &'a crate::schema::Value;
type FieldsValuesIter<'a> = FieldCopyingIterator<
'a,
hash_map::Iter<'a, Field, crate::schema::Value>,
crate::schema::Value,
>;
fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
FieldCopyingIterator(self.iter())
}
}
impl DocumentDeserialize for HashMap<Field, crate::schema::Value> {
fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
where D: DocumentDeserializer<'de> {
let mut document = HashMap::with_capacity(deserializer.size_hint());
while let Some((field, value)) = deserializer.next_field()? {
document.insert(field, value);
}
Ok(document)
}
}
pub struct FieldCopyingIterator<'a, I, V>(I)
where
V: 'a,
I: Iterator<Item = (&'a Field, &'a V)>;
impl<'a, I, V> Iterator for FieldCopyingIterator<'a, I, V>
where
V: 'a,
I: Iterator<Item = (&'a Field, &'a V)>,
{
type Item = (Field, &'a V);
fn next(&mut self) -> Option<Self::Item> {
let (field, value) = self.0.next()?;
Some((*field, value))
}
}

532
src/schema/document/mod.rs Normal file
View File

@@ -0,0 +1,532 @@
//! Document definition for Tantivy to index and store.
//!
//! A document and its values are defined by a couple core traits:
//! - [DocumentAccess] which describes your top-level document and it's fields.
//! - [DocValue] which provides tantivy with a way to access the document's values in a common way
//! without performing any additional allocations.
//! - [DocumentDeserialize] which implements the necessary code to deserialize the document from the
//! doc store.
//!
//! Tantivy provides a few out-of-box implementations of these core traits to provide
//! some simple usage if you don't want to implement these traits on a custom type yourself.
//!
//! # Out-of-box document implementations
//! - [Document] the old document type used by Tantivy before the trait based approach was
//! implemented. This type is still valid and provides all of the original behaviour you might
//! expect.
//! - `BTreeMap<Field, Value>` a mapping of field_ids to their relevant schema value using a
//! BTreeMap.
//! - `HashMap<Field, Value>` a mapping of field_ids to their relevant schema value using a HashMap.
//!
//! # Implementing your custom documents
//! Often in larger projects or higher performance applications you want to avoid the extra overhead
//! of converting your own types to the Tantivy [Document] type, this can often save you a
//! significant amount of time when indexing by avoiding the additional allocations.
//!
//! ### Important Note
//! The implementor of the `DocumentAccess` trait must be `'static` and safe to send across
//! thread boundaries.
//!
//! ## Reusing existing types
//! The API design of the document traits allow you to reuse as much of as little of the
//! existing trait implementations as you like, this can save quite a bit of boilerplate
//! as shown by the following example.
//!
//! ## A basic custom document
//! ```
//! use std::collections::{btree_map, BTreeMap};
//! use tantivy::schema::{Document, Field};
//! use tantivy::schema::document::{DeserializeError, DocumentDeserialize, DocumentDeserializer};
//!
//! /// Our custom document to let us use a map of `serde_json::Values`.
//! pub struct MyCustomDocument {
//! // Tantivy provides trait implementations for common `serde_json` types.
//! fields: BTreeMap<Field, serde_json::Value>
//! }
//!
//! impl Document for MyCustomDocument {
//! // The value type produced by the `iter_fields_and_values` iterator.
//! type Value<'a> = &'a serde_json::Value;
//! // The iterator which is produced by `iter_fields_and_values`.
//! // Often this is a simple new-type wrapper unless you like super long generics.
//! type FieldsValuesIter<'a> = MyCustomIter<'a>;
//!
//! /// Produces an iterator over the document fields and values.
//! /// This method will be called multiple times, it's important
//! /// to not do anything too heavy in this step, any heavy operations
//! /// should be done before and effectively cached.
//! fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
//! MyCustomIter(self.fields.iter())
//! }
//! }
//!
//! // Our document must also provide a way to get the original doc
//! // back when it's deserialized from the doc store.
//! // The API for this is very similar to serde but a little bit
//! // more specialised, giving you access to types like IP addresses, datetime, etc...
//! impl DocumentDeserialize for MyCustomDocument {
//! fn deserialize<'de, D>(deserializer: D) -> Result<Self, DeserializeError>
//! where D: DocumentDeserializer<'de>
//! {
//! // We're not going to implement the necessary logic for this example
//! // see the `Deserialization` section of implementing a custom document
//! // for more information on how this works.
//! unimplemented!()
//! }
//! }
//!
//! /// Our custom iterator just helps us to avoid some messy generics.
//! pub struct MyCustomIter<'a>(btree_map::Iter<'a, Field, serde_json::Value>);
//! impl<'a> Iterator for MyCustomIter<'a> {
//! // Here we can see our field-value pairs being produced by the iterator.
//! // The value returned alongside the field is the same type as `Document::Value<'_>`.
//! type Item = (Field, &'a serde_json::Value);
//!
//! fn next(&mut self) -> Option<Self::Item> {
//! let (field, value) = self.0.next()?;
//! Some((*field, value))
//! }
//! }
//! ```
//!
//! You may have noticed in this example that we haven't needed to implement any custom value types,
//! instead we've just used a [serde_json::Value] type which tantivy provides an existing
//! implementation for.
//!
//! ## Implementing custom values
//! Internally, Tantivy only works with `ReferenceValue` which is an enum that tries to borrow
//! as much data as it can, in order to allow documents to return custom types, they must implement
//! the `DocValue` trait which provides a way for Tantivy to get a `ReferenceValue` that it can then
//! index and store.
//!
//! Values can just as easily be customised as documents by implementing the `DocValue` trait.
//!
//! The implementor of this type should not own the data it's returning, instead it should just
//! hold references of the data held by the parent [Document] which can then be passed
//! on to the [ReferenceValue].
//!
//! This is why `DocValue` is implemented for `&'a serde_json::Value` and `&'a
//! tantivy::schema::Value` but not for their owned counterparts, as we cannot satisfy the lifetime
//! bounds necessary when indexing the documents.
//!
//! ### A note about returning values
//! The custom value type does not have to be the type stored by the document, instead the
//! implementor of a `DocValue` can just be used as a way to convert between the owned type
//! kept in the parent document, and the value passed into Tantivy.
//!
//! ```
//! use tantivy::schema::document::ReferenceValue;
//! use tantivy::schema::{DocValue};
//!
//! #[derive(Debug)]
//! /// Our custom value type which has 3 types, a string, float and bool.
//! #[allow(dead_code)]
//! pub enum MyCustomValue<'a> {
//! // Our string data is owned by the parent document, instead we just
//! // hold onto a reference of this data.
//! String(&'a str),
//! Float(f64),
//! Bool(bool),
//! }
//!
//! impl<'a> DocValue<'a> for MyCustomValue<'a> {
//! type ChildValue = Self;
//! // We don't need to worry about these types here as we're not
//! // working with nested types, but if we wanted to we would
//! // define our two iterator types, a sequence of ReferenceValues
//! // for the array iterator and a sequence of key-value pairs for objects.
//! type ArrayIter = std::iter::Empty<ReferenceValue<'a, Self>>;
//! type ObjectIter = std::iter::Empty<(&'a str, ReferenceValue<'a, Self>)>;
//!
//! // The ReferenceValue which Tantivy can use.
//! fn as_value(&self) -> ReferenceValue<'a, Self> {
//! // We can support any type that Tantivy itself supports.
//! match self {
//! MyCustomValue::String(val) => ReferenceValue::Str(val),
//! MyCustomValue::Float(val) => ReferenceValue::F64(*val),
//! MyCustomValue::Bool(val) => ReferenceValue::Bool(*val),
//! }
//! }
//!
//! }
//! ```
//!
//! TODO: Complete this section...
mod de;
mod default_doc_type;
mod existing_type_impls;
mod se;
use std::fmt::Debug;
use std::mem;
use std::net::Ipv6Addr;
pub(crate) use self::de::BinaryDocumentDeserializer;
pub use self::de::{
ArrayAccess, DeserializeError, DocumentDeserialize, DocumentDeserializer, ObjectAccess,
ValueDeserialize, ValueDeserializer, ValueType, ValueVisitor,
};
pub use self::default_doc_type::{DocParsingError, TantivyDocument};
pub(crate) use self::se::BinaryDocumentSerializer;
use super::*;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
/// The core trait representing a document within the index.
pub trait Document: DocumentDeserialize + Send + Sync + 'static {
/// The value of the field.
type Value<'a>: DocValue<'a> + Clone
where Self: 'a;
/// The iterator over all of the fields and values within the doc.
type FieldsValuesIter<'a>: Iterator<Item = (Field, Self::Value<'a>)>
where Self: 'a;
/// Get an iterator iterating over all fields and values in a document.
fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_>;
/// Sort and groups the field_values by field.
///
/// The result of this method is not cached and is
/// computed on the fly when this method is called.
fn get_sorted_field_values(&self) -> Vec<(Field, Vec<Self::Value<'_>>)> {
let mut field_values: Vec<(Field, Self::Value<'_>)> =
self.iter_fields_and_values().collect();
field_values.sort_by_key(|(field, _)| *field);
let mut field_values_it = field_values.into_iter();
let first_field_value = if let Some(first_field_value) = field_values_it.next() {
first_field_value
} else {
return Vec::new();
};
let mut grouped_field_values = vec![];
let mut current_field = first_field_value.0;
let mut current_group = vec![first_field_value.1];
for (field, value) in field_values_it {
if field == current_field {
current_group.push(value);
} else {
grouped_field_values
.push((current_field, mem::replace(&mut current_group, vec![value])));
current_field = field;
}
}
grouped_field_values.push((current_field, current_group));
grouped_field_values
}
}
/// A single field value.
pub trait DocValue<'a>: Send + Sync + Debug {
/// The child value type returned by this doc value.
type ChildValue: DocValue<'a>;
/// The iterator for walking through the elements within the array.
type ArrayIter: Iterator<Item = ReferenceValue<'a, Self::ChildValue>>;
/// The visitor walking through the key-value pairs within
/// the object.
type ObjectIter: Iterator<Item = (&'a str, ReferenceValue<'a, Self::ChildValue>)>;
/// Returns the field value represented by an enum which borrows it's data.
fn as_value(&self) -> ReferenceValue<'a, Self>;
#[inline]
/// Returns if the value is `null` or not.
fn is_null(&self) -> bool {
matches!(self.as_value(), ReferenceValue::Null)
}
#[inline]
/// If the Value is a String, returns the associated str. Returns None otherwise.
fn as_str(&self) -> Option<&'a str> {
if let ReferenceValue::Str(val) = self.as_value() {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a u64, returns the associated u64. Returns None otherwise.
fn as_u64(&self) -> Option<u64> {
if let ReferenceValue::U64(val) = self.as_value() {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a i64, returns the associated i64. Returns None otherwise.
fn as_i64(&self) -> Option<i64> {
if let ReferenceValue::I64(val) = self.as_value() {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a f64, returns the associated f64. Returns None otherwise.
fn as_f64(&self) -> Option<f64> {
if let ReferenceValue::F64(val) = self.as_value() {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a datetime, returns the associated datetime. Returns None otherwise.
fn as_datetime(&self) -> Option<DateTime> {
if let ReferenceValue::Date(val) = self.as_value() {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a IP address, returns the associated IP. Returns None otherwise.
fn as_ip_addr(&self) -> Option<Ipv6Addr> {
if let ReferenceValue::IpAddr(val) = self.as_value() {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a bool, returns the associated bool. Returns None otherwise.
fn as_bool(&self) -> Option<bool> {
if let ReferenceValue::Bool(val) = self.as_value() {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a pre-tokenized string, returns the associated string. Returns None
/// otherwise.
fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> {
if let ReferenceValue::PreTokStr(val) = self.as_value() {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a bytes value, returns the associated set of bytes. Returns None otherwise.
fn as_bytes(&self) -> Option<&'a [u8]> {
if let ReferenceValue::Bytes(val) = self.as_value() {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a facet, returns the associated facet. Returns None otherwise.
fn as_facet(&self) -> Option<&'a Facet> {
if let ReferenceValue::Facet(val) = self.as_value() {
Some(val)
} else {
None
}
}
#[inline]
/// Returns true if the Value is an array.
fn is_array(&self) -> bool {
matches!(self.as_value(), ReferenceValue::Object(_))
}
#[inline]
/// Returns true if the Value is an object.
fn is_object(&self) -> bool {
matches!(self.as_value(), ReferenceValue::Object(_))
}
}
/// A enum representing a value for tantivy to index.
pub enum ReferenceValue<'a, V>
where V: DocValue<'a> + ?Sized
{
/// A null value.
Null,
/// The str type is used for any text information.
Str(&'a str),
/// Unsigned 64-bits Integer `u64`
U64(u64),
/// Signed 64-bits Integer `i64`
I64(i64),
/// 64-bits Float `f64`
F64(f64),
/// Date/time with nanoseconds precision
Date(DateTime),
/// Facet
Facet(&'a Facet),
/// Arbitrarily sized byte array
Bytes(&'a [u8]),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(Ipv6Addr),
/// Bool value
Bool(bool),
/// Pre-tokenized str type,
PreTokStr(&'a PreTokenizedString),
/// A an array containing multiple values.
Array(V::ArrayIter),
/// A nested / dynamic object.
Object(V::ObjectIter),
}
impl<'a, V> ReferenceValue<'a, V>
where V: DocValue<'a>
{
#[inline]
/// Returns if the value is `null` or not.
pub fn is_null(&self) -> bool {
matches!(self, Self::Null)
}
#[inline]
/// If the Value is a String, returns the associated str. Returns None otherwise.
pub fn as_str(&self) -> Option<&'a str> {
if let Self::Str(val) = self {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a u64, returns the associated u64. Returns None otherwise.
pub fn as_u64(&self) -> Option<u64> {
if let Self::U64(val) = self {
Some(*val)
} else {
None
}
}
#[inline]
/// If the Value is a i64, returns the associated i64. Returns None otherwise.
pub fn as_i64(&self) -> Option<i64> {
if let Self::I64(val) = self {
Some(*val)
} else {
None
}
}
#[inline]
/// If the Value is a f64, returns the associated f64. Returns None otherwise.
pub fn as_f64(&self) -> Option<f64> {
if let Self::F64(val) = self {
Some(*val)
} else {
None
}
}
#[inline]
/// If the Value is a datetime, returns the associated datetime. Returns None otherwise.
pub fn as_datetime(&self) -> Option<DateTime> {
if let Self::Date(val) = self {
Some(*val)
} else {
None
}
}
#[inline]
/// If the Value is a IP address, returns the associated IP. Returns None otherwise.
pub fn as_ip_addr(&self) -> Option<Ipv6Addr> {
if let Self::IpAddr(val) = self {
Some(*val)
} else {
None
}
}
#[inline]
/// If the Value is a bool, returns the associated bool. Returns None otherwise.
pub fn as_bool(&self) -> Option<bool> {
if let Self::Bool(val) = self {
Some(*val)
} else {
None
}
}
#[inline]
/// If the Value is a pre-tokenized string, returns the associated string. Returns None
/// otherwise.
pub fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> {
if let Self::PreTokStr(val) = self {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a bytes value, returns the associated set of bytes. Returns None otherwise.
pub fn as_bytes(&self) -> Option<&'a [u8]> {
if let Self::Bytes(val) = self {
Some(val)
} else {
None
}
}
#[inline]
/// If the Value is a facet, returns the associated facet. Returns None otherwise.
pub fn as_facet(&self) -> Option<&'a Facet> {
if let Self::Facet(val) = self {
Some(val)
} else {
None
}
}
#[inline]
/// Returns true if the Value is an array.
pub fn is_array(&self) -> bool {
matches!(self, Self::Object(_))
}
#[inline]
/// Returns true if the Value is an object.
pub fn is_object(&self) -> bool {
matches!(self, Self::Object(_))
}
}
pub(crate) mod type_codes {
pub const TEXT_CODE: u8 = 0;
pub const U64_CODE: u8 = 1;
pub const I64_CODE: u8 = 2;
pub const HIERARCHICAL_FACET_CODE: u8 = 3;
pub const BYTES_CODE: u8 = 4;
pub const DATE_CODE: u8 = 5;
pub const F64_CODE: u8 = 6;
pub const EXT_CODE: u8 = 7;
// Replaced by the `OBJECT_CODE`.
// -- pub const JSON_OBJ_CODE: u8 = 8;
pub const BOOL_CODE: u8 = 9;
pub const IP_CODE: u8 = 10;
pub const NULL_CODE: u8 = 11;
pub const ARRAY_CODE: u8 = 12;
pub const OBJECT_CODE: u8 = 13;
// Extended type codes
pub const TOK_STR_EXT_CODE: u8 = 0;
}

760
src/schema/document/se.rs Normal file
View File

@@ -0,0 +1,760 @@
use std::borrow::Cow;
use std::io;
use std::io::Write;
use columnar::MonotonicallyMappableToU128;
use common::{f64_to_u64, BinarySerializable, VInt};
use crate::schema::document::{type_codes, DocValue, Document, ReferenceValue};
use crate::schema::Schema;
/// A serializer writing documents which implement [`Document`] to a provided writer.
pub struct BinaryDocumentSerializer<'se, W> {
writer: &'se mut W,
schema: &'se Schema,
}
impl<'se, W> BinaryDocumentSerializer<'se, W>
where W: Write
{
/// Creates a new serializer with a provided writer.
pub(crate) fn new(writer: &'se mut W, schema: &'se Schema) -> Self {
Self { writer, schema }
}
/// Attempts to serialize a given document and write the output
/// to the writer.
pub(crate) fn serialize_doc<D>(&mut self, doc: &D) -> io::Result<()>
where D: Document {
let stored_field_values = || {
doc.iter_fields_and_values()
.filter(|(field, _)| self.schema.get_field_entry(*field).is_stored())
};
let num_field_values = stored_field_values().count();
let mut actual_length = 0;
VInt(num_field_values as u64).serialize(self.writer)?;
for (field, value_access) in stored_field_values() {
field.serialize(self.writer)?;
let mut serializer = BinaryValueSerializer::new(self.writer);
match value_access.as_value() {
ReferenceValue::PreTokStr(pre_tokenized_text) => {
serializer.serialize_value(ReferenceValue::Str::<&'_ crate::schema::Value>(
&pre_tokenized_text.text,
))?;
}
_ => {
serializer.serialize_value(value_access.as_value())?;
}
}
actual_length += 1;
}
if num_field_values != actual_length {
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Unexpected number of entries written to serializer, expected {} entries, got \
{} entries",
num_field_values, actual_length,
),
));
}
Ok(())
}
}
/// A serializer for a single value.
pub struct BinaryValueSerializer<'se, W> {
writer: &'se mut W,
}
impl<'se, W> BinaryValueSerializer<'se, W>
where W: Write
{
/// Creates a new serializer with a provided writer.
pub(crate) fn new(writer: &'se mut W) -> Self {
Self { writer }
}
/// Attempts to serialize a given value and write the output
/// to the writer.
pub(crate) fn serialize_value<'a, V>(
&mut self,
value: ReferenceValue<'a, V>,
) -> io::Result<()>
where
V: DocValue<'a>,
{
match value {
ReferenceValue::Null => self.write_type_code(type_codes::NULL_CODE),
ReferenceValue::Str(val) => {
self.write_type_code(type_codes::TEXT_CODE)?;
let temp_val = Cow::Borrowed(val);
temp_val.serialize(self.writer)
}
ReferenceValue::U64(val) => {
self.write_type_code(type_codes::U64_CODE)?;
val.serialize(self.writer)
}
ReferenceValue::I64(val) => {
self.write_type_code(type_codes::I64_CODE)?;
val.serialize(self.writer)
}
ReferenceValue::F64(val) => {
self.write_type_code(type_codes::F64_CODE)?;
f64_to_u64(val).serialize(self.writer)
}
ReferenceValue::Date(val) => {
self.write_type_code(type_codes::DATE_CODE)?;
val.serialize(self.writer)
}
ReferenceValue::Facet(val) => {
self.write_type_code(type_codes::HIERARCHICAL_FACET_CODE)?;
val.serialize(self.writer)
}
ReferenceValue::Bytes(val) => {
self.write_type_code(type_codes::BYTES_CODE)?;
let temp_val = Cow::Borrowed(val);
temp_val.serialize(self.writer)
}
ReferenceValue::IpAddr(val) => {
self.write_type_code(type_codes::IP_CODE)?;
val.to_u128().serialize(self.writer)
}
ReferenceValue::Bool(val) => {
self.write_type_code(type_codes::BOOL_CODE)?;
val.serialize(self.writer)
}
ReferenceValue::PreTokStr(val) => {
self.write_type_code(type_codes::EXT_CODE)?;
self.write_type_code(type_codes::TOK_STR_EXT_CODE)?;
val.serialize(self.writer)
}
ReferenceValue::Array(elements) => {
self.write_type_code(type_codes::ARRAY_CODE)?;
// Somewhat unfortunate that we do this here however, writing the
// length at the end of the complicates things quite considerably.
let elements: Vec<ReferenceValue<'_, V::ChildValue>> = elements.collect();
let mut serializer = BinaryArraySerializer::begin(elements.len(), self.writer)?;
for value in elements {
serializer.serialize_value(value)?;
}
serializer.end()
}
ReferenceValue::Object(object) => {
self.write_type_code(type_codes::OBJECT_CODE)?;
// Somewhat unfortunate that we do this here however, writing the
// length at the end of the complicates things quite considerably.
let entries: Vec<(&str, ReferenceValue<'_, V::ChildValue>)> = object.collect();
let mut serializer = BinaryObjectSerializer::begin(entries.len(), self.writer)?;
for (key, value) in entries {
serializer.serialize_entry(key, value)?;
}
serializer.end()
}
}
}
fn write_type_code(&mut self, code: u8) -> io::Result<()> {
code.serialize(self.writer)
}
}
/// A serializer for writing a sequence of values to a writer.
pub struct BinaryArraySerializer<'se, W> {
writer: &'se mut W,
expected_length: usize,
actual_length: usize,
}
impl<'se, W> BinaryArraySerializer<'se, W>
where W: Write
{
/// Creates a new array serializer and writes the length of the array to the writer.
pub(crate) fn begin(length: usize, writer: &'se mut W) -> io::Result<Self> {
VInt(length as u64).serialize(writer)?;
Ok(Self {
writer,
expected_length: length,
actual_length: 0,
})
}
/// Attempts to serialize a given value and write the output
/// to the writer.
pub(crate) fn serialize_value<'a, V>(
&mut self,
value: ReferenceValue<'a, V>,
) -> io::Result<()>
where
V: DocValue<'a>,
{
let mut serializer = BinaryValueSerializer::new(self.writer);
serializer.serialize_value(value)?;
self.actual_length += 1;
Ok(())
}
/// Finishes writing the array to the writer and validates it.
pub(crate) fn end(self) -> io::Result<()> {
if self.expected_length != self.actual_length {
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Unexpected number of entries written to serializer, expected {} entries, got \
{} entries",
self.expected_length, self.actual_length,
),
));
}
Ok(())
}
}
/// A serializer for writing a set of key-value pairs to a writer.
pub struct BinaryObjectSerializer<'se, W> {
inner: BinaryArraySerializer<'se, W>,
expected_length: usize,
actual_length: usize,
}
impl<'se, W> BinaryObjectSerializer<'se, W>
where W: Write
{
/// Creates a new object serializer and writes the length of the object to the writer.
pub(crate) fn begin(length: usize, writer: &'se mut W) -> io::Result<Self> {
// We mul by 2 here to count the keys and values separately:
// [("a", 1), ("b", 2)] is actually stored as ["a", 1, "b", 2]
let inner = BinaryArraySerializer::begin(length * 2, writer)?;
Ok(Self {
inner,
expected_length: length,
actual_length: 0,
})
}
/// Attempts to serialize a given value and write the output
/// to the writer.
pub(crate) fn serialize_entry<'a, V>(
&mut self,
key: &'a str,
value: ReferenceValue<'a, V>,
) -> io::Result<()>
where
V: DocValue<'a>,
{
// Keys and values are stored inline with one another.
// Technically this isn't the *most* optimal way of storing the objects
// as we could avoid writing the extra byte per key. But the gain is
// largely not worth it for the extra complexity it brings.
self.inner
.serialize_value(ReferenceValue::<'a, V>::Str(key))?;
self.inner.serialize_value(value)?;
self.actual_length += 1;
Ok(())
}
/// Finishes writing the array to the writer and validates it.
pub(crate) fn end(self) -> io::Result<()> {
if self.expected_length != self.actual_length {
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Unexpected number of entries written to serializer, expected {} entries, got \
{} entries",
self.expected_length, self.actual_length,
),
));
}
// This should never fail if the above statement is valid.
self.inner.end()?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use common::DateTime;
use serde_json::Number;
use tokenizer_api::Token;
use super::*;
use crate::schema::document::existing_type_impls::{JsonArrayIter, JsonObjectIter};
use crate::schema::{Facet, Field, FAST, STORED, TEXT};
use crate::tokenizer::PreTokenizedString;
fn serialize_value<'a>(value: ReferenceValue<'a, &'a serde_json::Value>) -> Vec<u8> {
let mut writer = Vec::new();
let mut serializer = BinaryValueSerializer::new(&mut writer);
serializer.serialize_value(value).expect("Serialize value");
writer
}
/// A macro for defining the expected binary representation
/// of the serialized values in a somewhat human readable way.
macro_rules! binary_repr {
($( $type_code:expr $(, $ext_code:expr)? => $value:expr $(,)?)*) => {{
let mut writer = Vec::new();
$(
$type_code.serialize(&mut writer).unwrap();
$(
$ext_code.serialize(&mut writer).unwrap();
)?
$value.serialize(&mut writer).unwrap();
)*
writer
}};
(collection $code:expr, length $len:expr, $( $type_code:expr $(, $ext_code:expr)? => $value:expr $(,)?)*) => {{
let mut writer = Vec::new();
$code.serialize(&mut writer).unwrap();
VInt($len as u64).serialize(&mut writer).unwrap();
$(
$type_code.serialize(&mut writer).unwrap();
$(
$ext_code.serialize(&mut writer).unwrap();
)?
$value.serialize(&mut writer).unwrap();
)*
writer
}};
}
#[test]
fn test_simple_value_serialize() {
let result = serialize_value(ReferenceValue::Null);
let expected = binary_repr!(
type_codes::NULL_CODE => (),
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let result = serialize_value(ReferenceValue::Str("hello, world"));
let expected = binary_repr!(
type_codes::TEXT_CODE => String::from("hello, world"),
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let result = serialize_value(ReferenceValue::U64(123));
let expected = binary_repr!(
type_codes::U64_CODE => 123u64,
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let result = serialize_value(ReferenceValue::I64(-123));
let expected = binary_repr!(
type_codes::I64_CODE => -123i64,
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let result = serialize_value(ReferenceValue::F64(123.3845));
let expected = binary_repr!(
type_codes::F64_CODE => f64_to_u64(123.3845f64),
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let result = serialize_value(ReferenceValue::Bool(false));
let expected = binary_repr!(
type_codes::BOOL_CODE => false,
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let result = serialize_value(ReferenceValue::Date(DateTime::MAX));
let expected = binary_repr!(
type_codes::DATE_CODE => DateTime::MAX,
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let facet = Facet::from_text("/hello/world").unwrap();
let result = serialize_value(ReferenceValue::Facet(&facet));
let expected = binary_repr!(
type_codes::HIERARCHICAL_FACET_CODE => Facet::from_text("/hello/world").unwrap(),
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let pre_tok_str = PreTokenizedString {
text: "hello, world".to_string(),
tokens: vec![Token::default(), Token::default()],
};
let result = serialize_value(ReferenceValue::PreTokStr(&pre_tok_str));
let expected = binary_repr!(
type_codes::EXT_CODE, type_codes::TOK_STR_EXT_CODE => pre_tok_str,
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
}
#[test]
fn test_array_serialize() {
let elements = vec![serde_json::Value::Null, serde_json::Value::Null];
let result = serialize_value(ReferenceValue::Array(JsonArrayIter(elements.iter())));
let expected = binary_repr!(
collection type_codes::ARRAY_CODE,
length elements.len(),
type_codes::NULL_CODE => (),
type_codes::NULL_CODE => (),
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let elements = vec![
serde_json::Value::String("Hello, world".into()),
serde_json::Value::String("Some demo".into()),
];
let result = serialize_value(ReferenceValue::Array(JsonArrayIter(elements.iter())));
let expected = binary_repr!(
collection type_codes::ARRAY_CODE,
length elements.len(),
type_codes::TEXT_CODE => String::from("Hello, world"),
type_codes::TEXT_CODE => String::from("Some demo"),
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let elements = vec![];
let result = serialize_value(ReferenceValue::Array(JsonArrayIter(elements.iter())));
let expected = binary_repr!(
collection type_codes::ARRAY_CODE,
length elements.len(),
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let elements = vec![
serde_json::Value::Null,
serde_json::Value::String("Hello, world".into()),
serde_json::Value::Number(12345.into()),
];
let result = serialize_value(ReferenceValue::Array(JsonArrayIter(elements.iter())));
let expected = binary_repr!(
collection type_codes::ARRAY_CODE,
length elements.len(),
type_codes::NULL_CODE => (),
type_codes::TEXT_CODE => String::from("Hello, world"),
type_codes::U64_CODE => 12345u64,
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
}
#[test]
fn test_object_serialize() {
let mut object = serde_json::Map::new();
object.insert(
"my-first-key".into(),
serde_json::Value::String("Hello".into()),
);
object.insert("my-second-key".into(), serde_json::Value::Null);
object.insert(
"my-third-key".into(),
serde_json::Value::Number(Number::from_f64(123.0).unwrap()),
);
let result = serialize_value(ReferenceValue::Object(JsonObjectIter(object.iter())));
let expected = binary_repr!(
collection type_codes::OBJECT_CODE,
length object.len() * 2, // To account for keys counting towards the length
type_codes::TEXT_CODE => String::from("my-first-key"),
type_codes::TEXT_CODE => String::from("Hello"),
type_codes::TEXT_CODE => String::from("my-second-key"),
type_codes::NULL_CODE => (),
type_codes::TEXT_CODE => String::from("my-third-key"),
type_codes::F64_CODE => f64_to_u64(123.0),
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let object = serde_json::Map::new();
let result = serialize_value(ReferenceValue::Object(JsonObjectIter(object.iter())));
let expected = binary_repr!(
collection type_codes::OBJECT_CODE,
length object.len(),
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
let mut object = serde_json::Map::new();
object.insert("my-first-key".into(), serde_json::Value::Null);
object.insert("my-second-key".into(), serde_json::Value::Null);
object.insert("my-third-key".into(), serde_json::Value::Null);
let result = serialize_value(ReferenceValue::Object(JsonObjectIter(object.iter())));
let expected = binary_repr!(
collection type_codes::OBJECT_CODE,
length object.len() * 2, // To account for keys counting towards the length
type_codes::TEXT_CODE => String::from("my-first-key"),
type_codes::NULL_CODE => (),
type_codes::TEXT_CODE => String::from("my-second-key"),
type_codes::NULL_CODE => (),
type_codes::TEXT_CODE => String::from("my-third-key"),
type_codes::NULL_CODE => (),
);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
}
#[test]
fn test_nested_serialize() {
let mut object = serde_json::Map::new();
object.insert(
"my-array".into(),
serde_json::Value::Array(vec![
serde_json::Value::Null,
serde_json::Value::String(String::from("bobby of the sea")),
]),
);
object.insert(
"my-object".into(),
serde_json::Value::Object(
vec![
(
"inner-1".to_string(),
serde_json::Value::Number((-123i64).into()),
),
(
"inner-2".to_string(),
serde_json::Value::String(String::from("bobby of the sea 2")),
),
]
.into_iter()
.collect(),
),
);
let result = serialize_value(ReferenceValue::Object(JsonObjectIter(object.iter())));
let mut expected = Vec::new();
let header = binary_repr!(
collection type_codes::OBJECT_CODE,
length object.len() * 2,
);
expected.extend_from_slice(&header);
expected
.extend_from_slice(&binary_repr!(type_codes::TEXT_CODE => String::from("my-array")));
let nested_array = binary_repr!(
collection type_codes::ARRAY_CODE,
length 2,
type_codes::NULL_CODE => (),
type_codes::TEXT_CODE => String::from("bobby of the sea"),
);
expected.extend_from_slice(&nested_array);
expected
.extend_from_slice(&binary_repr!(type_codes::TEXT_CODE => String::from("my-object")));
let nested_object = binary_repr!(
collection type_codes::OBJECT_CODE,
length 4, // 2 keys, 2 values
type_codes::TEXT_CODE => String::from("inner-1"),
type_codes::I64_CODE => -123i64,
type_codes::TEXT_CODE => String::from("inner-2"),
type_codes::TEXT_CODE => String::from("bobby of the sea 2"),
);
expected.extend_from_slice(&nested_object);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
// Some more extreme nesting that might behave weirdly
let mut object = serde_json::Map::new();
object.insert(
"my-array".into(),
serde_json::Value::Array(vec![serde_json::Value::Array(vec![
serde_json::Value::Array(vec![]),
serde_json::Value::Array(vec![serde_json::Value::Null]),
])]),
);
let result = serialize_value(ReferenceValue::Object(JsonObjectIter(object.iter())));
let mut expected = Vec::new();
let header = binary_repr!(
collection type_codes::OBJECT_CODE,
length object.len() * 2,
);
expected.extend_from_slice(&header);
expected
.extend_from_slice(&binary_repr!(type_codes::TEXT_CODE => String::from("my-array")));
let nested_array = binary_repr!(
collection type_codes::ARRAY_CODE,
length 1,
);
expected.extend_from_slice(&nested_array);
let nested_array = binary_repr!(
collection type_codes::ARRAY_CODE,
length 2,
);
expected.extend_from_slice(&nested_array);
let nested_array = binary_repr!(
collection type_codes::ARRAY_CODE,
length 0,
);
expected.extend_from_slice(&nested_array);
let nested_array = binary_repr!(
collection type_codes::ARRAY_CODE,
length 1,
type_codes::NULL_CODE => (),
);
expected.extend_from_slice(&nested_array);
assert_eq!(
result, expected,
"Expected serialized value to match the binary representation"
);
}
fn serialize_doc<D: Document>(doc: &D, schema: &Schema) -> Vec<u8> {
let mut writer = Vec::new();
let mut serializer = BinaryDocumentSerializer::new(&mut writer, schema);
serializer.serialize_doc(doc).expect("Serialize value");
writer
}
/// A helper macro for generating the expected binary representation of the document.
macro_rules! expected_doc_data {
(length $len:expr) => {{
let mut writer = Vec::new();
VInt($len as u64).serialize(&mut writer).unwrap();
writer
}};
(length $len:expr, $( $field_id:expr => $value:expr $(,)?)*) => {{
let mut writer = Vec::new();
VInt($len as u64).serialize(&mut writer).unwrap();
$(
$field_id.serialize(&mut writer).unwrap();
$value.serialize(&mut writer).unwrap();
)*
writer
}};
}
#[test]
fn test_document_serialize() {
let mut builder = Schema::builder();
let name = builder.add_text_field("name", TEXT | STORED);
let age = builder.add_u64_field("age", FAST | STORED);
let schema = builder.build();
let mut document = BTreeMap::new();
document.insert(name, crate::schema::Value::Str("ChillFish8".into()));
document.insert(age, crate::schema::Value::U64(20));
let result = serialize_doc(&document, &schema);
let mut expected = expected_doc_data!(length document.len());
name.serialize(&mut expected).unwrap();
expected
.extend_from_slice(&binary_repr!(type_codes::TEXT_CODE => String::from("ChillFish8")));
age.serialize(&mut expected).unwrap();
expected.extend_from_slice(&binary_repr!(type_codes::U64_CODE => 20u64));
assert_eq!(
result, expected,
"Expected serialized document to match the binary representation"
);
let mut builder = Schema::builder();
let name = builder.add_text_field("name", TEXT | STORED);
// This should be skipped when serializing.
let age = builder.add_u64_field("age", FAST);
let schema = builder.build();
let mut document = BTreeMap::new();
document.insert(name, crate::schema::Value::Str("ChillFish8".into()));
document.insert(age, crate::schema::Value::U64(20));
let result = serialize_doc(&document, &schema);
let mut expected = expected_doc_data!(length 1);
name.serialize(&mut expected).unwrap();
expected
.extend_from_slice(&binary_repr!(type_codes::TEXT_CODE => String::from("ChillFish8")));
assert_eq!(
result, expected,
"Expected serialized document to match the binary representation"
);
let builder = Schema::builder();
let schema = builder.build();
let document = BTreeMap::<Field, crate::schema::Value>::new();
let result = serialize_doc(&document, &schema);
let expected = expected_doc_data!(length document.len());
assert_eq!(
result, expected,
"Expected serialized document to match the binary representation"
);
}
}

View File

@@ -481,7 +481,7 @@ impl FieldType {
})
}
}
FieldType::JsonObject(_) => Ok(Value::JsonObject(json_map)),
FieldType::JsonObject(_) => Ok(Value::from(json_map)),
_ => Err(ValueParsingError::TypeError {
expected: self.value_type().name(),
json: JsonValue::Object(json_map),
@@ -538,27 +538,27 @@ mod tests {
use crate::schema::{NumericOptions, Schema, TextOptions, Type, Value, COERCE, INDEXED};
use crate::time::{Date, Month, PrimitiveDateTime, Time};
use crate::tokenizer::{PreTokenizedString, Token};
use crate::{DateTime, Document};
use crate::{DateTime, TantivyDocument};
#[test]
fn test_to_string_coercion() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("id", COERCE);
let schema = schema_builder.build();
let doc = schema.parse_document(r#"{"id": 100}"#).unwrap();
let doc = TantivyDocument::parse_json(&schema, r#"{"id": 100}"#).unwrap();
assert_eq!(
&Value::Str("100".to_string()),
doc.get_first(text_field).unwrap()
);
let doc = schema.parse_document(r#"{"id": true}"#).unwrap();
let doc = TantivyDocument::parse_json(&schema, r#"{"id": true}"#).unwrap();
assert_eq!(
&Value::Str("true".to_string()),
doc.get_first(text_field).unwrap()
);
// Not sure if this null coercion is the best approach
let doc = schema.parse_document(r#"{"id": null}"#).unwrap();
let doc = TantivyDocument::parse_json(&schema, r#"{"id": null}"#).unwrap();
assert_eq!(
&Value::Str("null".to_string()),
doc.get_first(text_field).unwrap()
@@ -573,7 +573,7 @@ mod tests {
let f64_field = schema_builder.add_f64_field("f64", COERCE);
let schema = schema_builder.build();
let doc_json = r#"{"i64": "100", "u64": "100", "f64": "100"}"#;
let doc = schema.parse_document(doc_json).unwrap();
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
assert_eq!(&Value::I64(100), doc.get_first(i64_field).unwrap());
assert_eq!(&Value::U64(100), doc.get_first(u64_field).unwrap());
assert_eq!(&Value::F64(100.0), doc.get_first(f64_field).unwrap());
@@ -585,11 +585,11 @@ mod tests {
let bool_field = schema_builder.add_bool_field("bool", COERCE);
let schema = schema_builder.build();
let doc_json = r#"{"bool": "true"}"#;
let doc = schema.parse_document(doc_json).unwrap();
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
assert_eq!(&Value::Bool(true), doc.get_first(bool_field).unwrap());
let doc_json = r#"{"bool": "false"}"#;
let doc = schema.parse_document(doc_json).unwrap();
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
assert_eq!(&Value::Bool(false), doc.get_first(bool_field).unwrap());
}
@@ -600,20 +600,17 @@ mod tests {
schema_builder.add_u64_field("u64", NumericOptions::default());
schema_builder.add_f64_field("f64", NumericOptions::default());
let schema = schema_builder.build();
assert!(schema
.parse_document(r#"{"u64": "100"}"#)
assert!(TantivyDocument::parse_json(&schema, r#"{"u64": "100"}"#)
.unwrap_err()
.to_string()
.contains("a u64"));
assert!(schema
.parse_document(r#"{"i64": "100"}"#)
assert!(TantivyDocument::parse_json(&schema, r#"{"i64": "100"}"#)
.unwrap_err()
.to_string()
.contains("a i64"));
assert!(schema
.parse_document(r#"{"f64": "100"}"#)
assert!(TantivyDocument::parse_json(&schema, r#"{"f64": "100"}"#)
.unwrap_err()
.to_string()
.contains("a f64"));
@@ -625,7 +622,7 @@ mod tests {
let date_field = schema_builder.add_date_field("date", INDEXED);
let schema = schema_builder.build();
let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#;
let doc = schema.parse_document(doc_json).unwrap();
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let date = doc.get_first(date_field).unwrap();
// Time zone is converted to UTC
assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{date:?}"));
@@ -633,7 +630,7 @@ mod tests {
#[test]
fn test_serialize_json_date() {
let mut doc = Document::new();
let mut doc = TantivyDocument::new();
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date", INDEXED);
let schema = schema_builder.build();
@@ -641,7 +638,7 @@ mod tests {
let naive_time = Time::from_hms(13, 20, 0).unwrap();
let date_time = PrimitiveDateTime::new(naive_date, naive_time);
doc.add_date(date_field, DateTime::from_primitive(date_time));
let doc_json = schema.to_json(&doc);
let doc_json = doc.to_json(&schema);
assert_eq!(doc_json, r#"{"date":["1982-09-17T13:20:00Z"]}"#);
}

View File

@@ -1,7 +1,3 @@
use std::io::{self, Read, Write};
use common::BinarySerializable;
use crate::schema::{Field, Value};
/// `FieldValue` holds together a `Field` and its `Value`.
@@ -35,15 +31,16 @@ impl From<FieldValue> for Value {
}
}
impl BinarySerializable for FieldValue {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
self.field.serialize(writer)?;
self.value.serialize(writer)
}
/// A helper wrapper for creating standard iterators
/// out of the fields iterator trait.
pub struct FieldValueIter<'a>(pub(crate) std::slice::Iter<'a, FieldValue>);
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let field = Field::deserialize(reader)?;
let value = Value::deserialize(reader)?;
Ok(FieldValue { field, value })
impl<'a> Iterator for FieldValueIter<'a> {
type Item = (Field, &'a Value);
fn next(&mut self) -> Option<Self::Item> {
self.0
.next()
.map(|field_value| (field_value.field, &field_value.value))
}
}

View File

@@ -106,7 +106,7 @@
//! let schema = schema_builder.build();
//! ```
mod document;
pub mod document;
mod facet;
mod facet_options;
mod schema;
@@ -134,7 +134,7 @@ pub use self::bytes_options::BytesOptions;
#[allow(deprecated)]
pub use self::date_time_options::DatePrecision;
pub use self::date_time_options::{DateOptions, DateTimePrecision, DATE_TIME_PRECISION_INDEXED};
pub use self::document::Document;
pub use self::document::{DocParsingError, DocValue, Document, TantivyDocument};
pub(crate) use self::facet::FACET_SEP_BYTE;
pub use self::facet::{Facet, FacetParseError};
pub use self::facet_options::FacetOptions;
@@ -150,7 +150,7 @@ pub use self::named_field_document::NamedFieldDocument;
#[allow(deprecated)]
pub use self::numeric_options::IntOptions;
pub use self::numeric_options::NumericOptions;
pub use self::schema::{DocParsingError, Schema, SchemaBuilder};
pub use self::schema::{Schema, SchemaBuilder};
pub use self::term::{Term, ValueBytes, JSON_END_OF_PATH};
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
pub use self::value::Value;

View File

@@ -1,16 +1,14 @@
use std::collections::{BTreeMap, HashMap};
use std::collections::HashMap;
use std::fmt;
use std::sync::Arc;
use serde::de::{SeqAccess, Visitor};
use serde::ser::SerializeSeq;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_json::{self, Value as JsonValue};
use super::ip_options::IpAddrOptions;
use super::*;
use crate::schema::bytes_options::BytesOptions;
use crate::schema::field_type::ValueParsingError;
use crate::TantivyError;
/// Tantivy has a very strict schema.
@@ -317,78 +315,6 @@ impl Schema {
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))
}
/// Create document from a named doc.
pub fn convert_named_doc(
&self,
named_doc: NamedFieldDocument,
) -> Result<Document, DocParsingError> {
let mut document = Document::new();
for (field_name, values) in named_doc.0 {
if let Ok(field) = self.get_field(&field_name) {
for value in values {
document.add_field_value(field, value);
}
}
}
Ok(document)
}
/// Create a named document from the doc.
pub fn to_named_doc(&self, doc: &Document) -> NamedFieldDocument {
let mut field_map = BTreeMap::new();
for (field, field_values) in doc.get_sorted_field_values() {
let field_name = self.get_field_name(field);
let values: Vec<Value> = field_values.into_iter().cloned().collect();
field_map.insert(field_name.to_string(), values);
}
NamedFieldDocument(field_map)
}
/// Encode the schema in JSON.
///
/// Encoding a document cannot fail.
pub fn to_json(&self, doc: &Document) -> String {
serde_json::to_string(&self.to_named_doc(doc)).expect("doc encoding failed. This is a bug")
}
/// Build a document object from a json-object.
pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
let json_obj: serde_json::Map<String, JsonValue> =
serde_json::from_str(doc_json).map_err(|_| DocParsingError::invalid_json(doc_json))?;
self.json_object_to_doc(json_obj)
}
/// Build a document object from a json-object.
pub fn json_object_to_doc(
&self,
json_obj: serde_json::Map<String, JsonValue>,
) -> Result<Document, DocParsingError> {
let mut doc = Document::default();
for (field_name, json_value) in json_obj {
if let Ok(field) = self.get_field(&field_name) {
let field_entry = self.get_field_entry(field);
let field_type = field_entry.field_type();
match json_value {
JsonValue::Array(json_items) => {
for json_item in json_items {
let value = field_type
.value_from_json(json_item)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value);
}
}
_ => {
let value = field_type
.value_from_json(json_value)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value);
}
}
}
}
Ok(doc)
}
/// Searches for a full_path in the schema, returning the field name and a JSON path.
///
/// This function works by checking if the field exists for the exact given full_path.
@@ -478,26 +404,6 @@ impl<'de> Deserialize<'de> for Schema {
}
}
/// Error that may happen when deserializing
/// a document from JSON.
#[derive(Debug, Error, PartialEq)]
pub enum DocParsingError {
/// The payload given is not valid JSON.
#[error("The provided string is not valid JSON")]
InvalidJson(String),
/// One of the value node could not be parsed.
#[error("The field '{0:?}' could not be parsed: {1:?}")]
ValueError(String, ValueParsingError),
}
impl DocParsingError {
/// Builds a NotJson DocParsingError
fn invalid_json(invalid_json: &str) -> Self {
let sample = invalid_json.chars().take(20).collect();
DocParsingError::InvalidJson(sample)
}
}
#[cfg(test)]
mod tests {
@@ -507,6 +413,7 @@ mod tests {
use pretty_assertions::assert_eq;
use serde_json;
use crate::schema::document::DocValue;
use crate::schema::field_type::ValueParsingError;
use crate::schema::schema::DocParsingError::InvalidJson;
use crate::schema::*;
@@ -675,9 +582,9 @@ mod tests {
"ip": "127.0.0.1",
"is_read": true
}"#;
let doc = schema.parse_document(doc_json).unwrap();
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let doc_serdeser = schema.parse_document(&schema.to_json(&doc)).unwrap();
let doc_serdeser = TantivyDocument::parse_json(&schema, &doc.to_json(&schema)).unwrap();
assert_eq!(doc, doc_serdeser);
}
@@ -691,26 +598,26 @@ mod tests {
let doc_json = r#"{
"ip": "127.0.0.1"
}"#;
let doc = schema.parse_document(doc_json).unwrap();
let value: serde_json::Value = serde_json::from_str(&schema.to_json(&doc)).unwrap();
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let value: serde_json::Value = serde_json::from_str(&doc.to_json(&schema)).unwrap();
assert_eq!(value["ip"][0], "127.0.0.1");
// Special case IpV6 loopback. We don't want to map that to IPv4
let doc_json = r#"{
"ip": "::1"
}"#;
let doc = schema.parse_document(doc_json).unwrap();
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let value: serde_json::Value = serde_json::from_str(&schema.to_json(&doc)).unwrap();
let value: serde_json::Value = serde_json::from_str(&doc.to_json(&schema)).unwrap();
assert_eq!(value["ip"][0], "::1");
// testing ip address of every router in the world
let doc_json = r#"{
"ip": "192.168.0.1"
}"#;
let doc = schema.parse_document(doc_json).unwrap();
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let value: serde_json::Value = serde_json::from_str(&schema.to_json(&doc)).unwrap();
let value: serde_json::Value = serde_json::from_str(&doc.to_json(&schema)).unwrap();
assert_eq!(value["ip"][0], "192.168.0.1");
}
@@ -729,9 +636,8 @@ mod tests {
"val".to_string(),
vec![Value::from(14u64), Value::from(-1i64)],
);
let doc = schema
.convert_named_doc(NamedFieldDocument(named_doc_map))
.unwrap();
let doc =
TantivyDocument::convert_named_doc(&schema, NamedFieldDocument(named_doc_map)).unwrap();
assert_eq!(
doc.get_all(title).collect::<Vec<_>>(),
vec![
@@ -753,9 +659,7 @@ mod tests {
"title".to_string(),
vec![Value::from("title1"), Value::from("title2")],
);
schema
.convert_named_doc(NamedFieldDocument(named_doc_map))
.unwrap();
TantivyDocument::convert_named_doc(&schema, NamedFieldDocument(named_doc_map)).unwrap();
}
#[test]
@@ -771,27 +675,27 @@ mod tests {
let score_field = schema_builder.add_f64_field("score", score_options);
let schema = schema_builder.build();
{
let doc = schema.parse_document("{}").unwrap();
let doc = TantivyDocument::parse_json(&schema, "{}").unwrap();
assert!(doc.field_values().is_empty());
}
{
let doc = schema
.parse_document(
r#"{
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 4,
"popularity": 10,
"score": 80.5
}"#,
)
.unwrap();
)
.unwrap();
assert_eq!(
doc.get_first(title_field).unwrap().as_text(),
doc.get_first(title_field).unwrap().as_str(),
Some("my title")
);
assert_eq!(
doc.get_first(author_field).unwrap().as_text(),
doc.get_first(author_field).unwrap().as_str(),
Some("fulmicoton")
);
assert_eq!(doc.get_first(count_field).unwrap().as_u64(), Some(4));
@@ -799,7 +703,8 @@ mod tests {
assert_eq!(doc.get_first(score_field).unwrap().as_f64(), Some(80.5f64));
}
{
let res = schema.parse_document(
let res = TantivyDocument::parse_json(
&schema,
r#"{
"thisfieldisnotdefinedintheschema": "my title",
"title": "my title",
@@ -813,7 +718,8 @@ mod tests {
assert!(res.is_ok());
}
{
let json_err = schema.parse_document(
let json_err = TantivyDocument::parse_json(
&schema,
r#"{
"title": "my title",
"author": "fulmicoton",
@@ -832,7 +738,8 @@ mod tests {
);
}
{
let json_err = schema.parse_document(
let json_err = TantivyDocument::parse_json(
&schema,
r#"{
"title": "my title",
"author": "fulmicoton",
@@ -850,7 +757,8 @@ mod tests {
);
}
{
let json_err = schema.parse_document(
let json_err = TantivyDocument::parse_json(
&schema,
r#"{
"title": "my title",
"author": "fulmicoton",
@@ -868,7 +776,8 @@ mod tests {
));
}
{
let json_err = schema.parse_document(
let json_err = TantivyDocument::parse_json(
&schema,
r#"{
"title": "my title",
"author": "fulmicoton",
@@ -887,11 +796,12 @@ mod tests {
}
{
// Short JSON, under the 20 char take.
let json_err = schema.parse_document(r#"{"count": 50,}"#);
let json_err = TantivyDocument::parse_json(&schema, r#"{"count": 50,}"#);
assert_matches!(json_err, Err(InvalidJson(_)));
}
{
let json_err = schema.parse_document(
let json_err = TantivyDocument::parse_json(
&schema,
r#"{
"title": "my title",
"author": "fulmicoton",

View File

@@ -1,12 +1,17 @@
use std::collections::{btree_map, BTreeMap};
use std::fmt;
use std::net::Ipv6Addr;
use base64::engine::general_purpose::STANDARD as BASE64;
use base64::Engine;
use serde::de::Visitor;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_json::Map;
use serde::de::{MapAccess, SeqAccess};
use time::format_description::well_known::Rfc3339;
use time::OffsetDateTime;
use crate::schema::document::{
ArrayAccess, DeserializeError, DocValue, ObjectAccess, ReferenceValue, ValueDeserialize,
ValueDeserializer, ValueVisitor,
};
use crate::schema::Facet;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
@@ -15,6 +20,8 @@ use crate::DateTime;
/// It is an enum over all over all of the possible field type.
#[derive(Debug, Clone, PartialEq)]
pub enum Value {
/// A null value.
Null,
/// The str type is used for any text information.
Str(String),
/// Pre-tokenized str type,
@@ -33,18 +40,127 @@ pub enum Value {
Facet(Facet),
/// Arbitrarily sized byte array
Bytes(Vec<u8>),
/// Json object value.
JsonObject(serde_json::Map<String, serde_json::Value>),
/// A set of values.
Array(Vec<Self>),
/// Dynamic object value.
Object(BTreeMap<String, Self>),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(Ipv6Addr),
}
impl<'a> DocValue<'a> for &'a Value {
type ChildValue = Self;
type ArrayIter = ArrayIter<'a>;
type ObjectIter = ObjectMapIter<'a>;
fn as_value(&self) -> ReferenceValue<'a, Self> {
match self {
Value::Null => ReferenceValue::Null,
Value::Str(val) => ReferenceValue::Str(val),
Value::PreTokStr(val) => ReferenceValue::PreTokStr(val),
Value::U64(val) => ReferenceValue::U64(*val),
Value::I64(val) => ReferenceValue::I64(*val),
Value::F64(val) => ReferenceValue::F64(*val),
Value::Bool(val) => ReferenceValue::Bool(*val),
Value::Date(val) => ReferenceValue::Date(*val),
Value::Facet(val) => ReferenceValue::Facet(val),
Value::Bytes(val) => ReferenceValue::Bytes(val),
Value::IpAddr(val) => ReferenceValue::IpAddr(*val),
Value::Array(array) => ReferenceValue::Array(ArrayIter(array.iter())),
Value::Object(object) => ReferenceValue::Object(ObjectMapIter(object.iter())),
}
}
}
impl ValueDeserialize for Value {
fn deserialize<'de, D>(deserializer: D) -> Result<Self, DeserializeError>
where D: ValueDeserializer<'de> {
struct Visitor;
impl ValueVisitor for Visitor {
type Value = Value;
fn visit_null(&self) -> Result<Self::Value, DeserializeError> {
Ok(Value::Null)
}
fn visit_string(&self, val: String) -> Result<Self::Value, DeserializeError> {
Ok(Value::Str(val))
}
fn visit_u64(&self, val: u64) -> Result<Self::Value, DeserializeError> {
Ok(Value::U64(val))
}
fn visit_i64(&self, val: i64) -> Result<Self::Value, DeserializeError> {
Ok(Value::I64(val))
}
fn visit_f64(&self, val: f64) -> Result<Self::Value, DeserializeError> {
Ok(Value::F64(val))
}
fn visit_bool(&self, val: bool) -> Result<Self::Value, DeserializeError> {
Ok(Value::Bool(val))
}
fn visit_datetime(&self, val: DateTime) -> Result<Self::Value, DeserializeError> {
Ok(Value::Date(val))
}
fn visit_ip_address(&self, val: Ipv6Addr) -> Result<Self::Value, DeserializeError> {
Ok(Value::IpAddr(val))
}
fn visit_facet(&self, val: Facet) -> Result<Self::Value, DeserializeError> {
Ok(Value::Facet(val))
}
fn visit_bytes(&self, val: Vec<u8>) -> Result<Self::Value, DeserializeError> {
Ok(Value::Bytes(val))
}
fn visit_pre_tokenized_string(
&self,
val: PreTokenizedString,
) -> Result<Self::Value, DeserializeError> {
Ok(Value::PreTokStr(val))
}
fn visit_array<'de, A>(&self, mut access: A) -> Result<Self::Value, DeserializeError>
where A: ArrayAccess<'de> {
let mut elements = Vec::with_capacity(access.size_hint());
while let Some(value) = access.next_element()? {
elements.push(value);
}
Ok(Value::Array(elements))
}
fn visit_object<'de, A>(&self, mut access: A) -> Result<Self::Value, DeserializeError>
where A: ObjectAccess<'de> {
let mut elements = BTreeMap::new();
while let Some((key, value)) = access.next_entry()? {
elements.insert(key, value);
}
Ok(Value::Object(elements))
}
}
deserializer.deserialize_any(Visitor)
}
}
impl Eq for Value {}
impl Serialize for Value {
impl serde::Serialize for Value {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer {
where S: serde::Serializer {
match *self {
Value::Null => serializer.serialize_unit(),
Value::Str(ref v) => serializer.serialize_str(v),
Value::PreTokStr(ref v) => v.serialize(serializer),
Value::U64(u) => serializer.serialize_u64(u),
@@ -54,31 +170,36 @@ impl Serialize for Value {
Value::Date(ref date) => time::serde::rfc3339::serialize(&date.into_utc(), serializer),
Value::Facet(ref facet) => facet.serialize(serializer),
Value::Bytes(ref bytes) => serializer.serialize_str(&BASE64.encode(bytes)),
Value::JsonObject(ref obj) => obj.serialize(serializer),
Value::IpAddr(ref obj) => {
Value::Object(ref obj) => obj.serialize(serializer),
Value::IpAddr(ref ip_v6) => {
// Ensure IpV4 addresses get serialized as IpV4, but excluding IpV6 loopback.
if let Some(ip_v4) = obj.to_ipv4_mapped() {
if let Some(ip_v4) = ip_v6.to_ipv4_mapped() {
ip_v4.serialize(serializer)
} else {
obj.serialize(serializer)
ip_v6.serialize(serializer)
}
}
Value::Array(ref array) => array.serialize(serializer),
}
}
}
impl<'de> Deserialize<'de> for Value {
impl<'de> serde::Deserialize<'de> for Value {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de> {
where D: serde::Deserializer<'de> {
struct ValueVisitor;
impl<'de> Visitor<'de> for ValueVisitor {
impl<'de> serde::de::Visitor<'de> for ValueVisitor {
type Value = Value;
fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
formatter.write_str("a string or u32")
}
fn visit_bool<E>(self, v: bool) -> Result<Self::Value, E> {
Ok(Value::Bool(v))
}
fn visit_i64<E>(self, v: i64) -> Result<Self::Value, E> {
Ok(Value::I64(v))
}
@@ -91,10 +212,6 @@ impl<'de> Deserialize<'de> for Value {
Ok(Value::F64(v))
}
fn visit_bool<E>(self, v: bool) -> Result<Self::Value, E> {
Ok(Value::Bool(v))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> {
Ok(Value::Str(v.to_owned()))
}
@@ -102,130 +219,39 @@ impl<'de> Deserialize<'de> for Value {
fn visit_string<E>(self, v: String) -> Result<Self::Value, E> {
Ok(Value::Str(v))
}
fn visit_unit<E>(self) -> Result<Self::Value, E>
where E: serde::de::Error {
Ok(Value::Null)
}
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
where A: SeqAccess<'de> {
let mut elements = Vec::with_capacity(seq.size_hint().unwrap_or_default());
while let Some(value) = seq.next_element()? {
elements.push(value);
}
Ok(Value::Array(elements))
}
fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
where A: MapAccess<'de> {
let mut object = BTreeMap::new();
while let Some((key, value)) = map.next_entry()? {
object.insert(key, value);
}
Ok(Value::Object(object))
}
}
deserializer.deserialize_any(ValueVisitor)
}
}
impl Value {
/// Returns the text value, provided the value is of the `Str` type.
/// (Returns `None` if the value is not of the `Str` type).
pub fn as_text(&self) -> Option<&str> {
if let Value::Str(text) = self {
Some(text)
} else {
None
}
}
/// Returns the facet value, provided the value is of the `Facet` type.
/// (Returns `None` if the value is not of the `Facet` type).
pub fn as_facet(&self) -> Option<&Facet> {
if let Value::Facet(facet) = self {
Some(facet)
} else {
None
}
}
/// Returns the tokenized text, provided the value is of the `PreTokStr` type.
/// (Returns `None` if the value is not of the `PreTokStr` type.)
pub fn tokenized_text(&self) -> Option<&PreTokenizedString> {
if let Value::PreTokStr(tokenized_text) = self {
Some(tokenized_text)
} else {
None
}
}
/// Returns the u64-value, provided the value is of the `U64` type.
/// (Returns `None` if the value is not of the `U64` type)
pub fn as_u64(&self) -> Option<u64> {
if let Value::U64(val) = self {
Some(*val)
} else {
None
}
}
/// Returns the i64-value, provided the value is of the `I64` type.
///
/// Returns `None` if the value is not of type `I64`.
pub fn as_i64(&self) -> Option<i64> {
if let Value::I64(val) = self {
Some(*val)
} else {
None
}
}
/// Returns the f64-value, provided the value is of the `F64` type.
///
/// Returns `None` if the value is not of type `F64`.
pub fn as_f64(&self) -> Option<f64> {
if let Value::F64(value) = self {
Some(*value)
} else {
None
}
}
/// Returns the bool value, provided the value is of the `Bool` type.
///
/// Returns `None` if the value is not of type `Bool`.
pub fn as_bool(&self) -> Option<bool> {
if let Value::Bool(value) = self {
Some(*value)
} else {
None
}
}
/// Returns the Date-value, provided the value is of the `Date` type.
///
/// Returns `None` if the value is not of type `Date`.
pub fn as_date(&self) -> Option<DateTime> {
if let Value::Date(date) = self {
Some(*date)
} else {
None
}
}
/// Returns the Bytes-value, provided the value is of the `Bytes` type.
///
/// Returns `None` if the value is not of type `Bytes`.
pub fn as_bytes(&self) -> Option<&[u8]> {
if let Value::Bytes(bytes) = self {
Some(bytes)
} else {
None
}
}
/// Returns the json object, provided the value is of the `JsonObject` type.
///
/// Returns `None` if the value is not of type `JsonObject`.
pub fn as_json(&self) -> Option<&Map<String, serde_json::Value>> {
if let Value::JsonObject(json) = self {
Some(json)
} else {
None
}
}
/// Returns the ip addr, provided the value is of the `Ip` type.
/// (Returns None if the value is not of the `Ip` type)
pub fn as_ip_addr(&self) -> Option<Ipv6Addr> {
if let Value::IpAddr(val) = self {
Some(*val)
} else {
None
}
}
}
impl From<String> for Value {
fn from(s: String) -> Value {
Value::Str(s)
@@ -298,188 +324,93 @@ impl From<PreTokenizedString> for Value {
}
}
impl From<serde_json::Map<String, serde_json::Value>> for Value {
fn from(json_object: serde_json::Map<String, serde_json::Value>) -> Value {
Value::JsonObject(json_object)
impl From<BTreeMap<String, Value>> for Value {
fn from(object: BTreeMap<String, Value>) -> Value {
Value::Object(object)
}
}
fn can_be_rfc3339_date_time(text: &str) -> bool {
if let Some(&first_byte) = text.as_bytes().first() {
if (b'0'..=b'9').contains(&first_byte) {
return true;
}
}
false
}
impl From<serde_json::Value> for Value {
fn from(json_value: serde_json::Value) -> Value {
match json_value {
serde_json::Value::Object(json_object) => Value::JsonObject(json_object),
_ => {
panic!("Expected a json object.");
fn from(value: serde_json::Value) -> Self {
match value {
serde_json::Value::Null => Self::Null,
serde_json::Value::Bool(val) => Self::Bool(val),
serde_json::Value::Number(number) => {
if let Some(val) = number.as_i64() {
Self::I64(val)
} else if let Some(val) = number.as_u64() {
Self::U64(val)
} else if let Some(val) = number.as_f64() {
Self::F64(val)
} else {
panic!("Unsupported serde_json number {number}");
}
}
serde_json::Value::String(text) => {
if can_be_rfc3339_date_time(&text) {
match OffsetDateTime::parse(&text, &Rfc3339) {
Ok(dt) => {
let dt_utc = dt.to_offset(time::UtcOffset::UTC);
Self::Date(DateTime::from_utc(dt_utc))
}
Err(_) => Self::Str(text),
}
} else {
Self::Str(text)
}
}
serde_json::Value::Array(elements) => {
let converted_elements = elements.into_iter().map(Self::from).collect();
Self::Array(converted_elements)
}
serde_json::Value::Object(object) => Self::from(object),
}
}
}
mod binary_serialize {
use std::io::{self, Read, Write};
use std::net::Ipv6Addr;
impl From<serde_json::Map<String, serde_json::Value>> for Value {
fn from(map: serde_json::Map<String, serde_json::Value>) -> Self {
let mut object = BTreeMap::new();
use columnar::MonotonicallyMappableToU128;
use common::{f64_to_u64, u64_to_f64, BinarySerializable};
use super::Value;
use crate::schema::Facet;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
const TEXT_CODE: u8 = 0;
const U64_CODE: u8 = 1;
const I64_CODE: u8 = 2;
const HIERARCHICAL_FACET_CODE: u8 = 3;
const BYTES_CODE: u8 = 4;
const DATE_CODE: u8 = 5;
const F64_CODE: u8 = 6;
const EXT_CODE: u8 = 7;
const JSON_OBJ_CODE: u8 = 8;
const BOOL_CODE: u8 = 9;
const IP_CODE: u8 = 10;
// extended types
const TOK_STR_CODE: u8 = 0;
impl BinarySerializable for Value {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
match *self {
Value::Str(ref text) => {
TEXT_CODE.serialize(writer)?;
text.serialize(writer)
}
Value::PreTokStr(ref tok_str) => {
EXT_CODE.serialize(writer)?;
TOK_STR_CODE.serialize(writer)?;
if let Ok(text) = serde_json::to_string(tok_str) {
text.serialize(writer)
} else {
Err(io::Error::new(
io::ErrorKind::Other,
"Failed to dump Value::PreTokStr(_) to json.",
))
}
}
Value::U64(ref val) => {
U64_CODE.serialize(writer)?;
val.serialize(writer)
}
Value::I64(ref val) => {
I64_CODE.serialize(writer)?;
val.serialize(writer)
}
Value::F64(ref val) => {
F64_CODE.serialize(writer)?;
f64_to_u64(*val).serialize(writer)
}
Value::Bool(ref val) => {
BOOL_CODE.serialize(writer)?;
val.serialize(writer)
}
Value::Date(ref val) => {
DATE_CODE.serialize(writer)?;
let timestamp_micros = val.into_timestamp_micros();
timestamp_micros.serialize(writer)
}
Value::Facet(ref facet) => {
HIERARCHICAL_FACET_CODE.serialize(writer)?;
facet.serialize(writer)
}
Value::Bytes(ref bytes) => {
BYTES_CODE.serialize(writer)?;
bytes.serialize(writer)
}
Value::JsonObject(ref map) => {
JSON_OBJ_CODE.serialize(writer)?;
serde_json::to_writer(writer, &map)?;
Ok(())
}
Value::IpAddr(ref ip) => {
IP_CODE.serialize(writer)?;
ip.to_u128().serialize(writer)
}
}
for (key, value) in map {
object.insert(key, Value::from(value));
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let type_code = u8::deserialize(reader)?;
match type_code {
TEXT_CODE => {
let text = String::deserialize(reader)?;
Ok(Value::Str(text))
}
U64_CODE => {
let value = u64::deserialize(reader)?;
Ok(Value::U64(value))
}
I64_CODE => {
let value = i64::deserialize(reader)?;
Ok(Value::I64(value))
}
F64_CODE => {
let value = u64_to_f64(u64::deserialize(reader)?);
Ok(Value::F64(value))
}
BOOL_CODE => {
let value = bool::deserialize(reader)?;
Ok(Value::Bool(value))
}
DATE_CODE => {
let timestamp_micros = i64::deserialize(reader)?;
Ok(Value::Date(DateTime::from_timestamp_micros(
timestamp_micros,
)))
}
HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)),
BYTES_CODE => Ok(Value::Bytes(Vec::<u8>::deserialize(reader)?)),
EXT_CODE => {
let ext_type_code = u8::deserialize(reader)?;
match ext_type_code {
TOK_STR_CODE => {
let str_val = String::deserialize(reader)?;
if let Ok(value) = serde_json::from_str::<PreTokenizedString>(&str_val)
{
Ok(Value::PreTokStr(value))
} else {
Err(io::Error::new(
io::ErrorKind::Other,
"Failed to parse string data as Value::PreTokStr(_).",
))
}
}
_ => Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"No extended field type is associated with code {ext_type_code:?}"
),
)),
}
}
JSON_OBJ_CODE => {
// As explained in
// https://docs.serde.rs/serde_json/fn.from_reader.html
//
// `T::from_reader(..)` expects EOF after reading the object,
// which is not what we want here.
//
// For this reason we need to create our own `Deserializer`.
let mut de = serde_json::Deserializer::from_reader(reader);
let json_map = <serde_json::Map::<String, serde_json::Value> as serde::Deserialize>::deserialize(&mut de)?;
Ok(Value::JsonObject(json_map))
}
IP_CODE => {
let value = u128::deserialize(reader)?;
Ok(Value::IpAddr(Ipv6Addr::from_u128(value)))
}
Value::Object(object)
}
}
_ => Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("No field type is associated with code {type_code:?}"),
)),
}
}
/// A wrapper type for iterating over a serde_json array producing reference values.
pub struct ArrayIter<'a>(std::slice::Iter<'a, Value>);
impl<'a> Iterator for ArrayIter<'a> {
type Item = ReferenceValue<'a, &'a Value>;
fn next(&mut self) -> Option<Self::Item> {
let value = self.0.next()?;
Some(value.as_value())
}
}
/// A wrapper type for iterating over a serde_json object producing reference values.
pub struct ObjectMapIter<'a>(btree_map::Iter<'a, String, Value>);
impl<'a> Iterator for ObjectMapIter<'a> {
type Item = (&'a str, ReferenceValue<'a, &'a Value>);
fn next(&mut self) -> Option<Self::Item> {
let (key, value) = self.0.next()?;
Some((key.as_str(), value.as_value()))
}
}
@@ -489,7 +420,7 @@ mod tests {
use crate::schema::{BytesOptions, Schema};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{DateTime, Document};
use crate::{DateTime, TantivyDocument};
#[test]
fn test_parse_bytes_doc() {
@@ -497,9 +428,9 @@ mod tests {
let bytes_options = BytesOptions::default();
let bytes_field = schema_builder.add_bytes_field("my_bytes", bytes_options);
let schema = schema_builder.build();
let mut doc = Document::default();
let mut doc = TantivyDocument::default();
doc.add_bytes(bytes_field, "this is a test".as_bytes());
let json_string = schema.to_json(&doc);
let json_string = doc.to_json(&schema);
assert_eq!(json_string, r#"{"my_bytes":["dGhpcyBpcyBhIHRlc3Q="]}"#);
}
@@ -509,9 +440,9 @@ mod tests {
let bytes_options = BytesOptions::default();
let bytes_field = schema_builder.add_bytes_field("my_bytes", bytes_options);
let schema = schema_builder.build();
let mut doc = Document::default();
let mut doc = TantivyDocument::default();
doc.add_bytes(bytes_field, "".as_bytes());
let json_string = schema.to_json(&doc);
let json_string = doc.to_json(&schema);
assert_eq!(json_string, r#"{"my_bytes":[""]}"#);
}
@@ -521,12 +452,12 @@ mod tests {
let bytes_options = BytesOptions::default();
let bytes_field = schema_builder.add_bytes_field("my_bytes", bytes_options);
let schema = schema_builder.build();
let mut doc = Document::default();
let mut doc = TantivyDocument::default();
doc.add_bytes(
bytes_field,
"A bigger test I guess\nspanning on multiple lines\nhoping this will work".as_bytes(),
);
let json_string = schema.to_json(&doc);
let json_string = doc.to_json(&schema);
assert_eq!(
json_string,
r#"{"my_bytes":["QSBiaWdnZXIgdGVzdCBJIGd1ZXNzCnNwYW5uaW5nIG9uIG11bHRpcGxlIGxpbmVzCmhvcGluZyB0aGlzIHdpbGwgd29yaw=="]}"#

View File

@@ -5,9 +5,10 @@ use std::ops::Range;
use htmlescape::encode_minimal;
use crate::query::Query;
use crate::schema::{Field, Value};
use crate::schema::document::{DocValue, Document};
use crate::schema::Field;
use crate::tokenizer::{TextAnalyzer, Token};
use crate::{Document, Score, Searcher, Term};
use crate::{Score, Searcher, Term};
const DEFAULT_MAX_NUM_CHARS: usize = 150;
@@ -359,13 +360,21 @@ impl SnippetGenerator {
///
/// This method extract the text associated with the `SnippetGenerator`'s field
/// and computes a snippet.
pub fn snippet_from_doc(&self, doc: &Document) -> Snippet {
let text: String = doc
.get_all(self.field)
.flat_map(Value::as_text)
.collect::<Vec<&str>>()
.join(" ");
self.snippet(&text)
pub fn snippet_from_doc<D: Document>(&self, doc: &D) -> Snippet {
let mut text = String::new();
for (field, value) in doc.iter_fields_and_values() {
let value = value as D::Value<'_>;
if field != self.field {
continue;
}
if let Some(val) = value.as_str() {
text.push(' ');
text.push_str(val);
}
}
self.snippet(text.trim())
}
/// Generates a snippet for the given text.

View File

@@ -293,7 +293,7 @@ mod test {
use crate::core::Index;
use crate::schema::{Field, Schema, FAST, INDEXED, STORED, TEXT};
use crate::space_usage::PerFieldSpaceUsage;
use crate::Term;
use crate::{IndexWriter, Term};
#[test]
fn test_empty() {
@@ -447,7 +447,7 @@ mod test {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(name => 1u64))?;
index_writer.add_document(doc!(name => 2u64))?;
index_writer.add_document(doc!(name => 3u64))?;
@@ -456,7 +456,7 @@ mod test {
}
{
let mut index_writer2 = index.writer(50_000_000)?;
let mut index_writer2: IndexWriter = index.writer(50_000_000)?;
index_writer2.delete_term(Term::from_field_u64(name, 2u64));
index_writer2.delete_term(Term::from_field_u64(name, 3u64));
// ok, now we should have a deleted doc

View File

@@ -48,7 +48,7 @@ mod tests {
use crate::indexer::NoMergePolicy;
use crate::schema::{SchemaBuilder, STORED, TEXT};
use crate::store::index::Checkpoint;
use crate::{DocAddress, DocId, Index, Term};
use crate::{DocAddress, DocId, Index, IndexWriter, TantivyDocument, Term};
#[test]
fn test_skip_index_empty() -> io::Result<()> {
@@ -129,7 +129,7 @@ mod tests {
let body = schema_builder.add_text_field("body", STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let long_text: String = "abcdefghijklmnopqrstuvwxyz".repeat(1_000);
for _ in 0..20 {
@@ -149,7 +149,7 @@ mod tests {
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 30);
for i in 0..searcher.num_docs() as u32 {
let _doc = searcher.doc(DocAddress::new(0u32, i))?;
let _doc = searcher.doc::<TantivyDocument>(DocAddress::new(0u32, i))?;
}
Ok(())
}

View File

@@ -59,8 +59,11 @@ pub mod tests {
use super::*;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::AliveBitSet;
use crate::schema::{self, Document, Schema, TextFieldIndexing, TextOptions, STORED, TEXT};
use crate::{Index, Term};
use crate::schema::document::DocValue;
use crate::schema::{
self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, STORED, TEXT,
};
use crate::{Index, IndexWriter, Term};
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad \
@@ -88,7 +91,7 @@ pub mod tests {
let mut store_writer =
StoreWriter::new(writer, compressor, blocksize, separate_thread).unwrap();
for i in 0..num_docs {
let mut doc = Document::default();
let mut doc = TantivyDocument::default();
doc.add_field_value(field_body, LOREM.to_string());
doc.add_field_value(field_title, format!("Doc {i}"));
store_writer.store(&doc, &schema).unwrap();
@@ -117,18 +120,21 @@ pub mod tests {
for i in 0..NUM_DOCS as u32 {
assert_eq!(
*store
.get(i)?
.get::<TantivyDocument>(i)?
.get_first(field_title)
.unwrap()
.as_text()
.as_str()
.unwrap(),
format!("Doc {i}")
);
}
for (_, doc) in store.iter(Some(&alive_bitset)).enumerate() {
for (_, doc) in store
.iter::<TantivyDocument>(Some(&alive_bitset))
.enumerate()
{
let doc = doc?;
let title_content = doc.get_first(field_title).unwrap().as_text().unwrap();
let title_content = doc.get_first(field_title).unwrap().as_str().unwrap();
if !title_content.starts_with("Doc ") {
panic!("unexpected title_content {title_content}");
}
@@ -162,17 +168,17 @@ pub mod tests {
for i in 0..NUM_DOCS as u32 {
assert_eq!(
*store
.get(i)?
.get::<TantivyDocument>(i)?
.get_first(field_title)
.unwrap()
.as_text()
.as_str()
.unwrap(),
format!("Doc {i}")
);
}
for (i, doc) in store.iter(None).enumerate() {
for (i, doc) in store.iter::<TantivyDocument>(None).enumerate() {
assert_eq!(
*doc?.get_first(field_title).unwrap().as_text().unwrap(),
*doc?.get_first(field_title).unwrap().as_str().unwrap(),
format!("Doc {i}")
);
}
@@ -222,7 +228,7 @@ pub mod tests {
let index = index_builder.create_in_ram()?;
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=> "deleteme"))?;
index_writer.add_document(doc!(text_field=> "deletemenot"))?;
index_writer.add_document(doc!(text_field=> "deleteme"))?;
@@ -236,9 +242,9 @@ pub mod tests {
let searcher = index.reader()?.searcher();
let reader = searcher.segment_reader(0);
let store = reader.get_store_reader(10)?;
for doc in store.iter(reader.alive_bitset()) {
for doc in store.iter::<TantivyDocument>(reader.alive_bitset()) {
assert_eq!(
*doc?.get_first(text_field).unwrap().as_text().unwrap(),
*doc?.get_first(text_field).unwrap().as_str().unwrap(),
"deletemenot".to_string()
);
}
@@ -258,7 +264,7 @@ pub mod tests {
let mut index = index_builder.create_in_ram().unwrap();
index.settings_mut().docstore_compression = Compressor::Lz4;
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// put enough data create enough blocks in the doc store to be considered for stacking
for _ in 0..200 {
index_writer.add_document(doc!(text_field=> LOREM))?;
@@ -284,7 +290,7 @@ pub mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
assert!(index_writer.merge(&segment_ids).wait().is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
}
@@ -294,9 +300,12 @@ pub mod tests {
let reader = searcher.segment_readers().iter().last().unwrap();
let store = reader.get_store_reader(10).unwrap();
for doc in store.iter(reader.alive_bitset()).take(50) {
for doc in store
.iter::<TantivyDocument>(reader.alive_bitset())
.take(50)
{
assert_eq!(
*doc?.get_first(text_field).unwrap().as_text().unwrap(),
*doc?.get_first(text_field).and_then(|v| v.as_str()).unwrap(),
LOREM.to_string()
);
}
@@ -331,7 +340,7 @@ pub mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -355,6 +364,7 @@ mod bench {
use super::tests::write_lorem_ipsum_store;
use crate::directory::{Directory, RamDirectory};
use crate::store::{Compressor, StoreReader};
use crate::TantivyDocument;
#[bench]
#[cfg(feature = "mmap")]
@@ -386,6 +396,6 @@ mod bench {
);
let store_file = directory.open_read(path).unwrap();
let store = StoreReader::open(store_file, 10).unwrap();
b.iter(|| store.iter(None).collect::<Vec<_>>());
b.iter(|| store.iter::<TantivyDocument>(None).collect::<Vec<_>>());
}
}

View File

@@ -14,7 +14,7 @@ use super::Decompressor;
use crate::directory::FileSlice;
use crate::error::DataCorruption;
use crate::fastfield::AliveBitSet;
use crate::schema::Document;
use crate::schema::document::{BinaryDocumentDeserializer, Document};
use crate::space_usage::StoreSpaceUsage;
use crate::store::index::Checkpoint;
use crate::DocId;
@@ -198,9 +198,12 @@ impl StoreReader {
///
/// It should not be called to score documents
/// for instance.
pub fn get(&self, doc_id: DocId) -> crate::Result<Document> {
pub fn get<D: Document>(&self, doc_id: DocId) -> crate::Result<D> {
let mut doc_bytes = self.get_document_bytes(doc_id)?;
Ok(Document::deserialize(&mut doc_bytes)?)
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from)
}
/// Returns raw bytes of a given document.
@@ -232,13 +235,16 @@ impl StoreReader {
/// Iterator over all Documents in their order as they are stored in the doc store.
/// Use this, if you want to extract all Documents from the doc store.
/// The `alive_bitset` has to be forwarded from the `SegmentReader` or the results may be wrong.
pub fn iter<'a: 'b, 'b>(
pub fn iter<'a: 'b, 'b, D: Document>(
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<Document>> + 'b {
) -> impl Iterator<Item = crate::Result<D>> + 'b {
self.iter_raw(alive_bitset).map(|doc_bytes_res| {
let mut doc_bytes = doc_bytes_res?;
Ok(Document::deserialize(&mut doc_bytes)?)
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from)
})
}
@@ -364,9 +370,12 @@ impl StoreReader {
}
/// Fetches a document asynchronously. Async version of [`get`](Self::get).
pub async fn get_async(&self, doc_id: DocId) -> crate::Result<Document> {
pub async fn get_async<D: Document>(&self, doc_id: DocId) -> crate::Result<D> {
let mut doc_bytes = self.get_document_bytes_async(doc_id).await?;
Ok(Document::deserialize(&mut doc_bytes)?)
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from)
}
}
@@ -376,15 +385,16 @@ mod tests {
use super::*;
use crate::directory::RamDirectory;
use crate::schema::{Document, Field};
use crate::schema::document::DocValue;
use crate::schema::{Field, TantivyDocument};
use crate::store::tests::write_lorem_ipsum_store;
use crate::store::Compressor;
use crate::Directory;
const BLOCK_SIZE: usize = 16_384;
fn get_text_field<'a>(doc: &'a Document, field: &'a Field) -> Option<&'a str> {
doc.get_first(*field).and_then(|f| f.as_text())
fn get_text_field<'a>(doc: &'a TantivyDocument, field: &'a Field) -> Option<&'a str> {
doc.get_first(*field).and_then(|f| f.as_str())
}
#[test]

View File

@@ -5,7 +5,8 @@ use common::BinarySerializable;
use super::compressors::Compressor;
use super::StoreReader;
use crate::directory::WritePtr;
use crate::schema::{Document, Schema};
use crate::schema::document::{BinaryDocumentSerializer, Document};
use crate::schema::Schema;
use crate::store::store_compressor::BlockCompressor;
use crate::DocId;
@@ -95,9 +96,12 @@ impl StoreWriter {
///
/// The document id is implicitly the current number
/// of documents.
pub fn store(&mut self, document: &Document, schema: &Schema) -> io::Result<()> {
pub fn store<D: Document>(&mut self, document: &D, schema: &Schema) -> io::Result<()> {
self.doc_pos.push(self.current_block.len() as u32);
document.serialize_stored(schema, &mut self.current_block)?;
let mut serializer = BinaryDocumentSerializer::new(&mut self.current_block, schema);
serializer.serialize_doc(document)?;
self.num_docs_in_current_block += 1;
self.check_flush_block()?;
Ok(())

View File

@@ -1,11 +1,13 @@
use std::cmp::Ordering;
use std::io;
use std::io::{Read, Write};
use serde::{Deserialize, Serialize};
use common::BinarySerializable;
use crate::tokenizer::{Token, TokenStream};
/// Struct representing pre-tokenized text
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Eq, PartialEq)]
pub struct PreTokenizedString {
/// Original text
pub text: String,
@@ -25,6 +27,32 @@ impl PartialOrd for PreTokenizedString {
}
}
impl BinarySerializable for PreTokenizedString {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
if let Ok(text) = serde_json::to_string(self) {
<String as BinarySerializable>::serialize(&text, writer)
} else {
Err(io::Error::new(
io::ErrorKind::Other,
"Failed to dump PreTokenizedString to json.",
))
}
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let json_text = <String as BinarySerializable>::deserialize(reader)?;
if let Ok(value) = serde_json::from_str(&json_text) {
Ok(value)
} else {
Err(io::Error::new(
io::ErrorKind::Other,
"Failed to parse string data as PreTokenizedString.",
))
}
}
}
/// [`TokenStream`] implementation which wraps [`PreTokenizedString`]
pub struct PreTokenizedStream {
tokenized_string: PreTokenizedString,

View File

@@ -2,7 +2,7 @@ use std::path::Path;
use tantivy::directory::{Directory, ManagedDirectory, RamDirectory, TerminatingWrite};
use tantivy::schema::{Schema, TEXT};
use tantivy::{doc, Index, Term};
use tantivy::{doc, Index, IndexWriter, Term};
#[test]
fn test_failpoints_managed_directory_gc_if_delete_fails() {
@@ -45,7 +45,7 @@ fn test_write_commit_fails() -> tantivy::Result<()> {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 15_000_000)?;
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?;
for _ in 0..100 {
index_writer.add_document(doc!(text_field => "a"))?;
}
@@ -75,7 +75,7 @@ fn test_fail_on_flush_segment() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer_with_num_threads(1, 15_000_000)?;
let index_writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?;
fail::cfg("FieldSerializer::close_term", "return(simulatederror)").unwrap();
for i in 0..100_000 {
if index_writer
@@ -94,7 +94,7 @@ fn test_fail_on_flush_segment_but_one_worker_remains() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer_with_num_threads(2, 30_000_000)?;
let index_writer: IndexWriter = index.writer_with_num_threads(2, 30_000_000)?;
fail::cfg("FieldSerializer::close_term", "1*return(simulatederror)").unwrap();
for i in 0..100_000 {
if index_writer
@@ -113,7 +113,7 @@ fn test_fail_on_commit_segment() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 15_000_000)?;
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?;
fail::cfg("FieldSerializer::close_term", "return(simulatederror)").unwrap();
for i in 0..10 {
index_writer