mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
POC: Tantivy documents as a trait (#2071)
* fix windows build (#1) * Fix windows build * Add doc traits * Add field value iter * Add value and serialization * Adjust order * Fix bug * Correct type * Fix generic bugs * Reformat code * Add generic to index writer which I forgot about * Fix missing generics on single segment writer * Add missing type export * Add default methods for convenience * Cleanup * Fix more-like-this query to use standard types * Update API and fix tests * Add doc traits * Add field value iter * Add value and serialization * Adjust order * Fix bug * Correct type * Rebase main and fix conflicts * Reformat code * Merge upstream * Fix missing generics on single segment writer * Add missing type export * Add default methods for convenience * Cleanup * Fix more-like-this query to use standard types * Update API and fix tests * Add tokenizer improvements from previous commits * Add tokenizer improvements from previous commits * Reformat * Fix unit tests * Fix unit tests * Use enum in changes * Stage changes * Add new deserializer logic * Add serializer integration * Add document deserializer * Implement new (de)serialization api for existing types * Fix bugs and type errors * Add helper implementations * Fix errors * Reformat code * Add unit tests and some code organisation for serialization * Add unit tests to deserializer * Add some small docs * Add support for deserializing serde values * Reformat * Fix typo * Fix typo * Change repr of facet * Remove unused trait methods * Add child value type * Resolve comments * Fix build * Fix more build errors * Fix more build errors * Fix the tests I missed * Fix examples * fix numerical order, serialize PreTok Str * fix coverage * rename Document to TantivyDocument, rename DocumentAccess to Document add Binary prefix to binary de/serialization * fix coverage --------- Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>
This commit is contained in:
@@ -39,9 +39,9 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||
let lines = get_lines(HDFS_LOGS);
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for doc_json in &lines {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let doc = Document::parse_json(&schema, doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
})
|
||||
@@ -50,9 +50,10 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||
let lines = get_lines(HDFS_LOGS);
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
let mut index_writer: IndexWriter =
|
||||
index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for doc_json in &lines {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let doc = Document::parse_json(&schema, doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
@@ -62,9 +63,9 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||
let lines = get_lines(HDFS_LOGS);
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema_with_store.clone());
|
||||
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for doc_json in &lines {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let doc = Document::parse_json(&schema, doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
})
|
||||
@@ -73,9 +74,10 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||
let lines = get_lines(HDFS_LOGS);
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema_with_store.clone());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
let mut index_writer: IndexWriter =
|
||||
index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for doc_json in &lines {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let doc = Document::parse_json(&schema, doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
@@ -86,7 +88,8 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
let mut index_writer: IndexWriter =
|
||||
index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for doc_json in &lines {
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(doc_json).unwrap();
|
||||
@@ -113,7 +116,7 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
|
||||
b.iter(|| {
|
||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for doc_json in &lines {
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(doc_json).unwrap();
|
||||
@@ -127,7 +130,8 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
|
||||
b.iter(|| {
|
||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
let mut index_writer: IndexWriter =
|
||||
index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for doc_json in &lines {
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(doc_json).unwrap();
|
||||
@@ -154,7 +158,7 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
|
||||
b.iter(|| {
|
||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for doc_json in &lines {
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(doc_json).unwrap();
|
||||
@@ -168,7 +172,8 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
|
||||
b.iter(|| {
|
||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
let mut index_writer: IndexWriter =
|
||||
index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for doc_json in &lines {
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(doc_json).unwrap();
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
#![allow(deprecated)]
|
||||
|
||||
use std::fmt;
|
||||
use std::io::{Read, Write};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
|
||||
|
||||
use crate::BinarySerializable;
|
||||
|
||||
/// Precision with which datetimes are truncated when stored in fast fields. This setting is only
|
||||
/// relevant for fast fields. In the docstore, datetimes are always saved with nanosecond precision.
|
||||
#[derive(
|
||||
@@ -164,3 +167,15 @@ impl fmt::Debug for DateTime {
|
||||
f.write_str(&utc_rfc3339)
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for DateTime {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
|
||||
let timestamp_micros = self.into_timestamp_micros();
|
||||
<i64 as BinarySerializable>::serialize(×tamp_micros, writer)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> std::io::Result<Self> {
|
||||
let timestamp_micros = <i64 as BinarySerializable>::deserialize(reader)?;
|
||||
Ok(Self::from_timestamp_micros(timestamp_micros))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::borrow::Cow;
|
||||
use std::io::{Read, Write};
|
||||
use std::{fmt, io};
|
||||
|
||||
@@ -249,6 +250,43 @@ impl BinarySerializable for String {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BinarySerializable for Cow<'a, str> {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let data: &[u8] = self.as_bytes();
|
||||
VInt(data.len() as u64).serialize(writer)?;
|
||||
writer.write_all(data)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
|
||||
let string_length = VInt::deserialize(reader)?.val() as usize;
|
||||
let mut result = String::with_capacity(string_length);
|
||||
reader
|
||||
.take(string_length as u64)
|
||||
.read_to_string(&mut result)?;
|
||||
Ok(Cow::Owned(result))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BinarySerializable for Cow<'a, [u8]> {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
VInt(self.len() as u64).serialize(writer)?;
|
||||
for it in self.iter() {
|
||||
it.serialize(writer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
|
||||
let num_items = VInt::deserialize(reader)?.val();
|
||||
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
|
||||
for _ in 0..num_items {
|
||||
let item = u8::deserialize(reader)?;
|
||||
items.push(item);
|
||||
}
|
||||
Ok(Cow::Owned(items))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ use tantivy::aggregation::agg_result::AggregationResults;
|
||||
use tantivy::aggregation::AggregationCollector;
|
||||
use tantivy::query::AllQuery;
|
||||
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST};
|
||||
use tantivy::Index;
|
||||
use tantivy::{Index, IndexWriter, TantivyDocument};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Create Schema
|
||||
@@ -132,10 +132,10 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let stream = Deserializer::from_str(data).into_iter::<Value>();
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
let mut num_indexed = 0;
|
||||
for value in stream {
|
||||
let doc = schema.parse_document(&serde_json::to_string(&value.unwrap())?)?;
|
||||
let doc = TantivyDocument::parse_json(&schema, &serde_json::to_string(&value.unwrap())?)?;
|
||||
index_writer.add_document(doc)?;
|
||||
num_indexed += 1;
|
||||
if num_indexed > 4 {
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, ReloadPolicy};
|
||||
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
@@ -75,7 +75,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// Here we give tantivy a budget of `50MB`.
|
||||
// Using a bigger memory_arena for the indexer may increase
|
||||
// throughput, but 50 MB is already plenty.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
|
||||
// Let's index our documents!
|
||||
// We first need a handle on the title and the body field.
|
||||
@@ -87,7 +87,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let mut old_man_doc = Document::default();
|
||||
let mut old_man_doc = TantivyDocument::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
@@ -217,8 +217,8 @@ fn main() -> tantivy::Result<()> {
|
||||
// the document returned will only contain
|
||||
// a title.
|
||||
for (_score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
|
||||
println!("{}", retrieved_doc.to_json(&schema));
|
||||
}
|
||||
|
||||
// We can also get an explanation to understand
|
||||
|
||||
@@ -13,7 +13,7 @@ use columnar::Column;
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::{doc, Index, Score, SegmentReader};
|
||||
use tantivy::{doc, Index, IndexWriter, Score, SegmentReader};
|
||||
|
||||
#[derive(Default)]
|
||||
struct Stats {
|
||||
@@ -142,7 +142,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// this example.
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Super Broom 2000",
|
||||
product_description => "While it is ok for short distance travel, this broom \
|
||||
|
||||
@@ -6,7 +6,7 @@ use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::NgramTokenizer;
|
||||
use tantivy::{doc, Index};
|
||||
use tantivy::{doc, Index, IndexWriter};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
@@ -62,7 +62,7 @@ fn main() -> tantivy::Result<()> {
|
||||
//
|
||||
// Here we use a buffer of 50MB per thread. Using a bigger
|
||||
// memory arena for the indexer can increase its throughput.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
@@ -103,8 +103,8 @@ fn main() -> tantivy::Result<()> {
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
for (_, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
|
||||
println!("{}", retrieved_doc.to_json(&schema));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
|
||||
use tantivy::Index;
|
||||
use tantivy::{Index, IndexWriter, TantivyDocument};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
@@ -22,16 +22,18 @@ fn main() -> tantivy::Result<()> {
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
// The dates are passed as string in the RFC3339 format
|
||||
let doc = schema.parse_document(
|
||||
let doc = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"occurred_at": "2022-06-22T12:53:50.53Z",
|
||||
"event": "pull-request"
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
let doc = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"occurred_at": "2022-06-22T13:00:00.22Z",
|
||||
"event": "comment"
|
||||
@@ -58,13 +60,13 @@ fn main() -> tantivy::Result<()> {
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
|
||||
assert_eq!(count_docs.len(), 1);
|
||||
for (_score, doc_address) in count_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
|
||||
assert!(matches!(
|
||||
retrieved_doc.get_first(occurred_at),
|
||||
Some(Value::Date(_))
|
||||
));
|
||||
assert_eq!(
|
||||
schema.to_json(&retrieved_doc),
|
||||
retrieved_doc.to_json(&schema),
|
||||
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
|
||||
);
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, IndexReader};
|
||||
use tantivy::{doc, Index, IndexReader, IndexWriter};
|
||||
|
||||
// A simple helper function to fetch a single document
|
||||
// given its id from our index.
|
||||
@@ -19,7 +19,7 @@ use tantivy::{doc, Index, IndexReader};
|
||||
fn extract_doc_given_isbn(
|
||||
reader: &IndexReader,
|
||||
isbn_term: &Term,
|
||||
) -> tantivy::Result<Option<Document>> {
|
||||
) -> tantivy::Result<Option<TantivyDocument>> {
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// This is the simplest query you can think of.
|
||||
@@ -69,10 +69,10 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
|
||||
// Let's add a couple of documents, for the sake of the example.
|
||||
let mut old_man_doc = Document::default();
|
||||
let mut old_man_doc = TantivyDocument::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
index_writer.add_document(doc!(
|
||||
isbn => "978-0099908401",
|
||||
@@ -94,7 +94,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// Oops our frankenstein doc seems misspelled
|
||||
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
|
||||
assert_eq!(
|
||||
schema.to_json(&frankenstein_doc_misspelled),
|
||||
frankenstein_doc_misspelled.to_json(&schema),
|
||||
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
|
||||
);
|
||||
|
||||
@@ -136,7 +136,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// No more typo!
|
||||
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
|
||||
assert_eq!(
|
||||
schema.to_json(&frankenstein_new_doc),
|
||||
frankenstein_new_doc.to_json(&schema),
|
||||
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
|
||||
);
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
use tantivy::collector::FacetCollector;
|
||||
use tantivy::query::{AllQuery, TermQuery};
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index};
|
||||
use tantivy::{doc, Index, IndexWriter};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the sake of this example
|
||||
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(30_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(30_000_000)?;
|
||||
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
|
||||
@@ -12,7 +12,7 @@ use std::collections::HashSet;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::BooleanQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, DocId, Index, Score, SegmentReader};
|
||||
use tantivy::{doc, DocId, Index, IndexWriter, Score, SegmentReader};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(30_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(30_000_000)?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Fried egg",
|
||||
@@ -91,11 +91,10 @@ fn main() -> tantivy::Result<()> {
|
||||
.iter()
|
||||
.map(|(_, doc_id)| {
|
||||
searcher
|
||||
.doc(*doc_id)
|
||||
.doc::<TantivyDocument>(*doc_id)
|
||||
.unwrap()
|
||||
.get_first(title)
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap()
|
||||
.to_owned()
|
||||
})
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::FuzzyTermQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, ReloadPolicy};
|
||||
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
@@ -66,7 +66,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// Here we give tantivy a budget of `50MB`.
|
||||
// Using a bigger memory_arena for the indexer may increase
|
||||
// throughput, but 50 MB is already plenty.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
|
||||
// Let's index our documents!
|
||||
// We first need a handle on the title and the body field.
|
||||
@@ -151,10 +151,10 @@ fn main() -> tantivy::Result<()> {
|
||||
assert_eq!(count, 3);
|
||||
assert_eq!(top_docs.len(), 3);
|
||||
for (score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
// Note that the score is not lower for the fuzzy hit.
|
||||
// There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563
|
||||
println!("score {score:?} doc {}", schema.to_json(&retrieved_doc));
|
||||
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
|
||||
println!("score {score:?} doc {}", retrieved_doc.to_json(&schema));
|
||||
// score 1.0 doc {"title":["The Diary of Muadib"]}
|
||||
//
|
||||
// score 1.0 doc {"title":["The Diary of a Young Girl"]}
|
||||
|
||||
@@ -21,7 +21,7 @@ fn main() -> tantivy::Result<()> {
|
||||
}"#;
|
||||
|
||||
// We can parse our document
|
||||
let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
|
||||
let _mice_and_men_doc = TantivyDocument::parse_json(&schema, mice_and_men_doc_json)?;
|
||||
|
||||
// Multi-valued field are allowed, they are
|
||||
// expressed in JSON by an array.
|
||||
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"year": 1818
|
||||
}"#;
|
||||
let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
|
||||
let _frankenstein_doc = TantivyDocument::parse_json(&schema, frankenstein_json)?;
|
||||
|
||||
// Note that the schema is saved in your index directory.
|
||||
//
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
use tantivy::collector::Count;
|
||||
use tantivy::query::RangeQuery;
|
||||
use tantivy::schema::{Schema, INDEXED};
|
||||
use tantivy::{doc, Index, Result};
|
||||
use tantivy::{doc, Index, IndexWriter, Result};
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// For the sake of simplicity, this schema will only have 1 field
|
||||
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let reader = index.reader()?;
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 6_000_000)?;
|
||||
for year in 1950u64..2019u64 {
|
||||
index_writer.add_document(doc!(year_field => year))?;
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING};
|
||||
use tantivy::Index;
|
||||
use tantivy::{Index, IndexWriter, TantivyDocument};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
@@ -22,20 +22,22 @@ fn main() -> tantivy::Result<()> {
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
|
||||
// ### IPv4
|
||||
// Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as
|
||||
// `String`. Since the field is of type ip, we parse the IP address from the string and store it
|
||||
// internally as IPv6.
|
||||
let doc = schema.parse_document(
|
||||
let doc = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"ip": "192.168.0.33",
|
||||
"event_type": "login"
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
let doc = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"ip": "192.168.0.80",
|
||||
"event_type": "checkout"
|
||||
@@ -44,7 +46,8 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer.add_document(doc)?;
|
||||
// ### IPv6
|
||||
// Adding a document that contains an IPv6 address.
|
||||
let doc = schema.parse_document(
|
||||
let doc = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
|
||||
"event_type": "checkout"
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, DocSet, Index, Postings, TERMINATED};
|
||||
use tantivy::{doc, DocSet, Index, IndexWriter, Postings, TERMINATED};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// We first create a schema for the sake of the
|
||||
@@ -24,7 +24,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
|
||||
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
|
||||
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
|
||||
use tantivy::Index;
|
||||
use tantivy::{Index, IndexWriter, TantivyDocument};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
@@ -20,8 +20,9 @@ fn main() -> tantivy::Result<()> {
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let doc = schema.parse_document(
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
let doc = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"timestamp": "2022-02-22T23:20:50.53Z",
|
||||
"event_type": "click",
|
||||
@@ -33,7 +34,8 @@ fn main() -> tantivy::Result<()> {
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
let doc = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"timestamp": "2022-02-22T23:20:51.53Z",
|
||||
"event_type": "click",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, ReloadPolicy, Result};
|
||||
use tantivy::{doc, Index, IndexWriter, ReloadPolicy, Result};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema)?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
@@ -67,8 +67,12 @@ fn main() -> Result<()> {
|
||||
let mut titles = top_docs
|
||||
.into_iter()
|
||||
.map(|(_score, doc_address)| {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
|
||||
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
|
||||
let title = doc
|
||||
.get_first(title)
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap()
|
||||
.to_owned();
|
||||
Ok(title)
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
@@ -13,7 +13,7 @@ use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
|
||||
use tantivy::{doc, Index, ReloadPolicy};
|
||||
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn pre_tokenize_text(text: &str) -> Vec<Token> {
|
||||
@@ -38,7 +38,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
|
||||
// We can create a document manually, by setting the fields
|
||||
// one by one in a Document object.
|
||||
@@ -83,7 +83,7 @@ fn main() -> tantivy::Result<()> {
|
||||
}]
|
||||
}"#;
|
||||
|
||||
let short_man_doc = schema.parse_document(short_man_json)?;
|
||||
let short_man_doc = TantivyDocument::parse_json(&schema, short_man_json)?;
|
||||
|
||||
index_writer.add_document(short_man_doc)?;
|
||||
|
||||
@@ -115,8 +115,8 @@ fn main() -> tantivy::Result<()> {
|
||||
// Note that the tokens are not stored along with the original text
|
||||
// in the document store
|
||||
for (_score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("Document: {}", schema.to_json(&retrieved_doc));
|
||||
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
|
||||
println!("{}", retrieved_doc.to_json(&schema));
|
||||
}
|
||||
|
||||
// In contrary to the previous query, when we search for the "man" term we
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, Snippet, SnippetGenerator};
|
||||
use tantivy::{doc, Index, IndexWriter, Snippet, SnippetGenerator};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
@@ -27,7 +27,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_dir(&index_path, schema)?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
|
||||
// we'll only need one doc for this example.
|
||||
index_writer.add_document(doc!(
|
||||
@@ -54,13 +54,10 @@ fn main() -> tantivy::Result<()> {
|
||||
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
||||
|
||||
for (score, doc_address) in top_docs {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
|
||||
let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
println!("Document score {score}:");
|
||||
println!(
|
||||
"title: {}",
|
||||
doc.get_first(title).unwrap().as_text().unwrap()
|
||||
);
|
||||
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());
|
||||
println!("snippet: {}", snippet.to_html());
|
||||
println!("custom highlighting: {}", highlight(snippet));
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::*;
|
||||
use tantivy::{doc, Index};
|
||||
use tantivy::{doc, Index, IndexWriter};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// this example assumes you understand the content in `basic_search`
|
||||
@@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
index.tokenizers().register("stoppy", tokenizer);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
|
||||
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
@@ -105,9 +105,9 @@ fn main() -> tantivy::Result<()> {
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
for (score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
|
||||
println!("\n==\nDocument score {score}:");
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
println!("{}", retrieved_doc.to_json(&schema));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -6,8 +6,8 @@ use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, TEXT};
|
||||
use tantivy::{
|
||||
doc, DocAddress, DocId, Index, Opstamp, Searcher, SearcherGeneration, SegmentId, SegmentReader,
|
||||
Warmer,
|
||||
doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration, SegmentId,
|
||||
SegmentReader, Warmer,
|
||||
};
|
||||
|
||||
// This example shows how warmers can be used to
|
||||
@@ -143,7 +143,7 @@ fn main() -> tantivy::Result<()> {
|
||||
const SNEAKERS: ProductId = 23222;
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
|
||||
let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?;
|
||||
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
|
||||
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
|
||||
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
|
||||
|
||||
@@ -9,7 +9,7 @@ use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_v
|
||||
use crate::aggregation::DistributedAggregationCollector;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, Schema, FAST};
|
||||
use crate::{Index, Term};
|
||||
use crate::{Index, IndexWriter, Term};
|
||||
|
||||
fn get_avg_req(field_name: &str) -> Aggregation {
|
||||
serde_json::from_value(json!({
|
||||
@@ -586,7 +586,7 @@ fn test_aggregation_on_json_object() {
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"color": "red"})))
|
||||
.unwrap();
|
||||
@@ -630,7 +630,7 @@ fn test_aggregation_on_json_object_empty_columns() {
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Empty column when accessing color
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"price": 10.0})))
|
||||
@@ -748,7 +748,7 @@ fn test_aggregation_on_json_object_mixed_types() {
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Segment with all values numeric
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"mixed_type": 10.0})))
|
||||
|
||||
@@ -252,7 +252,7 @@ pub mod tests {
|
||||
use crate::aggregation::tests::exec_request;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::schema::{Schema, FAST, STRING};
|
||||
use crate::Index;
|
||||
use crate::{Index, IndexWriter, TantivyDocument};
|
||||
|
||||
#[test]
|
||||
fn test_parse_into_millisecs() {
|
||||
@@ -316,7 +316,7 @@ pub mod tests {
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
for values in segment_and_docs {
|
||||
for doc_str in values {
|
||||
let doc = schema.parse_document(doc_str)?;
|
||||
let doc = TantivyDocument::parse_json(&schema, doc_str)?;
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
// writing the segment
|
||||
@@ -328,7 +328,7 @@ pub mod tests {
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
if segment_ids.len() > 1 {
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
@@ -601,7 +601,7 @@ mod tests {
|
||||
use crate::aggregation::AggregationLimits;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::schema::{Schema, FAST, STRING};
|
||||
use crate::Index;
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
#[test]
|
||||
fn terms_aggregation_test_single_segment() -> crate::Result<()> {
|
||||
@@ -1473,7 +1473,7 @@ mod tests {
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Segment with empty json
|
||||
index_writer.add_document(doc!()).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
@@ -117,7 +117,7 @@ mod tests {
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::tests::exec_request_with_query;
|
||||
use crate::schema::{Schema, FAST};
|
||||
use crate::Index;
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
#[test]
|
||||
fn terms_aggregation_missing_mixed_type_mult_seg_sub_agg() -> crate::Result<()> {
|
||||
@@ -126,7 +126,7 @@ mod tests {
|
||||
let score = schema_builder.add_f64_field("score", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Segment with all values numeric
|
||||
index_writer
|
||||
.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))
|
||||
@@ -186,7 +186,7 @@ mod tests {
|
||||
let score = schema_builder.add_f64_field("score", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Segment with all values numeric
|
||||
index_writer.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))?;
|
||||
index_writer.add_document(doc!(score => 5.0))?;
|
||||
@@ -231,7 +231,7 @@ mod tests {
|
||||
let score = schema_builder.add_f64_field("score", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
|
||||
index_writer.add_document(doc!(score => 5.0))?;
|
||||
index_writer.commit().unwrap();
|
||||
@@ -278,7 +278,7 @@ mod tests {
|
||||
let score = schema_builder.add_f64_field("score", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
|
||||
index_writer.add_document(doc!(score => 5.0))?;
|
||||
index_writer.add_document(doc!(score => 5.0))?;
|
||||
@@ -323,7 +323,7 @@ mod tests {
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Segment with all values numeric
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"mixed_type": 10.0})))
|
||||
@@ -385,7 +385,7 @@ mod tests {
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Segment with all values numeric
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"mixed_type": 10.0})))
|
||||
@@ -427,7 +427,7 @@ mod tests {
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Segment with all values numeric
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"mixed_type": 10.0})))
|
||||
|
||||
@@ -71,7 +71,7 @@ mod tests {
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::tests::exec_request_with_query;
|
||||
use crate::schema::{Schema, FAST};
|
||||
use crate::Index;
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
#[test]
|
||||
fn test_max_agg_with_missing() -> crate::Result<()> {
|
||||
@@ -79,7 +79,7 @@ mod tests {
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Segment with empty json
|
||||
index_writer.add_document(doc!()).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
@@ -88,7 +88,7 @@ mod tests {
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::query::AllQuery;
|
||||
use crate::schema::{NumericOptions, Schema};
|
||||
use crate::Index;
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
#[test]
|
||||
fn test_metric_aggregations() {
|
||||
@@ -96,7 +96,7 @@ mod tests {
|
||||
let field_options = NumericOptions::default().set_fast();
|
||||
let field = schema_builder.add_f64_field("price", field_options);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
|
||||
for i in 0..3 {
|
||||
index_writer
|
||||
|
||||
@@ -300,7 +300,7 @@ mod tests {
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, Schema, FAST};
|
||||
use crate::{Index, Term};
|
||||
use crate::{Index, IndexWriter, Term};
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_stats_empty_index() -> crate::Result<()> {
|
||||
@@ -494,7 +494,7 @@ mod tests {
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Segment with empty json
|
||||
index_writer.add_document(doc!()).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
@@ -541,7 +541,7 @@ mod tests {
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Segment with empty json
|
||||
index_writer.add_document(doc!()).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
@@ -319,7 +319,7 @@ mod tests {
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
|
||||
use crate::{Index, Term};
|
||||
use crate::{Index, IndexWriter, Term};
|
||||
|
||||
pub fn get_test_index_with_num_docs(
|
||||
merge_segments: bool,
|
||||
@@ -451,7 +451,7 @@ mod tests {
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
if segment_ids.len() > 1 {
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
@@ -565,7 +565,7 @@ mod tests {
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
@@ -495,8 +495,8 @@ mod tests {
|
||||
use crate::collector::Count;
|
||||
use crate::core::Index;
|
||||
use crate::query::{AllQuery, QueryParser, TermQuery};
|
||||
use crate::schema::{Document, Facet, FacetOptions, IndexRecordOption, Schema};
|
||||
use crate::Term;
|
||||
use crate::schema::{Facet, FacetOptions, IndexRecordOption, Schema, TantivyDocument};
|
||||
use crate::{IndexWriter, Term};
|
||||
|
||||
fn test_collapse_mapping_aux(
|
||||
facet_terms: &[&str],
|
||||
@@ -559,7 +559,7 @@ mod tests {
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(facet_field=>Facet::from("/facet/a")))
|
||||
.unwrap();
|
||||
@@ -588,7 +588,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let num_facets: usize = 3 * 4 * 5;
|
||||
let facets: Vec<Facet> = (0..num_facets)
|
||||
.map(|mut n| {
|
||||
@@ -601,7 +601,7 @@ mod tests {
|
||||
})
|
||||
.collect();
|
||||
for i in 0..num_facets * 10 {
|
||||
let mut doc = Document::new();
|
||||
let mut doc = TantivyDocument::new();
|
||||
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
@@ -732,24 +732,25 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let uniform = Uniform::new_inclusive(1, 100_000);
|
||||
let mut docs: Vec<Document> = vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
|
||||
.into_iter()
|
||||
.flat_map(|(c, count)| {
|
||||
let facet = Facet::from(&format!("/facet/{}", c));
|
||||
let doc = doc!(facet_field => facet);
|
||||
iter::repeat(doc).take(count)
|
||||
})
|
||||
.map(|mut doc| {
|
||||
doc.add_facet(
|
||||
facet_field,
|
||||
&format!("/facet/{}", thread_rng().sample(uniform)),
|
||||
);
|
||||
doc
|
||||
})
|
||||
.collect();
|
||||
let mut docs: Vec<TantivyDocument> =
|
||||
vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
|
||||
.into_iter()
|
||||
.flat_map(|(c, count)| {
|
||||
let facet = Facet::from(&format!("/facet/{}", c));
|
||||
let doc = doc!(facet_field => facet);
|
||||
iter::repeat(doc).take(count)
|
||||
})
|
||||
.map(|mut doc| {
|
||||
doc.add_facet(
|
||||
facet_field,
|
||||
&format!("/facet/{}", thread_rng().sample(uniform)),
|
||||
);
|
||||
doc
|
||||
})
|
||||
.collect();
|
||||
docs[..].shuffle(&mut thread_rng());
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
@@ -780,7 +781,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let docs: Vec<Document> = vec![("b", 2), ("a", 2), ("c", 4)]
|
||||
let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)]
|
||||
.into_iter()
|
||||
.flat_map(|(c, count)| {
|
||||
let facet = Facet::from(&format!("/facet/{}", c));
|
||||
@@ -828,7 +829,7 @@ mod bench {
|
||||
use crate::collector::FacetCollector;
|
||||
use crate::query::AllQuery;
|
||||
use crate::schema::{Facet, Schema, INDEXED};
|
||||
use crate::Index;
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
#[bench]
|
||||
fn bench_facet_collector(b: &mut Bencher) {
|
||||
@@ -847,7 +848,7 @@ mod bench {
|
||||
// 40425 docs
|
||||
docs[..].shuffle(&mut thread_rng());
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
|
||||
@@ -7,7 +7,9 @@ use crate::query::{AllQuery, QueryParser};
|
||||
use crate::schema::{Schema, FAST, TEXT};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{doc, DateTime, DocAddress, DocId, Document, Index, Score, Searcher, SegmentOrdinal};
|
||||
use crate::{
|
||||
doc, DateTime, DocAddress, DocId, Index, Score, Searcher, SegmentOrdinal, TantivyDocument,
|
||||
};
|
||||
|
||||
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
|
||||
compute_score: true,
|
||||
@@ -280,8 +282,8 @@ fn make_test_searcher() -> crate::Result<Searcher> {
|
||||
let schema = Schema::builder().build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.add_document(TantivyDocument::default())?;
|
||||
index_writer.add_document(TantivyDocument::default())?;
|
||||
index_writer.commit()?;
|
||||
Ok(index.reader()?.searcher())
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ use crate::error::{DataCorruption, TantivyError};
|
||||
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN};
|
||||
use crate::indexer::segment_updater::save_metas;
|
||||
use crate::reader::{IndexReader, IndexReaderBuilder};
|
||||
use crate::schema::document::Document;
|
||||
use crate::schema::{Field, FieldType, Schema};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::IndexWriter;
|
||||
@@ -184,11 +185,11 @@ impl IndexBuilder {
|
||||
///
|
||||
/// It expects an originally empty directory, and will not run any GC operation.
|
||||
#[doc(hidden)]
|
||||
pub fn single_segment_index_writer(
|
||||
pub fn single_segment_index_writer<D: Document>(
|
||||
self,
|
||||
dir: impl Into<Box<dyn Directory>>,
|
||||
mem_budget: usize,
|
||||
) -> crate::Result<SingleSegmentIndexWriter> {
|
||||
) -> crate::Result<SingleSegmentIndexWriter<D>> {
|
||||
let index = self.create(dir)?;
|
||||
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
|
||||
Ok(index_simple_writer)
|
||||
@@ -531,11 +532,11 @@ impl Index {
|
||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
||||
/// If the memory arena per thread is too small or too big, returns
|
||||
/// `TantivyError::InvalidArgument`
|
||||
pub fn writer_with_num_threads(
|
||||
pub fn writer_with_num_threads<D: Document>(
|
||||
&self,
|
||||
num_threads: usize,
|
||||
overall_memory_budget_in_bytes: usize,
|
||||
) -> crate::Result<IndexWriter> {
|
||||
) -> crate::Result<IndexWriter<D>> {
|
||||
let directory_lock = self
|
||||
.directory
|
||||
.acquire_lock(&INDEX_WRITER_LOCK)
|
||||
@@ -564,7 +565,7 @@ impl Index {
|
||||
/// That index writer only simply has a single thread and a memory budget of 15 MB.
|
||||
/// Using a single thread gives us a deterministic allocation of DocId.
|
||||
#[cfg(test)]
|
||||
pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {
|
||||
pub fn writer_for_tests<D: Document>(&self) -> crate::Result<IndexWriter<D>> {
|
||||
self.writer_with_num_threads(1, 15_000_000)
|
||||
}
|
||||
|
||||
@@ -579,7 +580,10 @@ impl Index {
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// If the memory arena per thread is too small or too big, returns
|
||||
/// `TantivyError::InvalidArgument`
|
||||
pub fn writer(&self, memory_budget_in_bytes: usize) -> crate::Result<IndexWriter> {
|
||||
pub fn writer<D: Document>(
|
||||
&self,
|
||||
memory_budget_in_bytes: usize,
|
||||
) -> crate::Result<IndexWriter<D>> {
|
||||
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
|
||||
let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads;
|
||||
if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
|
||||
|
||||
@@ -5,6 +5,7 @@ use rustc_hash::FxHashMap;
|
||||
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
||||
use crate::schema::document::{DocValue, ReferenceValue};
|
||||
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
|
||||
use crate::schema::{Field, Type, DATE_TIME_PRECISION_INDEXED};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
@@ -64,9 +65,9 @@ impl IndexingPositionsPerPath {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn index_json_values<'a>(
|
||||
pub(crate) fn index_json_values<'a, V: DocValue<'a>>(
|
||||
doc: DocId,
|
||||
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
|
||||
json_visitors: impl Iterator<Item = crate::Result<V::ObjectIter>>,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
expand_dots_enabled: bool,
|
||||
term_buffer: &mut Term,
|
||||
@@ -75,11 +76,11 @@ pub(crate) fn index_json_values<'a>(
|
||||
) -> crate::Result<()> {
|
||||
let mut json_term_writer = JsonTermWriter::wrap(term_buffer, expand_dots_enabled);
|
||||
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
|
||||
for json_value_res in json_values {
|
||||
let json_value = json_value_res?;
|
||||
index_json_object(
|
||||
for json_visitor_res in json_visitors {
|
||||
let json_visitor = json_visitor_res?;
|
||||
index_json_object::<V>(
|
||||
doc,
|
||||
json_value,
|
||||
json_visitor,
|
||||
text_analyzer,
|
||||
&mut json_term_writer,
|
||||
postings_writer,
|
||||
@@ -90,20 +91,20 @@ pub(crate) fn index_json_values<'a>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn index_json_object(
|
||||
fn index_json_object<'a, V: DocValue<'a>>(
|
||||
doc: DocId,
|
||||
json_value: &serde_json::Map<String, serde_json::Value>,
|
||||
json_visitor: V::ObjectIter,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
positions_per_path: &mut IndexingPositionsPerPath,
|
||||
) {
|
||||
for (json_path_segment, json_value) in json_value {
|
||||
for (json_path_segment, json_value_visitor) in json_visitor {
|
||||
json_term_writer.push_path_segment(json_path_segment);
|
||||
index_json_value(
|
||||
doc,
|
||||
json_value,
|
||||
json_value_visitor,
|
||||
text_analyzer,
|
||||
json_term_writer,
|
||||
postings_writer,
|
||||
@@ -114,9 +115,9 @@ fn index_json_object(
|
||||
}
|
||||
}
|
||||
|
||||
fn index_json_value(
|
||||
fn index_json_value<'a, V: DocValue<'a>>(
|
||||
doc: DocId,
|
||||
json_value: &serde_json::Value,
|
||||
json_value: ReferenceValue<'a, V>,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
@@ -124,43 +125,56 @@ fn index_json_value(
|
||||
positions_per_path: &mut IndexingPositionsPerPath,
|
||||
) {
|
||||
match json_value {
|
||||
serde_json::Value::Null => {}
|
||||
serde_json::Value::Bool(val_bool) => {
|
||||
json_term_writer.set_fast_value(*val_bool);
|
||||
ReferenceValue::Null => {}
|
||||
ReferenceValue::Str(val) => {
|
||||
let mut token_stream = text_analyzer.token_stream(val);
|
||||
|
||||
// TODO: make sure the chain position works out.
|
||||
json_term_writer.close_path_and_set_type(Type::Str);
|
||||
let indexing_position = positions_per_path.get_position(json_term_writer.term());
|
||||
postings_writer.index_text(
|
||||
doc,
|
||||
&mut *token_stream,
|
||||
json_term_writer.term_buffer,
|
||||
ctx,
|
||||
indexing_position,
|
||||
);
|
||||
}
|
||||
ReferenceValue::U64(val) => {
|
||||
json_term_writer.set_fast_value(val);
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
serde_json::Value::Number(number) => {
|
||||
if let Some(number_i64) = number.as_i64() {
|
||||
json_term_writer.set_fast_value(number_i64);
|
||||
} else if let Some(number_u64) = number.as_u64() {
|
||||
json_term_writer.set_fast_value(number_u64);
|
||||
} else if let Some(number_f64) = number.as_f64() {
|
||||
json_term_writer.set_fast_value(number_f64);
|
||||
}
|
||||
ReferenceValue::I64(val) => {
|
||||
json_term_writer.set_fast_value(val);
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
serde_json::Value::String(text) => match infer_type_from_str(text) {
|
||||
TextOrDateTime::Text(text) => {
|
||||
let mut token_stream = text_analyzer.token_stream(text);
|
||||
// TODO make sure the chain position works out.
|
||||
json_term_writer.close_path_and_set_type(Type::Str);
|
||||
let indexing_position = positions_per_path.get_position(json_term_writer.term());
|
||||
postings_writer.index_text(
|
||||
doc,
|
||||
&mut *token_stream,
|
||||
json_term_writer.term_buffer,
|
||||
ctx,
|
||||
indexing_position,
|
||||
);
|
||||
}
|
||||
TextOrDateTime::DateTime(dt) => {
|
||||
json_term_writer.set_fast_value(DateTime::from_utc(dt));
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
},
|
||||
serde_json::Value::Array(arr) => {
|
||||
for val in arr {
|
||||
index_json_value(
|
||||
ReferenceValue::F64(val) => {
|
||||
json_term_writer.set_fast_value(val);
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
ReferenceValue::Bool(val) => {
|
||||
json_term_writer.set_fast_value(val);
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
ReferenceValue::Facet(_) => {
|
||||
unimplemented!("Facet support in dynamic fields is not yet implemented")
|
||||
}
|
||||
ReferenceValue::IpAddr(_) => {
|
||||
unimplemented!("IP address support in dynamic fields is not yet implemented")
|
||||
}
|
||||
ReferenceValue::Date(val) => {
|
||||
json_term_writer.set_fast_value(val);
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
ReferenceValue::PreTokStr(_) => {
|
||||
unimplemented!("Pre-tokenized string support in dynamic fields is not yet implemented")
|
||||
}
|
||||
ReferenceValue::Bytes(_) => {
|
||||
unimplemented!("Bytes support in dynamic fields is not yet implemented")
|
||||
}
|
||||
ReferenceValue::Array(elements) => {
|
||||
for val in elements {
|
||||
index_json_value::<V::ChildValue>(
|
||||
doc,
|
||||
val,
|
||||
text_analyzer,
|
||||
@@ -171,10 +185,10 @@ fn index_json_value(
|
||||
);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Object(map) => {
|
||||
index_json_object(
|
||||
ReferenceValue::Object(object) => {
|
||||
index_json_object::<V>(
|
||||
doc,
|
||||
map,
|
||||
object,
|
||||
text_analyzer,
|
||||
json_term_writer,
|
||||
postings_writer,
|
||||
@@ -185,21 +199,6 @@ fn index_json_value(
|
||||
}
|
||||
}
|
||||
|
||||
enum TextOrDateTime<'a> {
|
||||
Text(&'a str),
|
||||
DateTime(OffsetDateTime),
|
||||
}
|
||||
|
||||
fn infer_type_from_str(text: &str) -> TextOrDateTime {
|
||||
match OffsetDateTime::parse(text, &Rfc3339) {
|
||||
Ok(dt) => {
|
||||
let dt_utc = dt.to_offset(UtcOffset::UTC);
|
||||
TextOrDateTime::DateTime(dt_utc)
|
||||
}
|
||||
Err(_) => TextOrDateTime::Text(text),
|
||||
}
|
||||
}
|
||||
|
||||
// Tries to infer a JSON type from a string.
|
||||
pub fn convert_to_fast_value_and_get_term(
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
|
||||
@@ -5,7 +5,8 @@ use std::{fmt, io};
|
||||
use crate::collector::Collector;
|
||||
use crate::core::{Executor, SegmentReader};
|
||||
use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
|
||||
use crate::schema::{Document, Schema, Term};
|
||||
use crate::schema::document::Document;
|
||||
use crate::schema::{Schema, Term};
|
||||
use crate::space_usage::SearcherSpaceUsage;
|
||||
use crate::store::{CacheStats, StoreReader};
|
||||
use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject};
|
||||
@@ -83,7 +84,7 @@ impl Searcher {
|
||||
///
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
/// request to the right `Segment`.
|
||||
pub fn doc(&self, doc_address: DocAddress) -> crate::Result<Document> {
|
||||
pub fn doc<D: Document>(&self, doc_address: DocAddress) -> crate::Result<D> {
|
||||
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
|
||||
store_reader.get(doc_address.doc_id)
|
||||
}
|
||||
@@ -103,7 +104,7 @@ impl Searcher {
|
||||
|
||||
/// Fetches a document in an asynchronous manner.
|
||||
#[cfg(feature = "quickwit")]
|
||||
pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result<Document> {
|
||||
pub async fn doc_async<D: Document>(&self, doc_address: DocAddress) -> crate::Result<D> {
|
||||
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
|
||||
store_reader.get_async(doc_address.doc_id).await
|
||||
}
|
||||
|
||||
@@ -355,7 +355,7 @@ impl fmt::Debug for SegmentReader {
|
||||
mod test {
|
||||
use crate::core::Index;
|
||||
use crate::schema::{Schema, Term, STORED, TEXT};
|
||||
use crate::DocId;
|
||||
use crate::{DocId, IndexWriter};
|
||||
|
||||
#[test]
|
||||
fn test_num_alive() -> crate::Result<()> {
|
||||
@@ -366,7 +366,7 @@ mod test {
|
||||
let name = schema.get_field("name").unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(name => "tantivy"))?;
|
||||
index_writer.add_document(doc!(name => "horse"))?;
|
||||
index_writer.add_document(doc!(name => "jockey"))?;
|
||||
@@ -392,7 +392,7 @@ mod test {
|
||||
let name = schema.get_field("name").unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(name => "tantivy"))?;
|
||||
index_writer.add_document(doc!(name => "horse"))?;
|
||||
index_writer.add_document(doc!(name => "jockey"))?;
|
||||
@@ -402,7 +402,7 @@ mod test {
|
||||
}
|
||||
|
||||
{
|
||||
let mut index_writer2 = index.writer(50_000_000)?;
|
||||
let mut index_writer2: IndexWriter = index.writer(50_000_000)?;
|
||||
index_writer2.delete_term(Term::from_field_text(name, "horse"));
|
||||
index_writer2.delete_term(Term::from_field_text(name, "cap"));
|
||||
|
||||
|
||||
@@ -1,16 +1,20 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use crate::indexer::operation::AddOperation;
|
||||
use crate::indexer::segment_updater::save_metas;
|
||||
use crate::indexer::SegmentWriter;
|
||||
use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
|
||||
use crate::schema::document::Document;
|
||||
use crate::{Directory, Index, IndexMeta, Opstamp, Segment, TantivyDocument};
|
||||
|
||||
#[doc(hidden)]
|
||||
pub struct SingleSegmentIndexWriter {
|
||||
pub struct SingleSegmentIndexWriter<D: Document = TantivyDocument> {
|
||||
segment_writer: SegmentWriter,
|
||||
segment: Segment,
|
||||
opstamp: Opstamp,
|
||||
_phantom: PhantomData<D>,
|
||||
}
|
||||
|
||||
impl SingleSegmentIndexWriter {
|
||||
impl<D: Document> SingleSegmentIndexWriter<D> {
|
||||
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
|
||||
let segment = index.new_segment();
|
||||
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
|
||||
@@ -18,6 +22,7 @@ impl SingleSegmentIndexWriter {
|
||||
segment_writer,
|
||||
segment,
|
||||
opstamp: 0,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -25,7 +30,7 @@ impl SingleSegmentIndexWriter {
|
||||
self.segment_writer.mem_usage()
|
||||
}
|
||||
|
||||
pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
|
||||
pub fn add_document(&mut self, document: D) -> crate::Result<()> {
|
||||
let opstamp = self.opstamp;
|
||||
self.opstamp += 1;
|
||||
self.segment_writer
|
||||
|
||||
@@ -5,8 +5,8 @@ use crate::query::TermQuery;
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::{
|
||||
Directory, Document, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, SegmentId,
|
||||
Term,
|
||||
Directory, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy,
|
||||
SegmentId, TantivyDocument, Term,
|
||||
};
|
||||
|
||||
#[test]
|
||||
@@ -159,7 +159,7 @@ mod mmap_specific {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let mut index = Index::create_from_tempdir(schema)?;
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
let mut writer: IndexWriter = index.writer_for_tests()?;
|
||||
writer.commit()?;
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
@@ -208,7 +208,7 @@ fn test_index_on_commit_reload_policy_aux(
|
||||
.watch(WatchCallback::new(move || {
|
||||
let _ = sender.send(());
|
||||
}));
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
let mut writer: IndexWriter = index.writer_for_tests()?;
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64))?;
|
||||
writer.commit().unwrap();
|
||||
@@ -242,7 +242,7 @@ fn garbage_collect_works_as_intended() -> crate::Result<()> {
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create(directory.clone(), schema, IndexSettings::default())?;
|
||||
|
||||
let mut writer = index.writer_with_num_threads(1, 32_000_000).unwrap();
|
||||
let mut writer: IndexWriter = index.writer_with_num_threads(1, 32_000_000).unwrap();
|
||||
for _seg in 0..8 {
|
||||
for i in 0u64..1_000u64 {
|
||||
writer.add_document(doc!(field => i))?;
|
||||
@@ -306,7 +306,7 @@ fn test_merging_segment_update_docfreq() {
|
||||
let id_field = schema_builder.add_text_field("id", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
for _ in 0..5 {
|
||||
writer.add_document(doc!(text_field=>"hello")).unwrap();
|
||||
@@ -317,13 +317,13 @@ fn test_merging_segment_update_docfreq() {
|
||||
writer
|
||||
.add_document(doc!(text_field=>"hello", id_field=>"TO_BE_DELETED"))
|
||||
.unwrap();
|
||||
writer.add_document(Document::default()).unwrap();
|
||||
writer.add_document(TantivyDocument::default()).unwrap();
|
||||
writer.commit().unwrap();
|
||||
for _ in 0..7 {
|
||||
writer.add_document(doc!(text_field=>"hello")).unwrap();
|
||||
}
|
||||
writer.add_document(Document::default()).unwrap();
|
||||
writer.add_document(Document::default()).unwrap();
|
||||
writer.add_document(TantivyDocument::default()).unwrap();
|
||||
writer.add_document(TantivyDocument::default()).unwrap();
|
||||
writer.delete_term(Term::from_field_text(id_field, "TO_BE_DELETED"));
|
||||
writer.commit().unwrap();
|
||||
|
||||
|
||||
@@ -533,7 +533,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::indexer::LogMergePolicy;
|
||||
use crate::schema::{Schema, SchemaBuilder, TEXT};
|
||||
use crate::{Index, IndexSettings, ReloadPolicy};
|
||||
use crate::{Index, IndexSettings, IndexWriter, ReloadPolicy};
|
||||
|
||||
#[test]
|
||||
fn test_open_non_existent_path() {
|
||||
@@ -645,7 +645,7 @@ mod tests {
|
||||
let index =
|
||||
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
log_merge_policy.set_min_num_segments(3);
|
||||
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
||||
|
||||
10
src/error.rs
10
src/error.rs
@@ -11,6 +11,7 @@ use crate::directory::error::{
|
||||
Incompatibility, LockError, OpenDirectoryError, OpenReadError, OpenWriteError,
|
||||
};
|
||||
use crate::fastfield::FastFieldNotAvailableError;
|
||||
use crate::schema::document::DeserializeError;
|
||||
use crate::{query, schema};
|
||||
|
||||
/// Represents a `DataCorruption` error.
|
||||
@@ -106,6 +107,9 @@ pub enum TantivyError {
|
||||
/// e.g. a datastructure is incorrectly inititalized.
|
||||
#[error("Internal error: '{0}'")]
|
||||
InternalError(String),
|
||||
#[error("Deserialize error: {0}")]
|
||||
/// An error occurred while attempting to deserialize a document.
|
||||
DeserializeError(DeserializeError),
|
||||
}
|
||||
|
||||
impl From<io::Error> for TantivyError {
|
||||
@@ -176,3 +180,9 @@ impl From<rayon::ThreadPoolBuildError> for TantivyError {
|
||||
TantivyError::SystemError(error.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DeserializeError> for TantivyError {
|
||||
fn from(error: DeserializeError) -> TantivyError {
|
||||
TantivyError::DeserializeError(error)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,8 +62,9 @@ impl FacetReader {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED};
|
||||
use crate::{DocAddress, Document, Index};
|
||||
use crate::schema::document::DocValue;
|
||||
use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED};
|
||||
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
|
||||
|
||||
#[test]
|
||||
fn test_facet_only_indexed() {
|
||||
@@ -71,7 +72,7 @@ mod tests {
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))
|
||||
.unwrap();
|
||||
@@ -85,8 +86,10 @@ mod tests {
|
||||
let mut facet = Facet::default();
|
||||
facet_reader.facet_from_ord(0, &mut facet).unwrap();
|
||||
assert_eq!(facet.to_path_string(), "/a/b");
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32)).unwrap();
|
||||
let value = doc.get_first(facet_field).and_then(Value::as_facet);
|
||||
let doc = searcher
|
||||
.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))
|
||||
.unwrap();
|
||||
let value = doc.get_first(facet_field).and_then(|v| v.as_facet());
|
||||
assert_eq!(value, None);
|
||||
}
|
||||
|
||||
@@ -96,7 +99,7 @@ mod tests {
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(facet_field=>Facet::from_text("/parent/child1").unwrap()))
|
||||
.unwrap();
|
||||
@@ -142,8 +145,8 @@ mod tests {
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_ords.extend(facet_reader.facet_ords(0u32));
|
||||
assert_eq!(&facet_ords, &[0u64]);
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::as_facet);
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))?;
|
||||
let value: Option<&Facet> = doc.get_first(facet_field).and_then(|v| v.as_facet());
|
||||
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
|
||||
Ok(())
|
||||
}
|
||||
@@ -156,7 +159,7 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.add_document(TantivyDocument::default())?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
|
||||
@@ -176,8 +179,8 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.add_document(TantivyDocument::default())?;
|
||||
index_writer.add_document(TantivyDocument::default())?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
|
||||
|
||||
@@ -90,12 +90,12 @@ mod tests {
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::{
|
||||
Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder,
|
||||
Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, TantivyDocument,
|
||||
TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
|
||||
};
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
|
||||
use crate::{DateOptions, DateTimePrecision, Index, SegmentId, SegmentReader};
|
||||
use crate::{DateOptions, DateTimePrecision, Index, IndexWriter, SegmentId, SegmentReader};
|
||||
|
||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -271,7 +271,7 @@ mod tests {
|
||||
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
||||
for i in -100i64..10_000i64 {
|
||||
let mut doc = Document::default();
|
||||
let mut doc = TantivyDocument::default();
|
||||
doc.add_i64(i64_field, i);
|
||||
fast_field_writers.add_document(&doc).unwrap();
|
||||
}
|
||||
@@ -312,7 +312,7 @@ mod tests {
|
||||
{
|
||||
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
||||
let doc = Document::default();
|
||||
let doc = TantivyDocument::default();
|
||||
fast_field_writers.add_document(&doc).unwrap();
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
write.terminate().unwrap();
|
||||
@@ -345,7 +345,7 @@ mod tests {
|
||||
{
|
||||
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
||||
let doc = Document::default();
|
||||
let doc = TantivyDocument::default();
|
||||
fast_field_writers.add_document(&doc).unwrap();
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
write.terminate().unwrap();
|
||||
@@ -416,7 +416,7 @@ mod tests {
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer
|
||||
.add_document(doc!(date_field => DateTime::from_utc(OffsetDateTime::now_utc())))
|
||||
@@ -452,7 +452,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// first segment
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
@@ -506,7 +506,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// second segment
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
@@ -537,7 +537,7 @@ mod tests {
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids().unwrap();
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer.merge(&segment_ids).wait().unwrap();
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
}
|
||||
@@ -662,7 +662,7 @@ mod tests {
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
@@ -824,7 +824,7 @@ mod tests {
|
||||
{
|
||||
let mut write: WritePtr = directory.open_write(path).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
|
||||
let doc = Document::default();
|
||||
let doc = TantivyDocument::default();
|
||||
fast_field_writers.add_document(&doc).unwrap();
|
||||
fast_field_writers.serialize(&mut write, None).unwrap();
|
||||
write.terminate().unwrap();
|
||||
@@ -846,7 +846,7 @@ mod tests {
|
||||
assert_eq!(col.get_val(0), true);
|
||||
}
|
||||
|
||||
fn get_index(docs: &[crate::Document], schema: &Schema) -> crate::Result<RamDirectory> {
|
||||
fn get_index(docs: &[crate::TantivyDocument], schema: &Schema) -> crate::Result<RamDirectory> {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
@@ -888,7 +888,7 @@ mod tests {
|
||||
let field = schema_builder.add_date_field("field", date_options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let docs: Vec<Document> = times.iter().map(|time| doc!(field=>*time)).collect();
|
||||
let docs: Vec<TantivyDocument> = times.iter().map(|time| doc!(field=>*time)).collect();
|
||||
|
||||
let directory = get_index(&docs[..], &schema).unwrap();
|
||||
let path = Path::new("test");
|
||||
@@ -962,11 +962,15 @@ mod tests {
|
||||
let ip_field = schema_builder.add_u64_field("ip", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let ip_addr = Ipv6Addr::new(1, 2, 3, 4, 5, 1, 2, 3);
|
||||
index_writer.add_document(Document::default()).unwrap();
|
||||
index_writer
|
||||
.add_document(TantivyDocument::default())
|
||||
.unwrap();
|
||||
index_writer.add_document(doc!(ip_field=>ip_addr)).unwrap();
|
||||
index_writer.add_document(Document::default()).unwrap();
|
||||
index_writer
|
||||
.add_document(TantivyDocument::default())
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let fastfields = searcher.segment_reader(0u32).fast_fields();
|
||||
@@ -1086,7 +1090,7 @@ mod tests {
|
||||
let json = schema_builder.add_json_field("json", json_option);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"attr.age": 32})))
|
||||
.unwrap();
|
||||
@@ -1112,7 +1116,7 @@ mod tests {
|
||||
let json = schema_builder.add_json_field("json", json_option);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"age": 32})))
|
||||
.unwrap();
|
||||
@@ -1139,7 +1143,7 @@ mod tests {
|
||||
let json = schema_builder.add_json_field("json", json_option);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"attr.age": 32})))
|
||||
.unwrap();
|
||||
@@ -1162,7 +1166,7 @@ mod tests {
|
||||
let field_with_dot = schema_builder.add_i64_field("field.with.dot", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(field_with_dot => 32i64))
|
||||
.unwrap();
|
||||
@@ -1184,7 +1188,7 @@ mod tests {
|
||||
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(json_field=> json!({"attr": {"age": 32}}), shadowing_json_field=>json!({"age": 33})))
|
||||
.unwrap();
|
||||
@@ -1215,7 +1219,7 @@ mod tests {
|
||||
|
||||
let mut index = Index::create_in_ram(schema);
|
||||
index.set_fast_field_tokenizers(ff_tokenizer_manager);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(text_field => "Test1 test2"))
|
||||
.unwrap();
|
||||
@@ -1244,7 +1248,7 @@ mod tests {
|
||||
let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(log_field => "info"))
|
||||
.unwrap();
|
||||
@@ -1277,7 +1281,7 @@ mod tests {
|
||||
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(json_field=> json!({"attr.age": 32}), shadowing_json_field=>json!({"age": 33})))
|
||||
.unwrap();
|
||||
|
||||
@@ -357,7 +357,7 @@ mod tests {
|
||||
use columnar::ColumnType;
|
||||
|
||||
use crate::schema::{JsonObjectOptions, Schema, FAST};
|
||||
use crate::{Document, Index};
|
||||
use crate::{Index, IndexWriter, TantivyDocument};
|
||||
|
||||
#[test]
|
||||
fn test_fast_field_reader_resolve_with_dynamic_internal() {
|
||||
@@ -373,8 +373,10 @@ mod tests {
|
||||
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(Document::default()).unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(TantivyDocument::default())
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
@@ -443,7 +445,7 @@ mod tests {
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(id=> 1u64, json => json!({"foo": 42})))
|
||||
.unwrap();
|
||||
|
||||
@@ -5,8 +5,9 @@ use common::replace_in_place;
|
||||
use tokenizer_api::Token;
|
||||
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::schema::document::{DocValue, Document, ReferenceValue};
|
||||
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
|
||||
use crate::schema::{value_type_to_column_type, Document, FieldType, Schema, Type, Value};
|
||||
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::{DateTimePrecision, DocId, TantivyError};
|
||||
|
||||
@@ -117,114 +118,115 @@ impl FastFieldsWriter {
|
||||
}
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
|
||||
pub fn add_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
|
||||
let doc_id = self.num_docs;
|
||||
for field_value in doc.field_values() {
|
||||
if let Some(field_name) =
|
||||
&self.fast_field_names[field_value.field().field_id() as usize]
|
||||
{
|
||||
match &field_value.value {
|
||||
Value::U64(u64_val) => {
|
||||
self.columnar_writer.record_numerical(
|
||||
doc_id,
|
||||
field_name.as_str(),
|
||||
NumericalValue::from(*u64_val),
|
||||
);
|
||||
}
|
||||
Value::I64(i64_val) => {
|
||||
self.columnar_writer.record_numerical(
|
||||
doc_id,
|
||||
field_name.as_str(),
|
||||
NumericalValue::from(*i64_val),
|
||||
);
|
||||
}
|
||||
Value::F64(f64_val) => {
|
||||
self.columnar_writer.record_numerical(
|
||||
doc_id,
|
||||
field_name.as_str(),
|
||||
NumericalValue::from(*f64_val),
|
||||
);
|
||||
}
|
||||
Value::Str(text_val) => {
|
||||
if let Some(tokenizer) =
|
||||
&mut self.per_field_tokenizer[field_value.field().field_id() as usize]
|
||||
{
|
||||
let mut token_stream = tokenizer.token_stream(text_val);
|
||||
token_stream.process(&mut |token: &Token| {
|
||||
self.columnar_writer.record_str(
|
||||
doc_id,
|
||||
field_name.as_str(),
|
||||
&token.text,
|
||||
);
|
||||
})
|
||||
} else {
|
||||
self.columnar_writer
|
||||
.record_str(doc_id, field_name.as_str(), text_val);
|
||||
}
|
||||
}
|
||||
Value::Bytes(bytes_val) => {
|
||||
self.columnar_writer
|
||||
.record_bytes(doc_id, field_name.as_str(), bytes_val);
|
||||
}
|
||||
Value::PreTokStr(pre_tok) => {
|
||||
for token in &pre_tok.tokens {
|
||||
self.columnar_writer.record_str(
|
||||
doc_id,
|
||||
field_name.as_str(),
|
||||
&token.text,
|
||||
);
|
||||
}
|
||||
}
|
||||
Value::Bool(bool_val) => {
|
||||
self.columnar_writer
|
||||
.record_bool(doc_id, field_name.as_str(), *bool_val);
|
||||
}
|
||||
Value::Date(datetime) => {
|
||||
let date_precision =
|
||||
self.date_precisions[field_value.field().field_id() as usize];
|
||||
let truncated_datetime = datetime.truncate(date_precision);
|
||||
self.columnar_writer.record_datetime(
|
||||
doc_id,
|
||||
field_name.as_str(),
|
||||
truncated_datetime,
|
||||
);
|
||||
}
|
||||
Value::Facet(facet) => {
|
||||
self.columnar_writer.record_str(
|
||||
doc_id,
|
||||
field_name.as_str(),
|
||||
facet.encoded_str(),
|
||||
);
|
||||
}
|
||||
Value::JsonObject(json_obj) => {
|
||||
let expand_dots = self.expand_dots[field_value.field().field_id() as usize];
|
||||
self.json_path_buffer.clear();
|
||||
self.json_path_buffer.push_str(field_name);
|
||||
for (field, value) in doc.iter_fields_and_values() {
|
||||
let value_access = value as D::Value<'_>;
|
||||
|
||||
let text_analyzer =
|
||||
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
|
||||
|
||||
record_json_obj_to_columnar_writer(
|
||||
doc_id,
|
||||
json_obj,
|
||||
expand_dots,
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut self.json_path_buffer,
|
||||
&mut self.columnar_writer,
|
||||
text_analyzer,
|
||||
);
|
||||
}
|
||||
Value::IpAddr(ip_addr) => {
|
||||
self.columnar_writer
|
||||
.record_ip_addr(doc_id, field_name.as_str(), *ip_addr);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.add_doc_value(doc_id, field, value_access.as_value())?;
|
||||
}
|
||||
self.num_docs += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn add_doc_value<'a, V: DocValue<'a>>(
|
||||
&mut self,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
value: ReferenceValue<'a, V>,
|
||||
) -> crate::Result<()> {
|
||||
let field_name = match &self.fast_field_names[field.field_id() as usize] {
|
||||
None => return Ok(()),
|
||||
Some(name) => name,
|
||||
};
|
||||
|
||||
match value {
|
||||
ReferenceValue::Null => {}
|
||||
ReferenceValue::Str(val) => {
|
||||
if let Some(tokenizer) = &mut self.per_field_tokenizer[field.field_id() as usize] {
|
||||
let mut token_stream = tokenizer.token_stream(val);
|
||||
token_stream.process(&mut |token: &Token| {
|
||||
self.columnar_writer
|
||||
.record_str(doc_id, field_name, &token.text);
|
||||
})
|
||||
} else {
|
||||
self.columnar_writer.record_str(doc_id, field_name, val);
|
||||
}
|
||||
}
|
||||
ReferenceValue::U64(val) => {
|
||||
self.columnar_writer.record_numerical(
|
||||
doc_id,
|
||||
field_name,
|
||||
NumericalValue::from(val),
|
||||
);
|
||||
}
|
||||
ReferenceValue::I64(val) => {
|
||||
self.columnar_writer.record_numerical(
|
||||
doc_id,
|
||||
field_name,
|
||||
NumericalValue::from(val),
|
||||
);
|
||||
}
|
||||
ReferenceValue::F64(val) => {
|
||||
self.columnar_writer.record_numerical(
|
||||
doc_id,
|
||||
field_name,
|
||||
NumericalValue::from(val),
|
||||
);
|
||||
}
|
||||
ReferenceValue::Date(val) => {
|
||||
let date_precision = self.date_precisions[field.field_id() as usize];
|
||||
let truncated_datetime = val.truncate(date_precision);
|
||||
self.columnar_writer
|
||||
.record_datetime(doc_id, field_name, truncated_datetime);
|
||||
}
|
||||
ReferenceValue::Facet(val) => {
|
||||
self.columnar_writer
|
||||
.record_str(doc_id, field_name, val.encoded_str());
|
||||
}
|
||||
ReferenceValue::Bytes(val) => {
|
||||
self.columnar_writer.record_bytes(doc_id, field_name, val);
|
||||
}
|
||||
ReferenceValue::IpAddr(val) => {
|
||||
self.columnar_writer.record_ip_addr(doc_id, field_name, val);
|
||||
}
|
||||
ReferenceValue::Bool(val) => {
|
||||
self.columnar_writer.record_bool(doc_id, field_name, val);
|
||||
}
|
||||
ReferenceValue::PreTokStr(val) => {
|
||||
for token in &val.tokens {
|
||||
self.columnar_writer
|
||||
.record_str(doc_id, field_name, &token.text);
|
||||
}
|
||||
}
|
||||
ReferenceValue::Array(val) => {
|
||||
// TODO: Check this is the correct behaviour we want.
|
||||
for value in val {
|
||||
self.add_doc_value(doc_id, field, value)?;
|
||||
}
|
||||
}
|
||||
ReferenceValue::Object(val) => {
|
||||
let expand_dots = self.expand_dots[field.field_id() as usize];
|
||||
self.json_path_buffer.clear();
|
||||
self.json_path_buffer.push_str(field_name);
|
||||
|
||||
let text_analyzer = &mut self.per_field_tokenizer[field.field_id() as usize];
|
||||
|
||||
record_json_obj_to_columnar_writer::<V>(
|
||||
doc_id,
|
||||
val,
|
||||
expand_dots,
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut self.json_path_buffer,
|
||||
&mut self.columnar_writer,
|
||||
text_analyzer,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Serializes all of the `FastFieldWriter`s by pushing them in
|
||||
/// order to the fast field serializer.
|
||||
pub fn serialize(
|
||||
@@ -241,31 +243,16 @@ impl FastFieldsWriter {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn columnar_numerical_value(json_number: &serde_json::Number) -> Option<NumericalValue> {
|
||||
if let Some(num_i64) = json_number.as_i64() {
|
||||
return Some(num_i64.into());
|
||||
}
|
||||
if let Some(num_u64) = json_number.as_u64() {
|
||||
return Some(num_u64.into());
|
||||
}
|
||||
if let Some(num_f64) = json_number.as_f64() {
|
||||
return Some(num_f64.into());
|
||||
}
|
||||
// This can happen with arbitrary precision.... but we do not handle it.
|
||||
None
|
||||
}
|
||||
|
||||
fn record_json_obj_to_columnar_writer(
|
||||
fn record_json_obj_to_columnar_writer<'a, V: DocValue<'a>>(
|
||||
doc: DocId,
|
||||
json_obj: &serde_json::Map<String, serde_json::Value>,
|
||||
json_visitor: V::ObjectIter,
|
||||
expand_dots: bool,
|
||||
remaining_depth_limit: usize,
|
||||
json_path_buffer: &mut String,
|
||||
columnar_writer: &mut columnar::ColumnarWriter,
|
||||
tokenizer: &mut Option<TextAnalyzer>,
|
||||
) {
|
||||
for (key, child) in json_obj {
|
||||
for (key, child) in json_visitor {
|
||||
let len_path = json_path_buffer.len();
|
||||
if !json_path_buffer.is_empty() {
|
||||
json_path_buffer.push_str(JSON_PATH_SEGMENT_SEP_STR);
|
||||
@@ -295,9 +282,9 @@ fn record_json_obj_to_columnar_writer(
|
||||
}
|
||||
}
|
||||
|
||||
fn record_json_value_to_columnar_writer(
|
||||
fn record_json_value_to_columnar_writer<'a, V: DocValue<'a>>(
|
||||
doc: DocId,
|
||||
json_val: &serde_json::Value,
|
||||
json_val: ReferenceValue<'a, V>,
|
||||
expand_dots: bool,
|
||||
mut remaining_depth_limit: usize,
|
||||
json_path_writer: &mut String,
|
||||
@@ -308,31 +295,63 @@ fn record_json_value_to_columnar_writer(
|
||||
return;
|
||||
}
|
||||
remaining_depth_limit -= 1;
|
||||
|
||||
match json_val {
|
||||
serde_json::Value::Null => {
|
||||
// TODO handle null
|
||||
}
|
||||
serde_json::Value::Bool(bool_val) => {
|
||||
columnar_writer.record_bool(doc, json_path_writer, *bool_val);
|
||||
}
|
||||
serde_json::Value::Number(json_number) => {
|
||||
if let Some(numerical_value) = columnar_numerical_value(json_number) {
|
||||
columnar_writer.record_numerical(doc, json_path_writer.as_str(), numerical_value);
|
||||
}
|
||||
}
|
||||
serde_json::Value::String(text) => {
|
||||
ReferenceValue::Null => {} // TODO: Handle null
|
||||
ReferenceValue::Str(val) => {
|
||||
if let Some(text_analyzer) = tokenizer.as_mut() {
|
||||
let mut token_stream = text_analyzer.token_stream(text);
|
||||
let mut token_stream = text_analyzer.token_stream(val);
|
||||
token_stream.process(&mut |token| {
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
|
||||
})
|
||||
} else {
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), val);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Array(arr) => {
|
||||
for el in arr {
|
||||
record_json_value_to_columnar_writer(
|
||||
ReferenceValue::U64(val) => {
|
||||
columnar_writer.record_numerical(
|
||||
doc,
|
||||
json_path_writer.as_str(),
|
||||
NumericalValue::from(val),
|
||||
);
|
||||
}
|
||||
ReferenceValue::I64(val) => {
|
||||
columnar_writer.record_numerical(
|
||||
doc,
|
||||
json_path_writer.as_str(),
|
||||
NumericalValue::from(val),
|
||||
);
|
||||
}
|
||||
ReferenceValue::F64(val) => {
|
||||
columnar_writer.record_numerical(
|
||||
doc,
|
||||
json_path_writer.as_str(),
|
||||
NumericalValue::from(val),
|
||||
);
|
||||
}
|
||||
ReferenceValue::Bool(val) => {
|
||||
columnar_writer.record_bool(doc, json_path_writer, val);
|
||||
}
|
||||
ReferenceValue::Date(val) => {
|
||||
columnar_writer.record_datetime(doc, json_path_writer.as_str(), val);
|
||||
}
|
||||
ReferenceValue::Facet(_) => {
|
||||
unimplemented!("Facet support in dynamic fields is not yet implemented")
|
||||
}
|
||||
ReferenceValue::Bytes(_) => {
|
||||
// TODO: This can be re added once it is added to the JSON Utils section as well.
|
||||
// columnar_writer.record_bytes(doc, json_path_writer.as_str(), val);
|
||||
unimplemented!("Bytes support in dynamic fields is not yet implemented")
|
||||
}
|
||||
ReferenceValue::IpAddr(_) => {
|
||||
unimplemented!("IP address support in dynamic fields is not yet implemented")
|
||||
}
|
||||
ReferenceValue::PreTokStr(_) => {
|
||||
unimplemented!("Pre-tokenized string support in dynamic fields is not yet implemented")
|
||||
}
|
||||
ReferenceValue::Array(elements) => {
|
||||
for el in elements {
|
||||
record_json_value_to_columnar_writer::<V::ChildValue>(
|
||||
doc,
|
||||
el,
|
||||
expand_dots,
|
||||
@@ -343,10 +362,10 @@ fn record_json_value_to_columnar_writer(
|
||||
);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Object(json_obj) => {
|
||||
record_json_obj_to_columnar_writer(
|
||||
ReferenceValue::Object(object) => {
|
||||
record_json_obj_to_columnar_writer::<V>(
|
||||
doc,
|
||||
json_obj,
|
||||
object,
|
||||
expand_dots,
|
||||
remaining_depth_limit,
|
||||
json_path_writer,
|
||||
@@ -363,6 +382,7 @@ mod tests {
|
||||
|
||||
use super::record_json_value_to_columnar_writer;
|
||||
use crate::fastfield::writer::JSON_DEPTH_LIMIT;
|
||||
use crate::schema::document::DocValue;
|
||||
use crate::DocId;
|
||||
|
||||
fn test_columnar_from_jsons_aux(
|
||||
@@ -374,7 +394,7 @@ mod tests {
|
||||
for (doc, json_doc) in json_docs.iter().enumerate() {
|
||||
record_json_value_to_columnar_writer(
|
||||
doc as u32,
|
||||
json_doc,
|
||||
json_doc.as_value(),
|
||||
expand_dots,
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut json_path,
|
||||
|
||||
@@ -4,7 +4,7 @@ use rand::{thread_rng, Rng};
|
||||
|
||||
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
||||
use crate::schema::*;
|
||||
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, Order, Searcher};
|
||||
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, IndexWriter, Order, Searcher};
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
||||
assert!(searcher.segment_readers().len() < 20);
|
||||
@@ -12,7 +12,7 @@ fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
let store_reader = segment_reader.get_store_reader(1)?;
|
||||
for doc_id in 0..segment_reader.max_doc() {
|
||||
let _doc = store_reader.get(doc_id)?;
|
||||
let _doc: TantivyDocument = store_reader.get(doc_id)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -31,7 +31,8 @@ fn test_functional_store() -> crate::Result<()> {
|
||||
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(3, MEMORY_BUDGET_NUM_BYTES_MIN)?;
|
||||
let mut index_writer: IndexWriter =
|
||||
index.writer_with_num_threads(3, MEMORY_BUDGET_NUM_BYTES_MIN)?;
|
||||
|
||||
let mut doc_set: Vec<u64> = Vec::new();
|
||||
|
||||
@@ -91,7 +92,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
|
||||
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(3, 120_000_000)?;
|
||||
|
||||
let mut committed_docs: HashSet<u64> = HashSet::new();
|
||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||
@@ -114,7 +115,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
|
||||
index_writer.delete_term(doc_id_term);
|
||||
} else {
|
||||
uncommitted_docs.insert(random_val);
|
||||
let mut doc = Document::new();
|
||||
let mut doc = TantivyDocument::new();
|
||||
doc.add_u64(id_field, random_val);
|
||||
for i in 1u64..10u64 {
|
||||
doc.add_u64(multiples_field, random_val * i);
|
||||
@@ -166,7 +167,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
|
||||
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(3, 120_000_000)?;
|
||||
|
||||
let mut committed_docs: HashSet<u64> = HashSet::new();
|
||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||
@@ -189,7 +190,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
|
||||
index_writer.delete_term(doc_id_term);
|
||||
} else {
|
||||
uncommitted_docs.insert(random_val);
|
||||
let mut doc = Document::new();
|
||||
let mut doc = TantivyDocument::new();
|
||||
doc.add_u64(id_field, random_val);
|
||||
for i in 1u64..10u64 {
|
||||
doc.add_u64(multiples_field, random_val * i);
|
||||
|
||||
@@ -158,6 +158,7 @@ mod tests_indexsorting {
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::document::DocValue;
|
||||
use crate::schema::{Schema, *};
|
||||
use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order};
|
||||
|
||||
@@ -308,16 +309,16 @@ mod tests_indexsorting {
|
||||
{
|
||||
assert_eq!(
|
||||
searcher
|
||||
.doc(DocAddress::new(0, 0))?
|
||||
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
|
||||
.get_first(my_string_field),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
searcher
|
||||
.doc(DocAddress::new(0, 3))?
|
||||
.doc::<TantivyDocument>(DocAddress::new(0, 3))?
|
||||
.get_first(my_string_field)
|
||||
.unwrap()
|
||||
.as_text(),
|
||||
.as_str(),
|
||||
Some("blublub")
|
||||
);
|
||||
}
|
||||
@@ -337,13 +338,13 @@ mod tests_indexsorting {
|
||||
{
|
||||
assert_eq!(
|
||||
searcher
|
||||
.doc(DocAddress::new(0, 0))?
|
||||
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
|
||||
.get_first(my_string_field)
|
||||
.unwrap()
|
||||
.as_text(),
|
||||
.as_str(),
|
||||
Some("blublub")
|
||||
);
|
||||
let doc = searcher.doc(DocAddress::new(0, 4))?;
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
|
||||
assert_eq!(doc.get_first(my_string_field), None);
|
||||
}
|
||||
// sort by field desc
|
||||
@@ -360,9 +361,9 @@ mod tests_indexsorting {
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 4))?;
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
|
||||
assert_eq!(
|
||||
doc.get_first(my_string_field).unwrap().as_text(),
|
||||
doc.get_first(my_string_field).unwrap().as_str(),
|
||||
Some("blublub")
|
||||
);
|
||||
}
|
||||
|
||||
@@ -20,7 +20,8 @@ use crate::indexer::operation::DeleteOperation;
|
||||
use crate::indexer::stamper::Stamper;
|
||||
use crate::indexer::{MergePolicy, SegmentEntry, SegmentWriter};
|
||||
use crate::query::{EnableScoring, Query, TermQuery};
|
||||
use crate::schema::{Document, IndexRecordOption, Term};
|
||||
use crate::schema::document::Document;
|
||||
use crate::schema::{IndexRecordOption, TantivyDocument, Term};
|
||||
use crate::{FutureResult, Opstamp};
|
||||
|
||||
// Size of the margin for the `memory_arena`. A segment is closed when the remaining memory
|
||||
@@ -50,7 +51,7 @@ fn error_in_index_worker_thread(context: &str) -> TantivyError {
|
||||
/// indexing queue.
|
||||
/// Each indexing thread builds its own independent [`Segment`], via
|
||||
/// a `SegmentWriter` object.
|
||||
pub struct IndexWriter {
|
||||
pub struct IndexWriter<D: Document = TantivyDocument> {
|
||||
// the lock is just used to bind the
|
||||
// lifetime of the lock with that of the IndexWriter.
|
||||
_directory_lock: Option<DirectoryLock>,
|
||||
@@ -62,8 +63,8 @@ pub struct IndexWriter {
|
||||
|
||||
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>,
|
||||
|
||||
index_writer_status: IndexWriterStatus,
|
||||
operation_sender: AddBatchSender,
|
||||
index_writer_status: IndexWriterStatus<D>,
|
||||
operation_sender: AddBatchSender<D>,
|
||||
|
||||
segment_updater: SegmentUpdater,
|
||||
|
||||
@@ -164,10 +165,10 @@ pub(crate) fn advance_deletes(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn index_documents(
|
||||
fn index_documents<D: Document>(
|
||||
memory_budget: usize,
|
||||
segment: Segment,
|
||||
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch>,
|
||||
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch<D>>,
|
||||
segment_updater: &SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> crate::Result<()> {
|
||||
@@ -247,7 +248,7 @@ fn apply_deletes(
|
||||
})
|
||||
}
|
||||
|
||||
impl IndexWriter {
|
||||
impl<D: Document> IndexWriter<D> {
|
||||
/// Create a new index writer. Attempts to acquire a lockfile.
|
||||
///
|
||||
/// The lockfile should be deleted on drop, but it is possible
|
||||
@@ -267,7 +268,7 @@ impl IndexWriter {
|
||||
num_threads: usize,
|
||||
memory_budget_in_bytes_per_thread: usize,
|
||||
directory_lock: DirectoryLock,
|
||||
) -> crate::Result<IndexWriter> {
|
||||
) -> crate::Result<Self> {
|
||||
if memory_budget_in_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
|
||||
let err_msg = format!(
|
||||
"The memory arena in bytes per thread needs to be at least \
|
||||
@@ -281,7 +282,7 @@ impl IndexWriter {
|
||||
);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
|
||||
let (document_sender, document_receiver) =
|
||||
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
@@ -293,7 +294,7 @@ impl IndexWriter {
|
||||
let segment_updater =
|
||||
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||
|
||||
let mut index_writer = IndexWriter {
|
||||
let mut index_writer = Self {
|
||||
_directory_lock: Some(directory_lock),
|
||||
|
||||
memory_budget_in_bytes_per_thread,
|
||||
@@ -375,7 +376,7 @@ impl IndexWriter {
|
||||
self.index.new_segment()
|
||||
}
|
||||
|
||||
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver> {
|
||||
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver<D>> {
|
||||
self.index_writer_status
|
||||
.operation_receiver()
|
||||
.ok_or_else(|| {
|
||||
@@ -525,7 +526,7 @@ impl IndexWriter {
|
||||
///
|
||||
/// Returns the former segment_ready channel.
|
||||
fn recreate_document_channel(&mut self) {
|
||||
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
|
||||
let (document_sender, document_receiver) =
|
||||
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
self.operation_sender = document_sender;
|
||||
self.index_writer_status = IndexWriterStatus::from(document_receiver);
|
||||
@@ -552,7 +553,7 @@ impl IndexWriter {
|
||||
.take()
|
||||
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
|
||||
|
||||
let new_index_writer: IndexWriter = IndexWriter::new(
|
||||
let new_index_writer = IndexWriter::new(
|
||||
&self.index,
|
||||
self.num_threads,
|
||||
self.memory_budget_in_bytes_per_thread,
|
||||
@@ -598,7 +599,7 @@ impl IndexWriter {
|
||||
/// It is also possible to add a payload to the `commit`
|
||||
/// using this API.
|
||||
/// See [`PreparedCommit::set_payload()`].
|
||||
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit> {
|
||||
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<D>> {
|
||||
// Here, because we join all of the worker threads,
|
||||
// all of the segment update for this commit have been
|
||||
// sent.
|
||||
@@ -707,7 +708,7 @@ impl IndexWriter {
|
||||
/// The opstamp is an increasing `u64` that can
|
||||
/// be used by the client to align commits with its own
|
||||
/// document queue.
|
||||
pub fn add_document(&self, document: Document) -> crate::Result<Opstamp> {
|
||||
pub fn add_document(&self, document: D) -> crate::Result<Opstamp> {
|
||||
let opstamp = self.stamper.stamp();
|
||||
self.send_add_documents_batch(smallvec![AddOperation { opstamp, document }])?;
|
||||
Ok(opstamp)
|
||||
@@ -744,7 +745,7 @@ impl IndexWriter {
|
||||
/// visible to readers only after calling `commit()`.
|
||||
pub fn run<I>(&self, user_operations: I) -> crate::Result<Opstamp>
|
||||
where
|
||||
I: IntoIterator<Item = UserOperation>,
|
||||
I: IntoIterator<Item = UserOperation<D>>,
|
||||
I::IntoIter: ExactSizeIterator,
|
||||
{
|
||||
let user_operations_it = user_operations.into_iter();
|
||||
@@ -778,7 +779,7 @@ impl IndexWriter {
|
||||
Ok(batch_opstamp)
|
||||
}
|
||||
|
||||
fn send_add_documents_batch(&self, add_ops: AddBatch) -> crate::Result<()> {
|
||||
fn send_add_documents_batch(&self, add_ops: AddBatch<D>) -> crate::Result<()> {
|
||||
if self.index_writer_status.is_alive() && self.operation_sender.send(add_ops).is_ok() {
|
||||
Ok(())
|
||||
} else {
|
||||
@@ -787,7 +788,7 @@ impl IndexWriter {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for IndexWriter {
|
||||
impl<D: Document> Drop for IndexWriter<D> {
|
||||
fn drop(&mut self) {
|
||||
self.segment_updater.kill();
|
||||
self.drop_sender();
|
||||
@@ -814,13 +815,15 @@ mod tests {
|
||||
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
|
||||
use crate::schema::document::DocValue;
|
||||
use crate::schema::{
|
||||
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
|
||||
TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
|
||||
};
|
||||
use crate::store::DOCSTORE_CACHE_CAPACITY;
|
||||
use crate::{
|
||||
DateTime, DocAddress, Index, IndexSettings, IndexSortByField, Order, ReloadPolicy, Term,
|
||||
DateTime, DocAddress, Index, IndexSettings, IndexSortByField, IndexWriter, Order,
|
||||
ReloadPolicy, TantivyDocument, Term,
|
||||
};
|
||||
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
|
||||
@@ -852,7 +855,7 @@ mod tests {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(text_field => "hello1"))
|
||||
.unwrap();
|
||||
@@ -905,7 +908,7 @@ mod tests {
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let a_term = Term::from_field_text(text_field, "a");
|
||||
let b_term = Term::from_field_text(text_field, "b");
|
||||
let operations = vec![
|
||||
@@ -943,7 +946,7 @@ mod tests {
|
||||
fn test_empty_operations_group() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer_for_tests().unwrap();
|
||||
let index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let operations1 = vec![];
|
||||
let batch_opstamp1 = index_writer.run(operations1).unwrap();
|
||||
assert_eq!(batch_opstamp1, 0u64);
|
||||
@@ -956,8 +959,8 @@ mod tests {
|
||||
fn test_lockfile_stops_duplicates() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let _index_writer = index.writer_for_tests().unwrap();
|
||||
match index.writer_for_tests() {
|
||||
let _index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
match index.writer_for_tests::<TantivyDocument>() {
|
||||
Err(TantivyError::LockFailure(LockError::LockBusy, _)) => {}
|
||||
_ => panic!("Expected a `LockFailure` error"),
|
||||
}
|
||||
@@ -967,8 +970,8 @@ mod tests {
|
||||
fn test_lockfile_already_exists_error_msg() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let _index_writer = index.writer_for_tests().unwrap();
|
||||
match index.writer_for_tests() {
|
||||
let _index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
match index.writer_for_tests::<TantivyDocument>() {
|
||||
Err(err) => {
|
||||
let err_msg = err.to_string();
|
||||
assert!(err_msg.contains("already an `IndexWriter`"));
|
||||
@@ -981,7 +984,7 @@ mod tests {
|
||||
fn test_set_merge_policy() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer_for_tests().unwrap();
|
||||
let index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, \
|
||||
@@ -1000,11 +1003,11 @@ mod tests {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let _index_writer = index.writer_for_tests().unwrap();
|
||||
let _index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// the lock should be released when the
|
||||
// index_writer leaves the scope.
|
||||
}
|
||||
let _index_writer_two = index.writer_for_tests().unwrap();
|
||||
let _index_writer_two: IndexWriter = index.writer_for_tests().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1056,7 +1059,7 @@ mod tests {
|
||||
reader.searcher().doc_freq(&term_a).unwrap()
|
||||
};
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
index_writer.commit()?;
|
||||
// this should create 1 segment
|
||||
@@ -1096,7 +1099,7 @@ mod tests {
|
||||
reader.searcher().doc_freq(&term_a).unwrap()
|
||||
};
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
@@ -1382,7 +1385,7 @@ mod tests {
|
||||
fn test_delete_all_documents_empty_index() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index
|
||||
let mut index_writer: IndexWriter = index
|
||||
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
|
||||
.unwrap();
|
||||
let clear = index_writer.delete_all_documents();
|
||||
@@ -1395,7 +1398,7 @@ mod tests {
|
||||
fn test_delete_all_documents_index_twice() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index
|
||||
let mut index_writer: IndexWriter = index
|
||||
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
|
||||
.unwrap();
|
||||
let clear = index_writer.delete_all_documents();
|
||||
@@ -1415,7 +1418,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::builder().schema(schema).create_in_ram().unwrap();
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(text_field => "one"))
|
||||
.unwrap();
|
||||
@@ -1777,7 +1780,7 @@ mod tests {
|
||||
let num_segments_before_merge = searcher.segment_readers().len();
|
||||
if force_end_merge {
|
||||
index_writer.wait_merging_threads()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
@@ -1973,14 +1976,14 @@ mod tests {
|
||||
.get_store_reader(DOCSTORE_CACHE_CAPACITY)
|
||||
.unwrap();
|
||||
// test store iterator
|
||||
for doc in store_reader.iter(segment_reader.alive_bitset()) {
|
||||
for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) {
|
||||
let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap();
|
||||
assert!(expected_ids_and_num_occurrences.contains_key(&id));
|
||||
}
|
||||
// test store random access
|
||||
for doc_id in segment_reader.doc_ids_alive() {
|
||||
let id = store_reader
|
||||
.get(doc_id)
|
||||
.get::<TantivyDocument>(doc_id)
|
||||
.unwrap()
|
||||
.get_first(id_field)
|
||||
.unwrap()
|
||||
@@ -1989,7 +1992,7 @@ mod tests {
|
||||
assert!(expected_ids_and_num_occurrences.contains_key(&id));
|
||||
if id_exists(id) {
|
||||
let id2 = store_reader
|
||||
.get(doc_id)
|
||||
.get::<TantivyDocument>(doc_id)
|
||||
.unwrap()
|
||||
.get_first(multi_numbers)
|
||||
.unwrap()
|
||||
@@ -1997,13 +2000,13 @@ mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(id, id2);
|
||||
let bool = store_reader
|
||||
.get(doc_id)
|
||||
.get::<TantivyDocument>(doc_id)
|
||||
.unwrap()
|
||||
.get_first(bool_field)
|
||||
.unwrap()
|
||||
.as_bool()
|
||||
.unwrap();
|
||||
let doc = store_reader.get(doc_id).unwrap();
|
||||
let doc = store_reader.get::<TantivyDocument>(doc_id).unwrap();
|
||||
let mut bool2 = doc.get_all(multi_bools);
|
||||
assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap());
|
||||
assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap());
|
||||
@@ -2543,7 +2546,7 @@ mod tests {
|
||||
// Merge
|
||||
{
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
@@ -2585,7 +2588,7 @@ mod tests {
|
||||
// Merge
|
||||
{
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
|
||||
@@ -2,13 +2,15 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use super::AddBatchReceiver;
|
||||
use crate::schema::document::Document;
|
||||
use crate::TantivyDocument;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct IndexWriterStatus {
|
||||
inner: Arc<Inner>,
|
||||
pub(crate) struct IndexWriterStatus<D: Document = TantivyDocument> {
|
||||
inner: Arc<Inner<D>>,
|
||||
}
|
||||
|
||||
impl IndexWriterStatus {
|
||||
impl<D: Document> IndexWriterStatus<D> {
|
||||
/// Returns true iff the index writer is alive.
|
||||
pub fn is_alive(&self) -> bool {
|
||||
self.inner.as_ref().is_alive()
|
||||
@@ -16,7 +18,7 @@ impl IndexWriterStatus {
|
||||
|
||||
/// Returns a copy of the operation receiver.
|
||||
/// If the index writer was killed, returns `None`.
|
||||
pub fn operation_receiver(&self) -> Option<AddBatchReceiver> {
|
||||
pub fn operation_receiver(&self) -> Option<AddBatchReceiver<D>> {
|
||||
let rlock = self
|
||||
.inner
|
||||
.receive_channel
|
||||
@@ -27,19 +29,19 @@ impl IndexWriterStatus {
|
||||
|
||||
/// Create an index writer bomb.
|
||||
/// If dropped, the index writer status will be killed.
|
||||
pub(crate) fn create_bomb(&self) -> IndexWriterBomb {
|
||||
pub(crate) fn create_bomb(&self) -> IndexWriterBomb<D> {
|
||||
IndexWriterBomb {
|
||||
inner: Some(self.inner.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Inner {
|
||||
struct Inner<D: Document> {
|
||||
is_alive: AtomicBool,
|
||||
receive_channel: RwLock<Option<AddBatchReceiver>>,
|
||||
receive_channel: RwLock<Option<AddBatchReceiver<D>>>,
|
||||
}
|
||||
|
||||
impl Inner {
|
||||
impl<D: Document> Inner<D> {
|
||||
fn is_alive(&self) -> bool {
|
||||
self.is_alive.load(Ordering::Relaxed)
|
||||
}
|
||||
@@ -53,8 +55,8 @@ impl Inner {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<AddBatchReceiver> for IndexWriterStatus {
|
||||
fn from(receiver: AddBatchReceiver) -> Self {
|
||||
impl<D: Document> From<AddBatchReceiver<D>> for IndexWriterStatus<D> {
|
||||
fn from(receiver: AddBatchReceiver<D>) -> Self {
|
||||
IndexWriterStatus {
|
||||
inner: Arc::new(Inner {
|
||||
is_alive: AtomicBool::new(true),
|
||||
@@ -66,11 +68,11 @@ impl From<AddBatchReceiver> for IndexWriterStatus {
|
||||
|
||||
/// If dropped, the index writer will be killed.
|
||||
/// To prevent this, clients can call `.defuse()`.
|
||||
pub(crate) struct IndexWriterBomb {
|
||||
inner: Option<Arc<Inner>>,
|
||||
pub(crate) struct IndexWriterBomb<D: Document> {
|
||||
inner: Option<Arc<Inner<D>>>,
|
||||
}
|
||||
|
||||
impl IndexWriterBomb {
|
||||
impl<D: Document> IndexWriterBomb<D> {
|
||||
/// Defuses the bomb.
|
||||
///
|
||||
/// This is the only way to drop the bomb without killing
|
||||
@@ -80,7 +82,7 @@ impl IndexWriterBomb {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for IndexWriterBomb {
|
||||
impl<D: Document> Drop for IndexWriterBomb<D> {
|
||||
fn drop(&mut self) {
|
||||
if let Some(inner) = self.inner.take() {
|
||||
inner.kill();
|
||||
|
||||
@@ -753,9 +753,10 @@ mod tests {
|
||||
use crate::collector::{Count, FacetCollector};
|
||||
use crate::core::Index;
|
||||
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
|
||||
use crate::schema::document::DocValue;
|
||||
use crate::schema::{
|
||||
Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term, TextFieldIndexing,
|
||||
INDEXED, TEXT,
|
||||
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
|
||||
TextFieldIndexing, INDEXED, TEXT,
|
||||
};
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{
|
||||
@@ -817,7 +818,7 @@ mod tests {
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
@@ -866,30 +867,24 @@ mod tests {
|
||||
);
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 0))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b"));
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 1))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("a b c"));
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c"));
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 2))?;
|
||||
assert_eq!(
|
||||
doc.get_first(text_field).unwrap().as_text(),
|
||||
Some("a b c d")
|
||||
);
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c d"));
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 3))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b"));
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 4))?;
|
||||
assert_eq!(
|
||||
doc.get_first(text_field).unwrap().as_text(),
|
||||
Some("a b c g")
|
||||
);
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c g"));
|
||||
}
|
||||
|
||||
{
|
||||
@@ -1300,10 +1295,10 @@ mod tests {
|
||||
let reader = index.reader().unwrap();
|
||||
let mut int_val = 0;
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let index_doc =
|
||||
|index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| {
|
||||
let mut doc = Document::default();
|
||||
let mut doc = TantivyDocument::default();
|
||||
for facet in doc_facets {
|
||||
doc.add_facet(facet_field, Facet::from(facet));
|
||||
}
|
||||
@@ -1384,7 +1379,7 @@ mod tests {
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
@@ -1406,7 +1401,7 @@ mod tests {
|
||||
|
||||
// Deleting one term
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
|
||||
let facet_term = Term::from_facet(facet_field, &facet);
|
||||
index_writer.delete_term(facet_term);
|
||||
@@ -1431,7 +1426,7 @@ mod tests {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(int_field => 1u64))?;
|
||||
index_writer.commit().expect("commit failed");
|
||||
index_writer.add_document(doc!(int_field => 1u64))?;
|
||||
@@ -1460,7 +1455,7 @@ mod tests {
|
||||
let reader = index.reader()?;
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut doc = Document::default();
|
||||
let mut doc = TantivyDocument::default();
|
||||
doc.add_u64(int_field, 1);
|
||||
index_writer.add_document(doc.clone())?;
|
||||
index_writer.commit()?;
|
||||
@@ -1503,7 +1498,7 @@ mod tests {
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
|
||||
let mut doc = Document::default();
|
||||
let mut doc = TantivyDocument::default();
|
||||
for &val in int_vals {
|
||||
doc.add_u64(int_field, val);
|
||||
}
|
||||
@@ -1566,7 +1561,7 @@ mod tests {
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
@@ -1613,7 +1608,7 @@ mod tests {
|
||||
writer.set_merge_policy(Box::new(policy));
|
||||
|
||||
for i in 0..100 {
|
||||
let mut doc = Document::new();
|
||||
let mut doc = TantivyDocument::new();
|
||||
doc.add_f64(field, 42.0);
|
||||
doc.add_f64(multi_field, 0.24);
|
||||
doc.add_f64(multi_field, 0.27);
|
||||
|
||||
@@ -4,11 +4,15 @@ mod tests {
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::document::DocValue;
|
||||
use crate::schema::{
|
||||
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
|
||||
TextFieldIndexing, TextOptions,
|
||||
};
|
||||
use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term};
|
||||
use crate::{
|
||||
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, Postings,
|
||||
TantivyDocument, Term,
|
||||
};
|
||||
|
||||
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
@@ -26,7 +30,7 @@ mod tests {
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")))
|
||||
.unwrap();
|
||||
@@ -45,7 +49,7 @@ mod tests {
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
assert!(index_writer.merge(&segment_ids).wait().is_ok());
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
@@ -133,7 +137,7 @@ mod tests {
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
@@ -272,12 +276,16 @@ mod tests {
|
||||
} else {
|
||||
2
|
||||
};
|
||||
let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap();
|
||||
let doc = searcher
|
||||
.doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
doc.get_first(my_text_field).unwrap().as_text(),
|
||||
doc.get_first(my_text_field).unwrap().as_str(),
|
||||
Some("blubber")
|
||||
);
|
||||
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
|
||||
let doc = searcher
|
||||
.doc::<TantivyDocument>(DocAddress::new(0, 0))
|
||||
.unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000));
|
||||
}
|
||||
}
|
||||
@@ -494,7 +502,7 @@ mod bench_sorted_index_merge {
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let index_doc = |index_writer: &mut IndexWriter, val: u64| {
|
||||
index_writer.add_document(doc!(int_field=>val)).unwrap();
|
||||
};
|
||||
|
||||
@@ -44,9 +44,9 @@ pub type DefaultMergePolicy = LogMergePolicy;
|
||||
// - all docs in the operation will happen on the same segment and continuous doc_ids.
|
||||
// - all operations in the group are committed at the same time, making the group
|
||||
// atomic.
|
||||
type AddBatch = SmallVec<[AddOperation; 4]>;
|
||||
type AddBatchSender = channel::Sender<AddBatch>;
|
||||
type AddBatchReceiver = channel::Receiver<AddBatch>;
|
||||
type AddBatch<D> = SmallVec<[AddOperation<D>; 4]>;
|
||||
type AddBatchSender<D> = channel::Sender<AddBatch<D>>;
|
||||
type AddBatchReceiver<D> = channel::Receiver<AddBatch<D>>;
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(test)]
|
||||
@@ -55,14 +55,14 @@ mod tests_mmap {
|
||||
use crate::collector::Count;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{JsonObjectOptions, Schema, TEXT};
|
||||
use crate::{Index, Term};
|
||||
use crate::{Index, IndexWriter, Term};
|
||||
|
||||
#[test]
|
||||
fn test_advance_delete_bug() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_from_tempdir(schema_builder.build())?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
// there must be one deleted document in the segment
|
||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
||||
@@ -79,7 +79,7 @@ mod tests_mmap {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
|
||||
index_writer.add_document(doc!(json_field=>json)).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
@@ -110,7 +110,7 @@ mod tests_mmap {
|
||||
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
|
||||
let json_field = schema_builder.add_json_field("json", json_options);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
|
||||
index_writer.add_document(doc!(json_field=>json)).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use crate::query::Weight;
|
||||
use crate::schema::{Document, Term};
|
||||
use crate::schema::document::Document;
|
||||
use crate::schema::{TantivyDocument, Term};
|
||||
use crate::Opstamp;
|
||||
|
||||
/// Timestamped Delete operation.
|
||||
@@ -10,16 +11,16 @@ pub struct DeleteOperation {
|
||||
|
||||
/// Timestamped Add operation.
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub struct AddOperation {
|
||||
pub struct AddOperation<D: Document = TantivyDocument> {
|
||||
pub opstamp: Opstamp,
|
||||
pub document: Document,
|
||||
pub document: D,
|
||||
}
|
||||
|
||||
/// UserOperation is an enum type that encapsulates other operation types.
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub enum UserOperation {
|
||||
pub enum UserOperation<D: Document = TantivyDocument> {
|
||||
/// Add operation
|
||||
Add(Document),
|
||||
Add(D),
|
||||
/// Delete operation
|
||||
Delete(Term),
|
||||
}
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
use super::IndexWriter;
|
||||
use crate::{FutureResult, Opstamp};
|
||||
use crate::schema::document::Document;
|
||||
use crate::{FutureResult, Opstamp, TantivyDocument};
|
||||
|
||||
/// A prepared commit
|
||||
pub struct PreparedCommit<'a> {
|
||||
index_writer: &'a mut IndexWriter,
|
||||
pub struct PreparedCommit<'a, D: Document = TantivyDocument> {
|
||||
index_writer: &'a mut IndexWriter<D>,
|
||||
payload: Option<String>,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
impl<'a> PreparedCommit<'a> {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit<'_> {
|
||||
PreparedCommit {
|
||||
impl<'a, D: Document> PreparedCommit<'a, D> {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter<D>, opstamp: Opstamp) -> Self {
|
||||
Self {
|
||||
index_writer,
|
||||
payload: None,
|
||||
opstamp,
|
||||
|
||||
@@ -13,10 +13,11 @@ use crate::postings::{
|
||||
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
|
||||
PerFieldPostingsWriter, PostingsWriter,
|
||||
};
|
||||
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED};
|
||||
use crate::schema::document::{DocValue, Document, ReferenceValue};
|
||||
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
|
||||
use crate::store::{StoreReader, StoreWriter};
|
||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
|
||||
use crate::{DocId, Document, Opstamp, SegmentComponent, TantivyError};
|
||||
use crate::{DocId, Opstamp, SegmentComponent, TantivyError};
|
||||
|
||||
/// Computes the initial size of the hash table.
|
||||
///
|
||||
@@ -81,10 +82,7 @@ impl SegmentWriter {
|
||||
/// the flushing behavior as a memory limit.
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(
|
||||
memory_budget_in_bytes: usize,
|
||||
segment: Segment,
|
||||
) -> crate::Result<SegmentWriter> {
|
||||
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
|
||||
let schema = segment.schema();
|
||||
let tokenizer_manager = segment.index().tokenizers().clone();
|
||||
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
|
||||
@@ -113,7 +111,7 @@ impl SegmentWriter {
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
Ok(SegmentWriter {
|
||||
Ok(Self {
|
||||
max_doc: 0,
|
||||
ctx: IndexingContext::new(table_size),
|
||||
per_field_postings_writers,
|
||||
@@ -164,18 +162,21 @@ impl SegmentWriter {
|
||||
+ self.segment_serializer.mem_usage()
|
||||
}
|
||||
|
||||
fn index_document(&mut self, doc: &Document) -> crate::Result<()> {
|
||||
fn index_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
|
||||
let doc_id = self.max_doc;
|
||||
|
||||
// TODO: Can this be optimised a bit?
|
||||
let vals_grouped_by_field = doc
|
||||
.field_values()
|
||||
.iter()
|
||||
.sorted_by_key(|el| el.field())
|
||||
.group_by(|el| el.field());
|
||||
.iter_fields_and_values()
|
||||
.sorted_by_key(|(field, _)| *field)
|
||||
.group_by(|(field, _)| *field);
|
||||
|
||||
for (field, field_values) in &vals_grouped_by_field {
|
||||
let values = field_values.map(|field_value| field_value.value());
|
||||
let values = field_values.map(|el| el.1);
|
||||
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let make_schema_error = || {
|
||||
crate::TantivyError::SchemaError(format!(
|
||||
TantivyError::SchemaError(format!(
|
||||
"Expected a {:?} for field {:?}",
|
||||
field_entry.field_type().value_type(),
|
||||
field_entry.name()
|
||||
@@ -193,7 +194,10 @@ impl SegmentWriter {
|
||||
match field_entry.field_type() {
|
||||
FieldType::Facet(_) => {
|
||||
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
|
||||
for value in values {
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
|
||||
let facet = value.as_facet().ok_or_else(make_schema_error)?;
|
||||
let facet_str = facet.encoded_str();
|
||||
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
|
||||
@@ -209,19 +213,18 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Str(_) => {
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
for value in values {
|
||||
let mut token_stream = match value {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
let text_analyzer =
|
||||
&mut self.per_field_text_analyzers[field.field_id() as usize];
|
||||
text_analyzer.token_stream(text)
|
||||
}
|
||||
_ => {
|
||||
continue;
|
||||
}
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
|
||||
let mut token_stream = if let Some(text) = value.as_str() {
|
||||
let text_analyzer =
|
||||
&mut self.per_field_text_analyzers[field.field_id() as usize];
|
||||
text_analyzer.token_stream(text)
|
||||
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
|
||||
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
assert!(term_buffer.is_empty());
|
||||
@@ -240,7 +243,10 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value in values {
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
|
||||
num_vals += 1;
|
||||
let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_u64(u64_val);
|
||||
@@ -252,9 +258,13 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Date(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value in values {
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value_access = value_access as D::Value<'_>;
|
||||
let value = value_access.as_value();
|
||||
|
||||
num_vals += 1;
|
||||
let date_val = value.as_date().ok_or_else(make_schema_error)?;
|
||||
let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
|
||||
term_buffer
|
||||
.set_u64(date_val.truncate(DATE_TIME_PRECISION_INDEXED).to_u64());
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
@@ -265,7 +275,10 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::I64(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value in values {
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
|
||||
num_vals += 1;
|
||||
let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_i64(i64_val);
|
||||
@@ -277,7 +290,10 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value in values {
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
|
||||
num_vals += 1;
|
||||
let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_f64(f64_val);
|
||||
@@ -289,7 +305,10 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Bool(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value in values {
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
|
||||
num_vals += 1;
|
||||
let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_bool(bool_val);
|
||||
@@ -301,7 +320,10 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Bytes(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value in values {
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
|
||||
num_vals += 1;
|
||||
let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_bytes(bytes);
|
||||
@@ -314,9 +336,17 @@ impl SegmentWriter {
|
||||
FieldType::JsonObject(json_options) => {
|
||||
let text_analyzer =
|
||||
&mut self.per_field_text_analyzers[field.field_id() as usize];
|
||||
let json_values_it =
|
||||
values.map(|value| value.as_json().ok_or_else(make_schema_error));
|
||||
index_json_values(
|
||||
let json_values_it = values.map(|value_access| {
|
||||
// Used to help with linting and type checking.
|
||||
let value_access = value_access as D::Value<'_>;
|
||||
let value = value_access.as_value();
|
||||
|
||||
match value {
|
||||
ReferenceValue::Object(object_iter) => Ok(object_iter),
|
||||
_ => Err(make_schema_error()),
|
||||
}
|
||||
});
|
||||
index_json_values::<D::Value<'_>>(
|
||||
doc_id,
|
||||
json_values_it,
|
||||
text_analyzer,
|
||||
@@ -328,7 +358,10 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::IpAddr(_) => {
|
||||
let mut num_vals = 0;
|
||||
for value in values {
|
||||
for value_access in values {
|
||||
// Used to help with linting and type checking.
|
||||
let value = value_access as D::Value<'_>;
|
||||
|
||||
num_vals += 1;
|
||||
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_ip_addr(ip_addr);
|
||||
@@ -346,7 +379,10 @@ impl SegmentWriter {
|
||||
/// Indexes a new document
|
||||
///
|
||||
/// As a user, you should rather use `IndexWriter`'s add_document.
|
||||
pub fn add_document(&mut self, add_operation: AddOperation) -> crate::Result<()> {
|
||||
pub fn add_document<D: Document>(
|
||||
&mut self,
|
||||
add_operation: AddOperation<D>,
|
||||
) -> crate::Result<()> {
|
||||
let AddOperation { document, opstamp } = add_operation;
|
||||
self.doc_opstamps.push(opstamp);
|
||||
self.fast_field_writers.add_document(&document)?;
|
||||
@@ -445,6 +481,7 @@ fn remap_and_write(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use tempfile::TempDir;
|
||||
@@ -455,6 +492,7 @@ mod tests {
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::query::PhraseQuery;
|
||||
use crate::schema::document::DocValue;
|
||||
use crate::schema::{
|
||||
IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING, TEXT,
|
||||
};
|
||||
@@ -463,7 +501,8 @@ mod tests {
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
use crate::{
|
||||
DateTime, Directory, DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED,
|
||||
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, Postings, TantivyDocument,
|
||||
Term, TERMINATED,
|
||||
};
|
||||
|
||||
#[test]
|
||||
@@ -480,7 +519,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let mut doc = Document::default();
|
||||
let mut doc = TantivyDocument::default();
|
||||
let pre_tokenized_text = PreTokenizedString {
|
||||
text: String::from("A"),
|
||||
tokens: vec![Token {
|
||||
@@ -504,11 +543,11 @@ mod tests {
|
||||
store_writer.close().unwrap();
|
||||
|
||||
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
|
||||
let doc = reader.get(0).unwrap();
|
||||
let doc = reader.get::<TantivyDocument>(0).unwrap();
|
||||
|
||||
assert_eq!(doc.field_values().len(), 2);
|
||||
assert_eq!(doc.field_values()[0].value().as_text(), Some("A"));
|
||||
assert_eq!(doc.field_values()[1].value().as_text(), Some("title"));
|
||||
assert_eq!(doc.field_values()[0].value().as_str(), Some("A"));
|
||||
assert_eq!(doc.field_values()[1].value().as_str(), Some("title"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -539,13 +578,13 @@ mod tests {
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let doc = searcher
|
||||
.doc(DocAddress {
|
||||
.doc::<TantivyDocument>(DocAddress {
|
||||
segment_ord: 0u32,
|
||||
doc_id: 0u32,
|
||||
})
|
||||
.unwrap();
|
||||
let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
|
||||
&schema.to_json(&doc),
|
||||
&doc.to_json(&schema),
|
||||
)
|
||||
.unwrap()
|
||||
.get("json")
|
||||
@@ -675,10 +714,10 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let mut doc = Document::default();
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
let mut doc = TantivyDocument::default();
|
||||
let json_val: BTreeMap<String, crate::schema::Value> =
|
||||
serde_json::from_str(r#"{"mykey": "repeated token token"}"#).unwrap();
|
||||
doc.add_json_object(json_field, json_val);
|
||||
doc.add_object(json_field, json_val);
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc).unwrap();
|
||||
@@ -802,11 +841,10 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let doc = schema
|
||||
.parse_document(r#"{"text": [ "bbb", "aaa", "", "aaa"]}"#)
|
||||
let doc = TantivyDocument::parse_json(&schema, r#"{"text": [ "bbb", "aaa", "", "aaa"]}"#)
|
||||
.unwrap();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
// On debug this did panic on the underflow
|
||||
index_writer.commit().unwrap();
|
||||
@@ -831,7 +869,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let mut doc = Document::default();
|
||||
let mut doc = TantivyDocument::default();
|
||||
// This is a bit of a contrived example.
|
||||
let tokens = PreTokenizedString {
|
||||
text: "roller-coaster".to_string(),
|
||||
@@ -846,7 +884,7 @@ mod tests {
|
||||
doc.add_pre_tokenized_text(text, tokens.clone());
|
||||
doc.add_pre_tokenized_text(text, tokens);
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
@@ -869,7 +907,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let mut doc = Document::default();
|
||||
let mut doc = TantivyDocument::default();
|
||||
// This is a bit of a contrived example.
|
||||
let tokens = PreTokenizedString {
|
||||
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
|
||||
@@ -894,7 +932,7 @@ mod tests {
|
||||
doc.add_pre_tokenized_text(text, tokens);
|
||||
doc.add_text(text, "hello");
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
@@ -930,7 +968,7 @@ mod tests {
|
||||
let schema = index.schema();
|
||||
let mut index_writer = index.writer(50_000_000).unwrap();
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let mut document = Document::default();
|
||||
let mut document = TantivyDocument::default();
|
||||
document.add_text(title, "The Old Man and the Sea");
|
||||
index_writer.add_document(document).unwrap();
|
||||
let error = index_writer.commit().unwrap_err();
|
||||
|
||||
55
src/lib.rs
55
src/lib.rs
@@ -21,7 +21,7 @@
|
||||
//! # use tantivy::collector::TopDocs;
|
||||
//! # use tantivy::query::QueryParser;
|
||||
//! # use tantivy::schema::*;
|
||||
//! # use tantivy::{doc, DocAddress, Index, Score};
|
||||
//! # use tantivy::{doc, DocAddress, Index, IndexWriter, Score};
|
||||
//! #
|
||||
//! # fn main() {
|
||||
//! # // Let's create a temporary directory for the
|
||||
@@ -53,7 +53,7 @@
|
||||
//!
|
||||
//! // Here we use a buffer of 100MB that will be split
|
||||
//! // between indexing threads.
|
||||
//! let mut index_writer = index.writer(100_000_000)?;
|
||||
//! let mut index_writer: IndexWriter = index.writer(100_000_000)?;
|
||||
//!
|
||||
//! // Let's index one documents!
|
||||
//! index_writer.add_document(doc!(
|
||||
@@ -89,8 +89,8 @@
|
||||
//!
|
||||
//! for (_score, doc_address) in top_docs {
|
||||
//! // Retrieve the actual content of documents given its `doc_address`.
|
||||
//! let retrieved_doc = searcher.doc(doc_address)?;
|
||||
//! println!("{}", schema.to_json(&retrieved_doc));
|
||||
//! let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
|
||||
//! println!("{}", retrieved_doc.to_json(&schema));
|
||||
//! }
|
||||
//!
|
||||
//! # Ok(())
|
||||
@@ -186,7 +186,7 @@ pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, Pr
|
||||
pub use crate::postings::Postings;
|
||||
#[allow(deprecated)]
|
||||
pub use crate::schema::DatePrecision;
|
||||
pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
|
||||
pub use crate::schema::{DateOptions, DateTimePrecision, TantivyDocument, Term};
|
||||
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 5;
|
||||
@@ -342,8 +342,9 @@ pub mod tests {
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::schema::document::DocValue;
|
||||
use crate::schema::*;
|
||||
use crate::{DateTime, DocAddress, Index, Postings, ReloadPolicy};
|
||||
use crate::{DateTime, DocAddress, Index, IndexWriter, Postings, ReloadPolicy};
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
@@ -414,7 +415,7 @@ pub mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_from_tempdir(schema)?;
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
{
|
||||
let doc = doc!(text_field=>"af b");
|
||||
index_writer.add_document(doc)?;
|
||||
@@ -436,7 +437,7 @@ pub mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
@@ -463,7 +464,7 @@ pub mod tests {
|
||||
let title_field = schema_builder.add_text_field("title", TEXT);
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
||||
index_writer.commit()?;
|
||||
let index_reader = index.reader()?;
|
||||
@@ -485,7 +486,7 @@ pub mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
||||
@@ -528,7 +529,7 @@ pub mod tests {
|
||||
.unwrap();
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
// 0
|
||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
||||
// 1
|
||||
@@ -575,7 +576,7 @@ pub mod tests {
|
||||
}
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
// 0
|
||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
||||
// 1
|
||||
@@ -612,7 +613,7 @@ pub mod tests {
|
||||
}
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||
index_writer.rollback()?;
|
||||
@@ -662,7 +663,7 @@ pub mod tests {
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(field=>1u64))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
@@ -685,7 +686,7 @@ pub mod tests {
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
let negative_val = -1i64;
|
||||
index_writer.add_document(doc!(value_field => negative_val))?;
|
||||
index_writer.commit()?;
|
||||
@@ -709,7 +710,7 @@ pub mod tests {
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
let val = std::f64::consts::PI;
|
||||
index_writer.add_document(doc!(value_field => val))?;
|
||||
index_writer.commit()?;
|
||||
@@ -733,7 +734,7 @@ pub mod tests {
|
||||
let absent_field = schema_builder.add_text_field("absent_text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let reader = index.reader()?;
|
||||
@@ -756,7 +757,7 @@ pub mod tests {
|
||||
.try_into()?;
|
||||
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"63"))?;
|
||||
index_writer.add_document(doc!(text_field=>"70"))?;
|
||||
index_writer.add_document(doc!(text_field=>"34"))?;
|
||||
@@ -781,7 +782,7 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"af af af bc bc"))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
@@ -813,7 +814,7 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let reader = index.reader()?;
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"af af af b"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c d"))?;
|
||||
@@ -877,7 +878,7 @@ pub mod tests {
|
||||
.try_into()?;
|
||||
assert_eq!(reader.searcher().num_docs(), 0u64);
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"af b"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c d"))?;
|
||||
@@ -987,11 +988,11 @@ pub mod tests {
|
||||
assert_eq!(document.len(), 3);
|
||||
let values: Vec<&Value> = document.get_all(text_field).collect();
|
||||
assert_eq!(values.len(), 2);
|
||||
assert_eq!(values[0].as_text(), Some("tantivy"));
|
||||
assert_eq!(values[1].as_text(), Some("some other value"));
|
||||
assert_eq!(values[0].as_str(), Some("tantivy"));
|
||||
assert_eq!(values[1].as_str(), Some("some other value"));
|
||||
let values: Vec<&Value> = document.get_all(other_text_field).collect();
|
||||
assert_eq!(values.len(), 1);
|
||||
assert_eq!(values[0].as_text(), Some("short"));
|
||||
assert_eq!(values[0].as_str(), Some("short"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1005,7 +1006,7 @@ pub mod tests {
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
{
|
||||
let document =
|
||||
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
|
||||
@@ -1071,7 +1072,7 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let index_reader = index.reader()?;
|
||||
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
|
||||
for doc_id in 0u64..DOC_COUNT {
|
||||
@@ -1124,7 +1125,7 @@ pub mod tests {
|
||||
let body = builder.add_text_field("body", TEXT | STORED);
|
||||
let schema = builder.build();
|
||||
let index = Index::create_in_dir(&index_path, schema)?;
|
||||
let mut writer = index.writer(50_000_000)?;
|
||||
let mut writer: IndexWriter = index.writer(50_000_000)?;
|
||||
writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
for _ in 0..5000 {
|
||||
writer.add_document(doc!(body => "foo"))?;
|
||||
|
||||
@@ -45,12 +45,12 @@
|
||||
macro_rules! doc(
|
||||
() => {
|
||||
{
|
||||
($crate::Document::default())
|
||||
($crate::TantivyDocument::default())
|
||||
}
|
||||
}; // avoids a warning due to the useless `mut`.
|
||||
($($field:expr => $value:expr),*) => {
|
||||
{
|
||||
let mut document = $crate::Document::default();
|
||||
let mut document = $crate::TantivyDocument::default();
|
||||
$(
|
||||
document.add_field_value($field, $value);
|
||||
)*
|
||||
|
||||
@@ -52,7 +52,7 @@ pub mod tests {
|
||||
Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, INDEXED, TEXT,
|
||||
};
|
||||
use crate::tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
|
||||
use crate::{DocId, HasLen, Score};
|
||||
use crate::{DocId, HasLen, IndexWriter, Score};
|
||||
|
||||
#[test]
|
||||
pub fn test_position_write() -> crate::Result<()> {
|
||||
@@ -432,7 +432,7 @@ pub mod tests {
|
||||
|
||||
// delete some of the documents
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.delete_term(term_0);
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
@@ -483,7 +483,7 @@ pub mod tests {
|
||||
|
||||
// delete everything else
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.delete_term(term_1);
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
@@ -568,8 +568,8 @@ mod bench {
|
||||
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::query::Intersection;
|
||||
use crate::schema::{Document, Field, IndexRecordOption, Schema, Term, STRING};
|
||||
use crate::{tests, DocSet, Index};
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, TantivyDocument, Term, STRING};
|
||||
use crate::{tests, DocSet, Index, IndexWriter};
|
||||
|
||||
pub static TERM_A: Lazy<Term> = Lazy::new(|| {
|
||||
let field = Field::from_field_id(0);
|
||||
@@ -598,9 +598,9 @@ mod bench {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let posting_list_size = 1_000_000;
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
for _ in 0..posting_list_size {
|
||||
let mut doc = Document::default();
|
||||
let mut doc = TantivyDocument::default();
|
||||
if rng.gen_bool(1f64 / 15f64) {
|
||||
doc.add_text(text_field, "a");
|
||||
}
|
||||
|
||||
@@ -99,14 +99,14 @@ mod tests {
|
||||
use crate::docset::{DocSet, BUFFER_LEN, TERMINATED};
|
||||
use crate::query::{AllScorer, EnableScoring, Query};
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::Index;
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
fn create_test_index() -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(field=>"aaa"))?;
|
||||
index_writer.add_document(doc!(field=>"bbb"))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -117,13 +117,13 @@ mod tests {
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::{Schema, STRING};
|
||||
use crate::Index;
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
fn create_index() -> crate::Result<Index> {
|
||||
let mut schema = Schema::builder();
|
||||
let title = schema.add_text_field("title", STRING);
|
||||
let index = Index::create_in_ram(schema.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(title=>"abc"))?;
|
||||
index_writer.add_document(doc!(title=>"bcd"))?;
|
||||
index_writer.add_document(doc!(title=>"abcd"))?;
|
||||
|
||||
@@ -24,6 +24,7 @@ use crate::schema::{IndexRecordOption, Term};
|
||||
/// use tantivy::schema::{IndexRecordOption, Schema, TEXT};
|
||||
/// use tantivy::Term;
|
||||
/// use tantivy::Index;
|
||||
/// use tantivy::IndexWriter;
|
||||
///
|
||||
/// fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
@@ -32,7 +33,7 @@ use crate::schema::{IndexRecordOption, Term};
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(15_000_000)?;
|
||||
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ))?;
|
||||
|
||||
@@ -19,7 +19,7 @@ mod tests {
|
||||
TermQuery,
|
||||
};
|
||||
use crate::schema::*;
|
||||
use crate::{assert_nearly_equals, DocAddress, DocId, Index, Score};
|
||||
use crate::{assert_nearly_equals, DocAddress, DocId, Index, IndexWriter, Score};
|
||||
|
||||
fn aux_test_helper() -> crate::Result<(Index, Field)> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -28,7 +28,7 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field => "a b c"))?;
|
||||
index_writer.add_document(doc!(text_field => "a c"))?;
|
||||
index_writer.add_document(doc!(text_field => "b c"))?;
|
||||
@@ -224,7 +224,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field => "a b c"))?;
|
||||
index_writer.add_document(doc!(text_field => "a c"))?;
|
||||
index_writer.add_document(doc!(text_field => "b c"))?;
|
||||
@@ -297,7 +297,7 @@ mod tests {
|
||||
let text = schema_builder.add_text_field("text", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text=>"a"))?;
|
||||
index_writer.add_document(doc!(text=>"b"))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -136,14 +136,14 @@ mod tests {
|
||||
use super::BoostQuery;
|
||||
use crate::query::{AllQuery, Query};
|
||||
use crate::schema::Schema;
|
||||
use crate::{DocAddress, Document, Index};
|
||||
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
|
||||
|
||||
#[test]
|
||||
fn test_boost_query_explain() -> crate::Result<()> {
|
||||
let schema = Schema::builder().build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(Document::new())?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(TantivyDocument::new())?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
@@ -143,14 +143,14 @@ mod tests {
|
||||
use super::ConstScoreQuery;
|
||||
use crate::query::{AllQuery, Query};
|
||||
use crate::schema::Schema;
|
||||
use crate::{DocAddress, Document, Index};
|
||||
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
|
||||
|
||||
#[test]
|
||||
fn test_const_score_query_explain() -> crate::Result<()> {
|
||||
let schema = Schema::builder().build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(Document::new())?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(TantivyDocument::new())?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
@@ -15,6 +15,7 @@ use crate::{Score, Term};
|
||||
/// use tantivy::schema::{IndexRecordOption, Schema, TEXT};
|
||||
/// use tantivy::Term;
|
||||
/// use tantivy::Index;
|
||||
/// use tantivy::IndexWriter;
|
||||
///
|
||||
/// fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
@@ -23,7 +24,7 @@ use crate::{Score, Term};
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(15_000_000)?;
|
||||
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of Girl",
|
||||
/// ))?;
|
||||
|
||||
@@ -38,7 +38,7 @@ impl Automaton for DfaWrapper {
|
||||
/// use tantivy::collector::{Count, TopDocs};
|
||||
/// use tantivy::query::FuzzyTermQuery;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, Index, Term};
|
||||
/// use tantivy::{doc, Index, IndexWriter, Term};
|
||||
///
|
||||
/// fn example() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
@@ -46,7 +46,7 @@ impl Automaton for DfaWrapper {
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(15_000_000)?;
|
||||
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ))?;
|
||||
@@ -188,7 +188,7 @@ mod test {
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Schema, STORED, TEXT};
|
||||
use crate::{assert_nearly_equals, Index, Term};
|
||||
use crate::{assert_nearly_equals, Index, IndexWriter, TantivyDocument, Term};
|
||||
|
||||
#[test]
|
||||
pub fn test_fuzzy_json_path() -> crate::Result<()> {
|
||||
@@ -202,7 +202,8 @@ mod test {
|
||||
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
let doc = schema.parse_document(
|
||||
let doc = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"attributes": {
|
||||
"a": "japan"
|
||||
@@ -210,7 +211,8 @@ mod test {
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
let doc = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"attributes": {
|
||||
"aa": "japan"
|
||||
@@ -275,7 +277,7 @@ mod test {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
country_field => "japan",
|
||||
))?;
|
||||
@@ -324,7 +326,7 @@ mod test {
|
||||
let country_field = schema_builder.add_text_field("country", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(country_field => "japan"))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
|
||||
use tokenizer_api::Token;
|
||||
|
||||
use crate::query::bm25::idf;
|
||||
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
|
||||
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
|
||||
use crate::schema::document::{DocValue, Document};
|
||||
use crate::schema::{Field, FieldType, IndexRecordOption, Term};
|
||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
|
||||
use crate::{DocAddress, Result, Searcher, TantivyError};
|
||||
use crate::{DocAddress, Result, Searcher, TantivyDocument, TantivyError};
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
struct ScoreTerm {
|
||||
@@ -90,10 +93,10 @@ impl MoreLikeThis {
|
||||
}
|
||||
|
||||
/// Creates a [`BooleanQuery`] using a set of field values.
|
||||
pub fn query_with_document_fields(
|
||||
pub fn query_with_document_fields<'a, V: DocValue<'a>>(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
doc_fields: &[(Field, Vec<Value>)],
|
||||
doc_fields: &[(Field, Vec<V>)],
|
||||
) -> Result<BooleanQuery> {
|
||||
let score_terms = self.retrieve_terms_from_doc_fields(searcher, doc_fields)?;
|
||||
let query = self.create_query(score_terms);
|
||||
@@ -126,26 +129,18 @@ impl MoreLikeThis {
|
||||
searcher: &Searcher,
|
||||
doc_address: DocAddress,
|
||||
) -> Result<Vec<ScoreTerm>> {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let field_to_values = doc
|
||||
.get_sorted_field_values()
|
||||
.iter()
|
||||
.map(|(field, values)| {
|
||||
(
|
||||
*field,
|
||||
values.iter().map(|v| (**v).clone()).collect::<Vec<Value>>(),
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
|
||||
|
||||
let field_to_values = doc.get_sorted_field_values();
|
||||
self.retrieve_terms_from_doc_fields(searcher, &field_to_values)
|
||||
}
|
||||
|
||||
/// Finds terms for a more-like-this query.
|
||||
/// field_to_field_values is a mapping from field to possible values of that field.
|
||||
fn retrieve_terms_from_doc_fields(
|
||||
fn retrieve_terms_from_doc_fields<'a, V: DocValue<'a>>(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
field_to_values: &[(Field, Vec<Value>)],
|
||||
field_to_values: &[(Field, Vec<V>)],
|
||||
) -> Result<Vec<ScoreTerm>> {
|
||||
if field_to_values.is_empty() {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
@@ -164,11 +159,11 @@ impl MoreLikeThis {
|
||||
/// Computes the frequency of values for a field while updating the term frequencies
|
||||
/// Note: A FieldValue can be made up of multiple terms.
|
||||
/// We are interested in extracting terms within FieldValue
|
||||
fn add_term_frequencies(
|
||||
fn add_term_frequencies<'a, V: DocValue<'a>>(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
field: Field,
|
||||
values: &[Value],
|
||||
values: &[V],
|
||||
term_frequencies: &mut HashMap<Term, usize>,
|
||||
) -> Result<()> {
|
||||
let schema = searcher.schema();
|
||||
@@ -184,11 +179,10 @@ impl MoreLikeThis {
|
||||
FieldType::Facet(_) => {
|
||||
let facets: Vec<&str> = values
|
||||
.iter()
|
||||
.map(|value| match value {
|
||||
Value::Facet(ref facet) => Ok(facet.encoded_str()),
|
||||
_ => Err(TantivyError::InvalidArgument(
|
||||
"invalid field value".to_string(),
|
||||
)),
|
||||
.map(|value| {
|
||||
value.as_facet().map(|f| f.encoded_str()).ok_or_else(|| {
|
||||
TantivyError::InvalidArgument("invalid field value".to_string())
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
for fake_str in facets {
|
||||
@@ -203,35 +197,31 @@ impl MoreLikeThis {
|
||||
}
|
||||
}
|
||||
FieldType::Str(text_options) => {
|
||||
let mut tokenizer_opt = text_options
|
||||
.get_indexing_options()
|
||||
.map(|options| options.tokenizer())
|
||||
.and_then(|tokenizer_name| tokenizer_manager.get(tokenizer_name));
|
||||
|
||||
let sink = &mut |token: &Token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
};
|
||||
|
||||
// TOOD: Validate these changed align with the HEAD branch.
|
||||
for value in values {
|
||||
match value {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
|
||||
token_stream.process(&mut |token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
});
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
if let Some(mut tokenizer) = text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| {
|
||||
text_indexing_options.tokenizer().to_string()
|
||||
})
|
||||
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
|
||||
{
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
token_stream.process(&mut |token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
if let Some(text) = value.as_str() {
|
||||
let tokenizer = match &mut tokenizer_opt {
|
||||
None => continue,
|
||||
Some(tokenizer) => tokenizer,
|
||||
};
|
||||
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
token_stream.process(sink);
|
||||
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
|
||||
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
|
||||
token_stream.process(sink);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -248,7 +238,7 @@ impl MoreLikeThis {
|
||||
}
|
||||
FieldType::Date(_) => {
|
||||
for value in values {
|
||||
let timestamp = value.as_date().ok_or_else(|| {
|
||||
let timestamp = value.as_datetime().ok_or_else(|| {
|
||||
TantivyError::InvalidArgument("invalid value".to_string())
|
||||
})?;
|
||||
let term = Term::from_field_date(field, timestamp);
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::fmt::Debug;
|
||||
|
||||
use super::MoreLikeThis;
|
||||
use crate::query::{EnableScoring, Query, Weight};
|
||||
use crate::schema::{Field, Value};
|
||||
@@ -28,9 +30,9 @@ pub struct MoreLikeThisQuery {
|
||||
target: TargetDocument,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
enum TargetDocument {
|
||||
DocumentAdress(DocAddress),
|
||||
DocumentAddress(DocAddress),
|
||||
DocumentFields(Vec<(Field, Vec<Value>)>),
|
||||
}
|
||||
|
||||
@@ -51,14 +53,20 @@ impl Query for MoreLikeThisQuery {
|
||||
}
|
||||
};
|
||||
match &self.target {
|
||||
TargetDocument::DocumentAdress(doc_address) => self
|
||||
TargetDocument::DocumentAddress(doc_address) => self
|
||||
.mlt
|
||||
.query_with_document(searcher, *doc_address)?
|
||||
.weight(enable_scoring),
|
||||
TargetDocument::DocumentFields(doc_fields) => self
|
||||
.mlt
|
||||
.query_with_document_fields(searcher, doc_fields)?
|
||||
.weight(enable_scoring),
|
||||
TargetDocument::DocumentFields(doc_fields) => {
|
||||
let values = doc_fields
|
||||
.iter()
|
||||
.map(|(field, values)| (*field, values.iter().collect::<Vec<&Value>>()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
self.mlt
|
||||
.query_with_document_fields(searcher, &values)?
|
||||
.weight(enable_scoring)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -156,7 +164,7 @@ impl MoreLikeThisQueryBuilder {
|
||||
pub fn with_document(self, doc_address: DocAddress) -> MoreLikeThisQuery {
|
||||
MoreLikeThisQuery {
|
||||
mlt: self.mlt,
|
||||
target: TargetDocument::DocumentAdress(doc_address),
|
||||
target: TargetDocument::DocumentAddress(doc_address),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,7 +188,7 @@ mod tests {
|
||||
use super::{MoreLikeThisQuery, TargetDocument};
|
||||
use crate::collector::TopDocs;
|
||||
use crate::schema::{Schema, STORED, TEXT};
|
||||
use crate::{DocAddress, Index};
|
||||
use crate::{DocAddress, Index, IndexWriter};
|
||||
|
||||
fn create_test_index() -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -188,7 +196,7 @@ mod tests {
|
||||
let body = schema_builder.add_text_field("body", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(title => "aaa", body => "the old man and the sea"))?;
|
||||
index_writer.add_document(doc!(title => "bbb", body => "an old man sailing on the sea"))?;
|
||||
index_writer.add_document(doc!(title => "ccc", body=> "send this message to alice"))?;
|
||||
@@ -236,7 +244,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
query.target,
|
||||
TargetDocument::DocumentAdress(DocAddress::new(1, 2))
|
||||
TargetDocument::DocumentAddress(DocAddress::new(1, 2))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -161,7 +161,7 @@ mod tests {
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::query::{EnableScoring, PhrasePrefixQuery, Query};
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::{DocSet, Term};
|
||||
use crate::{DocSet, IndexWriter, Term};
|
||||
|
||||
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -169,7 +169,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
for &text in texts {
|
||||
let doc = doc!(text_field=>text);
|
||||
index_writer.add_document(doc)?;
|
||||
|
||||
@@ -17,7 +17,7 @@ pub mod tests {
|
||||
use crate::core::Index;
|
||||
use crate::query::{EnableScoring, QueryParser, Weight};
|
||||
use crate::schema::{Schema, Term, TEXT};
|
||||
use crate::{assert_nearly_equals, DocAddress, DocId, TERMINATED};
|
||||
use crate::{assert_nearly_equals, DocAddress, DocId, IndexWriter, TERMINATED};
|
||||
|
||||
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -25,7 +25,7 @@ pub mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
for &text in texts {
|
||||
let doc = doc!(text_field=>text);
|
||||
index_writer.add_document(doc)?;
|
||||
@@ -135,7 +135,7 @@ pub mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
@@ -278,7 +278,7 @@ pub mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
||||
index_writer.add_document(doc!(text_field=>"b a"))?;
|
||||
@@ -310,7 +310,7 @@ pub mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c d e f g h"))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
@@ -348,7 +348,7 @@ pub mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(json_field=>json!({
|
||||
"text": "elliot smith the happy who"
|
||||
})))?;
|
||||
|
||||
@@ -41,14 +41,14 @@ use crate::{DateTime, DocId, Score};
|
||||
/// use tantivy::collector::Count;
|
||||
/// use tantivy::query::RangeQuery;
|
||||
/// use tantivy::schema::{Schema, INDEXED};
|
||||
/// use tantivy::{doc, Index};
|
||||
/// use tantivy::{doc, Index, IndexWriter};
|
||||
/// # fn test() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let year_field = schema_builder.add_u64_field("year", INDEXED);
|
||||
/// let schema = schema_builder.build();
|
||||
///
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||||
/// let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 20_000_000)?;
|
||||
/// for year in 1950u64..2017u64 {
|
||||
/// let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
|
||||
/// for _ in 0..num_docs_within_year {
|
||||
@@ -474,8 +474,10 @@ mod tests {
|
||||
use crate::collector::{Count, TopDocs};
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Document, Field, IntoIpv6Addr, Schema, FAST, INDEXED, STORED, TEXT};
|
||||
use crate::{doc, Index};
|
||||
use crate::schema::{
|
||||
Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT,
|
||||
};
|
||||
use crate::{doc, Index, IndexWriter};
|
||||
|
||||
#[test]
|
||||
fn test_range_query_simple() -> crate::Result<()> {
|
||||
@@ -552,7 +554,7 @@ mod tests {
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
|
||||
for i in 1..100 {
|
||||
let mut doc = Document::new();
|
||||
let mut doc = TantivyDocument::new();
|
||||
for j in 1..100 {
|
||||
if i % j == 0 {
|
||||
doc.add_i64(int_field, j as i64);
|
||||
@@ -617,7 +619,7 @@ mod tests {
|
||||
let mut index_writer = index.writer_with_num_threads(1, 60_000_000).unwrap();
|
||||
let mut docs = Vec::new();
|
||||
for i in 1..100 {
|
||||
let mut doc = Document::new();
|
||||
let mut doc = TantivyDocument::new();
|
||||
for j in 1..100 {
|
||||
if i % j == 0 {
|
||||
doc.add_f64(float_field, j as f64);
|
||||
@@ -722,7 +724,7 @@ mod tests {
|
||||
let ip_addr_2 = IpAddr::from_str("127.0.0.20").unwrap().into_ipv6_addr();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
for _ in 0..1_000 {
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
|
||||
@@ -88,7 +88,7 @@ pub mod tests {
|
||||
use crate::collector::Count;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
|
||||
use crate::Index;
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Doc {
|
||||
@@ -158,7 +158,7 @@ pub mod tests {
|
||||
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
|
||||
.into_iter()
|
||||
.map(Ipv6Addr::from_u128)
|
||||
|
||||
@@ -141,7 +141,7 @@ pub mod tests {
|
||||
use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight;
|
||||
use crate::query::{QueryParser, Weight};
|
||||
use crate::schema::{NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING};
|
||||
use crate::{Index, TERMINATED};
|
||||
use crate::{Index, IndexWriter, TERMINATED};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Doc {
|
||||
@@ -209,7 +209,7 @@ pub mod tests {
|
||||
let field = schema_builder.add_u64_field("test_field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc!(field=>52_000u64)).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
|
||||
@@ -18,7 +18,7 @@ use crate::schema::Field;
|
||||
/// use tantivy::collector::Count;
|
||||
/// use tantivy::query::RegexQuery;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, Index, Term};
|
||||
/// use tantivy::{doc, Index, IndexWriter, Term};
|
||||
///
|
||||
/// # fn test() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
@@ -26,7 +26,7 @@ use crate::schema::Field;
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(15_000_000)?;
|
||||
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ))?;
|
||||
@@ -95,7 +95,7 @@ mod test {
|
||||
use super::RegexQuery;
|
||||
use crate::collector::TopDocs;
|
||||
use crate::schema::{Field, Schema, TEXT};
|
||||
use crate::{assert_nearly_equals, Index, IndexReader};
|
||||
use crate::{assert_nearly_equals, Index, IndexReader, IndexWriter};
|
||||
|
||||
fn build_test_index() -> crate::Result<(IndexReader, Field)> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -103,7 +103,7 @@ mod test {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(
|
||||
country_field => "japan",
|
||||
))?;
|
||||
|
||||
@@ -116,7 +116,7 @@ mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::query::{QueryParser, TermSetQuery};
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::{assert_nearly_equals, Index, Term};
|
||||
use crate::{assert_nearly_equals, Index, IndexWriter, Term};
|
||||
|
||||
#[test]
|
||||
pub fn test_term_set_query() -> crate::Result<()> {
|
||||
@@ -126,7 +126,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
field1 => "doc1",
|
||||
field2 => "val1",
|
||||
@@ -233,7 +233,7 @@ mod tests {
|
||||
schema_builder.add_text_field("field", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
let field = schema.get_field("field").unwrap();
|
||||
index_writer.add_document(doc!(
|
||||
field => "val1",
|
||||
|
||||
@@ -14,7 +14,7 @@ mod tests {
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::query::{EnableScoring, Query, QueryParser, Scorer, TermQuery};
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
|
||||
use crate::{assert_nearly_equals, DocAddress, Index, Term, TERMINATED};
|
||||
use crate::{assert_nearly_equals, DocAddress, Index, IndexWriter, Term, TERMINATED};
|
||||
|
||||
#[test]
|
||||
pub fn test_term_query_no_freq() -> crate::Result<()> {
|
||||
@@ -24,7 +24,7 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
let doc = doc!(text_field => "a");
|
||||
index_writer.add_document(doc)?;
|
||||
index_writer.commit()?;
|
||||
@@ -50,7 +50,7 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
for _ in 0..COMPRESSION_BLOCK_SIZE {
|
||||
let doc = doc!(text_field => "a");
|
||||
index_writer.add_document(doc)?;
|
||||
@@ -86,7 +86,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
left_field => "left1 left2 left2 left2f2 left2f2 left3 abcde abcde abcde abcde abcde abcde abcde abcde abcde abcewde abcde abcde",
|
||||
right_field => "right1 right2",
|
||||
@@ -133,7 +133,7 @@ mod tests {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a c"))?;
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
||||
@@ -151,7 +151,7 @@ mod tests {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
index_writer.commit()?;
|
||||
@@ -185,7 +185,7 @@ mod tests {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
|
||||
@@ -20,14 +20,14 @@ use crate::Term;
|
||||
/// use tantivy::collector::{Count, TopDocs};
|
||||
/// use tantivy::query::TermQuery;
|
||||
/// use tantivy::schema::{Schema, TEXT, IndexRecordOption};
|
||||
/// use tantivy::{doc, Index, Term};
|
||||
/// use tantivy::{doc, Index, IndexWriter, Term};
|
||||
/// # fn test() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(15_000_000)?;
|
||||
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ))?;
|
||||
@@ -139,7 +139,7 @@ mod tests {
|
||||
use crate::collector::{Count, TopDocs};
|
||||
use crate::query::{Query, QueryParser, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, IntoIpv6Addr, Schema, INDEXED, STORED};
|
||||
use crate::{doc, Index, Term};
|
||||
use crate::{doc, Index, IndexWriter, Term};
|
||||
|
||||
#[test]
|
||||
fn search_ip_test() {
|
||||
@@ -151,7 +151,7 @@ mod tests {
|
||||
let ip_addr_2 = Ipv6Addr::from_u128(10);
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
ip_field => ip_addr_1
|
||||
|
||||
@@ -133,7 +133,8 @@ mod tests {
|
||||
use crate::query::{Bm25Weight, EnableScoring, Scorer, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, Schema, TEXT};
|
||||
use crate::{
|
||||
assert_nearly_equals, DocId, DocSet, Index, Score, Searcher, SegmentId, Term, TERMINATED,
|
||||
assert_nearly_equals, DocId, DocSet, Index, IndexWriter, Score, Searcher, SegmentId, Term,
|
||||
TERMINATED,
|
||||
};
|
||||
|
||||
#[test]
|
||||
@@ -295,7 +296,7 @@ mod tests {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(3, 30_000_000)?;
|
||||
let mut writer: IndexWriter = index.writer_with_num_threads(3, 30_000_000)?;
|
||||
use rand::Rng;
|
||||
let mut rng = rand::thread_rng();
|
||||
writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
|
||||
@@ -1,282 +0,0 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::{self, Read, Write};
|
||||
use std::mem;
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use common::{BinarySerializable, VInt};
|
||||
|
||||
use super::*;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
|
||||
/// Tantivy's Document is the object that can
|
||||
/// be indexed and then searched for.
|
||||
///
|
||||
/// Documents are fundamentally a collection of unordered couples `(field, value)`.
|
||||
/// In this list, one field may appear more than once.
|
||||
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
|
||||
pub struct Document {
|
||||
field_values: Vec<FieldValue>,
|
||||
}
|
||||
|
||||
impl From<Vec<FieldValue>> for Document {
|
||||
fn from(field_values: Vec<FieldValue>) -> Self {
|
||||
Document { field_values }
|
||||
}
|
||||
}
|
||||
impl PartialEq for Document {
|
||||
fn eq(&self, other: &Document) -> bool {
|
||||
// super slow, but only here for tests
|
||||
let convert_to_comparable_map = |field_values: &[FieldValue]| {
|
||||
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
|
||||
for field_value in field_values.iter() {
|
||||
let json_val = serde_json::to_string(field_value.value()).unwrap();
|
||||
field_value_set
|
||||
.entry(field_value.field())
|
||||
.or_default()
|
||||
.insert(json_val);
|
||||
}
|
||||
field_value_set
|
||||
};
|
||||
let self_field_values: HashMap<Field, HashSet<String>> =
|
||||
convert_to_comparable_map(&self.field_values);
|
||||
let other_field_values: HashMap<Field, HashSet<String>> =
|
||||
convert_to_comparable_map(&other.field_values);
|
||||
self_field_values.eq(&other_field_values)
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Document {}
|
||||
|
||||
impl IntoIterator for Document {
|
||||
type Item = FieldValue;
|
||||
|
||||
type IntoIter = std::vec::IntoIter<FieldValue>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.field_values.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
impl Document {
|
||||
/// Creates a new, empty document object
|
||||
pub fn new() -> Document {
|
||||
Document::default()
|
||||
}
|
||||
|
||||
/// Returns the number of `(field, value)` pairs.
|
||||
pub fn len(&self) -> usize {
|
||||
self.field_values.len()
|
||||
}
|
||||
|
||||
/// Returns true if the document contains no fields.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.field_values.is_empty()
|
||||
}
|
||||
|
||||
/// Adding a facet to the document.
|
||||
pub fn add_facet<F>(&mut self, field: Field, path: F)
|
||||
where Facet: From<F> {
|
||||
let facet = Facet::from(path);
|
||||
let value = Value::Facet(facet);
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a text field.
|
||||
pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
|
||||
let value = Value::Str(text.to_string());
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a pre-tokenized text field.
|
||||
pub fn add_pre_tokenized_text(&mut self, field: Field, pre_tokenized_text: PreTokenizedString) {
|
||||
self.add_field_value(field, pre_tokenized_text);
|
||||
}
|
||||
|
||||
/// Add a u64 field
|
||||
pub fn add_u64(&mut self, field: Field, value: u64) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a IP address field. Internally only Ipv6Addr is used.
|
||||
pub fn add_ip_addr(&mut self, field: Field, value: Ipv6Addr) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a i64 field
|
||||
pub fn add_i64(&mut self, field: Field, value: i64) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a f64 field
|
||||
pub fn add_f64(&mut self, field: Field, value: f64) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a bool field
|
||||
pub fn add_bool(&mut self, field: Field, value: bool) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a date field with unspecified time zone offset
|
||||
pub fn add_date(&mut self, field: Field, value: DateTime) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a bytes field
|
||||
pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) {
|
||||
self.add_field_value(field, value.into());
|
||||
}
|
||||
|
||||
/// Add a JSON field
|
||||
pub fn add_json_object(
|
||||
&mut self,
|
||||
field: Field,
|
||||
json_object: serde_json::Map<String, serde_json::Value>,
|
||||
) {
|
||||
self.add_field_value(field, json_object);
|
||||
}
|
||||
|
||||
/// Add a (field, value) to the document.
|
||||
pub fn add_field_value<T: Into<Value>>(&mut self, field: Field, typed_val: T) {
|
||||
let value = typed_val.into();
|
||||
let field_value = FieldValue { field, value };
|
||||
self.field_values.push(field_value);
|
||||
}
|
||||
|
||||
/// field_values accessor
|
||||
pub fn field_values(&self) -> &[FieldValue] {
|
||||
&self.field_values
|
||||
}
|
||||
|
||||
/// Sort and groups the field_values by field.
|
||||
///
|
||||
/// The result of this method is not cached and is
|
||||
/// computed on the fly when this method is called.
|
||||
pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&Value>)> {
|
||||
let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect();
|
||||
field_values.sort_by_key(|field_value| field_value.field());
|
||||
|
||||
let mut field_values_it = field_values.into_iter();
|
||||
|
||||
let first_field_value = if let Some(first_field_value) = field_values_it.next() {
|
||||
first_field_value
|
||||
} else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let mut grouped_field_values = vec![];
|
||||
let mut current_field = first_field_value.field();
|
||||
let mut current_group = vec![first_field_value.value()];
|
||||
|
||||
for field_value in field_values_it {
|
||||
if field_value.field() == current_field {
|
||||
current_group.push(field_value.value());
|
||||
} else {
|
||||
grouped_field_values.push((
|
||||
current_field,
|
||||
mem::replace(&mut current_group, vec![field_value.value()]),
|
||||
));
|
||||
current_field = field_value.field();
|
||||
}
|
||||
}
|
||||
|
||||
grouped_field_values.push((current_field, current_group));
|
||||
grouped_field_values
|
||||
}
|
||||
|
||||
/// Returns all of the `FieldValue`s associated the given field
|
||||
pub fn get_all(&self, field: Field) -> impl Iterator<Item = &Value> {
|
||||
self.field_values
|
||||
.iter()
|
||||
.filter(move |field_value| field_value.field() == field)
|
||||
.map(FieldValue::value)
|
||||
}
|
||||
|
||||
/// Returns the first `FieldValue` associated the given field
|
||||
pub fn get_first(&self, field: Field) -> Option<&Value> {
|
||||
self.get_all(field).next()
|
||||
}
|
||||
|
||||
/// Serializes stored field values.
|
||||
pub fn serialize_stored<W: Write>(&self, schema: &Schema, writer: &mut W) -> io::Result<()> {
|
||||
let stored_field_values = || {
|
||||
self.field_values()
|
||||
.iter()
|
||||
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
|
||||
};
|
||||
let num_field_values = stored_field_values().count();
|
||||
|
||||
VInt(num_field_values as u64).serialize(writer)?;
|
||||
for field_value in stored_field_values() {
|
||||
match field_value {
|
||||
FieldValue {
|
||||
field,
|
||||
value: Value::PreTokStr(pre_tokenized_text),
|
||||
} => {
|
||||
let field_value = FieldValue {
|
||||
field: *field,
|
||||
value: Value::Str(pre_tokenized_text.text.to_string()),
|
||||
};
|
||||
field_value.serialize(writer)?;
|
||||
}
|
||||
field_value => field_value.serialize(writer)?,
|
||||
};
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for Document {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let field_values = self.field_values();
|
||||
VInt(field_values.len() as u64).serialize(writer)?;
|
||||
for field_value in field_values {
|
||||
field_value.serialize(writer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let num_field_values = VInt::deserialize(reader)?.val() as usize;
|
||||
let field_values = (0..num_field_values)
|
||||
.map(|_| FieldValue::deserialize(reader))
|
||||
.collect::<io::Result<Vec<FieldValue>>>()?;
|
||||
Ok(Document::from(field_values))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use common::BinarySerializable;
|
||||
|
||||
use crate::schema::*;
|
||||
|
||||
#[test]
|
||||
fn test_doc() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("title", TEXT);
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "My title");
|
||||
assert_eq!(doc.field_values().len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_serialization_issue() {
|
||||
let mut doc = Document::default();
|
||||
doc.add_json_object(
|
||||
Field::from_field_id(0),
|
||||
serde_json::json!({"key": 2u64})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
doc.add_text(Field::from_field_id(1), "hello");
|
||||
assert_eq!(doc.field_values().len(), 2);
|
||||
let mut payload: Vec<u8> = Vec::new();
|
||||
doc.serialize(&mut payload).unwrap();
|
||||
assert_eq!(payload.len(), 26);
|
||||
Document::deserialize(&mut &payload[..]).unwrap();
|
||||
}
|
||||
}
|
||||
1029
src/schema/document/de.rs
Normal file
1029
src/schema/document/de.rs
Normal file
File diff suppressed because it is too large
Load Diff
310
src/schema/document/default_doc_type.rs
Normal file
310
src/schema/document/default_doc_type.rs
Normal file
@@ -0,0 +1,310 @@
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use common::DateTime;
|
||||
use serde_json::Map;
|
||||
|
||||
use crate::schema::document::{
|
||||
DeserializeError, Document, DocumentDeserialize, DocumentDeserializer,
|
||||
};
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
use crate::schema::field_value::FieldValueIter;
|
||||
use crate::schema::{Facet, Field, FieldValue, NamedFieldDocument, Schema, Value};
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
|
||||
/// Tantivy's Document is the object that can be indexed and then searched for.
|
||||
/// It provides a default implementation of the `Document` trait.
|
||||
///
|
||||
/// Documents are fundamentally a collection of unordered couples `(field, value)`.
|
||||
/// In this list, one field may appear more than once.
|
||||
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
|
||||
pub struct TantivyDocument {
|
||||
field_values: Vec<FieldValue>,
|
||||
}
|
||||
|
||||
impl Document for TantivyDocument {
|
||||
type Value<'a> = &'a Value;
|
||||
type FieldsValuesIter<'a> = FieldValueIter<'a>;
|
||||
|
||||
fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
|
||||
FieldValueIter(self.field_values.iter())
|
||||
}
|
||||
}
|
||||
|
||||
impl DocumentDeserialize for TantivyDocument {
|
||||
fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
|
||||
where D: DocumentDeserializer<'de> {
|
||||
let mut field_values = Vec::with_capacity(deserializer.size_hint());
|
||||
|
||||
while let Some((field, value)) = deserializer.next_field()? {
|
||||
field_values.push(FieldValue::new(field, value));
|
||||
}
|
||||
|
||||
Ok(Self { field_values })
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<FieldValue>> for TantivyDocument {
|
||||
fn from(field_values: Vec<FieldValue>) -> Self {
|
||||
Self { field_values }
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for TantivyDocument {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
// super slow, but only here for tests
|
||||
let convert_to_comparable_map = |field_values: &[FieldValue]| {
|
||||
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
|
||||
for field_value in field_values.iter() {
|
||||
let value = serde_json::to_string(field_value.value()).unwrap();
|
||||
field_value_set
|
||||
.entry(field_value.field())
|
||||
.or_default()
|
||||
.insert(value);
|
||||
}
|
||||
field_value_set
|
||||
};
|
||||
let self_field_values: HashMap<Field, HashSet<String>> =
|
||||
convert_to_comparable_map(&self.field_values);
|
||||
let other_field_values: HashMap<Field, HashSet<String>> =
|
||||
convert_to_comparable_map(&other.field_values);
|
||||
self_field_values.eq(&other_field_values)
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for TantivyDocument {}
|
||||
|
||||
impl IntoIterator for TantivyDocument {
|
||||
type Item = FieldValue;
|
||||
|
||||
type IntoIter = std::vec::IntoIter<FieldValue>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.field_values.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
impl TantivyDocument {
|
||||
/// Creates a new, empty document object
|
||||
pub fn new() -> TantivyDocument {
|
||||
TantivyDocument::default()
|
||||
}
|
||||
|
||||
/// Returns the length of the document.
|
||||
pub fn len(&self) -> usize {
|
||||
self.field_values.len()
|
||||
}
|
||||
|
||||
/// Adding a facet to the document.
|
||||
pub fn add_facet<F>(&mut self, field: Field, path: F)
|
||||
where Facet: From<F> {
|
||||
let facet = Facet::from(path);
|
||||
let value = Value::Facet(facet);
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a text field.
|
||||
pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
|
||||
let value = Value::Str(text.to_string());
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a pre-tokenized text field.
|
||||
pub fn add_pre_tokenized_text(&mut self, field: Field, pre_tokenized_text: PreTokenizedString) {
|
||||
self.add_field_value(field, pre_tokenized_text);
|
||||
}
|
||||
|
||||
/// Add a u64 field
|
||||
pub fn add_u64(&mut self, field: Field, value: u64) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a IP address field. Internally only Ipv6Addr is used.
|
||||
pub fn add_ip_addr(&mut self, field: Field, value: Ipv6Addr) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a i64 field
|
||||
pub fn add_i64(&mut self, field: Field, value: i64) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a f64 field
|
||||
pub fn add_f64(&mut self, field: Field, value: f64) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a bool field
|
||||
pub fn add_bool(&mut self, field: Field, value: bool) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a date field with unspecified time zone offset
|
||||
pub fn add_date(&mut self, field: Field, value: DateTime) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a bytes field
|
||||
pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) {
|
||||
self.add_field_value(field, value.into());
|
||||
}
|
||||
|
||||
/// Add a dynamic object field
|
||||
pub fn add_object(&mut self, field: Field, object: BTreeMap<String, Value>) {
|
||||
self.add_field_value(field, object);
|
||||
}
|
||||
|
||||
/// Add a (field, value) to the document.
|
||||
pub fn add_field_value<T: Into<Value>>(&mut self, field: Field, typed_val: T) {
|
||||
let value = typed_val.into();
|
||||
let field_value = FieldValue { field, value };
|
||||
self.field_values.push(field_value);
|
||||
}
|
||||
|
||||
/// field_values accessor
|
||||
pub fn field_values(&self) -> &[FieldValue] {
|
||||
&self.field_values
|
||||
}
|
||||
|
||||
/// Returns all of the `FieldValue`s associated the given field
|
||||
pub fn get_all(&self, field: Field) -> impl Iterator<Item = &Value> {
|
||||
self.field_values
|
||||
.iter()
|
||||
.filter(move |field_value| field_value.field() == field)
|
||||
.map(FieldValue::value)
|
||||
}
|
||||
|
||||
/// Returns the first `FieldValue` associated the given field
|
||||
pub fn get_first(&self, field: Field) -> Option<&Value> {
|
||||
self.get_all(field).next()
|
||||
}
|
||||
|
||||
/// Create document from a named doc.
|
||||
pub fn convert_named_doc(
|
||||
schema: &Schema,
|
||||
named_doc: NamedFieldDocument,
|
||||
) -> Result<TantivyDocument, DocParsingError> {
|
||||
let mut document = TantivyDocument::new();
|
||||
for (field_name, values) in named_doc.0 {
|
||||
if let Ok(field) = schema.get_field(&field_name) {
|
||||
for value in values {
|
||||
document.add_field_value(field, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(document)
|
||||
}
|
||||
|
||||
/// Create a named document from the doc.
|
||||
pub fn to_named_doc(&self, schema: &Schema) -> NamedFieldDocument {
|
||||
let mut field_map = BTreeMap::new();
|
||||
for (field, field_values) in self.get_sorted_field_values() {
|
||||
let field_name = schema.get_field_name(field);
|
||||
let values: Vec<Value> = field_values.into_iter().cloned().collect();
|
||||
field_map.insert(field_name.to_string(), values);
|
||||
}
|
||||
NamedFieldDocument(field_map)
|
||||
}
|
||||
|
||||
/// Encode the schema in JSON.
|
||||
///
|
||||
/// Encoding a document cannot fail.
|
||||
pub fn to_json(&self, schema: &Schema) -> String {
|
||||
serde_json::to_string(&self.to_named_doc(schema))
|
||||
.expect("doc encoding failed. This is a bug")
|
||||
}
|
||||
|
||||
/// Build a document object from a json-object.
|
||||
pub fn parse_json(schema: &Schema, doc_json: &str) -> Result<TantivyDocument, DocParsingError> {
|
||||
let json_obj: Map<String, serde_json::Value> =
|
||||
serde_json::from_str(doc_json).map_err(|_| DocParsingError::invalid_json(doc_json))?;
|
||||
Self::from_json_object(schema, json_obj)
|
||||
}
|
||||
|
||||
/// Build a document object from a json-object.
|
||||
pub fn from_json_object(
|
||||
schema: &Schema,
|
||||
json_obj: Map<String, serde_json::Value>,
|
||||
) -> Result<TantivyDocument, DocParsingError> {
|
||||
let mut doc = TantivyDocument::default();
|
||||
for (field_name, json_value) in json_obj {
|
||||
if let Ok(field) = schema.get_field(&field_name) {
|
||||
let field_entry = schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
match json_value {
|
||||
serde_json::Value::Array(json_items) => {
|
||||
for json_item in json_items {
|
||||
let value = field_type
|
||||
.value_from_json(json_item)
|
||||
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
|
||||
doc.add_field_value(field, value);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let value = field_type
|
||||
.value_from_json(json_value)
|
||||
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
|
||||
doc.add_field_value(field, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(doc)
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that may happen when deserializing
|
||||
/// a document from JSON.
|
||||
#[derive(Debug, Error, PartialEq)]
|
||||
pub enum DocParsingError {
|
||||
/// The payload given is not valid JSON.
|
||||
#[error("The provided string is not valid JSON")]
|
||||
InvalidJson(String),
|
||||
/// One of the value node could not be parsed.
|
||||
#[error("The field '{0:?}' could not be parsed: {1:?}")]
|
||||
ValueError(String, ValueParsingError),
|
||||
}
|
||||
|
||||
impl DocParsingError {
|
||||
/// Builds a NotJson DocParsingError
|
||||
fn invalid_json(invalid_json: &str) -> Self {
|
||||
let sample = invalid_json.chars().take(20).collect();
|
||||
DocParsingError::InvalidJson(sample)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::schema::document::default_doc_type::TantivyDocument;
|
||||
use crate::schema::*;
|
||||
|
||||
#[test]
|
||||
fn test_doc() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("title", TEXT);
|
||||
let mut doc = TantivyDocument::default();
|
||||
doc.add_text(text_field, "My title");
|
||||
assert_eq!(doc.field_values().len(), 1);
|
||||
}
|
||||
|
||||
// TODO: Should this be re-added with the serialize method
|
||||
// technically this is no longer useful since the doc types
|
||||
// do not implement BinarySerializable due to orphan rules.
|
||||
// #[test]
|
||||
// fn test_doc_serialization_issue() {
|
||||
// let mut doc = Document::default();
|
||||
// doc.add_json_object(
|
||||
// Field::from_field_id(0),
|
||||
// serde_json::json!({"key": 2u64})
|
||||
// .as_object()
|
||||
// .unwrap()
|
||||
// .clone(),
|
||||
// );
|
||||
// doc.add_text(Field::from_field_id(1), "hello");
|
||||
// assert_eq!(doc.field_values().len(), 2);
|
||||
// let mut payload: Vec<u8> = Vec::new();
|
||||
// doc_binary_wrappers::serialize(&doc, &mut payload).unwrap();
|
||||
// assert_eq!(payload.len(), 26);
|
||||
// doc_binary_wrappers::deserialize::<Document, _>(&mut &payload[..]).unwrap();
|
||||
// }
|
||||
}
|
||||
207
src/schema/document/existing_type_impls.rs
Normal file
207
src/schema/document/existing_type_impls.rs
Normal file
@@ -0,0 +1,207 @@
|
||||
//! Implementations of some of the core traits on varius types to improve the ergonomics
|
||||
//! of the API when providing custom documents.
|
||||
//!
|
||||
//! This allows users a bit more freedom and ergonomics if they want a simple API
|
||||
//! and don't care about some of the more specialised types or only want to customise
|
||||
//! part of the document structure.
|
||||
use std::collections::{btree_map, hash_map, BTreeMap, HashMap};
|
||||
|
||||
use serde_json::Number;
|
||||
|
||||
use crate::schema::document::{
|
||||
ArrayAccess, DeserializeError, DocValue, Document, DocumentDeserialize, DocumentDeserializer,
|
||||
ObjectAccess, ReferenceValue, ValueDeserialize, ValueDeserializer, ValueVisitor,
|
||||
};
|
||||
use crate::schema::Field;
|
||||
|
||||
// Serde compatibility support.
|
||||
impl<'a> DocValue<'a> for &'a serde_json::Value {
|
||||
type ChildValue = Self;
|
||||
type ArrayIter = JsonArrayIter<'a>;
|
||||
type ObjectIter = JsonObjectIter<'a>;
|
||||
|
||||
fn as_value(&self) -> ReferenceValue<'a, Self> {
|
||||
match self {
|
||||
serde_json::Value::Null => ReferenceValue::Null,
|
||||
serde_json::Value::Bool(value) => ReferenceValue::Bool(*value),
|
||||
serde_json::Value::Number(number) => {
|
||||
if let Some(val) = number.as_u64() {
|
||||
ReferenceValue::U64(val)
|
||||
} else if let Some(val) = number.as_i64() {
|
||||
ReferenceValue::I64(val)
|
||||
} else if let Some(val) = number.as_f64() {
|
||||
ReferenceValue::F64(val)
|
||||
} else {
|
||||
panic!("Unsupported serde_json number {number}");
|
||||
}
|
||||
}
|
||||
serde_json::Value::String(val) => ReferenceValue::Str(val),
|
||||
serde_json::Value::Array(elements) => {
|
||||
ReferenceValue::Array(JsonArrayIter(elements.iter()))
|
||||
}
|
||||
serde_json::Value::Object(object) => {
|
||||
ReferenceValue::Object(JsonObjectIter(object.iter()))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueDeserialize for serde_json::Value {
|
||||
fn deserialize<'de, D>(deserializer: D) -> Result<Self, DeserializeError>
|
||||
where D: ValueDeserializer<'de> {
|
||||
struct SerdeValueVisitor;
|
||||
|
||||
impl ValueVisitor for SerdeValueVisitor {
|
||||
type Value = serde_json::Value;
|
||||
|
||||
fn visit_null(&self) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(serde_json::Value::Null)
|
||||
}
|
||||
|
||||
fn visit_string(&self, val: String) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(serde_json::Value::String(val))
|
||||
}
|
||||
|
||||
fn visit_u64(&self, val: u64) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(serde_json::Value::Number(val.into()))
|
||||
}
|
||||
|
||||
fn visit_i64(&self, val: i64) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(serde_json::Value::Number(val.into()))
|
||||
}
|
||||
|
||||
fn visit_f64(&self, val: f64) -> Result<Self::Value, DeserializeError> {
|
||||
let num = Number::from_f64(val).ok_or_else(|| {
|
||||
DeserializeError::custom(format!(
|
||||
"serde_json::Value cannot deserialize float {val}"
|
||||
))
|
||||
})?;
|
||||
Ok(serde_json::Value::Number(num))
|
||||
}
|
||||
|
||||
fn visit_bool(&self, val: bool) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(serde_json::Value::Bool(val.into()))
|
||||
}
|
||||
|
||||
fn visit_array<'de, A>(&self, mut access: A) -> Result<Self::Value, DeserializeError>
|
||||
where A: ArrayAccess<'de> {
|
||||
let mut elements = Vec::with_capacity(access.size_hint());
|
||||
|
||||
while let Some(value) = access.next_element()? {
|
||||
elements.push(value);
|
||||
}
|
||||
|
||||
Ok(serde_json::Value::Array(elements))
|
||||
}
|
||||
|
||||
fn visit_object<'de, A>(&self, mut access: A) -> Result<Self::Value, DeserializeError>
|
||||
where A: ObjectAccess<'de> {
|
||||
let mut object = serde_json::Map::with_capacity(access.size_hint());
|
||||
|
||||
while let Some((key, value)) = access.next_entry()? {
|
||||
object.insert(key, value);
|
||||
}
|
||||
|
||||
Ok(serde_json::Value::Object(object))
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_any(SerdeValueVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
/// A wrapper struct for an interator producing [Value]s.
|
||||
pub struct JsonArrayIter<'a>(pub(crate) std::slice::Iter<'a, serde_json::Value>);
|
||||
|
||||
impl<'a> Iterator for JsonArrayIter<'a> {
|
||||
type Item = ReferenceValue<'a, &'a serde_json::Value>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let value = self.0.next()?;
|
||||
Some(value.as_value())
|
||||
}
|
||||
}
|
||||
|
||||
/// A wrapper struct for an interator producing [Value]s.
|
||||
pub struct JsonObjectIter<'a>(pub(crate) serde_json::map::Iter<'a>);
|
||||
|
||||
impl<'a> Iterator for JsonObjectIter<'a> {
|
||||
type Item = (&'a str, ReferenceValue<'a, &'a serde_json::Value>);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let (key, value) = self.0.next()?;
|
||||
Some((key, value.as_value()))
|
||||
}
|
||||
}
|
||||
|
||||
// Custom document types
|
||||
|
||||
// BTreeMap based documents
|
||||
impl Document for BTreeMap<Field, crate::schema::Value> {
|
||||
type Value<'a> = &'a crate::schema::Value;
|
||||
type FieldsValuesIter<'a> = FieldCopyingIterator<
|
||||
'a,
|
||||
btree_map::Iter<'a, Field, crate::schema::Value>,
|
||||
crate::schema::Value,
|
||||
>;
|
||||
|
||||
fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
|
||||
FieldCopyingIterator(self.iter())
|
||||
}
|
||||
}
|
||||
impl DocumentDeserialize for BTreeMap<Field, crate::schema::Value> {
|
||||
fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
|
||||
where D: DocumentDeserializer<'de> {
|
||||
let mut document = BTreeMap::new();
|
||||
|
||||
while let Some((field, value)) = deserializer.next_field()? {
|
||||
document.insert(field, value);
|
||||
}
|
||||
|
||||
Ok(document)
|
||||
}
|
||||
}
|
||||
|
||||
// HashMap based documents
|
||||
impl Document for HashMap<Field, crate::schema::Value> {
|
||||
type Value<'a> = &'a crate::schema::Value;
|
||||
type FieldsValuesIter<'a> = FieldCopyingIterator<
|
||||
'a,
|
||||
hash_map::Iter<'a, Field, crate::schema::Value>,
|
||||
crate::schema::Value,
|
||||
>;
|
||||
|
||||
fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
|
||||
FieldCopyingIterator(self.iter())
|
||||
}
|
||||
}
|
||||
impl DocumentDeserialize for HashMap<Field, crate::schema::Value> {
|
||||
fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
|
||||
where D: DocumentDeserializer<'de> {
|
||||
let mut document = HashMap::with_capacity(deserializer.size_hint());
|
||||
|
||||
while let Some((field, value)) = deserializer.next_field()? {
|
||||
document.insert(field, value);
|
||||
}
|
||||
|
||||
Ok(document)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FieldCopyingIterator<'a, I, V>(I)
|
||||
where
|
||||
V: 'a,
|
||||
I: Iterator<Item = (&'a Field, &'a V)>;
|
||||
|
||||
impl<'a, I, V> Iterator for FieldCopyingIterator<'a, I, V>
|
||||
where
|
||||
V: 'a,
|
||||
I: Iterator<Item = (&'a Field, &'a V)>,
|
||||
{
|
||||
type Item = (Field, &'a V);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let (field, value) = self.0.next()?;
|
||||
Some((*field, value))
|
||||
}
|
||||
}
|
||||
532
src/schema/document/mod.rs
Normal file
532
src/schema/document/mod.rs
Normal file
@@ -0,0 +1,532 @@
|
||||
//! Document definition for Tantivy to index and store.
|
||||
//!
|
||||
//! A document and its values are defined by a couple core traits:
|
||||
//! - [DocumentAccess] which describes your top-level document and it's fields.
|
||||
//! - [DocValue] which provides tantivy with a way to access the document's values in a common way
|
||||
//! without performing any additional allocations.
|
||||
//! - [DocumentDeserialize] which implements the necessary code to deserialize the document from the
|
||||
//! doc store.
|
||||
//!
|
||||
//! Tantivy provides a few out-of-box implementations of these core traits to provide
|
||||
//! some simple usage if you don't want to implement these traits on a custom type yourself.
|
||||
//!
|
||||
//! # Out-of-box document implementations
|
||||
//! - [Document] the old document type used by Tantivy before the trait based approach was
|
||||
//! implemented. This type is still valid and provides all of the original behaviour you might
|
||||
//! expect.
|
||||
//! - `BTreeMap<Field, Value>` a mapping of field_ids to their relevant schema value using a
|
||||
//! BTreeMap.
|
||||
//! - `HashMap<Field, Value>` a mapping of field_ids to their relevant schema value using a HashMap.
|
||||
//!
|
||||
//! # Implementing your custom documents
|
||||
//! Often in larger projects or higher performance applications you want to avoid the extra overhead
|
||||
//! of converting your own types to the Tantivy [Document] type, this can often save you a
|
||||
//! significant amount of time when indexing by avoiding the additional allocations.
|
||||
//!
|
||||
//! ### Important Note
|
||||
//! The implementor of the `DocumentAccess` trait must be `'static` and safe to send across
|
||||
//! thread boundaries.
|
||||
//!
|
||||
//! ## Reusing existing types
|
||||
//! The API design of the document traits allow you to reuse as much of as little of the
|
||||
//! existing trait implementations as you like, this can save quite a bit of boilerplate
|
||||
//! as shown by the following example.
|
||||
//!
|
||||
//! ## A basic custom document
|
||||
//! ```
|
||||
//! use std::collections::{btree_map, BTreeMap};
|
||||
//! use tantivy::schema::{Document, Field};
|
||||
//! use tantivy::schema::document::{DeserializeError, DocumentDeserialize, DocumentDeserializer};
|
||||
//!
|
||||
//! /// Our custom document to let us use a map of `serde_json::Values`.
|
||||
//! pub struct MyCustomDocument {
|
||||
//! // Tantivy provides trait implementations for common `serde_json` types.
|
||||
//! fields: BTreeMap<Field, serde_json::Value>
|
||||
//! }
|
||||
//!
|
||||
//! impl Document for MyCustomDocument {
|
||||
//! // The value type produced by the `iter_fields_and_values` iterator.
|
||||
//! type Value<'a> = &'a serde_json::Value;
|
||||
//! // The iterator which is produced by `iter_fields_and_values`.
|
||||
//! // Often this is a simple new-type wrapper unless you like super long generics.
|
||||
//! type FieldsValuesIter<'a> = MyCustomIter<'a>;
|
||||
//!
|
||||
//! /// Produces an iterator over the document fields and values.
|
||||
//! /// This method will be called multiple times, it's important
|
||||
//! /// to not do anything too heavy in this step, any heavy operations
|
||||
//! /// should be done before and effectively cached.
|
||||
//! fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
|
||||
//! MyCustomIter(self.fields.iter())
|
||||
//! }
|
||||
//! }
|
||||
//!
|
||||
//! // Our document must also provide a way to get the original doc
|
||||
//! // back when it's deserialized from the doc store.
|
||||
//! // The API for this is very similar to serde but a little bit
|
||||
//! // more specialised, giving you access to types like IP addresses, datetime, etc...
|
||||
//! impl DocumentDeserialize for MyCustomDocument {
|
||||
//! fn deserialize<'de, D>(deserializer: D) -> Result<Self, DeserializeError>
|
||||
//! where D: DocumentDeserializer<'de>
|
||||
//! {
|
||||
//! // We're not going to implement the necessary logic for this example
|
||||
//! // see the `Deserialization` section of implementing a custom document
|
||||
//! // for more information on how this works.
|
||||
//! unimplemented!()
|
||||
//! }
|
||||
//! }
|
||||
//!
|
||||
//! /// Our custom iterator just helps us to avoid some messy generics.
|
||||
//! pub struct MyCustomIter<'a>(btree_map::Iter<'a, Field, serde_json::Value>);
|
||||
//! impl<'a> Iterator for MyCustomIter<'a> {
|
||||
//! // Here we can see our field-value pairs being produced by the iterator.
|
||||
//! // The value returned alongside the field is the same type as `Document::Value<'_>`.
|
||||
//! type Item = (Field, &'a serde_json::Value);
|
||||
//!
|
||||
//! fn next(&mut self) -> Option<Self::Item> {
|
||||
//! let (field, value) = self.0.next()?;
|
||||
//! Some((*field, value))
|
||||
//! }
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! You may have noticed in this example that we haven't needed to implement any custom value types,
|
||||
//! instead we've just used a [serde_json::Value] type which tantivy provides an existing
|
||||
//! implementation for.
|
||||
//!
|
||||
//! ## Implementing custom values
|
||||
//! Internally, Tantivy only works with `ReferenceValue` which is an enum that tries to borrow
|
||||
//! as much data as it can, in order to allow documents to return custom types, they must implement
|
||||
//! the `DocValue` trait which provides a way for Tantivy to get a `ReferenceValue` that it can then
|
||||
//! index and store.
|
||||
//!
|
||||
//! Values can just as easily be customised as documents by implementing the `DocValue` trait.
|
||||
//!
|
||||
//! The implementor of this type should not own the data it's returning, instead it should just
|
||||
//! hold references of the data held by the parent [Document] which can then be passed
|
||||
//! on to the [ReferenceValue].
|
||||
//!
|
||||
//! This is why `DocValue` is implemented for `&'a serde_json::Value` and `&'a
|
||||
//! tantivy::schema::Value` but not for their owned counterparts, as we cannot satisfy the lifetime
|
||||
//! bounds necessary when indexing the documents.
|
||||
//!
|
||||
//! ### A note about returning values
|
||||
//! The custom value type does not have to be the type stored by the document, instead the
|
||||
//! implementor of a `DocValue` can just be used as a way to convert between the owned type
|
||||
//! kept in the parent document, and the value passed into Tantivy.
|
||||
//!
|
||||
//! ```
|
||||
//! use tantivy::schema::document::ReferenceValue;
|
||||
//! use tantivy::schema::{DocValue};
|
||||
//!
|
||||
//! #[derive(Debug)]
|
||||
//! /// Our custom value type which has 3 types, a string, float and bool.
|
||||
//! #[allow(dead_code)]
|
||||
//! pub enum MyCustomValue<'a> {
|
||||
//! // Our string data is owned by the parent document, instead we just
|
||||
//! // hold onto a reference of this data.
|
||||
//! String(&'a str),
|
||||
//! Float(f64),
|
||||
//! Bool(bool),
|
||||
//! }
|
||||
//!
|
||||
//! impl<'a> DocValue<'a> for MyCustomValue<'a> {
|
||||
//! type ChildValue = Self;
|
||||
//! // We don't need to worry about these types here as we're not
|
||||
//! // working with nested types, but if we wanted to we would
|
||||
//! // define our two iterator types, a sequence of ReferenceValues
|
||||
//! // for the array iterator and a sequence of key-value pairs for objects.
|
||||
//! type ArrayIter = std::iter::Empty<ReferenceValue<'a, Self>>;
|
||||
//! type ObjectIter = std::iter::Empty<(&'a str, ReferenceValue<'a, Self>)>;
|
||||
//!
|
||||
//! // The ReferenceValue which Tantivy can use.
|
||||
//! fn as_value(&self) -> ReferenceValue<'a, Self> {
|
||||
//! // We can support any type that Tantivy itself supports.
|
||||
//! match self {
|
||||
//! MyCustomValue::String(val) => ReferenceValue::Str(val),
|
||||
//! MyCustomValue::Float(val) => ReferenceValue::F64(*val),
|
||||
//! MyCustomValue::Bool(val) => ReferenceValue::Bool(*val),
|
||||
//! }
|
||||
//! }
|
||||
//!
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! TODO: Complete this section...
|
||||
|
||||
mod de;
|
||||
mod default_doc_type;
|
||||
mod existing_type_impls;
|
||||
mod se;
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::mem;
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
pub(crate) use self::de::BinaryDocumentDeserializer;
|
||||
pub use self::de::{
|
||||
ArrayAccess, DeserializeError, DocumentDeserialize, DocumentDeserializer, ObjectAccess,
|
||||
ValueDeserialize, ValueDeserializer, ValueType, ValueVisitor,
|
||||
};
|
||||
pub use self::default_doc_type::{DocParsingError, TantivyDocument};
|
||||
pub(crate) use self::se::BinaryDocumentSerializer;
|
||||
use super::*;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
|
||||
/// The core trait representing a document within the index.
|
||||
pub trait Document: DocumentDeserialize + Send + Sync + 'static {
|
||||
/// The value of the field.
|
||||
type Value<'a>: DocValue<'a> + Clone
|
||||
where Self: 'a;
|
||||
|
||||
/// The iterator over all of the fields and values within the doc.
|
||||
type FieldsValuesIter<'a>: Iterator<Item = (Field, Self::Value<'a>)>
|
||||
where Self: 'a;
|
||||
|
||||
/// Get an iterator iterating over all fields and values in a document.
|
||||
fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_>;
|
||||
|
||||
/// Sort and groups the field_values by field.
|
||||
///
|
||||
/// The result of this method is not cached and is
|
||||
/// computed on the fly when this method is called.
|
||||
fn get_sorted_field_values(&self) -> Vec<(Field, Vec<Self::Value<'_>>)> {
|
||||
let mut field_values: Vec<(Field, Self::Value<'_>)> =
|
||||
self.iter_fields_and_values().collect();
|
||||
field_values.sort_by_key(|(field, _)| *field);
|
||||
|
||||
let mut field_values_it = field_values.into_iter();
|
||||
|
||||
let first_field_value = if let Some(first_field_value) = field_values_it.next() {
|
||||
first_field_value
|
||||
} else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let mut grouped_field_values = vec![];
|
||||
let mut current_field = first_field_value.0;
|
||||
let mut current_group = vec![first_field_value.1];
|
||||
|
||||
for (field, value) in field_values_it {
|
||||
if field == current_field {
|
||||
current_group.push(value);
|
||||
} else {
|
||||
grouped_field_values
|
||||
.push((current_field, mem::replace(&mut current_group, vec![value])));
|
||||
current_field = field;
|
||||
}
|
||||
}
|
||||
|
||||
grouped_field_values.push((current_field, current_group));
|
||||
grouped_field_values
|
||||
}
|
||||
}
|
||||
|
||||
/// A single field value.
|
||||
pub trait DocValue<'a>: Send + Sync + Debug {
|
||||
/// The child value type returned by this doc value.
|
||||
type ChildValue: DocValue<'a>;
|
||||
/// The iterator for walking through the elements within the array.
|
||||
type ArrayIter: Iterator<Item = ReferenceValue<'a, Self::ChildValue>>;
|
||||
/// The visitor walking through the key-value pairs within
|
||||
/// the object.
|
||||
type ObjectIter: Iterator<Item = (&'a str, ReferenceValue<'a, Self::ChildValue>)>;
|
||||
|
||||
/// Returns the field value represented by an enum which borrows it's data.
|
||||
fn as_value(&self) -> ReferenceValue<'a, Self>;
|
||||
|
||||
#[inline]
|
||||
/// Returns if the value is `null` or not.
|
||||
fn is_null(&self) -> bool {
|
||||
matches!(self.as_value(), ReferenceValue::Null)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a String, returns the associated str. Returns None otherwise.
|
||||
fn as_str(&self) -> Option<&'a str> {
|
||||
if let ReferenceValue::Str(val) = self.as_value() {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a u64, returns the associated u64. Returns None otherwise.
|
||||
fn as_u64(&self) -> Option<u64> {
|
||||
if let ReferenceValue::U64(val) = self.as_value() {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a i64, returns the associated i64. Returns None otherwise.
|
||||
fn as_i64(&self) -> Option<i64> {
|
||||
if let ReferenceValue::I64(val) = self.as_value() {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a f64, returns the associated f64. Returns None otherwise.
|
||||
fn as_f64(&self) -> Option<f64> {
|
||||
if let ReferenceValue::F64(val) = self.as_value() {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a datetime, returns the associated datetime. Returns None otherwise.
|
||||
fn as_datetime(&self) -> Option<DateTime> {
|
||||
if let ReferenceValue::Date(val) = self.as_value() {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a IP address, returns the associated IP. Returns None otherwise.
|
||||
fn as_ip_addr(&self) -> Option<Ipv6Addr> {
|
||||
if let ReferenceValue::IpAddr(val) = self.as_value() {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a bool, returns the associated bool. Returns None otherwise.
|
||||
fn as_bool(&self) -> Option<bool> {
|
||||
if let ReferenceValue::Bool(val) = self.as_value() {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a pre-tokenized string, returns the associated string. Returns None
|
||||
/// otherwise.
|
||||
fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> {
|
||||
if let ReferenceValue::PreTokStr(val) = self.as_value() {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a bytes value, returns the associated set of bytes. Returns None otherwise.
|
||||
fn as_bytes(&self) -> Option<&'a [u8]> {
|
||||
if let ReferenceValue::Bytes(val) = self.as_value() {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a facet, returns the associated facet. Returns None otherwise.
|
||||
fn as_facet(&self) -> Option<&'a Facet> {
|
||||
if let ReferenceValue::Facet(val) = self.as_value() {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns true if the Value is an array.
|
||||
fn is_array(&self) -> bool {
|
||||
matches!(self.as_value(), ReferenceValue::Object(_))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns true if the Value is an object.
|
||||
fn is_object(&self) -> bool {
|
||||
matches!(self.as_value(), ReferenceValue::Object(_))
|
||||
}
|
||||
}
|
||||
|
||||
/// A enum representing a value for tantivy to index.
|
||||
pub enum ReferenceValue<'a, V>
|
||||
where V: DocValue<'a> + ?Sized
|
||||
{
|
||||
/// A null value.
|
||||
Null,
|
||||
/// The str type is used for any text information.
|
||||
Str(&'a str),
|
||||
/// Unsigned 64-bits Integer `u64`
|
||||
U64(u64),
|
||||
/// Signed 64-bits Integer `i64`
|
||||
I64(i64),
|
||||
/// 64-bits Float `f64`
|
||||
F64(f64),
|
||||
/// Date/time with nanoseconds precision
|
||||
Date(DateTime),
|
||||
/// Facet
|
||||
Facet(&'a Facet),
|
||||
/// Arbitrarily sized byte array
|
||||
Bytes(&'a [u8]),
|
||||
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
|
||||
IpAddr(Ipv6Addr),
|
||||
/// Bool value
|
||||
Bool(bool),
|
||||
/// Pre-tokenized str type,
|
||||
PreTokStr(&'a PreTokenizedString),
|
||||
/// A an array containing multiple values.
|
||||
Array(V::ArrayIter),
|
||||
/// A nested / dynamic object.
|
||||
Object(V::ObjectIter),
|
||||
}
|
||||
|
||||
impl<'a, V> ReferenceValue<'a, V>
|
||||
where V: DocValue<'a>
|
||||
{
|
||||
#[inline]
|
||||
/// Returns if the value is `null` or not.
|
||||
pub fn is_null(&self) -> bool {
|
||||
matches!(self, Self::Null)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a String, returns the associated str. Returns None otherwise.
|
||||
pub fn as_str(&self) -> Option<&'a str> {
|
||||
if let Self::Str(val) = self {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a u64, returns the associated u64. Returns None otherwise.
|
||||
pub fn as_u64(&self) -> Option<u64> {
|
||||
if let Self::U64(val) = self {
|
||||
Some(*val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a i64, returns the associated i64. Returns None otherwise.
|
||||
pub fn as_i64(&self) -> Option<i64> {
|
||||
if let Self::I64(val) = self {
|
||||
Some(*val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a f64, returns the associated f64. Returns None otherwise.
|
||||
pub fn as_f64(&self) -> Option<f64> {
|
||||
if let Self::F64(val) = self {
|
||||
Some(*val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a datetime, returns the associated datetime. Returns None otherwise.
|
||||
pub fn as_datetime(&self) -> Option<DateTime> {
|
||||
if let Self::Date(val) = self {
|
||||
Some(*val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a IP address, returns the associated IP. Returns None otherwise.
|
||||
pub fn as_ip_addr(&self) -> Option<Ipv6Addr> {
|
||||
if let Self::IpAddr(val) = self {
|
||||
Some(*val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a bool, returns the associated bool. Returns None otherwise.
|
||||
pub fn as_bool(&self) -> Option<bool> {
|
||||
if let Self::Bool(val) = self {
|
||||
Some(*val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a pre-tokenized string, returns the associated string. Returns None
|
||||
/// otherwise.
|
||||
pub fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> {
|
||||
if let Self::PreTokStr(val) = self {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a bytes value, returns the associated set of bytes. Returns None otherwise.
|
||||
pub fn as_bytes(&self) -> Option<&'a [u8]> {
|
||||
if let Self::Bytes(val) = self {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// If the Value is a facet, returns the associated facet. Returns None otherwise.
|
||||
pub fn as_facet(&self) -> Option<&'a Facet> {
|
||||
if let Self::Facet(val) = self {
|
||||
Some(val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns true if the Value is an array.
|
||||
pub fn is_array(&self) -> bool {
|
||||
matches!(self, Self::Object(_))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns true if the Value is an object.
|
||||
pub fn is_object(&self) -> bool {
|
||||
matches!(self, Self::Object(_))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) mod type_codes {
|
||||
pub const TEXT_CODE: u8 = 0;
|
||||
pub const U64_CODE: u8 = 1;
|
||||
pub const I64_CODE: u8 = 2;
|
||||
pub const HIERARCHICAL_FACET_CODE: u8 = 3;
|
||||
pub const BYTES_CODE: u8 = 4;
|
||||
pub const DATE_CODE: u8 = 5;
|
||||
pub const F64_CODE: u8 = 6;
|
||||
pub const EXT_CODE: u8 = 7;
|
||||
// Replaced by the `OBJECT_CODE`.
|
||||
// -- pub const JSON_OBJ_CODE: u8 = 8;
|
||||
pub const BOOL_CODE: u8 = 9;
|
||||
pub const IP_CODE: u8 = 10;
|
||||
pub const NULL_CODE: u8 = 11;
|
||||
pub const ARRAY_CODE: u8 = 12;
|
||||
pub const OBJECT_CODE: u8 = 13;
|
||||
|
||||
// Extended type codes
|
||||
pub const TOK_STR_EXT_CODE: u8 = 0;
|
||||
}
|
||||
760
src/schema/document/se.rs
Normal file
760
src/schema/document/se.rs
Normal file
@@ -0,0 +1,760 @@
|
||||
use std::borrow::Cow;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
use columnar::MonotonicallyMappableToU128;
|
||||
use common::{f64_to_u64, BinarySerializable, VInt};
|
||||
|
||||
use crate::schema::document::{type_codes, DocValue, Document, ReferenceValue};
|
||||
use crate::schema::Schema;
|
||||
|
||||
/// A serializer writing documents which implement [`Document`] to a provided writer.
|
||||
pub struct BinaryDocumentSerializer<'se, W> {
|
||||
writer: &'se mut W,
|
||||
schema: &'se Schema,
|
||||
}
|
||||
|
||||
impl<'se, W> BinaryDocumentSerializer<'se, W>
|
||||
where W: Write
|
||||
{
|
||||
/// Creates a new serializer with a provided writer.
|
||||
pub(crate) fn new(writer: &'se mut W, schema: &'se Schema) -> Self {
|
||||
Self { writer, schema }
|
||||
}
|
||||
|
||||
/// Attempts to serialize a given document and write the output
|
||||
/// to the writer.
|
||||
pub(crate) fn serialize_doc<D>(&mut self, doc: &D) -> io::Result<()>
|
||||
where D: Document {
|
||||
let stored_field_values = || {
|
||||
doc.iter_fields_and_values()
|
||||
.filter(|(field, _)| self.schema.get_field_entry(*field).is_stored())
|
||||
};
|
||||
let num_field_values = stored_field_values().count();
|
||||
let mut actual_length = 0;
|
||||
|
||||
VInt(num_field_values as u64).serialize(self.writer)?;
|
||||
for (field, value_access) in stored_field_values() {
|
||||
field.serialize(self.writer)?;
|
||||
|
||||
let mut serializer = BinaryValueSerializer::new(self.writer);
|
||||
match value_access.as_value() {
|
||||
ReferenceValue::PreTokStr(pre_tokenized_text) => {
|
||||
serializer.serialize_value(ReferenceValue::Str::<&'_ crate::schema::Value>(
|
||||
&pre_tokenized_text.text,
|
||||
))?;
|
||||
}
|
||||
_ => {
|
||||
serializer.serialize_value(value_access.as_value())?;
|
||||
}
|
||||
}
|
||||
|
||||
actual_length += 1;
|
||||
}
|
||||
|
||||
if num_field_values != actual_length {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!(
|
||||
"Unexpected number of entries written to serializer, expected {} entries, got \
|
||||
{} entries",
|
||||
num_field_values, actual_length,
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A serializer for a single value.
|
||||
pub struct BinaryValueSerializer<'se, W> {
|
||||
writer: &'se mut W,
|
||||
}
|
||||
|
||||
impl<'se, W> BinaryValueSerializer<'se, W>
|
||||
where W: Write
|
||||
{
|
||||
/// Creates a new serializer with a provided writer.
|
||||
pub(crate) fn new(writer: &'se mut W) -> Self {
|
||||
Self { writer }
|
||||
}
|
||||
|
||||
/// Attempts to serialize a given value and write the output
|
||||
/// to the writer.
|
||||
pub(crate) fn serialize_value<'a, V>(
|
||||
&mut self,
|
||||
value: ReferenceValue<'a, V>,
|
||||
) -> io::Result<()>
|
||||
where
|
||||
V: DocValue<'a>,
|
||||
{
|
||||
match value {
|
||||
ReferenceValue::Null => self.write_type_code(type_codes::NULL_CODE),
|
||||
ReferenceValue::Str(val) => {
|
||||
self.write_type_code(type_codes::TEXT_CODE)?;
|
||||
|
||||
let temp_val = Cow::Borrowed(val);
|
||||
temp_val.serialize(self.writer)
|
||||
}
|
||||
ReferenceValue::U64(val) => {
|
||||
self.write_type_code(type_codes::U64_CODE)?;
|
||||
|
||||
val.serialize(self.writer)
|
||||
}
|
||||
ReferenceValue::I64(val) => {
|
||||
self.write_type_code(type_codes::I64_CODE)?;
|
||||
|
||||
val.serialize(self.writer)
|
||||
}
|
||||
ReferenceValue::F64(val) => {
|
||||
self.write_type_code(type_codes::F64_CODE)?;
|
||||
|
||||
f64_to_u64(val).serialize(self.writer)
|
||||
}
|
||||
ReferenceValue::Date(val) => {
|
||||
self.write_type_code(type_codes::DATE_CODE)?;
|
||||
val.serialize(self.writer)
|
||||
}
|
||||
ReferenceValue::Facet(val) => {
|
||||
self.write_type_code(type_codes::HIERARCHICAL_FACET_CODE)?;
|
||||
|
||||
val.serialize(self.writer)
|
||||
}
|
||||
ReferenceValue::Bytes(val) => {
|
||||
self.write_type_code(type_codes::BYTES_CODE)?;
|
||||
|
||||
let temp_val = Cow::Borrowed(val);
|
||||
temp_val.serialize(self.writer)
|
||||
}
|
||||
ReferenceValue::IpAddr(val) => {
|
||||
self.write_type_code(type_codes::IP_CODE)?;
|
||||
|
||||
val.to_u128().serialize(self.writer)
|
||||
}
|
||||
ReferenceValue::Bool(val) => {
|
||||
self.write_type_code(type_codes::BOOL_CODE)?;
|
||||
|
||||
val.serialize(self.writer)
|
||||
}
|
||||
ReferenceValue::PreTokStr(val) => {
|
||||
self.write_type_code(type_codes::EXT_CODE)?;
|
||||
self.write_type_code(type_codes::TOK_STR_EXT_CODE)?;
|
||||
|
||||
val.serialize(self.writer)
|
||||
}
|
||||
ReferenceValue::Array(elements) => {
|
||||
self.write_type_code(type_codes::ARRAY_CODE)?;
|
||||
|
||||
// Somewhat unfortunate that we do this here however, writing the
|
||||
// length at the end of the complicates things quite considerably.
|
||||
let elements: Vec<ReferenceValue<'_, V::ChildValue>> = elements.collect();
|
||||
|
||||
let mut serializer = BinaryArraySerializer::begin(elements.len(), self.writer)?;
|
||||
|
||||
for value in elements {
|
||||
serializer.serialize_value(value)?;
|
||||
}
|
||||
|
||||
serializer.end()
|
||||
}
|
||||
ReferenceValue::Object(object) => {
|
||||
self.write_type_code(type_codes::OBJECT_CODE)?;
|
||||
|
||||
// Somewhat unfortunate that we do this here however, writing the
|
||||
// length at the end of the complicates things quite considerably.
|
||||
let entries: Vec<(&str, ReferenceValue<'_, V::ChildValue>)> = object.collect();
|
||||
|
||||
let mut serializer = BinaryObjectSerializer::begin(entries.len(), self.writer)?;
|
||||
|
||||
for (key, value) in entries {
|
||||
serializer.serialize_entry(key, value)?;
|
||||
}
|
||||
|
||||
serializer.end()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn write_type_code(&mut self, code: u8) -> io::Result<()> {
|
||||
code.serialize(self.writer)
|
||||
}
|
||||
}
|
||||
|
||||
/// A serializer for writing a sequence of values to a writer.
|
||||
pub struct BinaryArraySerializer<'se, W> {
|
||||
writer: &'se mut W,
|
||||
expected_length: usize,
|
||||
actual_length: usize,
|
||||
}
|
||||
|
||||
impl<'se, W> BinaryArraySerializer<'se, W>
|
||||
where W: Write
|
||||
{
|
||||
/// Creates a new array serializer and writes the length of the array to the writer.
|
||||
pub(crate) fn begin(length: usize, writer: &'se mut W) -> io::Result<Self> {
|
||||
VInt(length as u64).serialize(writer)?;
|
||||
|
||||
Ok(Self {
|
||||
writer,
|
||||
expected_length: length,
|
||||
actual_length: 0,
|
||||
})
|
||||
}
|
||||
|
||||
/// Attempts to serialize a given value and write the output
|
||||
/// to the writer.
|
||||
pub(crate) fn serialize_value<'a, V>(
|
||||
&mut self,
|
||||
value: ReferenceValue<'a, V>,
|
||||
) -> io::Result<()>
|
||||
where
|
||||
V: DocValue<'a>,
|
||||
{
|
||||
let mut serializer = BinaryValueSerializer::new(self.writer);
|
||||
serializer.serialize_value(value)?;
|
||||
|
||||
self.actual_length += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Finishes writing the array to the writer and validates it.
|
||||
pub(crate) fn end(self) -> io::Result<()> {
|
||||
if self.expected_length != self.actual_length {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!(
|
||||
"Unexpected number of entries written to serializer, expected {} entries, got \
|
||||
{} entries",
|
||||
self.expected_length, self.actual_length,
|
||||
),
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A serializer for writing a set of key-value pairs to a writer.
|
||||
pub struct BinaryObjectSerializer<'se, W> {
|
||||
inner: BinaryArraySerializer<'se, W>,
|
||||
expected_length: usize,
|
||||
actual_length: usize,
|
||||
}
|
||||
|
||||
impl<'se, W> BinaryObjectSerializer<'se, W>
|
||||
where W: Write
|
||||
{
|
||||
/// Creates a new object serializer and writes the length of the object to the writer.
|
||||
pub(crate) fn begin(length: usize, writer: &'se mut W) -> io::Result<Self> {
|
||||
// We mul by 2 here to count the keys and values separately:
|
||||
// [("a", 1), ("b", 2)] is actually stored as ["a", 1, "b", 2]
|
||||
let inner = BinaryArraySerializer::begin(length * 2, writer)?;
|
||||
|
||||
Ok(Self {
|
||||
inner,
|
||||
expected_length: length,
|
||||
actual_length: 0,
|
||||
})
|
||||
}
|
||||
|
||||
/// Attempts to serialize a given value and write the output
|
||||
/// to the writer.
|
||||
pub(crate) fn serialize_entry<'a, V>(
|
||||
&mut self,
|
||||
key: &'a str,
|
||||
value: ReferenceValue<'a, V>,
|
||||
) -> io::Result<()>
|
||||
where
|
||||
V: DocValue<'a>,
|
||||
{
|
||||
// Keys and values are stored inline with one another.
|
||||
// Technically this isn't the *most* optimal way of storing the objects
|
||||
// as we could avoid writing the extra byte per key. But the gain is
|
||||
// largely not worth it for the extra complexity it brings.
|
||||
self.inner
|
||||
.serialize_value(ReferenceValue::<'a, V>::Str(key))?;
|
||||
self.inner.serialize_value(value)?;
|
||||
|
||||
self.actual_length += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Finishes writing the array to the writer and validates it.
|
||||
pub(crate) fn end(self) -> io::Result<()> {
|
||||
if self.expected_length != self.actual_length {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!(
|
||||
"Unexpected number of entries written to serializer, expected {} entries, got \
|
||||
{} entries",
|
||||
self.expected_length, self.actual_length,
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
// This should never fail if the above statement is valid.
|
||||
self.inner.end()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use common::DateTime;
|
||||
use serde_json::Number;
|
||||
use tokenizer_api::Token;
|
||||
|
||||
use super::*;
|
||||
use crate::schema::document::existing_type_impls::{JsonArrayIter, JsonObjectIter};
|
||||
use crate::schema::{Facet, Field, FAST, STORED, TEXT};
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
|
||||
fn serialize_value<'a>(value: ReferenceValue<'a, &'a serde_json::Value>) -> Vec<u8> {
|
||||
let mut writer = Vec::new();
|
||||
|
||||
let mut serializer = BinaryValueSerializer::new(&mut writer);
|
||||
serializer.serialize_value(value).expect("Serialize value");
|
||||
|
||||
writer
|
||||
}
|
||||
|
||||
/// A macro for defining the expected binary representation
|
||||
/// of the serialized values in a somewhat human readable way.
|
||||
macro_rules! binary_repr {
|
||||
($( $type_code:expr $(, $ext_code:expr)? => $value:expr $(,)?)*) => {{
|
||||
let mut writer = Vec::new();
|
||||
|
||||
$(
|
||||
$type_code.serialize(&mut writer).unwrap();
|
||||
|
||||
$(
|
||||
$ext_code.serialize(&mut writer).unwrap();
|
||||
)?
|
||||
|
||||
$value.serialize(&mut writer).unwrap();
|
||||
)*
|
||||
|
||||
writer
|
||||
}};
|
||||
(collection $code:expr, length $len:expr, $( $type_code:expr $(, $ext_code:expr)? => $value:expr $(,)?)*) => {{
|
||||
let mut writer = Vec::new();
|
||||
|
||||
$code.serialize(&mut writer).unwrap();
|
||||
VInt($len as u64).serialize(&mut writer).unwrap();
|
||||
|
||||
$(
|
||||
$type_code.serialize(&mut writer).unwrap();
|
||||
|
||||
$(
|
||||
$ext_code.serialize(&mut writer).unwrap();
|
||||
)?
|
||||
|
||||
$value.serialize(&mut writer).unwrap();
|
||||
)*
|
||||
|
||||
writer
|
||||
}};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple_value_serialize() {
|
||||
let result = serialize_value(ReferenceValue::Null);
|
||||
let expected = binary_repr!(
|
||||
type_codes::NULL_CODE => (),
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let result = serialize_value(ReferenceValue::Str("hello, world"));
|
||||
let expected = binary_repr!(
|
||||
type_codes::TEXT_CODE => String::from("hello, world"),
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let result = serialize_value(ReferenceValue::U64(123));
|
||||
let expected = binary_repr!(
|
||||
type_codes::U64_CODE => 123u64,
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let result = serialize_value(ReferenceValue::I64(-123));
|
||||
let expected = binary_repr!(
|
||||
type_codes::I64_CODE => -123i64,
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let result = serialize_value(ReferenceValue::F64(123.3845));
|
||||
let expected = binary_repr!(
|
||||
type_codes::F64_CODE => f64_to_u64(123.3845f64),
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let result = serialize_value(ReferenceValue::Bool(false));
|
||||
let expected = binary_repr!(
|
||||
type_codes::BOOL_CODE => false,
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let result = serialize_value(ReferenceValue::Date(DateTime::MAX));
|
||||
let expected = binary_repr!(
|
||||
type_codes::DATE_CODE => DateTime::MAX,
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let facet = Facet::from_text("/hello/world").unwrap();
|
||||
let result = serialize_value(ReferenceValue::Facet(&facet));
|
||||
let expected = binary_repr!(
|
||||
type_codes::HIERARCHICAL_FACET_CODE => Facet::from_text("/hello/world").unwrap(),
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let pre_tok_str = PreTokenizedString {
|
||||
text: "hello, world".to_string(),
|
||||
tokens: vec![Token::default(), Token::default()],
|
||||
};
|
||||
let result = serialize_value(ReferenceValue::PreTokStr(&pre_tok_str));
|
||||
let expected = binary_repr!(
|
||||
type_codes::EXT_CODE, type_codes::TOK_STR_EXT_CODE => pre_tok_str,
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_array_serialize() {
|
||||
let elements = vec![serde_json::Value::Null, serde_json::Value::Null];
|
||||
let result = serialize_value(ReferenceValue::Array(JsonArrayIter(elements.iter())));
|
||||
let expected = binary_repr!(
|
||||
collection type_codes::ARRAY_CODE,
|
||||
length elements.len(),
|
||||
type_codes::NULL_CODE => (),
|
||||
type_codes::NULL_CODE => (),
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let elements = vec![
|
||||
serde_json::Value::String("Hello, world".into()),
|
||||
serde_json::Value::String("Some demo".into()),
|
||||
];
|
||||
let result = serialize_value(ReferenceValue::Array(JsonArrayIter(elements.iter())));
|
||||
let expected = binary_repr!(
|
||||
collection type_codes::ARRAY_CODE,
|
||||
length elements.len(),
|
||||
type_codes::TEXT_CODE => String::from("Hello, world"),
|
||||
type_codes::TEXT_CODE => String::from("Some demo"),
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let elements = vec![];
|
||||
let result = serialize_value(ReferenceValue::Array(JsonArrayIter(elements.iter())));
|
||||
let expected = binary_repr!(
|
||||
collection type_codes::ARRAY_CODE,
|
||||
length elements.len(),
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let elements = vec![
|
||||
serde_json::Value::Null,
|
||||
serde_json::Value::String("Hello, world".into()),
|
||||
serde_json::Value::Number(12345.into()),
|
||||
];
|
||||
let result = serialize_value(ReferenceValue::Array(JsonArrayIter(elements.iter())));
|
||||
let expected = binary_repr!(
|
||||
collection type_codes::ARRAY_CODE,
|
||||
length elements.len(),
|
||||
type_codes::NULL_CODE => (),
|
||||
type_codes::TEXT_CODE => String::from("Hello, world"),
|
||||
type_codes::U64_CODE => 12345u64,
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_object_serialize() {
|
||||
let mut object = serde_json::Map::new();
|
||||
object.insert(
|
||||
"my-first-key".into(),
|
||||
serde_json::Value::String("Hello".into()),
|
||||
);
|
||||
object.insert("my-second-key".into(), serde_json::Value::Null);
|
||||
object.insert(
|
||||
"my-third-key".into(),
|
||||
serde_json::Value::Number(Number::from_f64(123.0).unwrap()),
|
||||
);
|
||||
let result = serialize_value(ReferenceValue::Object(JsonObjectIter(object.iter())));
|
||||
let expected = binary_repr!(
|
||||
collection type_codes::OBJECT_CODE,
|
||||
length object.len() * 2, // To account for keys counting towards the length
|
||||
type_codes::TEXT_CODE => String::from("my-first-key"),
|
||||
type_codes::TEXT_CODE => String::from("Hello"),
|
||||
type_codes::TEXT_CODE => String::from("my-second-key"),
|
||||
type_codes::NULL_CODE => (),
|
||||
type_codes::TEXT_CODE => String::from("my-third-key"),
|
||||
type_codes::F64_CODE => f64_to_u64(123.0),
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let object = serde_json::Map::new();
|
||||
let result = serialize_value(ReferenceValue::Object(JsonObjectIter(object.iter())));
|
||||
let expected = binary_repr!(
|
||||
collection type_codes::OBJECT_CODE,
|
||||
length object.len(),
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let mut object = serde_json::Map::new();
|
||||
object.insert("my-first-key".into(), serde_json::Value::Null);
|
||||
object.insert("my-second-key".into(), serde_json::Value::Null);
|
||||
object.insert("my-third-key".into(), serde_json::Value::Null);
|
||||
let result = serialize_value(ReferenceValue::Object(JsonObjectIter(object.iter())));
|
||||
let expected = binary_repr!(
|
||||
collection type_codes::OBJECT_CODE,
|
||||
length object.len() * 2, // To account for keys counting towards the length
|
||||
type_codes::TEXT_CODE => String::from("my-first-key"),
|
||||
type_codes::NULL_CODE => (),
|
||||
type_codes::TEXT_CODE => String::from("my-second-key"),
|
||||
type_codes::NULL_CODE => (),
|
||||
type_codes::TEXT_CODE => String::from("my-third-key"),
|
||||
type_codes::NULL_CODE => (),
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nested_serialize() {
|
||||
let mut object = serde_json::Map::new();
|
||||
object.insert(
|
||||
"my-array".into(),
|
||||
serde_json::Value::Array(vec![
|
||||
serde_json::Value::Null,
|
||||
serde_json::Value::String(String::from("bobby of the sea")),
|
||||
]),
|
||||
);
|
||||
object.insert(
|
||||
"my-object".into(),
|
||||
serde_json::Value::Object(
|
||||
vec![
|
||||
(
|
||||
"inner-1".to_string(),
|
||||
serde_json::Value::Number((-123i64).into()),
|
||||
),
|
||||
(
|
||||
"inner-2".to_string(),
|
||||
serde_json::Value::String(String::from("bobby of the sea 2")),
|
||||
),
|
||||
]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
),
|
||||
);
|
||||
let result = serialize_value(ReferenceValue::Object(JsonObjectIter(object.iter())));
|
||||
|
||||
let mut expected = Vec::new();
|
||||
let header = binary_repr!(
|
||||
collection type_codes::OBJECT_CODE,
|
||||
length object.len() * 2,
|
||||
);
|
||||
expected.extend_from_slice(&header);
|
||||
expected
|
||||
.extend_from_slice(&binary_repr!(type_codes::TEXT_CODE => String::from("my-array")));
|
||||
let nested_array = binary_repr!(
|
||||
collection type_codes::ARRAY_CODE,
|
||||
length 2,
|
||||
type_codes::NULL_CODE => (),
|
||||
type_codes::TEXT_CODE => String::from("bobby of the sea"),
|
||||
);
|
||||
expected.extend_from_slice(&nested_array);
|
||||
expected
|
||||
.extend_from_slice(&binary_repr!(type_codes::TEXT_CODE => String::from("my-object")));
|
||||
let nested_object = binary_repr!(
|
||||
collection type_codes::OBJECT_CODE,
|
||||
length 4, // 2 keys, 2 values
|
||||
type_codes::TEXT_CODE => String::from("inner-1"),
|
||||
type_codes::I64_CODE => -123i64,
|
||||
type_codes::TEXT_CODE => String::from("inner-2"),
|
||||
type_codes::TEXT_CODE => String::from("bobby of the sea 2"),
|
||||
);
|
||||
expected.extend_from_slice(&nested_object);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
// Some more extreme nesting that might behave weirdly
|
||||
let mut object = serde_json::Map::new();
|
||||
object.insert(
|
||||
"my-array".into(),
|
||||
serde_json::Value::Array(vec![serde_json::Value::Array(vec![
|
||||
serde_json::Value::Array(vec![]),
|
||||
serde_json::Value::Array(vec![serde_json::Value::Null]),
|
||||
])]),
|
||||
);
|
||||
let result = serialize_value(ReferenceValue::Object(JsonObjectIter(object.iter())));
|
||||
|
||||
let mut expected = Vec::new();
|
||||
let header = binary_repr!(
|
||||
collection type_codes::OBJECT_CODE,
|
||||
length object.len() * 2,
|
||||
);
|
||||
expected.extend_from_slice(&header);
|
||||
expected
|
||||
.extend_from_slice(&binary_repr!(type_codes::TEXT_CODE => String::from("my-array")));
|
||||
let nested_array = binary_repr!(
|
||||
collection type_codes::ARRAY_CODE,
|
||||
length 1,
|
||||
);
|
||||
expected.extend_from_slice(&nested_array);
|
||||
let nested_array = binary_repr!(
|
||||
collection type_codes::ARRAY_CODE,
|
||||
length 2,
|
||||
);
|
||||
expected.extend_from_slice(&nested_array);
|
||||
let nested_array = binary_repr!(
|
||||
collection type_codes::ARRAY_CODE,
|
||||
length 0,
|
||||
);
|
||||
expected.extend_from_slice(&nested_array);
|
||||
let nested_array = binary_repr!(
|
||||
collection type_codes::ARRAY_CODE,
|
||||
length 1,
|
||||
type_codes::NULL_CODE => (),
|
||||
);
|
||||
expected.extend_from_slice(&nested_array);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
}
|
||||
|
||||
fn serialize_doc<D: Document>(doc: &D, schema: &Schema) -> Vec<u8> {
|
||||
let mut writer = Vec::new();
|
||||
|
||||
let mut serializer = BinaryDocumentSerializer::new(&mut writer, schema);
|
||||
serializer.serialize_doc(doc).expect("Serialize value");
|
||||
|
||||
writer
|
||||
}
|
||||
|
||||
/// A helper macro for generating the expected binary representation of the document.
|
||||
macro_rules! expected_doc_data {
|
||||
(length $len:expr) => {{
|
||||
let mut writer = Vec::new();
|
||||
VInt($len as u64).serialize(&mut writer).unwrap();
|
||||
writer
|
||||
}};
|
||||
(length $len:expr, $( $field_id:expr => $value:expr $(,)?)*) => {{
|
||||
let mut writer = Vec::new();
|
||||
|
||||
VInt($len as u64).serialize(&mut writer).unwrap();
|
||||
$(
|
||||
$field_id.serialize(&mut writer).unwrap();
|
||||
$value.serialize(&mut writer).unwrap();
|
||||
)*
|
||||
|
||||
writer
|
||||
}};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_document_serialize() {
|
||||
let mut builder = Schema::builder();
|
||||
let name = builder.add_text_field("name", TEXT | STORED);
|
||||
let age = builder.add_u64_field("age", FAST | STORED);
|
||||
let schema = builder.build();
|
||||
|
||||
let mut document = BTreeMap::new();
|
||||
document.insert(name, crate::schema::Value::Str("ChillFish8".into()));
|
||||
document.insert(age, crate::schema::Value::U64(20));
|
||||
|
||||
let result = serialize_doc(&document, &schema);
|
||||
let mut expected = expected_doc_data!(length document.len());
|
||||
name.serialize(&mut expected).unwrap();
|
||||
expected
|
||||
.extend_from_slice(&binary_repr!(type_codes::TEXT_CODE => String::from("ChillFish8")));
|
||||
age.serialize(&mut expected).unwrap();
|
||||
expected.extend_from_slice(&binary_repr!(type_codes::U64_CODE => 20u64));
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized document to match the binary representation"
|
||||
);
|
||||
|
||||
let mut builder = Schema::builder();
|
||||
let name = builder.add_text_field("name", TEXT | STORED);
|
||||
// This should be skipped when serializing.
|
||||
let age = builder.add_u64_field("age", FAST);
|
||||
let schema = builder.build();
|
||||
|
||||
let mut document = BTreeMap::new();
|
||||
document.insert(name, crate::schema::Value::Str("ChillFish8".into()));
|
||||
document.insert(age, crate::schema::Value::U64(20));
|
||||
|
||||
let result = serialize_doc(&document, &schema);
|
||||
let mut expected = expected_doc_data!(length 1);
|
||||
name.serialize(&mut expected).unwrap();
|
||||
expected
|
||||
.extend_from_slice(&binary_repr!(type_codes::TEXT_CODE => String::from("ChillFish8")));
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized document to match the binary representation"
|
||||
);
|
||||
|
||||
let builder = Schema::builder();
|
||||
let schema = builder.build();
|
||||
let document = BTreeMap::<Field, crate::schema::Value>::new();
|
||||
let result = serialize_doc(&document, &schema);
|
||||
let expected = expected_doc_data!(length document.len());
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized document to match the binary representation"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -481,7 +481,7 @@ impl FieldType {
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(_) => Ok(Value::JsonObject(json_map)),
|
||||
FieldType::JsonObject(_) => Ok(Value::from(json_map)),
|
||||
_ => Err(ValueParsingError::TypeError {
|
||||
expected: self.value_type().name(),
|
||||
json: JsonValue::Object(json_map),
|
||||
@@ -538,27 +538,27 @@ mod tests {
|
||||
use crate::schema::{NumericOptions, Schema, TextOptions, Type, Value, COERCE, INDEXED};
|
||||
use crate::time::{Date, Month, PrimitiveDateTime, Time};
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
use crate::{DateTime, Document};
|
||||
use crate::{DateTime, TantivyDocument};
|
||||
|
||||
#[test]
|
||||
fn test_to_string_coercion() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("id", COERCE);
|
||||
let schema = schema_builder.build();
|
||||
let doc = schema.parse_document(r#"{"id": 100}"#).unwrap();
|
||||
let doc = TantivyDocument::parse_json(&schema, r#"{"id": 100}"#).unwrap();
|
||||
assert_eq!(
|
||||
&Value::Str("100".to_string()),
|
||||
doc.get_first(text_field).unwrap()
|
||||
);
|
||||
|
||||
let doc = schema.parse_document(r#"{"id": true}"#).unwrap();
|
||||
let doc = TantivyDocument::parse_json(&schema, r#"{"id": true}"#).unwrap();
|
||||
assert_eq!(
|
||||
&Value::Str("true".to_string()),
|
||||
doc.get_first(text_field).unwrap()
|
||||
);
|
||||
|
||||
// Not sure if this null coercion is the best approach
|
||||
let doc = schema.parse_document(r#"{"id": null}"#).unwrap();
|
||||
let doc = TantivyDocument::parse_json(&schema, r#"{"id": null}"#).unwrap();
|
||||
assert_eq!(
|
||||
&Value::Str("null".to_string()),
|
||||
doc.get_first(text_field).unwrap()
|
||||
@@ -573,7 +573,7 @@ mod tests {
|
||||
let f64_field = schema_builder.add_f64_field("f64", COERCE);
|
||||
let schema = schema_builder.build();
|
||||
let doc_json = r#"{"i64": "100", "u64": "100", "f64": "100"}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
|
||||
assert_eq!(&Value::I64(100), doc.get_first(i64_field).unwrap());
|
||||
assert_eq!(&Value::U64(100), doc.get_first(u64_field).unwrap());
|
||||
assert_eq!(&Value::F64(100.0), doc.get_first(f64_field).unwrap());
|
||||
@@ -585,11 +585,11 @@ mod tests {
|
||||
let bool_field = schema_builder.add_bool_field("bool", COERCE);
|
||||
let schema = schema_builder.build();
|
||||
let doc_json = r#"{"bool": "true"}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
|
||||
assert_eq!(&Value::Bool(true), doc.get_first(bool_field).unwrap());
|
||||
|
||||
let doc_json = r#"{"bool": "false"}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
|
||||
assert_eq!(&Value::Bool(false), doc.get_first(bool_field).unwrap());
|
||||
}
|
||||
|
||||
@@ -600,20 +600,17 @@ mod tests {
|
||||
schema_builder.add_u64_field("u64", NumericOptions::default());
|
||||
schema_builder.add_f64_field("f64", NumericOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
assert!(schema
|
||||
.parse_document(r#"{"u64": "100"}"#)
|
||||
assert!(TantivyDocument::parse_json(&schema, r#"{"u64": "100"}"#)
|
||||
.unwrap_err()
|
||||
.to_string()
|
||||
.contains("a u64"));
|
||||
|
||||
assert!(schema
|
||||
.parse_document(r#"{"i64": "100"}"#)
|
||||
assert!(TantivyDocument::parse_json(&schema, r#"{"i64": "100"}"#)
|
||||
.unwrap_err()
|
||||
.to_string()
|
||||
.contains("a i64"));
|
||||
|
||||
assert!(schema
|
||||
.parse_document(r#"{"f64": "100"}"#)
|
||||
assert!(TantivyDocument::parse_json(&schema, r#"{"f64": "100"}"#)
|
||||
.unwrap_err()
|
||||
.to_string()
|
||||
.contains("a f64"));
|
||||
@@ -625,7 +622,7 @@ mod tests {
|
||||
let date_field = schema_builder.add_date_field("date", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
|
||||
let date = doc.get_first(date_field).unwrap();
|
||||
// Time zone is converted to UTC
|
||||
assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{date:?}"));
|
||||
@@ -633,7 +630,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_serialize_json_date() {
|
||||
let mut doc = Document::new();
|
||||
let mut doc = TantivyDocument::new();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field("date", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
@@ -641,7 +638,7 @@ mod tests {
|
||||
let naive_time = Time::from_hms(13, 20, 0).unwrap();
|
||||
let date_time = PrimitiveDateTime::new(naive_date, naive_time);
|
||||
doc.add_date(date_field, DateTime::from_primitive(date_time));
|
||||
let doc_json = schema.to_json(&doc);
|
||||
let doc_json = doc.to_json(&schema);
|
||||
assert_eq!(doc_json, r#"{"date":["1982-09-17T13:20:00Z"]}"#);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,3 @@
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
use common::BinarySerializable;
|
||||
|
||||
use crate::schema::{Field, Value};
|
||||
|
||||
/// `FieldValue` holds together a `Field` and its `Value`.
|
||||
@@ -35,15 +31,16 @@ impl From<FieldValue> for Value {
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for FieldValue {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
self.field.serialize(writer)?;
|
||||
self.value.serialize(writer)
|
||||
}
|
||||
/// A helper wrapper for creating standard iterators
|
||||
/// out of the fields iterator trait.
|
||||
pub struct FieldValueIter<'a>(pub(crate) std::slice::Iter<'a, FieldValue>);
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let field = Field::deserialize(reader)?;
|
||||
let value = Value::deserialize(reader)?;
|
||||
Ok(FieldValue { field, value })
|
||||
impl<'a> Iterator for FieldValueIter<'a> {
|
||||
type Item = (Field, &'a Value);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0
|
||||
.next()
|
||||
.map(|field_value| (field_value.field, &field_value.value))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,7 +106,7 @@
|
||||
//! let schema = schema_builder.build();
|
||||
//! ```
|
||||
|
||||
mod document;
|
||||
pub mod document;
|
||||
mod facet;
|
||||
mod facet_options;
|
||||
mod schema;
|
||||
@@ -134,7 +134,7 @@ pub use self::bytes_options::BytesOptions;
|
||||
#[allow(deprecated)]
|
||||
pub use self::date_time_options::DatePrecision;
|
||||
pub use self::date_time_options::{DateOptions, DateTimePrecision, DATE_TIME_PRECISION_INDEXED};
|
||||
pub use self::document::Document;
|
||||
pub use self::document::{DocParsingError, DocValue, Document, TantivyDocument};
|
||||
pub(crate) use self::facet::FACET_SEP_BYTE;
|
||||
pub use self::facet::{Facet, FacetParseError};
|
||||
pub use self::facet_options::FacetOptions;
|
||||
@@ -150,7 +150,7 @@ pub use self::named_field_document::NamedFieldDocument;
|
||||
#[allow(deprecated)]
|
||||
pub use self::numeric_options::IntOptions;
|
||||
pub use self::numeric_options::NumericOptions;
|
||||
pub use self::schema::{DocParsingError, Schema, SchemaBuilder};
|
||||
pub use self::schema::{Schema, SchemaBuilder};
|
||||
pub use self::term::{Term, ValueBytes, JSON_END_OF_PATH};
|
||||
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
|
||||
pub use self::value::Value;
|
||||
|
||||
@@ -1,16 +1,14 @@
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
|
||||
use serde::de::{SeqAccess, Visitor};
|
||||
use serde::ser::SerializeSeq;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use serde_json::{self, Value as JsonValue};
|
||||
|
||||
use super::ip_options::IpAddrOptions;
|
||||
use super::*;
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Tantivy has a very strict schema.
|
||||
@@ -317,78 +315,6 @@ impl Schema {
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))
|
||||
}
|
||||
|
||||
/// Create document from a named doc.
|
||||
pub fn convert_named_doc(
|
||||
&self,
|
||||
named_doc: NamedFieldDocument,
|
||||
) -> Result<Document, DocParsingError> {
|
||||
let mut document = Document::new();
|
||||
for (field_name, values) in named_doc.0 {
|
||||
if let Ok(field) = self.get_field(&field_name) {
|
||||
for value in values {
|
||||
document.add_field_value(field, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(document)
|
||||
}
|
||||
|
||||
/// Create a named document from the doc.
|
||||
pub fn to_named_doc(&self, doc: &Document) -> NamedFieldDocument {
|
||||
let mut field_map = BTreeMap::new();
|
||||
for (field, field_values) in doc.get_sorted_field_values() {
|
||||
let field_name = self.get_field_name(field);
|
||||
let values: Vec<Value> = field_values.into_iter().cloned().collect();
|
||||
field_map.insert(field_name.to_string(), values);
|
||||
}
|
||||
NamedFieldDocument(field_map)
|
||||
}
|
||||
|
||||
/// Encode the schema in JSON.
|
||||
///
|
||||
/// Encoding a document cannot fail.
|
||||
pub fn to_json(&self, doc: &Document) -> String {
|
||||
serde_json::to_string(&self.to_named_doc(doc)).expect("doc encoding failed. This is a bug")
|
||||
}
|
||||
|
||||
/// Build a document object from a json-object.
|
||||
pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
|
||||
let json_obj: serde_json::Map<String, JsonValue> =
|
||||
serde_json::from_str(doc_json).map_err(|_| DocParsingError::invalid_json(doc_json))?;
|
||||
self.json_object_to_doc(json_obj)
|
||||
}
|
||||
|
||||
/// Build a document object from a json-object.
|
||||
pub fn json_object_to_doc(
|
||||
&self,
|
||||
json_obj: serde_json::Map<String, JsonValue>,
|
||||
) -> Result<Document, DocParsingError> {
|
||||
let mut doc = Document::default();
|
||||
for (field_name, json_value) in json_obj {
|
||||
if let Ok(field) = self.get_field(&field_name) {
|
||||
let field_entry = self.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
match json_value {
|
||||
JsonValue::Array(json_items) => {
|
||||
for json_item in json_items {
|
||||
let value = field_type
|
||||
.value_from_json(json_item)
|
||||
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
|
||||
doc.add_field_value(field, value);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let value = field_type
|
||||
.value_from_json(json_value)
|
||||
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
|
||||
doc.add_field_value(field, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(doc)
|
||||
}
|
||||
|
||||
/// Searches for a full_path in the schema, returning the field name and a JSON path.
|
||||
///
|
||||
/// This function works by checking if the field exists for the exact given full_path.
|
||||
@@ -478,26 +404,6 @@ impl<'de> Deserialize<'de> for Schema {
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that may happen when deserializing
|
||||
/// a document from JSON.
|
||||
#[derive(Debug, Error, PartialEq)]
|
||||
pub enum DocParsingError {
|
||||
/// The payload given is not valid JSON.
|
||||
#[error("The provided string is not valid JSON")]
|
||||
InvalidJson(String),
|
||||
/// One of the value node could not be parsed.
|
||||
#[error("The field '{0:?}' could not be parsed: {1:?}")]
|
||||
ValueError(String, ValueParsingError),
|
||||
}
|
||||
|
||||
impl DocParsingError {
|
||||
/// Builds a NotJson DocParsingError
|
||||
fn invalid_json(invalid_json: &str) -> Self {
|
||||
let sample = invalid_json.chars().take(20).collect();
|
||||
DocParsingError::InvalidJson(sample)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -507,6 +413,7 @@ mod tests {
|
||||
use pretty_assertions::assert_eq;
|
||||
use serde_json;
|
||||
|
||||
use crate::schema::document::DocValue;
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
use crate::schema::schema::DocParsingError::InvalidJson;
|
||||
use crate::schema::*;
|
||||
@@ -675,9 +582,9 @@ mod tests {
|
||||
"ip": "127.0.0.1",
|
||||
"is_read": true
|
||||
}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
|
||||
|
||||
let doc_serdeser = schema.parse_document(&schema.to_json(&doc)).unwrap();
|
||||
let doc_serdeser = TantivyDocument::parse_json(&schema, &doc.to_json(&schema)).unwrap();
|
||||
assert_eq!(doc, doc_serdeser);
|
||||
}
|
||||
|
||||
@@ -691,26 +598,26 @@ mod tests {
|
||||
let doc_json = r#"{
|
||||
"ip": "127.0.0.1"
|
||||
}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_str(&schema.to_json(&doc)).unwrap();
|
||||
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_str(&doc.to_json(&schema)).unwrap();
|
||||
assert_eq!(value["ip"][0], "127.0.0.1");
|
||||
|
||||
// Special case IpV6 loopback. We don't want to map that to IPv4
|
||||
let doc_json = r#"{
|
||||
"ip": "::1"
|
||||
}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
|
||||
|
||||
let value: serde_json::Value = serde_json::from_str(&schema.to_json(&doc)).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_str(&doc.to_json(&schema)).unwrap();
|
||||
assert_eq!(value["ip"][0], "::1");
|
||||
|
||||
// testing ip address of every router in the world
|
||||
let doc_json = r#"{
|
||||
"ip": "192.168.0.1"
|
||||
}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
|
||||
|
||||
let value: serde_json::Value = serde_json::from_str(&schema.to_json(&doc)).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_str(&doc.to_json(&schema)).unwrap();
|
||||
assert_eq!(value["ip"][0], "192.168.0.1");
|
||||
}
|
||||
|
||||
@@ -729,9 +636,8 @@ mod tests {
|
||||
"val".to_string(),
|
||||
vec![Value::from(14u64), Value::from(-1i64)],
|
||||
);
|
||||
let doc = schema
|
||||
.convert_named_doc(NamedFieldDocument(named_doc_map))
|
||||
.unwrap();
|
||||
let doc =
|
||||
TantivyDocument::convert_named_doc(&schema, NamedFieldDocument(named_doc_map)).unwrap();
|
||||
assert_eq!(
|
||||
doc.get_all(title).collect::<Vec<_>>(),
|
||||
vec![
|
||||
@@ -753,9 +659,7 @@ mod tests {
|
||||
"title".to_string(),
|
||||
vec![Value::from("title1"), Value::from("title2")],
|
||||
);
|
||||
schema
|
||||
.convert_named_doc(NamedFieldDocument(named_doc_map))
|
||||
.unwrap();
|
||||
TantivyDocument::convert_named_doc(&schema, NamedFieldDocument(named_doc_map)).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -771,27 +675,27 @@ mod tests {
|
||||
let score_field = schema_builder.add_f64_field("score", score_options);
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let doc = schema.parse_document("{}").unwrap();
|
||||
let doc = TantivyDocument::parse_json(&schema, "{}").unwrap();
|
||||
assert!(doc.field_values().is_empty());
|
||||
}
|
||||
{
|
||||
let doc = schema
|
||||
.parse_document(
|
||||
r#"{
|
||||
let doc = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 4,
|
||||
"popularity": 10,
|
||||
"score": 80.5
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
doc.get_first(title_field).unwrap().as_text(),
|
||||
doc.get_first(title_field).unwrap().as_str(),
|
||||
Some("my title")
|
||||
);
|
||||
assert_eq!(
|
||||
doc.get_first(author_field).unwrap().as_text(),
|
||||
doc.get_first(author_field).unwrap().as_str(),
|
||||
Some("fulmicoton")
|
||||
);
|
||||
assert_eq!(doc.get_first(count_field).unwrap().as_u64(), Some(4));
|
||||
@@ -799,7 +703,8 @@ mod tests {
|
||||
assert_eq!(doc.get_first(score_field).unwrap().as_f64(), Some(80.5f64));
|
||||
}
|
||||
{
|
||||
let res = schema.parse_document(
|
||||
let res = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"thisfieldisnotdefinedintheschema": "my title",
|
||||
"title": "my title",
|
||||
@@ -813,7 +718,8 @@ mod tests {
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
let json_err = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
@@ -832,7 +738,8 @@ mod tests {
|
||||
);
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
let json_err = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
@@ -850,7 +757,8 @@ mod tests {
|
||||
);
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
let json_err = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
@@ -868,7 +776,8 @@ mod tests {
|
||||
));
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
let json_err = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
@@ -887,11 +796,12 @@ mod tests {
|
||||
}
|
||||
{
|
||||
// Short JSON, under the 20 char take.
|
||||
let json_err = schema.parse_document(r#"{"count": 50,}"#);
|
||||
let json_err = TantivyDocument::parse_json(&schema, r#"{"count": 50,}"#);
|
||||
assert_matches!(json_err, Err(InvalidJson(_)));
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
let json_err = TantivyDocument::parse_json(
|
||||
&schema,
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
|
||||
@@ -1,12 +1,17 @@
|
||||
use std::collections::{btree_map, BTreeMap};
|
||||
use std::fmt;
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use base64::engine::general_purpose::STANDARD as BASE64;
|
||||
use base64::Engine;
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use serde_json::Map;
|
||||
use serde::de::{MapAccess, SeqAccess};
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::schema::document::{
|
||||
ArrayAccess, DeserializeError, DocValue, ObjectAccess, ReferenceValue, ValueDeserialize,
|
||||
ValueDeserializer, ValueVisitor,
|
||||
};
|
||||
use crate::schema::Facet;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
@@ -15,6 +20,8 @@ use crate::DateTime;
|
||||
/// It is an enum over all over all of the possible field type.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum Value {
|
||||
/// A null value.
|
||||
Null,
|
||||
/// The str type is used for any text information.
|
||||
Str(String),
|
||||
/// Pre-tokenized str type,
|
||||
@@ -33,18 +40,127 @@ pub enum Value {
|
||||
Facet(Facet),
|
||||
/// Arbitrarily sized byte array
|
||||
Bytes(Vec<u8>),
|
||||
/// Json object value.
|
||||
JsonObject(serde_json::Map<String, serde_json::Value>),
|
||||
/// A set of values.
|
||||
Array(Vec<Self>),
|
||||
/// Dynamic object value.
|
||||
Object(BTreeMap<String, Self>),
|
||||
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
|
||||
IpAddr(Ipv6Addr),
|
||||
}
|
||||
|
||||
impl<'a> DocValue<'a> for &'a Value {
|
||||
type ChildValue = Self;
|
||||
type ArrayIter = ArrayIter<'a>;
|
||||
type ObjectIter = ObjectMapIter<'a>;
|
||||
|
||||
fn as_value(&self) -> ReferenceValue<'a, Self> {
|
||||
match self {
|
||||
Value::Null => ReferenceValue::Null,
|
||||
Value::Str(val) => ReferenceValue::Str(val),
|
||||
Value::PreTokStr(val) => ReferenceValue::PreTokStr(val),
|
||||
Value::U64(val) => ReferenceValue::U64(*val),
|
||||
Value::I64(val) => ReferenceValue::I64(*val),
|
||||
Value::F64(val) => ReferenceValue::F64(*val),
|
||||
Value::Bool(val) => ReferenceValue::Bool(*val),
|
||||
Value::Date(val) => ReferenceValue::Date(*val),
|
||||
Value::Facet(val) => ReferenceValue::Facet(val),
|
||||
Value::Bytes(val) => ReferenceValue::Bytes(val),
|
||||
Value::IpAddr(val) => ReferenceValue::IpAddr(*val),
|
||||
Value::Array(array) => ReferenceValue::Array(ArrayIter(array.iter())),
|
||||
Value::Object(object) => ReferenceValue::Object(ObjectMapIter(object.iter())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueDeserialize for Value {
|
||||
fn deserialize<'de, D>(deserializer: D) -> Result<Self, DeserializeError>
|
||||
where D: ValueDeserializer<'de> {
|
||||
struct Visitor;
|
||||
|
||||
impl ValueVisitor for Visitor {
|
||||
type Value = Value;
|
||||
|
||||
fn visit_null(&self) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(Value::Null)
|
||||
}
|
||||
|
||||
fn visit_string(&self, val: String) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(Value::Str(val))
|
||||
}
|
||||
|
||||
fn visit_u64(&self, val: u64) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(Value::U64(val))
|
||||
}
|
||||
|
||||
fn visit_i64(&self, val: i64) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(Value::I64(val))
|
||||
}
|
||||
|
||||
fn visit_f64(&self, val: f64) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(Value::F64(val))
|
||||
}
|
||||
|
||||
fn visit_bool(&self, val: bool) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(Value::Bool(val))
|
||||
}
|
||||
|
||||
fn visit_datetime(&self, val: DateTime) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(Value::Date(val))
|
||||
}
|
||||
|
||||
fn visit_ip_address(&self, val: Ipv6Addr) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(Value::IpAddr(val))
|
||||
}
|
||||
|
||||
fn visit_facet(&self, val: Facet) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(Value::Facet(val))
|
||||
}
|
||||
|
||||
fn visit_bytes(&self, val: Vec<u8>) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(Value::Bytes(val))
|
||||
}
|
||||
|
||||
fn visit_pre_tokenized_string(
|
||||
&self,
|
||||
val: PreTokenizedString,
|
||||
) -> Result<Self::Value, DeserializeError> {
|
||||
Ok(Value::PreTokStr(val))
|
||||
}
|
||||
|
||||
fn visit_array<'de, A>(&self, mut access: A) -> Result<Self::Value, DeserializeError>
|
||||
where A: ArrayAccess<'de> {
|
||||
let mut elements = Vec::with_capacity(access.size_hint());
|
||||
|
||||
while let Some(value) = access.next_element()? {
|
||||
elements.push(value);
|
||||
}
|
||||
|
||||
Ok(Value::Array(elements))
|
||||
}
|
||||
|
||||
fn visit_object<'de, A>(&self, mut access: A) -> Result<Self::Value, DeserializeError>
|
||||
where A: ObjectAccess<'de> {
|
||||
let mut elements = BTreeMap::new();
|
||||
|
||||
while let Some((key, value)) = access.next_entry()? {
|
||||
elements.insert(key, value);
|
||||
}
|
||||
|
||||
Ok(Value::Object(elements))
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_any(Visitor)
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Value {}
|
||||
|
||||
impl Serialize for Value {
|
||||
impl serde::Serialize for Value {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer {
|
||||
where S: serde::Serializer {
|
||||
match *self {
|
||||
Value::Null => serializer.serialize_unit(),
|
||||
Value::Str(ref v) => serializer.serialize_str(v),
|
||||
Value::PreTokStr(ref v) => v.serialize(serializer),
|
||||
Value::U64(u) => serializer.serialize_u64(u),
|
||||
@@ -54,31 +170,36 @@ impl Serialize for Value {
|
||||
Value::Date(ref date) => time::serde::rfc3339::serialize(&date.into_utc(), serializer),
|
||||
Value::Facet(ref facet) => facet.serialize(serializer),
|
||||
Value::Bytes(ref bytes) => serializer.serialize_str(&BASE64.encode(bytes)),
|
||||
Value::JsonObject(ref obj) => obj.serialize(serializer),
|
||||
Value::IpAddr(ref obj) => {
|
||||
Value::Object(ref obj) => obj.serialize(serializer),
|
||||
Value::IpAddr(ref ip_v6) => {
|
||||
// Ensure IpV4 addresses get serialized as IpV4, but excluding IpV6 loopback.
|
||||
if let Some(ip_v4) = obj.to_ipv4_mapped() {
|
||||
if let Some(ip_v4) = ip_v6.to_ipv4_mapped() {
|
||||
ip_v4.serialize(serializer)
|
||||
} else {
|
||||
obj.serialize(serializer)
|
||||
ip_v6.serialize(serializer)
|
||||
}
|
||||
}
|
||||
Value::Array(ref array) => array.serialize(serializer),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Value {
|
||||
impl<'de> serde::Deserialize<'de> for Value {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where D: Deserializer<'de> {
|
||||
where D: serde::Deserializer<'de> {
|
||||
struct ValueVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for ValueVisitor {
|
||||
impl<'de> serde::de::Visitor<'de> for ValueVisitor {
|
||||
type Value = Value;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
formatter.write_str("a string or u32")
|
||||
}
|
||||
|
||||
fn visit_bool<E>(self, v: bool) -> Result<Self::Value, E> {
|
||||
Ok(Value::Bool(v))
|
||||
}
|
||||
|
||||
fn visit_i64<E>(self, v: i64) -> Result<Self::Value, E> {
|
||||
Ok(Value::I64(v))
|
||||
}
|
||||
@@ -91,10 +212,6 @@ impl<'de> Deserialize<'de> for Value {
|
||||
Ok(Value::F64(v))
|
||||
}
|
||||
|
||||
fn visit_bool<E>(self, v: bool) -> Result<Self::Value, E> {
|
||||
Ok(Value::Bool(v))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> {
|
||||
Ok(Value::Str(v.to_owned()))
|
||||
}
|
||||
@@ -102,130 +219,39 @@ impl<'de> Deserialize<'de> for Value {
|
||||
fn visit_string<E>(self, v: String) -> Result<Self::Value, E> {
|
||||
Ok(Value::Str(v))
|
||||
}
|
||||
|
||||
fn visit_unit<E>(self) -> Result<Self::Value, E>
|
||||
where E: serde::de::Error {
|
||||
Ok(Value::Null)
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
|
||||
where A: SeqAccess<'de> {
|
||||
let mut elements = Vec::with_capacity(seq.size_hint().unwrap_or_default());
|
||||
|
||||
while let Some(value) = seq.next_element()? {
|
||||
elements.push(value);
|
||||
}
|
||||
|
||||
Ok(Value::Array(elements))
|
||||
}
|
||||
|
||||
fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
|
||||
where A: MapAccess<'de> {
|
||||
let mut object = BTreeMap::new();
|
||||
|
||||
while let Some((key, value)) = map.next_entry()? {
|
||||
object.insert(key, value);
|
||||
}
|
||||
|
||||
Ok(Value::Object(object))
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_any(ValueVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
impl Value {
|
||||
/// Returns the text value, provided the value is of the `Str` type.
|
||||
/// (Returns `None` if the value is not of the `Str` type).
|
||||
pub fn as_text(&self) -> Option<&str> {
|
||||
if let Value::Str(text) = self {
|
||||
Some(text)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the facet value, provided the value is of the `Facet` type.
|
||||
/// (Returns `None` if the value is not of the `Facet` type).
|
||||
pub fn as_facet(&self) -> Option<&Facet> {
|
||||
if let Value::Facet(facet) = self {
|
||||
Some(facet)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the tokenized text, provided the value is of the `PreTokStr` type.
|
||||
/// (Returns `None` if the value is not of the `PreTokStr` type.)
|
||||
pub fn tokenized_text(&self) -> Option<&PreTokenizedString> {
|
||||
if let Value::PreTokStr(tokenized_text) = self {
|
||||
Some(tokenized_text)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the u64-value, provided the value is of the `U64` type.
|
||||
/// (Returns `None` if the value is not of the `U64` type)
|
||||
pub fn as_u64(&self) -> Option<u64> {
|
||||
if let Value::U64(val) = self {
|
||||
Some(*val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the i64-value, provided the value is of the `I64` type.
|
||||
///
|
||||
/// Returns `None` if the value is not of type `I64`.
|
||||
pub fn as_i64(&self) -> Option<i64> {
|
||||
if let Value::I64(val) = self {
|
||||
Some(*val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the f64-value, provided the value is of the `F64` type.
|
||||
///
|
||||
/// Returns `None` if the value is not of type `F64`.
|
||||
pub fn as_f64(&self) -> Option<f64> {
|
||||
if let Value::F64(value) = self {
|
||||
Some(*value)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the bool value, provided the value is of the `Bool` type.
|
||||
///
|
||||
/// Returns `None` if the value is not of type `Bool`.
|
||||
pub fn as_bool(&self) -> Option<bool> {
|
||||
if let Value::Bool(value) = self {
|
||||
Some(*value)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the Date-value, provided the value is of the `Date` type.
|
||||
///
|
||||
/// Returns `None` if the value is not of type `Date`.
|
||||
pub fn as_date(&self) -> Option<DateTime> {
|
||||
if let Value::Date(date) = self {
|
||||
Some(*date)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the Bytes-value, provided the value is of the `Bytes` type.
|
||||
///
|
||||
/// Returns `None` if the value is not of type `Bytes`.
|
||||
pub fn as_bytes(&self) -> Option<&[u8]> {
|
||||
if let Value::Bytes(bytes) = self {
|
||||
Some(bytes)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the json object, provided the value is of the `JsonObject` type.
|
||||
///
|
||||
/// Returns `None` if the value is not of type `JsonObject`.
|
||||
pub fn as_json(&self) -> Option<&Map<String, serde_json::Value>> {
|
||||
if let Value::JsonObject(json) = self {
|
||||
Some(json)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the ip addr, provided the value is of the `Ip` type.
|
||||
/// (Returns None if the value is not of the `Ip` type)
|
||||
pub fn as_ip_addr(&self) -> Option<Ipv6Addr> {
|
||||
if let Value::IpAddr(val) = self {
|
||||
Some(*val)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<String> for Value {
|
||||
fn from(s: String) -> Value {
|
||||
Value::Str(s)
|
||||
@@ -298,188 +324,93 @@ impl From<PreTokenizedString> for Value {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Map<String, serde_json::Value>> for Value {
|
||||
fn from(json_object: serde_json::Map<String, serde_json::Value>) -> Value {
|
||||
Value::JsonObject(json_object)
|
||||
impl From<BTreeMap<String, Value>> for Value {
|
||||
fn from(object: BTreeMap<String, Value>) -> Value {
|
||||
Value::Object(object)
|
||||
}
|
||||
}
|
||||
|
||||
fn can_be_rfc3339_date_time(text: &str) -> bool {
|
||||
if let Some(&first_byte) = text.as_bytes().first() {
|
||||
if (b'0'..=b'9').contains(&first_byte) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
impl From<serde_json::Value> for Value {
|
||||
fn from(json_value: serde_json::Value) -> Value {
|
||||
match json_value {
|
||||
serde_json::Value::Object(json_object) => Value::JsonObject(json_object),
|
||||
_ => {
|
||||
panic!("Expected a json object.");
|
||||
fn from(value: serde_json::Value) -> Self {
|
||||
match value {
|
||||
serde_json::Value::Null => Self::Null,
|
||||
serde_json::Value::Bool(val) => Self::Bool(val),
|
||||
serde_json::Value::Number(number) => {
|
||||
if let Some(val) = number.as_i64() {
|
||||
Self::I64(val)
|
||||
} else if let Some(val) = number.as_u64() {
|
||||
Self::U64(val)
|
||||
} else if let Some(val) = number.as_f64() {
|
||||
Self::F64(val)
|
||||
} else {
|
||||
panic!("Unsupported serde_json number {number}");
|
||||
}
|
||||
}
|
||||
serde_json::Value::String(text) => {
|
||||
if can_be_rfc3339_date_time(&text) {
|
||||
match OffsetDateTime::parse(&text, &Rfc3339) {
|
||||
Ok(dt) => {
|
||||
let dt_utc = dt.to_offset(time::UtcOffset::UTC);
|
||||
Self::Date(DateTime::from_utc(dt_utc))
|
||||
}
|
||||
Err(_) => Self::Str(text),
|
||||
}
|
||||
} else {
|
||||
Self::Str(text)
|
||||
}
|
||||
}
|
||||
serde_json::Value::Array(elements) => {
|
||||
let converted_elements = elements.into_iter().map(Self::from).collect();
|
||||
Self::Array(converted_elements)
|
||||
}
|
||||
serde_json::Value::Object(object) => Self::from(object),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod binary_serialize {
|
||||
use std::io::{self, Read, Write};
|
||||
use std::net::Ipv6Addr;
|
||||
impl From<serde_json::Map<String, serde_json::Value>> for Value {
|
||||
fn from(map: serde_json::Map<String, serde_json::Value>) -> Self {
|
||||
let mut object = BTreeMap::new();
|
||||
|
||||
use columnar::MonotonicallyMappableToU128;
|
||||
use common::{f64_to_u64, u64_to_f64, BinarySerializable};
|
||||
|
||||
use super::Value;
|
||||
use crate::schema::Facet;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
|
||||
const TEXT_CODE: u8 = 0;
|
||||
const U64_CODE: u8 = 1;
|
||||
const I64_CODE: u8 = 2;
|
||||
const HIERARCHICAL_FACET_CODE: u8 = 3;
|
||||
const BYTES_CODE: u8 = 4;
|
||||
const DATE_CODE: u8 = 5;
|
||||
const F64_CODE: u8 = 6;
|
||||
const EXT_CODE: u8 = 7;
|
||||
const JSON_OBJ_CODE: u8 = 8;
|
||||
const BOOL_CODE: u8 = 9;
|
||||
const IP_CODE: u8 = 10;
|
||||
|
||||
// extended types
|
||||
|
||||
const TOK_STR_CODE: u8 = 0;
|
||||
|
||||
impl BinarySerializable for Value {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
match *self {
|
||||
Value::Str(ref text) => {
|
||||
TEXT_CODE.serialize(writer)?;
|
||||
text.serialize(writer)
|
||||
}
|
||||
Value::PreTokStr(ref tok_str) => {
|
||||
EXT_CODE.serialize(writer)?;
|
||||
TOK_STR_CODE.serialize(writer)?;
|
||||
if let Ok(text) = serde_json::to_string(tok_str) {
|
||||
text.serialize(writer)
|
||||
} else {
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"Failed to dump Value::PreTokStr(_) to json.",
|
||||
))
|
||||
}
|
||||
}
|
||||
Value::U64(ref val) => {
|
||||
U64_CODE.serialize(writer)?;
|
||||
val.serialize(writer)
|
||||
}
|
||||
Value::I64(ref val) => {
|
||||
I64_CODE.serialize(writer)?;
|
||||
val.serialize(writer)
|
||||
}
|
||||
Value::F64(ref val) => {
|
||||
F64_CODE.serialize(writer)?;
|
||||
f64_to_u64(*val).serialize(writer)
|
||||
}
|
||||
Value::Bool(ref val) => {
|
||||
BOOL_CODE.serialize(writer)?;
|
||||
val.serialize(writer)
|
||||
}
|
||||
Value::Date(ref val) => {
|
||||
DATE_CODE.serialize(writer)?;
|
||||
let timestamp_micros = val.into_timestamp_micros();
|
||||
timestamp_micros.serialize(writer)
|
||||
}
|
||||
Value::Facet(ref facet) => {
|
||||
HIERARCHICAL_FACET_CODE.serialize(writer)?;
|
||||
facet.serialize(writer)
|
||||
}
|
||||
Value::Bytes(ref bytes) => {
|
||||
BYTES_CODE.serialize(writer)?;
|
||||
bytes.serialize(writer)
|
||||
}
|
||||
Value::JsonObject(ref map) => {
|
||||
JSON_OBJ_CODE.serialize(writer)?;
|
||||
serde_json::to_writer(writer, &map)?;
|
||||
Ok(())
|
||||
}
|
||||
Value::IpAddr(ref ip) => {
|
||||
IP_CODE.serialize(writer)?;
|
||||
ip.to_u128().serialize(writer)
|
||||
}
|
||||
}
|
||||
for (key, value) in map {
|
||||
object.insert(key, Value::from(value));
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let type_code = u8::deserialize(reader)?;
|
||||
match type_code {
|
||||
TEXT_CODE => {
|
||||
let text = String::deserialize(reader)?;
|
||||
Ok(Value::Str(text))
|
||||
}
|
||||
U64_CODE => {
|
||||
let value = u64::deserialize(reader)?;
|
||||
Ok(Value::U64(value))
|
||||
}
|
||||
I64_CODE => {
|
||||
let value = i64::deserialize(reader)?;
|
||||
Ok(Value::I64(value))
|
||||
}
|
||||
F64_CODE => {
|
||||
let value = u64_to_f64(u64::deserialize(reader)?);
|
||||
Ok(Value::F64(value))
|
||||
}
|
||||
BOOL_CODE => {
|
||||
let value = bool::deserialize(reader)?;
|
||||
Ok(Value::Bool(value))
|
||||
}
|
||||
DATE_CODE => {
|
||||
let timestamp_micros = i64::deserialize(reader)?;
|
||||
Ok(Value::Date(DateTime::from_timestamp_micros(
|
||||
timestamp_micros,
|
||||
)))
|
||||
}
|
||||
HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)),
|
||||
BYTES_CODE => Ok(Value::Bytes(Vec::<u8>::deserialize(reader)?)),
|
||||
EXT_CODE => {
|
||||
let ext_type_code = u8::deserialize(reader)?;
|
||||
match ext_type_code {
|
||||
TOK_STR_CODE => {
|
||||
let str_val = String::deserialize(reader)?;
|
||||
if let Ok(value) = serde_json::from_str::<PreTokenizedString>(&str_val)
|
||||
{
|
||||
Ok(Value::PreTokStr(value))
|
||||
} else {
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"Failed to parse string data as Value::PreTokStr(_).",
|
||||
))
|
||||
}
|
||||
}
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"No extended field type is associated with code {ext_type_code:?}"
|
||||
),
|
||||
)),
|
||||
}
|
||||
}
|
||||
JSON_OBJ_CODE => {
|
||||
// As explained in
|
||||
// https://docs.serde.rs/serde_json/fn.from_reader.html
|
||||
//
|
||||
// `T::from_reader(..)` expects EOF after reading the object,
|
||||
// which is not what we want here.
|
||||
//
|
||||
// For this reason we need to create our own `Deserializer`.
|
||||
let mut de = serde_json::Deserializer::from_reader(reader);
|
||||
let json_map = <serde_json::Map::<String, serde_json::Value> as serde::Deserialize>::deserialize(&mut de)?;
|
||||
Ok(Value::JsonObject(json_map))
|
||||
}
|
||||
IP_CODE => {
|
||||
let value = u128::deserialize(reader)?;
|
||||
Ok(Value::IpAddr(Ipv6Addr::from_u128(value)))
|
||||
}
|
||||
Value::Object(object)
|
||||
}
|
||||
}
|
||||
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!("No field type is associated with code {type_code:?}"),
|
||||
)),
|
||||
}
|
||||
}
|
||||
/// A wrapper type for iterating over a serde_json array producing reference values.
|
||||
pub struct ArrayIter<'a>(std::slice::Iter<'a, Value>);
|
||||
|
||||
impl<'a> Iterator for ArrayIter<'a> {
|
||||
type Item = ReferenceValue<'a, &'a Value>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let value = self.0.next()?;
|
||||
Some(value.as_value())
|
||||
}
|
||||
}
|
||||
|
||||
/// A wrapper type for iterating over a serde_json object producing reference values.
|
||||
pub struct ObjectMapIter<'a>(btree_map::Iter<'a, String, Value>);
|
||||
|
||||
impl<'a> Iterator for ObjectMapIter<'a> {
|
||||
type Item = (&'a str, ReferenceValue<'a, &'a Value>);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let (key, value) = self.0.next()?;
|
||||
Some((key.as_str(), value.as_value()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -489,7 +420,7 @@ mod tests {
|
||||
use crate::schema::{BytesOptions, Schema};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{DateTime, Document};
|
||||
use crate::{DateTime, TantivyDocument};
|
||||
|
||||
#[test]
|
||||
fn test_parse_bytes_doc() {
|
||||
@@ -497,9 +428,9 @@ mod tests {
|
||||
let bytes_options = BytesOptions::default();
|
||||
let bytes_field = schema_builder.add_bytes_field("my_bytes", bytes_options);
|
||||
let schema = schema_builder.build();
|
||||
let mut doc = Document::default();
|
||||
let mut doc = TantivyDocument::default();
|
||||
doc.add_bytes(bytes_field, "this is a test".as_bytes());
|
||||
let json_string = schema.to_json(&doc);
|
||||
let json_string = doc.to_json(&schema);
|
||||
assert_eq!(json_string, r#"{"my_bytes":["dGhpcyBpcyBhIHRlc3Q="]}"#);
|
||||
}
|
||||
|
||||
@@ -509,9 +440,9 @@ mod tests {
|
||||
let bytes_options = BytesOptions::default();
|
||||
let bytes_field = schema_builder.add_bytes_field("my_bytes", bytes_options);
|
||||
let schema = schema_builder.build();
|
||||
let mut doc = Document::default();
|
||||
let mut doc = TantivyDocument::default();
|
||||
doc.add_bytes(bytes_field, "".as_bytes());
|
||||
let json_string = schema.to_json(&doc);
|
||||
let json_string = doc.to_json(&schema);
|
||||
assert_eq!(json_string, r#"{"my_bytes":[""]}"#);
|
||||
}
|
||||
|
||||
@@ -521,12 +452,12 @@ mod tests {
|
||||
let bytes_options = BytesOptions::default();
|
||||
let bytes_field = schema_builder.add_bytes_field("my_bytes", bytes_options);
|
||||
let schema = schema_builder.build();
|
||||
let mut doc = Document::default();
|
||||
let mut doc = TantivyDocument::default();
|
||||
doc.add_bytes(
|
||||
bytes_field,
|
||||
"A bigger test I guess\nspanning on multiple lines\nhoping this will work".as_bytes(),
|
||||
);
|
||||
let json_string = schema.to_json(&doc);
|
||||
let json_string = doc.to_json(&schema);
|
||||
assert_eq!(
|
||||
json_string,
|
||||
r#"{"my_bytes":["QSBiaWdnZXIgdGVzdCBJIGd1ZXNzCnNwYW5uaW5nIG9uIG11bHRpcGxlIGxpbmVzCmhvcGluZyB0aGlzIHdpbGwgd29yaw=="]}"#
|
||||
|
||||
@@ -5,9 +5,10 @@ use std::ops::Range;
|
||||
use htmlescape::encode_minimal;
|
||||
|
||||
use crate::query::Query;
|
||||
use crate::schema::{Field, Value};
|
||||
use crate::schema::document::{DocValue, Document};
|
||||
use crate::schema::Field;
|
||||
use crate::tokenizer::{TextAnalyzer, Token};
|
||||
use crate::{Document, Score, Searcher, Term};
|
||||
use crate::{Score, Searcher, Term};
|
||||
|
||||
const DEFAULT_MAX_NUM_CHARS: usize = 150;
|
||||
|
||||
@@ -359,13 +360,21 @@ impl SnippetGenerator {
|
||||
///
|
||||
/// This method extract the text associated with the `SnippetGenerator`'s field
|
||||
/// and computes a snippet.
|
||||
pub fn snippet_from_doc(&self, doc: &Document) -> Snippet {
|
||||
let text: String = doc
|
||||
.get_all(self.field)
|
||||
.flat_map(Value::as_text)
|
||||
.collect::<Vec<&str>>()
|
||||
.join(" ");
|
||||
self.snippet(&text)
|
||||
pub fn snippet_from_doc<D: Document>(&self, doc: &D) -> Snippet {
|
||||
let mut text = String::new();
|
||||
for (field, value) in doc.iter_fields_and_values() {
|
||||
let value = value as D::Value<'_>;
|
||||
if field != self.field {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(val) = value.as_str() {
|
||||
text.push(' ');
|
||||
text.push_str(val);
|
||||
}
|
||||
}
|
||||
|
||||
self.snippet(text.trim())
|
||||
}
|
||||
|
||||
/// Generates a snippet for the given text.
|
||||
|
||||
@@ -293,7 +293,7 @@ mod test {
|
||||
use crate::core::Index;
|
||||
use crate::schema::{Field, Schema, FAST, INDEXED, STORED, TEXT};
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use crate::Term;
|
||||
use crate::{IndexWriter, Term};
|
||||
|
||||
#[test]
|
||||
fn test_empty() {
|
||||
@@ -447,7 +447,7 @@ mod test {
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(name => 1u64))?;
|
||||
index_writer.add_document(doc!(name => 2u64))?;
|
||||
index_writer.add_document(doc!(name => 3u64))?;
|
||||
@@ -456,7 +456,7 @@ mod test {
|
||||
}
|
||||
|
||||
{
|
||||
let mut index_writer2 = index.writer(50_000_000)?;
|
||||
let mut index_writer2: IndexWriter = index.writer(50_000_000)?;
|
||||
index_writer2.delete_term(Term::from_field_u64(name, 2u64));
|
||||
index_writer2.delete_term(Term::from_field_u64(name, 3u64));
|
||||
// ok, now we should have a deleted doc
|
||||
|
||||
@@ -48,7 +48,7 @@ mod tests {
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::schema::{SchemaBuilder, STORED, TEXT};
|
||||
use crate::store::index::Checkpoint;
|
||||
use crate::{DocAddress, DocId, Index, Term};
|
||||
use crate::{DocAddress, DocId, Index, IndexWriter, TantivyDocument, Term};
|
||||
|
||||
#[test]
|
||||
fn test_skip_index_empty() -> io::Result<()> {
|
||||
@@ -129,7 +129,7 @@ mod tests {
|
||||
let body = schema_builder.add_text_field("body", STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
let long_text: String = "abcdefghijklmnopqrstuvwxyz".repeat(1_000);
|
||||
for _ in 0..20 {
|
||||
@@ -149,7 +149,7 @@ mod tests {
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 30);
|
||||
for i in 0..searcher.num_docs() as u32 {
|
||||
let _doc = searcher.doc(DocAddress::new(0u32, i))?;
|
||||
let _doc = searcher.doc::<TantivyDocument>(DocAddress::new(0u32, i))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -59,8 +59,11 @@ pub mod tests {
|
||||
use super::*;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::schema::{self, Document, Schema, TextFieldIndexing, TextOptions, STORED, TEXT};
|
||||
use crate::{Index, Term};
|
||||
use crate::schema::document::DocValue;
|
||||
use crate::schema::{
|
||||
self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, STORED, TEXT,
|
||||
};
|
||||
use crate::{Index, IndexWriter, Term};
|
||||
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
|
||||
eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad \
|
||||
@@ -88,7 +91,7 @@ pub mod tests {
|
||||
let mut store_writer =
|
||||
StoreWriter::new(writer, compressor, blocksize, separate_thread).unwrap();
|
||||
for i in 0..num_docs {
|
||||
let mut doc = Document::default();
|
||||
let mut doc = TantivyDocument::default();
|
||||
doc.add_field_value(field_body, LOREM.to_string());
|
||||
doc.add_field_value(field_title, format!("Doc {i}"));
|
||||
store_writer.store(&doc, &schema).unwrap();
|
||||
@@ -117,18 +120,21 @@ pub mod tests {
|
||||
for i in 0..NUM_DOCS as u32 {
|
||||
assert_eq!(
|
||||
*store
|
||||
.get(i)?
|
||||
.get::<TantivyDocument>(i)?
|
||||
.get_first(field_title)
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.as_str()
|
||||
.unwrap(),
|
||||
format!("Doc {i}")
|
||||
);
|
||||
}
|
||||
|
||||
for (_, doc) in store.iter(Some(&alive_bitset)).enumerate() {
|
||||
for (_, doc) in store
|
||||
.iter::<TantivyDocument>(Some(&alive_bitset))
|
||||
.enumerate()
|
||||
{
|
||||
let doc = doc?;
|
||||
let title_content = doc.get_first(field_title).unwrap().as_text().unwrap();
|
||||
let title_content = doc.get_first(field_title).unwrap().as_str().unwrap();
|
||||
if !title_content.starts_with("Doc ") {
|
||||
panic!("unexpected title_content {title_content}");
|
||||
}
|
||||
@@ -162,17 +168,17 @@ pub mod tests {
|
||||
for i in 0..NUM_DOCS as u32 {
|
||||
assert_eq!(
|
||||
*store
|
||||
.get(i)?
|
||||
.get::<TantivyDocument>(i)?
|
||||
.get_first(field_title)
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.as_str()
|
||||
.unwrap(),
|
||||
format!("Doc {i}")
|
||||
);
|
||||
}
|
||||
for (i, doc) in store.iter(None).enumerate() {
|
||||
for (i, doc) in store.iter::<TantivyDocument>(None).enumerate() {
|
||||
assert_eq!(
|
||||
*doc?.get_first(field_title).unwrap().as_text().unwrap(),
|
||||
*doc?.get_first(field_title).unwrap().as_str().unwrap(),
|
||||
format!("Doc {i}")
|
||||
);
|
||||
}
|
||||
@@ -222,7 +228,7 @@ pub mod tests {
|
||||
let index = index_builder.create_in_ram()?;
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(text_field=> "deleteme"))?;
|
||||
index_writer.add_document(doc!(text_field=> "deletemenot"))?;
|
||||
index_writer.add_document(doc!(text_field=> "deleteme"))?;
|
||||
@@ -236,9 +242,9 @@ pub mod tests {
|
||||
let searcher = index.reader()?.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let store = reader.get_store_reader(10)?;
|
||||
for doc in store.iter(reader.alive_bitset()) {
|
||||
for doc in store.iter::<TantivyDocument>(reader.alive_bitset()) {
|
||||
assert_eq!(
|
||||
*doc?.get_first(text_field).unwrap().as_text().unwrap(),
|
||||
*doc?.get_first(text_field).unwrap().as_str().unwrap(),
|
||||
"deletemenot".to_string()
|
||||
);
|
||||
}
|
||||
@@ -258,7 +264,7 @@ pub mod tests {
|
||||
let mut index = index_builder.create_in_ram().unwrap();
|
||||
index.settings_mut().docstore_compression = Compressor::Lz4;
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// put enough data create enough blocks in the doc store to be considered for stacking
|
||||
for _ in 0..200 {
|
||||
index_writer.add_document(doc!(text_field=> LOREM))?;
|
||||
@@ -284,7 +290,7 @@ pub mod tests {
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
assert!(index_writer.merge(&segment_ids).wait().is_ok());
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
@@ -294,9 +300,12 @@ pub mod tests {
|
||||
let reader = searcher.segment_readers().iter().last().unwrap();
|
||||
let store = reader.get_store_reader(10).unwrap();
|
||||
|
||||
for doc in store.iter(reader.alive_bitset()).take(50) {
|
||||
for doc in store
|
||||
.iter::<TantivyDocument>(reader.alive_bitset())
|
||||
.take(50)
|
||||
{
|
||||
assert_eq!(
|
||||
*doc?.get_first(text_field).unwrap().as_text().unwrap(),
|
||||
*doc?.get_first(text_field).and_then(|v| v.as_str()).unwrap(),
|
||||
LOREM.to_string()
|
||||
);
|
||||
}
|
||||
@@ -331,7 +340,7 @@ pub mod tests {
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
@@ -355,6 +364,7 @@ mod bench {
|
||||
use super::tests::write_lorem_ipsum_store;
|
||||
use crate::directory::{Directory, RamDirectory};
|
||||
use crate::store::{Compressor, StoreReader};
|
||||
use crate::TantivyDocument;
|
||||
|
||||
#[bench]
|
||||
#[cfg(feature = "mmap")]
|
||||
@@ -386,6 +396,6 @@ mod bench {
|
||||
);
|
||||
let store_file = directory.open_read(path).unwrap();
|
||||
let store = StoreReader::open(store_file, 10).unwrap();
|
||||
b.iter(|| store.iter(None).collect::<Vec<_>>());
|
||||
b.iter(|| store.iter::<TantivyDocument>(None).collect::<Vec<_>>());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ use super::Decompressor;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::document::{BinaryDocumentDeserializer, Document};
|
||||
use crate::space_usage::StoreSpaceUsage;
|
||||
use crate::store::index::Checkpoint;
|
||||
use crate::DocId;
|
||||
@@ -198,9 +198,12 @@ impl StoreReader {
|
||||
///
|
||||
/// It should not be called to score documents
|
||||
/// for instance.
|
||||
pub fn get(&self, doc_id: DocId) -> crate::Result<Document> {
|
||||
pub fn get<D: Document>(&self, doc_id: DocId) -> crate::Result<D> {
|
||||
let mut doc_bytes = self.get_document_bytes(doc_id)?;
|
||||
Ok(Document::deserialize(&mut doc_bytes)?)
|
||||
|
||||
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
|
||||
.map_err(crate::TantivyError::from)?;
|
||||
D::deserialize(deserializer).map_err(crate::TantivyError::from)
|
||||
}
|
||||
|
||||
/// Returns raw bytes of a given document.
|
||||
@@ -232,13 +235,16 @@ impl StoreReader {
|
||||
/// Iterator over all Documents in their order as they are stored in the doc store.
|
||||
/// Use this, if you want to extract all Documents from the doc store.
|
||||
/// The `alive_bitset` has to be forwarded from the `SegmentReader` or the results may be wrong.
|
||||
pub fn iter<'a: 'b, 'b>(
|
||||
pub fn iter<'a: 'b, 'b, D: Document>(
|
||||
&'b self,
|
||||
alive_bitset: Option<&'a AliveBitSet>,
|
||||
) -> impl Iterator<Item = crate::Result<Document>> + 'b {
|
||||
) -> impl Iterator<Item = crate::Result<D>> + 'b {
|
||||
self.iter_raw(alive_bitset).map(|doc_bytes_res| {
|
||||
let mut doc_bytes = doc_bytes_res?;
|
||||
Ok(Document::deserialize(&mut doc_bytes)?)
|
||||
|
||||
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
|
||||
.map_err(crate::TantivyError::from)?;
|
||||
D::deserialize(deserializer).map_err(crate::TantivyError::from)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -364,9 +370,12 @@ impl StoreReader {
|
||||
}
|
||||
|
||||
/// Fetches a document asynchronously. Async version of [`get`](Self::get).
|
||||
pub async fn get_async(&self, doc_id: DocId) -> crate::Result<Document> {
|
||||
pub async fn get_async<D: Document>(&self, doc_id: DocId) -> crate::Result<D> {
|
||||
let mut doc_bytes = self.get_document_bytes_async(doc_id).await?;
|
||||
Ok(Document::deserialize(&mut doc_bytes)?)
|
||||
|
||||
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
|
||||
.map_err(crate::TantivyError::from)?;
|
||||
D::deserialize(deserializer).map_err(crate::TantivyError::from)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -376,15 +385,16 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::schema::{Document, Field};
|
||||
use crate::schema::document::DocValue;
|
||||
use crate::schema::{Field, TantivyDocument};
|
||||
use crate::store::tests::write_lorem_ipsum_store;
|
||||
use crate::store::Compressor;
|
||||
use crate::Directory;
|
||||
|
||||
const BLOCK_SIZE: usize = 16_384;
|
||||
|
||||
fn get_text_field<'a>(doc: &'a Document, field: &'a Field) -> Option<&'a str> {
|
||||
doc.get_first(*field).and_then(|f| f.as_text())
|
||||
fn get_text_field<'a>(doc: &'a TantivyDocument, field: &'a Field) -> Option<&'a str> {
|
||||
doc.get_first(*field).and_then(|f| f.as_str())
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -5,7 +5,8 @@ use common::BinarySerializable;
|
||||
use super::compressors::Compressor;
|
||||
use super::StoreReader;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::{Document, Schema};
|
||||
use crate::schema::document::{BinaryDocumentSerializer, Document};
|
||||
use crate::schema::Schema;
|
||||
use crate::store::store_compressor::BlockCompressor;
|
||||
use crate::DocId;
|
||||
|
||||
@@ -95,9 +96,12 @@ impl StoreWriter {
|
||||
///
|
||||
/// The document id is implicitly the current number
|
||||
/// of documents.
|
||||
pub fn store(&mut self, document: &Document, schema: &Schema) -> io::Result<()> {
|
||||
pub fn store<D: Document>(&mut self, document: &D, schema: &Schema) -> io::Result<()> {
|
||||
self.doc_pos.push(self.current_block.len() as u32);
|
||||
document.serialize_stored(schema, &mut self.current_block)?;
|
||||
|
||||
let mut serializer = BinaryDocumentSerializer::new(&mut self.current_block, schema);
|
||||
serializer.serialize_doc(document)?;
|
||||
|
||||
self.num_docs_in_current_block += 1;
|
||||
self.check_flush_block()?;
|
||||
Ok(())
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::io;
|
||||
use std::io::{Read, Write};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use common::BinarySerializable;
|
||||
|
||||
use crate::tokenizer::{Token, TokenStream};
|
||||
|
||||
/// Struct representing pre-tokenized text
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Eq, PartialEq)]
|
||||
pub struct PreTokenizedString {
|
||||
/// Original text
|
||||
pub text: String,
|
||||
@@ -25,6 +27,32 @@ impl PartialOrd for PreTokenizedString {
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for PreTokenizedString {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
if let Ok(text) = serde_json::to_string(self) {
|
||||
<String as BinarySerializable>::serialize(&text, writer)
|
||||
} else {
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"Failed to dump PreTokenizedString to json.",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let json_text = <String as BinarySerializable>::deserialize(reader)?;
|
||||
|
||||
if let Ok(value) = serde_json::from_str(&json_text) {
|
||||
Ok(value)
|
||||
} else {
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"Failed to parse string data as PreTokenizedString.",
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// [`TokenStream`] implementation which wraps [`PreTokenizedString`]
|
||||
pub struct PreTokenizedStream {
|
||||
tokenized_string: PreTokenizedString,
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::path::Path;
|
||||
|
||||
use tantivy::directory::{Directory, ManagedDirectory, RamDirectory, TerminatingWrite};
|
||||
use tantivy::schema::{Schema, TEXT};
|
||||
use tantivy::{doc, Index, Term};
|
||||
use tantivy::{doc, Index, IndexWriter, Term};
|
||||
|
||||
#[test]
|
||||
fn test_failpoints_managed_directory_gc_if_delete_fails() {
|
||||
@@ -45,7 +45,7 @@ fn test_write_commit_fails() -> tantivy::Result<()> {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 15_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?;
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"))?;
|
||||
}
|
||||
@@ -75,7 +75,7 @@ fn test_fail_on_flush_segment() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer_with_num_threads(1, 15_000_000)?;
|
||||
let index_writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?;
|
||||
fail::cfg("FieldSerializer::close_term", "return(simulatederror)").unwrap();
|
||||
for i in 0..100_000 {
|
||||
if index_writer
|
||||
@@ -94,7 +94,7 @@ fn test_fail_on_flush_segment_but_one_worker_remains() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer_with_num_threads(2, 30_000_000)?;
|
||||
let index_writer: IndexWriter = index.writer_with_num_threads(2, 30_000_000)?;
|
||||
fail::cfg("FieldSerializer::close_term", "1*return(simulatederror)").unwrap();
|
||||
for i in 0..100_000 {
|
||||
if index_writer
|
||||
@@ -113,7 +113,7 @@ fn test_fail_on_commit_segment() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 15_000_000)?;
|
||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?;
|
||||
fail::cfg("FieldSerializer::close_term", "return(simulatederror)").unwrap();
|
||||
for i in 0..10 {
|
||||
index_writer
|
||||
|
||||
Reference in New Issue
Block a user