Compare commits

...

6 Commits

Author SHA1 Message Date
trinity-1686a
bcff3eb2d2 try with custom Cow<str> 2023-01-11 16:02:52 +01:00
trinity-1686a
85f2588875 implement add_borrowed_values on Document 2022-12-23 16:16:22 +01:00
trinity-1686a
db6cf65d53 make Document support Yoked inner values 2022-12-22 17:52:53 +01:00
trinity-1686a
654aa7f42c allow Value to borrow 2022-12-22 15:43:13 +01:00
François Massot
951a898633 Update bench. 2022-10-30 14:12:07 +01:00
François Massot
003722d831 Add bench to reproduce performance drop on array of texts. 2022-10-29 02:54:07 +02:00
12 changed files with 100490 additions and 187 deletions

View File

@@ -61,6 +61,7 @@ measure_time = "0.8.2"
ciborium = { version = "0.2", optional = true} ciborium = { version = "0.2", optional = true}
async-trait = "0.1.53" async-trait = "0.1.53"
arc-swap = "1.5.0" arc-swap = "1.5.0"
yoke = { version = "0.6.2", features = ["derive"] }
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
winapi = "0.3.9" winapi = "0.3.9"

100000
benches/hdfs_with_array.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,116 +1,159 @@
use criterion::{criterion_group, criterion_main, Criterion}; use criterion::{criterion_group, criterion_main, Criterion};
use itertools::Itertools;
use pprof::criterion::{Output, PProfProfiler}; use pprof::criterion::{Output, PProfProfiler};
use tantivy::schema::{INDEXED, STORED, STRING, TEXT}; use serde_json::{self, Value as JsonValue};
use tantivy::Index; use tantivy::directory::RamDirectory;
use tantivy::schema::{
FieldValue, TextFieldIndexing, TextOptions, Value, INDEXED, STORED, STRING, TEXT,
};
use tantivy::{Document, Index, IndexBuilder};
const HDFS_LOGS: &str = include_str!("hdfs.json"); const HDFS_LOGS: &str = include_str!("hdfs.json");
const NUM_REPEATS: usize = 2; const NUM_REPEATS: usize = 20;
pub fn hdfs_index_benchmark(c: &mut Criterion) { pub fn hdfs_index_benchmark(c: &mut Criterion) {
let schema = { let mut schema_builder = tantivy::schema::SchemaBuilder::new();
let mut schema_builder = tantivy::schema::SchemaBuilder::new(); let text_indexing_options = TextFieldIndexing::default()
schema_builder.add_u64_field("timestamp", INDEXED); .set_tokenizer("default")
schema_builder.add_text_field("body", TEXT); .set_fieldnorms(false)
schema_builder.add_text_field("severity", STRING); .set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions);
schema_builder.build() let mut text_options = TextOptions::default().set_indexing_options(text_indexing_options);
}; let text_field = schema_builder.add_text_field("body", text_options);
let schema_with_store = { let schema = schema_builder.build();
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", INDEXED | STORED); // prepare doc
schema_builder.add_text_field("body", TEXT | STORED); let mut documents_no_array = Vec::new();
schema_builder.add_text_field("severity", STRING | STORED); let mut documents_with_array = Vec::new();
schema_builder.build() for doc_json in HDFS_LOGS.trim().split("\n") {
}; let json_obj: serde_json::Map<String, JsonValue> = serde_json::from_str(doc_json).unwrap();
let dynamic_schema = { let text = json_obj.get("body").unwrap().as_str().unwrap();
let mut schema_builder = tantivy::schema::SchemaBuilder::new(); let mut doc_no_array = Document::new();
schema_builder.add_json_field("json", TEXT); doc_no_array.add_text(text_field, text);
schema_builder.build() documents_no_array.push(doc_no_array);
}; let mut doc_with_array = Document::new();
doc_with_array.add_borrowed_values(text.to_owned(), |text| {
text.split(' ')
.map(|text| FieldValue::new(text_field, text.into()))
.collect()
});
documents_with_array.push(doc_with_array);
}
let mut group = c.benchmark_group("index-hdfs"); let mut group = c.benchmark_group("index-hdfs");
group.sample_size(20); group.sample_size(20);
group.bench_function("index-hdfs-no-commit", |b| { group.bench_function("index-hdfs-no-commit", |b| {
b.iter(|| { b.iter(|| {
let index = Index::create_in_ram(schema.clone()); let ram_directory = RamDirectory::create();
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); let mut index_writer = IndexBuilder::new()
.schema(schema.clone())
.single_segment_index_writer(ram_directory, 100_000_000)
.unwrap();
for _ in 0..NUM_REPEATS { for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") { let documents_cloned = documents_no_array.clone();
let doc = schema.parse_document(doc_json).unwrap(); for doc in documents_cloned {
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc).unwrap();
} }
} }
}) })
}); });
group.bench_function("index-hdfs-with-commit", |b| { group.bench_function("index-hdfs-with-array-no-commit", |b| {
b.iter(|| { b.iter(|| {
let index = Index::create_in_ram(schema.clone()); let ram_directory = RamDirectory::create();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); let mut index_writer = IndexBuilder::new()
.schema(schema.clone())
.single_segment_index_writer(ram_directory, 100_000_000)
.unwrap();
for _ in 0..NUM_REPEATS { for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") { let documents_with_array_cloned = documents_with_array.clone();
let doc = schema.parse_document(doc_json).unwrap(); for doc in documents_with_array_cloned {
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc).unwrap();
} }
} }
}) })
}); });
group.bench_function("index-hdfs-with-commit-with-docstore", |b| { // group.bench_function("index-hdfs-with-commit", |b| {
b.iter(|| { // b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone()); // let ram_directory = RamDirectory::create();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); // let mut index_writer = IndexBuilder::new()
for _ in 0..NUM_REPEATS { // .schema(schema.clone())
for doc_json in HDFS_LOGS.trim().split("\n") { // .single_segment_index_writer(ram_directory, 100_000_000)
let doc = schema.parse_document(doc_json).unwrap(); // .unwrap();
index_writer.add_document(doc).unwrap(); // for _ in 0..NUM_REPEATS {
} // for doc_json in HDFS_LOGS.trim().split("\n") {
} // let doc = schema.parse_document(doc_json).unwrap();
index_writer.commit().unwrap(); // index_writer.add_document(doc).unwrap();
}) // }
}); // }
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| { // index_writer.commit().unwrap();
b.iter(|| { // })
let index = Index::create_in_ram(dynamic_schema.clone()); // });
let json_field = dynamic_schema.get_field("json").unwrap(); // group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); // b.iter(|| {
for _ in 0..NUM_REPEATS { // let ram_directory = RamDirectory::create();
for doc_json in HDFS_LOGS.trim().split("\n") { // let mut index_writer = IndexBuilder::new()
let json_val: serde_json::Map<String, serde_json::Value> = // .schema(schema.clone())
serde_json::from_str(doc_json).unwrap(); // .single_segment_index_writer(ram_directory, 100_000_000)
let doc = tantivy::doc!(json_field=>json_val); // .unwrap();
index_writer.add_document(doc).unwrap(); // for _ in 0..NUM_REPEATS {
} // for doc_json in HDFS_LOGS.trim().split("\n") {
} // let doc = schema.parse_document(doc_json).unwrap();
index_writer.commit().unwrap(); // index_writer.add_document(doc).unwrap();
}) // }
}); // }
group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| { // })
b.iter(|| { // });
let index = Index::create_in_ram(dynamic_schema.clone()); // group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
let json_field = dynamic_schema.get_field("json").unwrap(); // b.iter(|| {
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); // let ram_directory = RamDirectory::create();
for _ in 0..NUM_REPEATS { // let mut index_writer = IndexBuilder::new()
for doc_json in HDFS_LOGS.trim().split("\n") { // .schema(schema.clone())
let json_val: serde_json::Map<String, serde_json::Value> = // .single_segment_index_writer(ram_directory, 100_000_000)
serde_json::from_str(doc_json).unwrap(); // .unwrap();
let doc = tantivy::doc!(json_field=>json_val); // for _ in 0..NUM_REPEATS {
index_writer.add_document(doc).unwrap(); // for doc_json in HDFS_LOGS.trim().split("\n") {
} // let doc = schema.parse_document(doc_json).unwrap();
} // index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap(); // }
}) // }
}); // index_writer.commit().unwrap();
// })
// });
// group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let json_val: serde_json::Map<String, serde_json::Value> =
// serde_json::from_str(doc_json).unwrap();
// let doc = tantivy::doc!(json_field=>json_val);
// index_writer.add_document(doc).unwrap();
// }
// }
// index_writer.commit().unwrap();
// })
// });
// group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let json_val: serde_json::Map<String, serde_json::Value> =
// serde_json::from_str(doc_json).unwrap();
// let doc = tantivy::doc!(json_field=>json_val);
// index_writer.add_document(doc).unwrap();
// }
// }
// index_writer.commit().unwrap();
// })
//});
} }
criterion_group! { criterion_group! {

View File

@@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::io::{Read, Write}; use std::io::{Read, Write};
use std::{fmt, io}; use std::{fmt, io};
@@ -210,6 +211,23 @@ impl BinarySerializable for String {
} }
} }
impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
Ok(Cow::Owned(result))
}
}
#[cfg(test)] #[cfg(test)]
pub mod test { pub mod test {

View File

@@ -158,7 +158,6 @@ impl SegmentWriter {
let doc_id = self.max_doc; let doc_id = self.max_doc;
let vals_grouped_by_field = doc let vals_grouped_by_field = doc
.field_values() .field_values()
.iter()
.sorted_by_key(|el| el.field()) .sorted_by_key(|el| el.field())
.group_by(|el| el.field()); .group_by(|el| el.field());
for (field, field_values) in &vals_grouped_by_field { for (field, field_values) in &vals_grouped_by_field {
@@ -502,9 +501,17 @@ mod tests {
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap(); let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
let doc = reader.get(0).unwrap(); let doc = reader.get(0).unwrap();
assert_eq!(doc.field_values().len(), 2); assert_eq!(doc.value_count(), 2);
assert_eq!(doc.field_values()[0].value().as_text(), Some("A")); let mut field_value_iter = doc.field_values();
assert_eq!(doc.field_values()[1].value().as_text(), Some("title")); assert_eq!(
field_value_iter.next().unwrap().value().as_text(),
Some("A")
);
assert_eq!(
field_value_iter.next().unwrap().value().as_text(),
Some("title")
);
assert!(field_value_iter.next().is_none());
} }
#[test] #[test]
@@ -833,20 +840,23 @@ mod tests {
// This is a bit of a contrived example. // This is a bit of a contrived example.
let tokens = PreTokenizedString { let tokens = PreTokenizedString {
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life. text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
tokens: vec![Token { // Not the last token, yet ends after the last token. tokens: vec![
offset_from: 0, Token {
offset_to: 14, // Not the last token, yet ends after the last token.
position: 0, offset_from: 0,
text: "long_token".to_string(), offset_to: 14,
position_length: 3, position: 0,
}, text: "long_token".to_string(),
Token { position_length: 3,
offset_from: 0, },
offset_to: 14, Token {
position: 1, offset_from: 0,
text: "short".to_string(), offset_to: 14,
position_length: 1, position: 1,
}], text: "short".to_string(),
position_length: 1,
},
],
}; };
doc.add_pre_tokenized_text(text, tokens); doc.add_pre_tokenized_text(text, tokens);
doc.add_text(text, "hello"); doc.add_text(text, "hello");

View File

@@ -31,7 +31,7 @@ pub struct MoreLikeThisQuery {
#[derive(Debug, PartialEq, Clone)] #[derive(Debug, PartialEq, Clone)]
enum TargetDocument { enum TargetDocument {
DocumentAdress(DocAddress), DocumentAdress(DocAddress),
DocumentFields(Vec<(Field, Vec<Value>)>), DocumentFields(Vec<(Field, Vec<Value<'static>>)>),
} }
impl MoreLikeThisQuery { impl MoreLikeThisQuery {
@@ -160,7 +160,10 @@ impl MoreLikeThisQueryBuilder {
/// that will be used to compose the resulting query. /// that will be used to compose the resulting query.
/// This interface is meant to be used when you want to provide your own set of fields /// This interface is meant to be used when you want to provide your own set of fields
/// not necessarily from a specific document. /// not necessarily from a specific document.
pub fn with_document_fields(self, doc_fields: Vec<(Field, Vec<Value>)>) -> MoreLikeThisQuery { pub fn with_document_fields(
self,
doc_fields: Vec<(Field, Vec<Value<'static>>)>,
) -> MoreLikeThisQuery {
MoreLikeThisQuery { MoreLikeThisQuery {
mlt: self.mlt, mlt: self.mlt,
target: TargetDocument::DocumentFields(doc_fields), target: TargetDocument::DocumentFields(doc_fields),

View File

@@ -1,35 +1,105 @@
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::io::{self, Read, Write}; use std::io::{self, Read, Write};
use std::mem;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use std::sync::Arc;
use std::{fmt, mem};
use common::{BinarySerializable, VInt}; use common::{BinarySerializable, VInt};
use itertools::Either;
use yoke::erased::ErasedArcCart;
use yoke::Yoke;
use super::*; use super::*;
use crate::schema::value::MaybeOwnedString;
use crate::tokenizer::PreTokenizedString; use crate::tokenizer::PreTokenizedString;
use crate::DateTime; use crate::DateTime;
/// A group of FieldValue sharing an underlying storage
///
/// Or a single owned FieldValue.
#[derive(Clone)]
enum FieldValueGroup {
Single(FieldValue<'static>),
Group(Yoke<VecFieldValue<'static>, ErasedArcCart>),
}
// this NewType is required to make it possible to yoke a vec with non 'static inner values.
#[derive(yoke::Yokeable, Clone)]
struct VecFieldValue<'a>(Vec<FieldValue<'a>>);
impl<'a> std::ops::Deref for VecFieldValue<'a> {
type Target = Vec<FieldValue<'a>>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<'a> From<Vec<FieldValue<'a>>> for VecFieldValue<'a> {
fn from(field_values: Vec<FieldValue>) -> VecFieldValue {
VecFieldValue(field_values)
}
}
impl FieldValueGroup {
fn iter(&self) -> impl Iterator<Item = &FieldValue> {
match self {
FieldValueGroup::Single(field_value) => Either::Left(std::iter::once(field_value)),
FieldValueGroup::Group(field_values) => Either::Right(field_values.get().iter()),
}
}
fn count(&self) -> usize {
match self {
FieldValueGroup::Single(_) => 1,
FieldValueGroup::Group(field_values) => field_values.get().len(),
}
}
}
impl From<Vec<FieldValue<'static>>> for FieldValueGroup {
fn from(field_values: Vec<FieldValue<'static>>) -> FieldValueGroup {
FieldValueGroup::Group(
Yoke::new_always_owned(field_values.into())
.wrap_cart_in_arc()
.erase_arc_cart(),
)
}
}
/// Tantivy's Document is the object that can /// Tantivy's Document is the object that can
/// be indexed and then searched for. /// be indexed and then searched for.
/// ///
/// Documents are fundamentally a collection of unordered couples `(field, value)`. /// Documents are fundamentally a collection of unordered couples `(field, value)`.
/// In this list, one field may appear more than once. /// In this list, one field may appear more than once.
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)] #[derive(Clone, Default)]
// TODO bring back Ser/De and Debug
//#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
//#[serde(bound(deserialize = "'static: 'de, 'de: 'static"))]
pub struct Document { pub struct Document {
field_values: Vec<FieldValue>, field_values: Vec<FieldValueGroup>,
} }
impl From<Vec<FieldValue>> for Document { impl fmt::Debug for Document {
fn from(field_values: Vec<FieldValue>) -> Self { fn fmt(&self, _: &mut fmt::Formatter<'_>) -> fmt::Result {
todo!()
}
}
impl From<Vec<FieldValue<'static>>> for Document {
fn from(field_values: Vec<FieldValue<'static>>) -> Self {
let field_values = vec![field_values.into()];
Document { field_values } Document { field_values }
} }
} }
impl PartialEq for Document { impl PartialEq for Document {
fn eq(&self, other: &Document) -> bool { fn eq(&self, other: &Document) -> bool {
// super slow, but only here for tests // super slow, but only here for tests
let convert_to_comparable_map = |field_values: &[FieldValue]| { let convert_to_comparable_map = |field_values| {
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default(); let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
for field_value in field_values.iter() { for field_value in field_values {
// for some reason rustc fails to guess the type
let field_value: &FieldValue = field_value;
let json_val = serde_json::to_string(field_value.value()).unwrap(); let json_val = serde_json::to_string(field_value.value()).unwrap();
field_value_set field_value_set
.entry(field_value.field()) .entry(field_value.field())
@@ -39,9 +109,9 @@ impl PartialEq for Document {
field_value_set field_value_set
}; };
let self_field_values: HashMap<Field, HashSet<String>> = let self_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&self.field_values); convert_to_comparable_map(self.field_values());
let other_field_values: HashMap<Field, HashSet<String>> = let other_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&other.field_values); convert_to_comparable_map(other.field_values());
self_field_values.eq(&other_field_values) self_field_values.eq(&other_field_values)
} }
} }
@@ -49,12 +119,13 @@ impl PartialEq for Document {
impl Eq for Document {} impl Eq for Document {}
impl IntoIterator for Document { impl IntoIterator for Document {
type Item = FieldValue; type Item = FieldValue<'static>;
type IntoIter = std::vec::IntoIter<FieldValue>; type IntoIter = std::vec::IntoIter<FieldValue<'static>>;
fn into_iter(self) -> Self::IntoIter { fn into_iter(self) -> Self::IntoIter {
self.field_values.into_iter() todo!()
// self.field_values.into_iter()
} }
} }
@@ -84,7 +155,7 @@ impl Document {
/// Add a text field. /// Add a text field.
pub fn add_text<S: ToString>(&mut self, field: Field, text: S) { pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
let value = Value::Str(text.to_string()); let value = Value::Str(MaybeOwnedString::from_string(text.to_string()));
self.add_field_value(field, value); self.add_field_value(field, value);
} }
@@ -138,15 +209,35 @@ impl Document {
} }
/// Add a (field, value) to the document. /// Add a (field, value) to the document.
pub fn add_field_value<T: Into<Value>>(&mut self, field: Field, typed_val: T) { pub fn add_field_value<T: Into<Value<'static>>>(&mut self, field: Field, typed_val: T) {
let value = typed_val.into(); let value = typed_val.into();
let field_value = FieldValue { field, value }; let field_value = FieldValue { field, value };
self.field_values.push(field_value); self.field_values.push(FieldValueGroup::Single(field_value));
}
/// Add multiple borrowed values, also taking the container they're borrowing from
// TODO add a try_ variant?
pub fn add_borrowed_values<T, F>(&mut self, storage: T, f: F)
where
T: Send + Sync + 'static,
F: FnOnce(&T) -> Vec<FieldValue>,
{
let yoke =
Yoke::attach_to_cart(Arc::new(storage), |storage| f(storage).into()).erase_arc_cart();
self.field_values.push(FieldValueGroup::Group(yoke));
} }
/// field_values accessor /// field_values accessor
pub fn field_values(&self) -> &[FieldValue] { pub fn field_values(&self) -> impl Iterator<Item = &FieldValue> {
&self.field_values self.field_values.iter().flat_map(|group| group.iter())
}
/// Return the total number of values
///
/// More efficient than calling `self.field_values().count()`
pub fn value_count(&self) -> usize {
self.field_values.iter().map(|group| group.count()).sum()
} }
/// Sort and groups the field_values by field. /// Sort and groups the field_values by field.
@@ -154,7 +245,7 @@ impl Document {
/// The result of this method is not cached and is /// The result of this method is not cached and is
/// computed on the fly when this method is called. /// computed on the fly when this method is called.
pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&Value>)> { pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&Value>)> {
let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect(); let mut field_values: Vec<&FieldValue> = self.field_values().collect();
field_values.sort_by_key(|field_value| field_value.field()); field_values.sort_by_key(|field_value| field_value.field());
let mut field_values_it = field_values.into_iter(); let mut field_values_it = field_values.into_iter();
@@ -189,6 +280,7 @@ impl Document {
pub fn get_all(&self, field: Field) -> impl Iterator<Item = &Value> { pub fn get_all(&self, field: Field) -> impl Iterator<Item = &Value> {
self.field_values self.field_values
.iter() .iter()
.flat_map(|group| group.iter())
.filter(move |field_value| field_value.field() == field) .filter(move |field_value| field_value.field() == field)
.map(FieldValue::value) .map(FieldValue::value)
} }
@@ -202,7 +294,6 @@ impl Document {
pub fn serialize_stored<W: Write>(&self, schema: &Schema, writer: &mut W) -> io::Result<()> { pub fn serialize_stored<W: Write>(&self, schema: &Schema, writer: &mut W) -> io::Result<()> {
let stored_field_values = || { let stored_field_values = || {
self.field_values() self.field_values()
.iter()
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored()) .filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
}; };
let num_field_values = stored_field_values().count(); let num_field_values = stored_field_values().count();
@@ -216,7 +307,9 @@ impl Document {
} => { } => {
let field_value = FieldValue { let field_value = FieldValue {
field: *field, field: *field,
value: Value::Str(pre_tokenized_text.text.to_string()), value: Value::Str(MaybeOwnedString::from_string(
pre_tokenized_text.text.to_string(),
)),
}; };
field_value.serialize(writer)?; field_value.serialize(writer)?;
} }
@@ -230,7 +323,7 @@ impl Document {
impl BinarySerializable for Document { impl BinarySerializable for Document {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let field_values = self.field_values(); let field_values = self.field_values();
VInt(field_values.len() as u64).serialize(writer)?; VInt(self.value_count() as u64).serialize(writer)?;
for field_value in field_values { for field_value in field_values {
field_value.serialize(writer)?; field_value.serialize(writer)?;
} }
@@ -259,7 +352,7 @@ mod tests {
let text_field = schema_builder.add_text_field("title", TEXT); let text_field = schema_builder.add_text_field("title", TEXT);
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_text(text_field, "My title"); doc.add_text(text_field, "My title");
assert_eq!(doc.field_values().len(), 1); assert_eq!(doc.value_count(), 1);
} }
#[test] #[test]
@@ -273,7 +366,7 @@ mod tests {
.clone(), .clone(),
); );
doc.add_text(Field::from_field_id(1), "hello"); doc.add_text(Field::from_field_id(1), "hello");
assert_eq!(doc.field_values().len(), 2); assert_eq!(doc.value_count(), 2);
let mut payload: Vec<u8> = Vec::new(); let mut payload: Vec<u8> = Vec::new();
doc.serialize(&mut payload).unwrap(); doc.serialize(&mut payload).unwrap();
assert_eq!(payload.len(), 26); assert_eq!(payload.len(), 26);

View File

@@ -9,6 +9,7 @@ use super::ip_options::IpAddrOptions;
use super::{Cardinality, IntoIpv6Addr}; use super::{Cardinality, IntoIpv6Addr};
use crate::schema::bytes_options::BytesOptions; use crate::schema::bytes_options::BytesOptions;
use crate::schema::facet_options::FacetOptions; use crate::schema::facet_options::FacetOptions;
use crate::schema::value::MaybeOwnedString;
use crate::schema::{ use crate::schema::{
DateOptions, Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing, DateOptions, Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing,
TextOptions, Value, TextOptions, Value,
@@ -329,7 +330,7 @@ impl FieldType {
/// Tantivy will not try to cast values. /// Tantivy will not try to cast values.
/// For instance, If the json value is the integer `3` and the /// For instance, If the json value is the integer `3` and the
/// target field is a `Str`, this method will return an Error. /// target field is a `Str`, this method will return an Error.
pub fn value_from_json(&self, json: JsonValue) -> Result<Value, ValueParsingError> { pub fn value_from_json(&self, json: JsonValue) -> Result<Value<'static>, ValueParsingError> {
match json { match json {
JsonValue::String(field_text) => { JsonValue::String(field_text) => {
match self { match self {
@@ -341,7 +342,7 @@ impl FieldType {
})?; })?;
Ok(DateTime::from_utc(dt_with_fixed_tz).into()) Ok(DateTime::from_utc(dt_with_fixed_tz).into())
} }
FieldType::Str(_) => Ok(Value::Str(field_text)), FieldType::Str(_) => Ok(Value::Str(MaybeOwnedString::from_string(field_text))),
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => { FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
Err(ValueParsingError::TypeError { Err(ValueParsingError::TypeError {
expected: "an integer", expected: "an integer",

View File

@@ -7,12 +7,13 @@ use crate::schema::{Field, Value};
/// `FieldValue` holds together a `Field` and its `Value`. /// `FieldValue` holds together a `Field` and its `Value`.
#[allow(missing_docs)] #[allow(missing_docs)]
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct FieldValue { #[serde(bound(deserialize = "'a: 'de, 'de: 'a"))]
pub struct FieldValue<'a> {
pub field: Field, pub field: Field,
pub value: Value, pub value: Value<'a>,
} }
impl FieldValue { impl<'a> FieldValue<'a> {
/// Constructor /// Constructor
pub fn new(field: Field, value: Value) -> FieldValue { pub fn new(field: Field, value: Value) -> FieldValue {
FieldValue { field, value } FieldValue { field, value }
@@ -29,13 +30,13 @@ impl FieldValue {
} }
} }
impl From<FieldValue> for Value { impl<'a> From<FieldValue<'a>> for Value<'a> {
fn from(field_value: FieldValue) -> Self { fn from(field_value: FieldValue<'a>) -> Self {
field_value.value field_value.value
} }
} }
impl BinarySerializable for FieldValue { impl<'a> BinarySerializable for FieldValue<'a> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.field.serialize(writer)?; self.field.serialize(writer)?;
self.value.serialize(writer) self.value.serialize(writer)

View File

@@ -10,4 +10,5 @@ use crate::schema::Value;
/// A `NamedFieldDocument` is a simple representation of a document /// A `NamedFieldDocument` is a simple representation of a document
/// as a `BTreeMap<String, Vec<Value>>`. /// as a `BTreeMap<String, Vec<Value>>`.
#[derive(Debug, Deserialize, Serialize)] #[derive(Debug, Deserialize, Serialize)]
pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value>>); #[serde(bound(deserialize = "'static: 'de, 'de: 'static"))]
pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value<'static>>>);

View File

@@ -308,7 +308,11 @@ impl Schema {
let mut field_map = BTreeMap::new(); let mut field_map = BTreeMap::new();
for (field, field_values) in doc.get_sorted_field_values() { for (field, field_values) in doc.get_sorted_field_values() {
let field_name = self.get_field_name(field); let field_name = self.get_field_name(field);
let values: Vec<Value> = field_values.into_iter().cloned().collect(); let values: Vec<Value> = field_values
.into_iter()
.cloned()
.map(Value::into_owned)
.collect();
field_map.insert(field_name.to_string(), values); field_map.insert(field_name.to_string(), values);
} }
NamedFieldDocument(field_map) NamedFieldDocument(field_map)
@@ -338,20 +342,21 @@ impl Schema {
if let Some(field) = self.get_field(&field_name) { if let Some(field) = self.get_field(&field_name) {
let field_entry = self.get_field_entry(field); let field_entry = self.get_field_entry(field);
let field_type = field_entry.field_type(); let field_type = field_entry.field_type();
// TODO rewrite this with shared allocation?
match json_value { match json_value {
JsonValue::Array(json_items) => { JsonValue::Array(json_items) => {
for json_item in json_items { for json_item in json_items {
let value = field_type let value = field_type
.value_from_json(json_item) .value_from_json(json_item)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?; .map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value); doc.add_field_value(field, value.into_owned());
} }
} }
_ => { _ => {
let value = field_type let value = field_type
.value_from_json(json_value) .value_from_json(json_value)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?; .map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value); doc.add_field_value(field, value.into_owned());
} }
} }
} }
@@ -706,7 +711,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
{ {
let doc = schema.parse_document("{}").unwrap(); let doc = schema.parse_document("{}").unwrap();
assert!(doc.field_values().is_empty()); assert_eq!(doc.value_count(), 0);
} }
{ {
let doc = schema let doc = schema

View File

@@ -1,6 +1,7 @@
use std::fmt; use std::fmt;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
pub use not_safe::MaybeOwnedString;
use serde::de::Visitor; use serde::de::Visitor;
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_json::Map; use serde_json::Map;
@@ -12,9 +13,9 @@ use crate::DateTime;
/// Value represents the value of a any field. /// Value represents the value of a any field.
/// It is an enum over all over all of the possible field type. /// It is an enum over all over all of the possible field type.
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub enum Value { pub enum Value<'a> {
/// The str type is used for any text information. /// The str type is used for any text information.
Str(String), Str(MaybeOwnedString<'a>),
/// Pre-tokenized str type, /// Pre-tokenized str type,
PreTokStr(PreTokenizedString), PreTokStr(PreTokenizedString),
/// Unsigned 64-bits Integer `u64` /// Unsigned 64-bits Integer `u64`
@@ -30,16 +31,38 @@ pub enum Value {
/// Facet /// Facet
Facet(Facet), Facet(Facet),
/// Arbitrarily sized byte array /// Arbitrarily sized byte array
// TODO allow Cow<'a, [u8]>
Bytes(Vec<u8>), Bytes(Vec<u8>),
/// Json object value. /// Json object value.
// TODO allow Cow keys and borrowed values
JsonObject(serde_json::Map<String, serde_json::Value>), JsonObject(serde_json::Map<String, serde_json::Value>),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`. /// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(Ipv6Addr), IpAddr(Ipv6Addr),
} }
impl Eq for Value {} impl<'a> Value<'a> {
/// Convert a borrowing [`Value`] to an owning one.
pub fn into_owned(self) -> Value<'static> {
use Value::*;
match self {
Str(val) => Str(MaybeOwnedString::from_string(val.into_string())),
PreTokStr(val) => PreTokStr(val),
U64(val) => U64(val),
I64(val) => I64(val),
F64(val) => F64(val),
Bool(val) => Bool(val),
Date(val) => Date(val),
Facet(val) => Facet(val),
Bytes(val) => Bytes(val),
JsonObject(val) => JsonObject(val),
IpAddr(val) => IpAddr(val),
}
}
}
impl Serialize for Value { impl<'a> Eq for Value<'a> {}
impl<'a> Serialize for Value<'a> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer { where S: Serializer {
match *self { match *self {
@@ -65,13 +88,13 @@ impl Serialize for Value {
} }
} }
impl<'de> Deserialize<'de> for Value { impl<'de> Deserialize<'de> for Value<'de> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de> { where D: Deserializer<'de> {
struct ValueVisitor; struct ValueVisitor;
impl<'de> Visitor<'de> for ValueVisitor { impl<'de> Visitor<'de> for ValueVisitor {
type Value = Value; type Value = Value<'de>;
fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
formatter.write_str("a string or u32") formatter.write_str("a string or u32")
@@ -93,12 +116,13 @@ impl<'de> Deserialize<'de> for Value {
Ok(Value::Bool(v)) Ok(Value::Bool(v))
} }
// TODO add visit_borrowed_str
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> { fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> {
Ok(Value::Str(v.to_owned())) Ok(Value::Str(MaybeOwnedString::from_string(v.to_owned())))
} }
fn visit_string<E>(self, v: String) -> Result<Self::Value, E> { fn visit_string<E>(self, v: String) -> Result<Self::Value, E> {
Ok(Value::Str(v)) Ok(Value::Str(MaybeOwnedString::from_string(v)))
} }
} }
@@ -106,7 +130,7 @@ impl<'de> Deserialize<'de> for Value {
} }
} }
impl Value { impl<'a> Value<'a> {
/// Returns the text value, provided the value is of the `Str` type. /// Returns the text value, provided the value is of the `Str` type.
/// (Returns `None` if the value is not of the `Str` type). /// (Returns `None` if the value is not of the `Str` type).
pub fn as_text(&self) -> Option<&str> { pub fn as_text(&self) -> Option<&str> {
@@ -224,86 +248,87 @@ impl Value {
} }
} }
impl From<String> for Value { impl From<String> for Value<'static> {
fn from(s: String) -> Value { fn from(s: String) -> Value<'static> {
Value::Str(s) Value::Str(MaybeOwnedString::from_string(s))
} }
} }
impl From<Ipv6Addr> for Value { impl From<Ipv6Addr> for Value<'static> {
fn from(v: Ipv6Addr) -> Value { fn from(v: Ipv6Addr) -> Value<'static> {
Value::IpAddr(v) Value::IpAddr(v)
} }
} }
impl From<u64> for Value { impl From<u64> for Value<'static> {
fn from(v: u64) -> Value { fn from(v: u64) -> Value<'static> {
Value::U64(v) Value::U64(v)
} }
} }
impl From<i64> for Value { impl From<i64> for Value<'static> {
fn from(v: i64) -> Value { fn from(v: i64) -> Value<'static> {
Value::I64(v) Value::I64(v)
} }
} }
impl From<f64> for Value { impl From<f64> for Value<'static> {
fn from(v: f64) -> Value { fn from(v: f64) -> Value<'static> {
Value::F64(v) Value::F64(v)
} }
} }
impl From<bool> for Value { impl From<bool> for Value<'static> {
fn from(b: bool) -> Self { fn from(b: bool) -> Self {
Value::Bool(b) Value::Bool(b)
} }
} }
impl From<DateTime> for Value { impl From<DateTime> for Value<'static> {
fn from(dt: DateTime) -> Value { fn from(dt: DateTime) -> Value<'static> {
Value::Date(dt) Value::Date(dt)
} }
} }
impl<'a> From<&'a str> for Value { impl<'a> From<&'a str> for Value<'a> {
fn from(s: &'a str) -> Value { fn from(s: &'a str) -> Value<'a> {
Value::Str(s.to_string()) Value::Str(MaybeOwnedString::from_str(s))
} }
} }
impl<'a> From<&'a [u8]> for Value { // TODO change lifetime to 'a
fn from(bytes: &'a [u8]) -> Value { impl<'a> From<&'a [u8]> for Value<'static> {
fn from(bytes: &'a [u8]) -> Value<'static> {
Value::Bytes(bytes.to_vec()) Value::Bytes(bytes.to_vec())
} }
} }
impl From<Facet> for Value { impl From<Facet> for Value<'static> {
fn from(facet: Facet) -> Value { fn from(facet: Facet) -> Value<'static> {
Value::Facet(facet) Value::Facet(facet)
} }
} }
impl From<Vec<u8>> for Value { impl From<Vec<u8>> for Value<'static> {
fn from(bytes: Vec<u8>) -> Value { fn from(bytes: Vec<u8>) -> Value<'static> {
Value::Bytes(bytes) Value::Bytes(bytes)
} }
} }
impl From<PreTokenizedString> for Value { impl From<PreTokenizedString> for Value<'static> {
fn from(pretokenized_string: PreTokenizedString) -> Value { fn from(pretokenized_string: PreTokenizedString) -> Value<'static> {
Value::PreTokStr(pretokenized_string) Value::PreTokStr(pretokenized_string)
} }
} }
impl From<serde_json::Map<String, serde_json::Value>> for Value { impl From<serde_json::Map<String, serde_json::Value>> for Value<'static> {
fn from(json_object: serde_json::Map<String, serde_json::Value>) -> Value { fn from(json_object: serde_json::Map<String, serde_json::Value>) -> Value<'static> {
Value::JsonObject(json_object) Value::JsonObject(json_object)
} }
} }
impl From<serde_json::Value> for Value { impl From<serde_json::Value> for Value<'static> {
fn from(json_value: serde_json::Value) -> Value { fn from(json_value: serde_json::Value) -> Value<'static> {
match json_value { match json_value {
serde_json::Value::Object(json_object) => Value::JsonObject(json_object), serde_json::Value::Object(json_object) => Value::JsonObject(json_object),
_ => { _ => {
@@ -320,7 +345,7 @@ mod binary_serialize {
use common::{f64_to_u64, u64_to_f64, BinarySerializable}; use common::{f64_to_u64, u64_to_f64, BinarySerializable};
use fastfield_codecs::MonotonicallyMappableToU128; use fastfield_codecs::MonotonicallyMappableToU128;
use super::Value; use super::{MaybeOwnedString, Value};
use crate::schema::Facet; use crate::schema::Facet;
use crate::tokenizer::PreTokenizedString; use crate::tokenizer::PreTokenizedString;
use crate::DateTime; use crate::DateTime;
@@ -341,12 +366,13 @@ mod binary_serialize {
const TOK_STR_CODE: u8 = 0; const TOK_STR_CODE: u8 = 0;
impl BinarySerializable for Value { impl<'a> BinarySerializable for Value<'a> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
match *self { match *self {
Value::Str(ref text) => { Value::Str(ref text) => {
TEXT_CODE.serialize(writer)?; TEXT_CODE.serialize(writer)?;
text.serialize(writer) // TODO impl trait for MaybeOwnedString
text.as_str().to_owned().serialize(writer)
} }
Value::PreTokStr(ref tok_str) => { Value::PreTokStr(ref tok_str) => {
EXT_CODE.serialize(writer)?; EXT_CODE.serialize(writer)?;
@@ -408,7 +434,7 @@ mod binary_serialize {
match type_code { match type_code {
TEXT_CODE => { TEXT_CODE => {
let text = String::deserialize(reader)?; let text = String::deserialize(reader)?;
Ok(Value::Str(text)) Ok(Value::Str(MaybeOwnedString::from_string(text)))
} }
U64_CODE => { U64_CODE => {
let value = u64::deserialize(reader)?; let value = u64::deserialize(reader)?;
@@ -550,3 +576,104 @@ mod tests {
assert_eq!(serialized_value_json, r#""1996-12-20T01:39:57Z""#); assert_eq!(serialized_value_json, r#""1996-12-20T01:39:57Z""#);
} }
} }
mod not_safe {
use std::ops::Deref;
union Ref<'a, T: ?Sized> {
shared: &'a T,
uniq: &'a mut T,
}
pub struct MaybeOwnedString<'a> {
string: Ref<'a, str>,
capacity: usize,
}
impl<'a> MaybeOwnedString<'a> {
pub fn from_str(string: &'a str) -> MaybeOwnedString<'a> {
MaybeOwnedString {
string: Ref { shared: string },
capacity: 0,
}
}
pub fn from_string(mut string: String) -> MaybeOwnedString<'static> {
string.shrink_to_fit(); // <= actually important for safety, todo use the Vec .as_ptr instead
let mut s = std::mem::ManuallyDrop::new(string);
let ptr = s.as_mut_ptr();
let len = s.len();
let capacity = s.capacity();
let string = unsafe {
std::str::from_utf8_unchecked_mut(std::slice::from_raw_parts_mut(ptr, len))
};
MaybeOwnedString {
string: Ref { uniq: string },
capacity,
}
}
pub fn into_string(mut self) -> String {
if self.capacity != 0 {
let string = unsafe { &mut self.string.uniq };
unsafe {
return String::from_raw_parts(string.as_mut_ptr(), self.len(), self.capacity);
};
}
self.deref().to_owned()
}
pub fn as_str(&self) -> &str {
self.deref()
}
}
impl<'a> Deref for MaybeOwnedString<'a> {
type Target = str;
#[inline]
fn deref(&self) -> &str {
unsafe { self.string.shared }
}
}
impl<'a> Drop for MaybeOwnedString<'a> {
fn drop(&mut self) {
// if capacity is 0, either it's an empty String so there is no dealloc to do, or it's
// borrowed
if self.capacity != 0 {
let string = unsafe { &mut self.string.uniq };
unsafe { String::from_raw_parts(string.as_mut_ptr(), self.len(), self.capacity) };
}
}
}
impl<'a> Clone for MaybeOwnedString<'a> {
fn clone(&self) -> Self {
if self.capacity == 0 {
MaybeOwnedString {
string: Ref {
shared: unsafe { self.string.shared },
},
capacity: 0,
}
} else {
MaybeOwnedString::from_string(self.deref().to_owned())
}
}
}
impl<'a> std::fmt::Debug for MaybeOwnedString<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.deref())
}
}
impl<'a> PartialEq for MaybeOwnedString<'a> {
fn eq(&self, other: &Self) -> bool {
self.deref() == other.deref()
}
}
}