Compare commits

...

6 Commits

Author SHA1 Message Date
trinity-1686a
bcff3eb2d2 try with custom Cow<str> 2023-01-11 16:02:52 +01:00
trinity-1686a
85f2588875 implement add_borrowed_values on Document 2022-12-23 16:16:22 +01:00
trinity-1686a
db6cf65d53 make Document support Yoked inner values 2022-12-22 17:52:53 +01:00
trinity-1686a
654aa7f42c allow Value to borrow 2022-12-22 15:43:13 +01:00
François Massot
951a898633 Update bench. 2022-10-30 14:12:07 +01:00
François Massot
003722d831 Add bench to reproduce performance drop on array of texts. 2022-10-29 02:54:07 +02:00
12 changed files with 100490 additions and 187 deletions

View File

@@ -61,6 +61,7 @@ measure_time = "0.8.2"
ciborium = { version = "0.2", optional = true}
async-trait = "0.1.53"
arc-swap = "1.5.0"
yoke = { version = "0.6.2", features = ["derive"] }
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"

100000
benches/hdfs_with_array.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,116 +1,159 @@
use criterion::{criterion_group, criterion_main, Criterion};
use itertools::Itertools;
use pprof::criterion::{Output, PProfProfiler};
use tantivy::schema::{INDEXED, STORED, STRING, TEXT};
use tantivy::Index;
use serde_json::{self, Value as JsonValue};
use tantivy::directory::RamDirectory;
use tantivy::schema::{
FieldValue, TextFieldIndexing, TextOptions, Value, INDEXED, STORED, STRING, TEXT,
};
use tantivy::{Document, Index, IndexBuilder};
const HDFS_LOGS: &str = include_str!("hdfs.json");
const NUM_REPEATS: usize = 2;
const NUM_REPEATS: usize = 20;
pub fn hdfs_index_benchmark(c: &mut Criterion) {
let schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", INDEXED);
schema_builder.add_text_field("body", TEXT);
schema_builder.add_text_field("severity", STRING);
schema_builder.build()
};
let schema_with_store = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", INDEXED | STORED);
schema_builder.add_text_field("body", TEXT | STORED);
schema_builder.add_text_field("severity", STRING | STORED);
schema_builder.build()
};
let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT);
schema_builder.build()
};
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
let text_indexing_options = TextFieldIndexing::default()
.set_tokenizer("default")
.set_fieldnorms(false)
.set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions);
let mut text_options = TextOptions::default().set_indexing_options(text_indexing_options);
let text_field = schema_builder.add_text_field("body", text_options);
let schema = schema_builder.build();
// prepare doc
let mut documents_no_array = Vec::new();
let mut documents_with_array = Vec::new();
for doc_json in HDFS_LOGS.trim().split("\n") {
let json_obj: serde_json::Map<String, JsonValue> = serde_json::from_str(doc_json).unwrap();
let text = json_obj.get("body").unwrap().as_str().unwrap();
let mut doc_no_array = Document::new();
doc_no_array.add_text(text_field, text);
documents_no_array.push(doc_no_array);
let mut doc_with_array = Document::new();
doc_with_array.add_borrowed_values(text.to_owned(), |text| {
text.split(' ')
.map(|text| FieldValue::new(text_field, text.into()))
.collect()
});
documents_with_array.push(doc_with_array);
}
let mut group = c.benchmark_group("index-hdfs");
group.sample_size(20);
group.bench_function("index-hdfs-no-commit", |b| {
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let ram_directory = RamDirectory::create();
let mut index_writer = IndexBuilder::new()
.schema(schema.clone())
.single_segment_index_writer(ram_directory, 100_000_000)
.unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let doc = schema.parse_document(doc_json).unwrap();
let documents_cloned = documents_no_array.clone();
for doc in documents_cloned {
index_writer.add_document(doc).unwrap();
}
}
})
});
group.bench_function("index-hdfs-with-commit", |b| {
group.bench_function("index-hdfs-with-array-no-commit", |b| {
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
let ram_directory = RamDirectory::create();
let mut index_writer = IndexBuilder::new()
.schema(schema.clone())
.single_segment_index_writer(ram_directory, 100_000_000)
.unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let doc = schema.parse_document(doc_json).unwrap();
let documents_with_array_cloned = documents_with_array.clone();
for doc in documents_with_array_cloned {
index_writer.add_document(doc).unwrap();
}
}
})
});
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split("\n") {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
// group.bench_function("index-hdfs-with-commit", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let doc = schema.parse_document(doc_json).unwrap();
// index_writer.add_document(doc).unwrap();
// }
// }
// index_writer.commit().unwrap();
// })
// });
// group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let doc = schema.parse_document(doc_json).unwrap();
// index_writer.add_document(doc).unwrap();
// }
// }
// })
// });
// group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let doc = schema.parse_document(doc_json).unwrap();
// index_writer.add_document(doc).unwrap();
// }
// }
// index_writer.commit().unwrap();
// })
// });
// group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let json_val: serde_json::Map<String, serde_json::Value> =
// serde_json::from_str(doc_json).unwrap();
// let doc = tantivy::doc!(json_field=>json_val);
// index_writer.add_document(doc).unwrap();
// }
// }
// index_writer.commit().unwrap();
// })
// });
// group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
// b.iter(|| {
// let ram_directory = RamDirectory::create();
// let mut index_writer = IndexBuilder::new()
// .schema(schema.clone())
// .single_segment_index_writer(ram_directory, 100_000_000)
// .unwrap();
// for _ in 0..NUM_REPEATS {
// for doc_json in HDFS_LOGS.trim().split("\n") {
// let json_val: serde_json::Map<String, serde_json::Value> =
// serde_json::from_str(doc_json).unwrap();
// let doc = tantivy::doc!(json_field=>json_val);
// index_writer.add_document(doc).unwrap();
// }
// }
// index_writer.commit().unwrap();
// })
//});
}
criterion_group! {

View File

@@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::io::{Read, Write};
use std::{fmt, io};
@@ -210,6 +211,23 @@ impl BinarySerializable for String {
}
}
impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
Ok(Cow::Owned(result))
}
}
#[cfg(test)]
pub mod test {

View File

@@ -158,7 +158,6 @@ impl SegmentWriter {
let doc_id = self.max_doc;
let vals_grouped_by_field = doc
.field_values()
.iter()
.sorted_by_key(|el| el.field())
.group_by(|el| el.field());
for (field, field_values) in &vals_grouped_by_field {
@@ -502,9 +501,17 @@ mod tests {
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
let doc = reader.get(0).unwrap();
assert_eq!(doc.field_values().len(), 2);
assert_eq!(doc.field_values()[0].value().as_text(), Some("A"));
assert_eq!(doc.field_values()[1].value().as_text(), Some("title"));
assert_eq!(doc.value_count(), 2);
let mut field_value_iter = doc.field_values();
assert_eq!(
field_value_iter.next().unwrap().value().as_text(),
Some("A")
);
assert_eq!(
field_value_iter.next().unwrap().value().as_text(),
Some("title")
);
assert!(field_value_iter.next().is_none());
}
#[test]
@@ -833,20 +840,23 @@ mod tests {
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
tokens: vec![Token { // Not the last token, yet ends after the last token.
offset_from: 0,
offset_to: 14,
position: 0,
text: "long_token".to_string(),
position_length: 3,
},
Token {
offset_from: 0,
offset_to: 14,
position: 1,
text: "short".to_string(),
position_length: 1,
}],
tokens: vec![
Token {
// Not the last token, yet ends after the last token.
offset_from: 0,
offset_to: 14,
position: 0,
text: "long_token".to_string(),
position_length: 3,
},
Token {
offset_from: 0,
offset_to: 14,
position: 1,
text: "short".to_string(),
position_length: 1,
},
],
};
doc.add_pre_tokenized_text(text, tokens);
doc.add_text(text, "hello");

View File

@@ -31,7 +31,7 @@ pub struct MoreLikeThisQuery {
#[derive(Debug, PartialEq, Clone)]
enum TargetDocument {
DocumentAdress(DocAddress),
DocumentFields(Vec<(Field, Vec<Value>)>),
DocumentFields(Vec<(Field, Vec<Value<'static>>)>),
}
impl MoreLikeThisQuery {
@@ -160,7 +160,10 @@ impl MoreLikeThisQueryBuilder {
/// that will be used to compose the resulting query.
/// This interface is meant to be used when you want to provide your own set of fields
/// not necessarily from a specific document.
pub fn with_document_fields(self, doc_fields: Vec<(Field, Vec<Value>)>) -> MoreLikeThisQuery {
pub fn with_document_fields(
self,
doc_fields: Vec<(Field, Vec<Value<'static>>)>,
) -> MoreLikeThisQuery {
MoreLikeThisQuery {
mlt: self.mlt,
target: TargetDocument::DocumentFields(doc_fields),

View File

@@ -1,35 +1,105 @@
use std::collections::{HashMap, HashSet};
use std::io::{self, Read, Write};
use std::mem;
use std::net::Ipv6Addr;
use std::sync::Arc;
use std::{fmt, mem};
use common::{BinarySerializable, VInt};
use itertools::Either;
use yoke::erased::ErasedArcCart;
use yoke::Yoke;
use super::*;
use crate::schema::value::MaybeOwnedString;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
/// A group of FieldValue sharing an underlying storage
///
/// Or a single owned FieldValue.
#[derive(Clone)]
enum FieldValueGroup {
Single(FieldValue<'static>),
Group(Yoke<VecFieldValue<'static>, ErasedArcCart>),
}
// this NewType is required to make it possible to yoke a vec with non 'static inner values.
#[derive(yoke::Yokeable, Clone)]
struct VecFieldValue<'a>(Vec<FieldValue<'a>>);
impl<'a> std::ops::Deref for VecFieldValue<'a> {
type Target = Vec<FieldValue<'a>>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<'a> From<Vec<FieldValue<'a>>> for VecFieldValue<'a> {
fn from(field_values: Vec<FieldValue>) -> VecFieldValue {
VecFieldValue(field_values)
}
}
impl FieldValueGroup {
fn iter(&self) -> impl Iterator<Item = &FieldValue> {
match self {
FieldValueGroup::Single(field_value) => Either::Left(std::iter::once(field_value)),
FieldValueGroup::Group(field_values) => Either::Right(field_values.get().iter()),
}
}
fn count(&self) -> usize {
match self {
FieldValueGroup::Single(_) => 1,
FieldValueGroup::Group(field_values) => field_values.get().len(),
}
}
}
impl From<Vec<FieldValue<'static>>> for FieldValueGroup {
fn from(field_values: Vec<FieldValue<'static>>) -> FieldValueGroup {
FieldValueGroup::Group(
Yoke::new_always_owned(field_values.into())
.wrap_cart_in_arc()
.erase_arc_cart(),
)
}
}
/// Tantivy's Document is the object that can
/// be indexed and then searched for.
///
/// Documents are fundamentally a collection of unordered couples `(field, value)`.
/// In this list, one field may appear more than once.
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
#[derive(Clone, Default)]
// TODO bring back Ser/De and Debug
//#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
//#[serde(bound(deserialize = "'static: 'de, 'de: 'static"))]
pub struct Document {
field_values: Vec<FieldValue>,
field_values: Vec<FieldValueGroup>,
}
impl From<Vec<FieldValue>> for Document {
fn from(field_values: Vec<FieldValue>) -> Self {
impl fmt::Debug for Document {
fn fmt(&self, _: &mut fmt::Formatter<'_>) -> fmt::Result {
todo!()
}
}
impl From<Vec<FieldValue<'static>>> for Document {
fn from(field_values: Vec<FieldValue<'static>>) -> Self {
let field_values = vec![field_values.into()];
Document { field_values }
}
}
impl PartialEq for Document {
fn eq(&self, other: &Document) -> bool {
// super slow, but only here for tests
let convert_to_comparable_map = |field_values: &[FieldValue]| {
let convert_to_comparable_map = |field_values| {
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
for field_value in field_values.iter() {
for field_value in field_values {
// for some reason rustc fails to guess the type
let field_value: &FieldValue = field_value;
let json_val = serde_json::to_string(field_value.value()).unwrap();
field_value_set
.entry(field_value.field())
@@ -39,9 +109,9 @@ impl PartialEq for Document {
field_value_set
};
let self_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&self.field_values);
convert_to_comparable_map(self.field_values());
let other_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&other.field_values);
convert_to_comparable_map(other.field_values());
self_field_values.eq(&other_field_values)
}
}
@@ -49,12 +119,13 @@ impl PartialEq for Document {
impl Eq for Document {}
impl IntoIterator for Document {
type Item = FieldValue;
type Item = FieldValue<'static>;
type IntoIter = std::vec::IntoIter<FieldValue>;
type IntoIter = std::vec::IntoIter<FieldValue<'static>>;
fn into_iter(self) -> Self::IntoIter {
self.field_values.into_iter()
todo!()
// self.field_values.into_iter()
}
}
@@ -84,7 +155,7 @@ impl Document {
/// Add a text field.
pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
let value = Value::Str(text.to_string());
let value = Value::Str(MaybeOwnedString::from_string(text.to_string()));
self.add_field_value(field, value);
}
@@ -138,15 +209,35 @@ impl Document {
}
/// Add a (field, value) to the document.
pub fn add_field_value<T: Into<Value>>(&mut self, field: Field, typed_val: T) {
pub fn add_field_value<T: Into<Value<'static>>>(&mut self, field: Field, typed_val: T) {
let value = typed_val.into();
let field_value = FieldValue { field, value };
self.field_values.push(field_value);
self.field_values.push(FieldValueGroup::Single(field_value));
}
/// Add multiple borrowed values, also taking the container they're borrowing from
// TODO add a try_ variant?
pub fn add_borrowed_values<T, F>(&mut self, storage: T, f: F)
where
T: Send + Sync + 'static,
F: FnOnce(&T) -> Vec<FieldValue>,
{
let yoke =
Yoke::attach_to_cart(Arc::new(storage), |storage| f(storage).into()).erase_arc_cart();
self.field_values.push(FieldValueGroup::Group(yoke));
}
/// field_values accessor
pub fn field_values(&self) -> &[FieldValue] {
&self.field_values
pub fn field_values(&self) -> impl Iterator<Item = &FieldValue> {
self.field_values.iter().flat_map(|group| group.iter())
}
/// Return the total number of values
///
/// More efficient than calling `self.field_values().count()`
pub fn value_count(&self) -> usize {
self.field_values.iter().map(|group| group.count()).sum()
}
/// Sort and groups the field_values by field.
@@ -154,7 +245,7 @@ impl Document {
/// The result of this method is not cached and is
/// computed on the fly when this method is called.
pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&Value>)> {
let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect();
let mut field_values: Vec<&FieldValue> = self.field_values().collect();
field_values.sort_by_key(|field_value| field_value.field());
let mut field_values_it = field_values.into_iter();
@@ -189,6 +280,7 @@ impl Document {
pub fn get_all(&self, field: Field) -> impl Iterator<Item = &Value> {
self.field_values
.iter()
.flat_map(|group| group.iter())
.filter(move |field_value| field_value.field() == field)
.map(FieldValue::value)
}
@@ -202,7 +294,6 @@ impl Document {
pub fn serialize_stored<W: Write>(&self, schema: &Schema, writer: &mut W) -> io::Result<()> {
let stored_field_values = || {
self.field_values()
.iter()
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
};
let num_field_values = stored_field_values().count();
@@ -216,7 +307,9 @@ impl Document {
} => {
let field_value = FieldValue {
field: *field,
value: Value::Str(pre_tokenized_text.text.to_string()),
value: Value::Str(MaybeOwnedString::from_string(
pre_tokenized_text.text.to_string(),
)),
};
field_value.serialize(writer)?;
}
@@ -230,7 +323,7 @@ impl Document {
impl BinarySerializable for Document {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let field_values = self.field_values();
VInt(field_values.len() as u64).serialize(writer)?;
VInt(self.value_count() as u64).serialize(writer)?;
for field_value in field_values {
field_value.serialize(writer)?;
}
@@ -259,7 +352,7 @@ mod tests {
let text_field = schema_builder.add_text_field("title", TEXT);
let mut doc = Document::default();
doc.add_text(text_field, "My title");
assert_eq!(doc.field_values().len(), 1);
assert_eq!(doc.value_count(), 1);
}
#[test]
@@ -273,7 +366,7 @@ mod tests {
.clone(),
);
doc.add_text(Field::from_field_id(1), "hello");
assert_eq!(doc.field_values().len(), 2);
assert_eq!(doc.value_count(), 2);
let mut payload: Vec<u8> = Vec::new();
doc.serialize(&mut payload).unwrap();
assert_eq!(payload.len(), 26);

View File

@@ -9,6 +9,7 @@ use super::ip_options::IpAddrOptions;
use super::{Cardinality, IntoIpv6Addr};
use crate::schema::bytes_options::BytesOptions;
use crate::schema::facet_options::FacetOptions;
use crate::schema::value::MaybeOwnedString;
use crate::schema::{
DateOptions, Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing,
TextOptions, Value,
@@ -329,7 +330,7 @@ impl FieldType {
/// Tantivy will not try to cast values.
/// For instance, If the json value is the integer `3` and the
/// target field is a `Str`, this method will return an Error.
pub fn value_from_json(&self, json: JsonValue) -> Result<Value, ValueParsingError> {
pub fn value_from_json(&self, json: JsonValue) -> Result<Value<'static>, ValueParsingError> {
match json {
JsonValue::String(field_text) => {
match self {
@@ -341,7 +342,7 @@ impl FieldType {
})?;
Ok(DateTime::from_utc(dt_with_fixed_tz).into())
}
FieldType::Str(_) => Ok(Value::Str(field_text)),
FieldType::Str(_) => Ok(Value::Str(MaybeOwnedString::from_string(field_text))),
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
Err(ValueParsingError::TypeError {
expected: "an integer",

View File

@@ -7,12 +7,13 @@ use crate::schema::{Field, Value};
/// `FieldValue` holds together a `Field` and its `Value`.
#[allow(missing_docs)]
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct FieldValue {
#[serde(bound(deserialize = "'a: 'de, 'de: 'a"))]
pub struct FieldValue<'a> {
pub field: Field,
pub value: Value,
pub value: Value<'a>,
}
impl FieldValue {
impl<'a> FieldValue<'a> {
/// Constructor
pub fn new(field: Field, value: Value) -> FieldValue {
FieldValue { field, value }
@@ -29,13 +30,13 @@ impl FieldValue {
}
}
impl From<FieldValue> for Value {
fn from(field_value: FieldValue) -> Self {
impl<'a> From<FieldValue<'a>> for Value<'a> {
fn from(field_value: FieldValue<'a>) -> Self {
field_value.value
}
}
impl BinarySerializable for FieldValue {
impl<'a> BinarySerializable for FieldValue<'a> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.field.serialize(writer)?;
self.value.serialize(writer)

View File

@@ -10,4 +10,5 @@ use crate::schema::Value;
/// A `NamedFieldDocument` is a simple representation of a document
/// as a `BTreeMap<String, Vec<Value>>`.
#[derive(Debug, Deserialize, Serialize)]
pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value>>);
#[serde(bound(deserialize = "'static: 'de, 'de: 'static"))]
pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value<'static>>>);

View File

@@ -308,7 +308,11 @@ impl Schema {
let mut field_map = BTreeMap::new();
for (field, field_values) in doc.get_sorted_field_values() {
let field_name = self.get_field_name(field);
let values: Vec<Value> = field_values.into_iter().cloned().collect();
let values: Vec<Value> = field_values
.into_iter()
.cloned()
.map(Value::into_owned)
.collect();
field_map.insert(field_name.to_string(), values);
}
NamedFieldDocument(field_map)
@@ -338,20 +342,21 @@ impl Schema {
if let Some(field) = self.get_field(&field_name) {
let field_entry = self.get_field_entry(field);
let field_type = field_entry.field_type();
// TODO rewrite this with shared allocation?
match json_value {
JsonValue::Array(json_items) => {
for json_item in json_items {
let value = field_type
.value_from_json(json_item)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value);
doc.add_field_value(field, value.into_owned());
}
}
_ => {
let value = field_type
.value_from_json(json_value)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value);
doc.add_field_value(field, value.into_owned());
}
}
}
@@ -706,7 +711,7 @@ mod tests {
let schema = schema_builder.build();
{
let doc = schema.parse_document("{}").unwrap();
assert!(doc.field_values().is_empty());
assert_eq!(doc.value_count(), 0);
}
{
let doc = schema

View File

@@ -1,6 +1,7 @@
use std::fmt;
use std::net::Ipv6Addr;
pub use not_safe::MaybeOwnedString;
use serde::de::Visitor;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_json::Map;
@@ -12,9 +13,9 @@ use crate::DateTime;
/// Value represents the value of a any field.
/// It is an enum over all over all of the possible field type.
#[derive(Debug, Clone, PartialEq)]
pub enum Value {
pub enum Value<'a> {
/// The str type is used for any text information.
Str(String),
Str(MaybeOwnedString<'a>),
/// Pre-tokenized str type,
PreTokStr(PreTokenizedString),
/// Unsigned 64-bits Integer `u64`
@@ -30,16 +31,38 @@ pub enum Value {
/// Facet
Facet(Facet),
/// Arbitrarily sized byte array
// TODO allow Cow<'a, [u8]>
Bytes(Vec<u8>),
/// Json object value.
// TODO allow Cow keys and borrowed values
JsonObject(serde_json::Map<String, serde_json::Value>),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(Ipv6Addr),
}
impl Eq for Value {}
impl<'a> Value<'a> {
/// Convert a borrowing [`Value`] to an owning one.
pub fn into_owned(self) -> Value<'static> {
use Value::*;
match self {
Str(val) => Str(MaybeOwnedString::from_string(val.into_string())),
PreTokStr(val) => PreTokStr(val),
U64(val) => U64(val),
I64(val) => I64(val),
F64(val) => F64(val),
Bool(val) => Bool(val),
Date(val) => Date(val),
Facet(val) => Facet(val),
Bytes(val) => Bytes(val),
JsonObject(val) => JsonObject(val),
IpAddr(val) => IpAddr(val),
}
}
}
impl Serialize for Value {
impl<'a> Eq for Value<'a> {}
impl<'a> Serialize for Value<'a> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer {
match *self {
@@ -65,13 +88,13 @@ impl Serialize for Value {
}
}
impl<'de> Deserialize<'de> for Value {
impl<'de> Deserialize<'de> for Value<'de> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de> {
struct ValueVisitor;
impl<'de> Visitor<'de> for ValueVisitor {
type Value = Value;
type Value = Value<'de>;
fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
formatter.write_str("a string or u32")
@@ -93,12 +116,13 @@ impl<'de> Deserialize<'de> for Value {
Ok(Value::Bool(v))
}
// TODO add visit_borrowed_str
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> {
Ok(Value::Str(v.to_owned()))
Ok(Value::Str(MaybeOwnedString::from_string(v.to_owned())))
}
fn visit_string<E>(self, v: String) -> Result<Self::Value, E> {
Ok(Value::Str(v))
Ok(Value::Str(MaybeOwnedString::from_string(v)))
}
}
@@ -106,7 +130,7 @@ impl<'de> Deserialize<'de> for Value {
}
}
impl Value {
impl<'a> Value<'a> {
/// Returns the text value, provided the value is of the `Str` type.
/// (Returns `None` if the value is not of the `Str` type).
pub fn as_text(&self) -> Option<&str> {
@@ -224,86 +248,87 @@ impl Value {
}
}
impl From<String> for Value {
fn from(s: String) -> Value {
Value::Str(s)
impl From<String> for Value<'static> {
fn from(s: String) -> Value<'static> {
Value::Str(MaybeOwnedString::from_string(s))
}
}
impl From<Ipv6Addr> for Value {
fn from(v: Ipv6Addr) -> Value {
impl From<Ipv6Addr> for Value<'static> {
fn from(v: Ipv6Addr) -> Value<'static> {
Value::IpAddr(v)
}
}
impl From<u64> for Value {
fn from(v: u64) -> Value {
impl From<u64> for Value<'static> {
fn from(v: u64) -> Value<'static> {
Value::U64(v)
}
}
impl From<i64> for Value {
fn from(v: i64) -> Value {
impl From<i64> for Value<'static> {
fn from(v: i64) -> Value<'static> {
Value::I64(v)
}
}
impl From<f64> for Value {
fn from(v: f64) -> Value {
impl From<f64> for Value<'static> {
fn from(v: f64) -> Value<'static> {
Value::F64(v)
}
}
impl From<bool> for Value {
impl From<bool> for Value<'static> {
fn from(b: bool) -> Self {
Value::Bool(b)
}
}
impl From<DateTime> for Value {
fn from(dt: DateTime) -> Value {
impl From<DateTime> for Value<'static> {
fn from(dt: DateTime) -> Value<'static> {
Value::Date(dt)
}
}
impl<'a> From<&'a str> for Value {
fn from(s: &'a str) -> Value {
Value::Str(s.to_string())
impl<'a> From<&'a str> for Value<'a> {
fn from(s: &'a str) -> Value<'a> {
Value::Str(MaybeOwnedString::from_str(s))
}
}
impl<'a> From<&'a [u8]> for Value {
fn from(bytes: &'a [u8]) -> Value {
// TODO change lifetime to 'a
impl<'a> From<&'a [u8]> for Value<'static> {
fn from(bytes: &'a [u8]) -> Value<'static> {
Value::Bytes(bytes.to_vec())
}
}
impl From<Facet> for Value {
fn from(facet: Facet) -> Value {
impl From<Facet> for Value<'static> {
fn from(facet: Facet) -> Value<'static> {
Value::Facet(facet)
}
}
impl From<Vec<u8>> for Value {
fn from(bytes: Vec<u8>) -> Value {
impl From<Vec<u8>> for Value<'static> {
fn from(bytes: Vec<u8>) -> Value<'static> {
Value::Bytes(bytes)
}
}
impl From<PreTokenizedString> for Value {
fn from(pretokenized_string: PreTokenizedString) -> Value {
impl From<PreTokenizedString> for Value<'static> {
fn from(pretokenized_string: PreTokenizedString) -> Value<'static> {
Value::PreTokStr(pretokenized_string)
}
}
impl From<serde_json::Map<String, serde_json::Value>> for Value {
fn from(json_object: serde_json::Map<String, serde_json::Value>) -> Value {
impl From<serde_json::Map<String, serde_json::Value>> for Value<'static> {
fn from(json_object: serde_json::Map<String, serde_json::Value>) -> Value<'static> {
Value::JsonObject(json_object)
}
}
impl From<serde_json::Value> for Value {
fn from(json_value: serde_json::Value) -> Value {
impl From<serde_json::Value> for Value<'static> {
fn from(json_value: serde_json::Value) -> Value<'static> {
match json_value {
serde_json::Value::Object(json_object) => Value::JsonObject(json_object),
_ => {
@@ -320,7 +345,7 @@ mod binary_serialize {
use common::{f64_to_u64, u64_to_f64, BinarySerializable};
use fastfield_codecs::MonotonicallyMappableToU128;
use super::Value;
use super::{MaybeOwnedString, Value};
use crate::schema::Facet;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
@@ -341,12 +366,13 @@ mod binary_serialize {
const TOK_STR_CODE: u8 = 0;
impl BinarySerializable for Value {
impl<'a> BinarySerializable for Value<'a> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
match *self {
Value::Str(ref text) => {
TEXT_CODE.serialize(writer)?;
text.serialize(writer)
// TODO impl trait for MaybeOwnedString
text.as_str().to_owned().serialize(writer)
}
Value::PreTokStr(ref tok_str) => {
EXT_CODE.serialize(writer)?;
@@ -408,7 +434,7 @@ mod binary_serialize {
match type_code {
TEXT_CODE => {
let text = String::deserialize(reader)?;
Ok(Value::Str(text))
Ok(Value::Str(MaybeOwnedString::from_string(text)))
}
U64_CODE => {
let value = u64::deserialize(reader)?;
@@ -550,3 +576,104 @@ mod tests {
assert_eq!(serialized_value_json, r#""1996-12-20T01:39:57Z""#);
}
}
mod not_safe {
use std::ops::Deref;
union Ref<'a, T: ?Sized> {
shared: &'a T,
uniq: &'a mut T,
}
pub struct MaybeOwnedString<'a> {
string: Ref<'a, str>,
capacity: usize,
}
impl<'a> MaybeOwnedString<'a> {
pub fn from_str(string: &'a str) -> MaybeOwnedString<'a> {
MaybeOwnedString {
string: Ref { shared: string },
capacity: 0,
}
}
pub fn from_string(mut string: String) -> MaybeOwnedString<'static> {
string.shrink_to_fit(); // <= actually important for safety, todo use the Vec .as_ptr instead
let mut s = std::mem::ManuallyDrop::new(string);
let ptr = s.as_mut_ptr();
let len = s.len();
let capacity = s.capacity();
let string = unsafe {
std::str::from_utf8_unchecked_mut(std::slice::from_raw_parts_mut(ptr, len))
};
MaybeOwnedString {
string: Ref { uniq: string },
capacity,
}
}
pub fn into_string(mut self) -> String {
if self.capacity != 0 {
let string = unsafe { &mut self.string.uniq };
unsafe {
return String::from_raw_parts(string.as_mut_ptr(), self.len(), self.capacity);
};
}
self.deref().to_owned()
}
pub fn as_str(&self) -> &str {
self.deref()
}
}
impl<'a> Deref for MaybeOwnedString<'a> {
type Target = str;
#[inline]
fn deref(&self) -> &str {
unsafe { self.string.shared }
}
}
impl<'a> Drop for MaybeOwnedString<'a> {
fn drop(&mut self) {
// if capacity is 0, either it's an empty String so there is no dealloc to do, or it's
// borrowed
if self.capacity != 0 {
let string = unsafe { &mut self.string.uniq };
unsafe { String::from_raw_parts(string.as_mut_ptr(), self.len(), self.capacity) };
}
}
}
impl<'a> Clone for MaybeOwnedString<'a> {
fn clone(&self) -> Self {
if self.capacity == 0 {
MaybeOwnedString {
string: Ref {
shared: unsafe { self.string.shared },
},
capacity: 0,
}
} else {
MaybeOwnedString::from_string(self.deref().to_owned())
}
}
}
impl<'a> std::fmt::Debug for MaybeOwnedString<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.deref())
}
}
impl<'a> PartialEq for MaybeOwnedString<'a> {
fn eq(&self, other: &Self) -> bool {
self.deref() == other.deref()
}
}
}