make Document support Yoked inner values

This commit is contained in:
trinity-1686a
2022-12-22 17:52:53 +01:00
parent 654aa7f42c
commit db6cf65d53
4 changed files with 104 additions and 36 deletions

View File

@@ -61,6 +61,7 @@ measure_time = "0.8.2"
ciborium = { version = "0.2", optional = true}
async-trait = "0.1.53"
arc-swap = "1.5.0"
yoke = "0.6.2"
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"

View File

@@ -158,7 +158,6 @@ impl SegmentWriter {
let doc_id = self.max_doc;
let vals_grouped_by_field = doc
.field_values()
.iter()
.sorted_by_key(|el| el.field())
.group_by(|el| el.field());
for (field, field_values) in &vals_grouped_by_field {
@@ -502,9 +501,17 @@ mod tests {
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
let doc = reader.get(0).unwrap();
assert_eq!(doc.field_values().len(), 2);
assert_eq!(doc.field_values()[0].value().as_text(), Some("A"));
assert_eq!(doc.field_values()[1].value().as_text(), Some("title"));
assert_eq!(doc.value_count(), 2);
let mut field_value_iter = doc.field_values();
assert_eq!(
field_value_iter.next().unwrap().value().as_text(),
Some("A")
);
assert_eq!(
field_value_iter.next().unwrap().value().as_text(),
Some("title")
);
assert!(field_value_iter.next().is_none());
}
#[test]
@@ -833,20 +840,23 @@ mod tests {
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
tokens: vec![Token { // Not the last token, yet ends after the last token.
offset_from: 0,
offset_to: 14,
position: 0,
text: "long_token".to_string(),
position_length: 3,
},
Token {
offset_from: 0,
offset_to: 14,
position: 1,
text: "short".to_string(),
position_length: 1,
}],
tokens: vec![
Token {
// Not the last token, yet ends after the last token.
offset_from: 0,
offset_to: 14,
position: 0,
text: "long_token".to_string(),
position_length: 3,
},
Token {
offset_from: 0,
offset_to: 14,
position: 1,
text: "short".to_string(),
position_length: 1,
},
],
};
doc.add_pre_tokenized_text(text, tokens);
doc.add_text(text, "hello");

View File

@@ -1,37 +1,86 @@
use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::io::{self, Read, Write};
use std::mem;
use std::net::Ipv6Addr;
use std::{fmt, mem};
use common::{BinarySerializable, VInt};
use itertools::Either;
use yoke::erased::ErasedArcCart;
use yoke::Yoke;
use super::*;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
/// A group of FieldValue sharing an underlying storage
///
/// Or a single owned FieldValue.
#[derive(Clone)]
enum FieldValueGroup {
Single(FieldValue<'static>),
Group(Yoke<Vec<FieldValue<'static>>, ErasedArcCart>),
}
impl FieldValueGroup {
fn iter(&self) -> impl Iterator<Item = &FieldValue> {
match self {
FieldValueGroup::Single(field_value) => Either::Left(std::iter::once(field_value)),
FieldValueGroup::Group(field_values) => Either::Right(field_values.get().iter()),
}
}
fn count(&self) -> usize {
match self {
FieldValueGroup::Single(_) => 1,
FieldValueGroup::Group(field_values) => field_values.get().len(),
}
}
}
impl From<Vec<FieldValue<'static>>> for FieldValueGroup {
fn from(field_values: Vec<FieldValue<'static>>) -> FieldValueGroup {
FieldValueGroup::Group(
Yoke::new_always_owned(field_values)
.wrap_cart_in_arc()
.erase_arc_cart(),
)
}
}
/// Tantivy's Document is the object that can
/// be indexed and then searched for.
///
/// Documents are fundamentally a collection of unordered couples `(field, value)`.
/// In this list, one field may appear more than once.
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
#[serde(bound(deserialize = "'static: 'de, 'de: 'static"))]
#[derive(Clone, Default)]
// TODO bring back Ser/De and Debug
//#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
//#[serde(bound(deserialize = "'static: 'de, 'de: 'static"))]
pub struct Document {
field_values: Vec<FieldValue<'static>>,
field_values: Vec<FieldValueGroup>,
}
impl fmt::Debug for Document {
fn fmt(&self, _: &mut fmt::Formatter<'_>) -> fmt::Result {
todo!()
}
}
impl From<Vec<FieldValue<'static>>> for Document {
fn from(field_values: Vec<FieldValue<'static>>) -> Self {
let field_values = vec![field_values.into()];
Document { field_values }
}
}
impl PartialEq for Document {
fn eq(&self, other: &Document) -> bool {
// super slow, but only here for tests
let convert_to_comparable_map = |field_values: &[FieldValue]| {
let convert_to_comparable_map = |field_values| {
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
for field_value in field_values.iter() {
for field_value in field_values {
// for some reason rustc fails to guess the type
let field_value: &FieldValue = field_value;
let json_val = serde_json::to_string(field_value.value()).unwrap();
field_value_set
.entry(field_value.field())
@@ -41,9 +90,9 @@ impl PartialEq for Document {
field_value_set
};
let self_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&self.field_values);
convert_to_comparable_map(self.field_values());
let other_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&other.field_values);
convert_to_comparable_map(other.field_values());
self_field_values.eq(&other_field_values)
}
}
@@ -56,7 +105,8 @@ impl IntoIterator for Document {
type IntoIter = std::vec::IntoIter<FieldValue<'static>>;
fn into_iter(self) -> Self::IntoIter {
self.field_values.into_iter()
todo!()
// self.field_values.into_iter()
}
}
@@ -143,12 +193,19 @@ impl Document {
pub fn add_field_value<T: Into<Value<'static>>>(&mut self, field: Field, typed_val: T) {
let value = typed_val.into();
let field_value = FieldValue { field, value };
self.field_values.push(field_value);
self.field_values.push(FieldValueGroup::Single(field_value));
}
/// field_values accessor
pub fn field_values(&self) -> &[FieldValue] {
&self.field_values
pub fn field_values(&self) -> impl Iterator<Item = &FieldValue> {
self.field_values.iter().flat_map(|group| group.iter())
}
/// Return the total number of values
///
/// More efficient than calling `self.field_values().count()`
pub fn value_count(&self) -> usize {
self.field_values.iter().map(|group| group.count()).sum()
}
/// Sort and groups the field_values by field.
@@ -156,7 +213,7 @@ impl Document {
/// The result of this method is not cached and is
/// computed on the fly when this method is called.
pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&Value>)> {
let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect();
let mut field_values: Vec<&FieldValue> = self.field_values().collect();
field_values.sort_by_key(|field_value| field_value.field());
let mut field_values_it = field_values.into_iter();
@@ -191,6 +248,7 @@ impl Document {
pub fn get_all(&self, field: Field) -> impl Iterator<Item = &Value> {
self.field_values
.iter()
.flat_map(|group| group.iter())
.filter(move |field_value| field_value.field() == field)
.map(FieldValue::value)
}
@@ -204,7 +262,6 @@ impl Document {
pub fn serialize_stored<W: Write>(&self, schema: &Schema, writer: &mut W) -> io::Result<()> {
let stored_field_values = || {
self.field_values()
.iter()
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
};
let num_field_values = stored_field_values().count();
@@ -232,7 +289,7 @@ impl Document {
impl BinarySerializable for Document {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let field_values = self.field_values();
VInt(field_values.len() as u64).serialize(writer)?;
VInt(self.value_count() as u64).serialize(writer)?;
for field_value in field_values {
field_value.serialize(writer)?;
}
@@ -261,7 +318,7 @@ mod tests {
let text_field = schema_builder.add_text_field("title", TEXT);
let mut doc = Document::default();
doc.add_text(text_field, "My title");
assert_eq!(doc.field_values().len(), 1);
assert_eq!(doc.value_count(), 1);
}
#[test]
@@ -275,7 +332,7 @@ mod tests {
.clone(),
);
doc.add_text(Field::from_field_id(1), "hello");
assert_eq!(doc.field_values().len(), 2);
assert_eq!(doc.value_count(), 2);
let mut payload: Vec<u8> = Vec::new();
doc.serialize(&mut payload).unwrap();
assert_eq!(payload.len(), 26);

View File

@@ -711,7 +711,7 @@ mod tests {
let schema = schema_builder.build();
{
let doc = schema.parse_document("{}").unwrap();
assert!(doc.field_values().is_empty());
assert_eq!(doc.value_count(), 0);
}
{
let doc = schema