From db6cf65d5303bf9b876167ea25721dfc620efdbd Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Thu, 22 Dec 2022 17:52:53 +0100 Subject: [PATCH] make Document support Yoked inner values --- Cargo.toml | 1 + src/indexer/segment_writer.rs | 46 +++++++++++------- src/schema/document.rs | 91 ++++++++++++++++++++++++++++------- src/schema/schema.rs | 2 +- 4 files changed, 104 insertions(+), 36 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7f285446e..08e6bed22 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,6 +61,7 @@ measure_time = "0.8.2" ciborium = { version = "0.2", optional = true} async-trait = "0.1.53" arc-swap = "1.5.0" +yoke = "0.6.2" [target.'cfg(windows)'.dependencies] winapi = "0.3.9" diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index c22e9dc56..5633b1d79 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -158,7 +158,6 @@ impl SegmentWriter { let doc_id = self.max_doc; let vals_grouped_by_field = doc .field_values() - .iter() .sorted_by_key(|el| el.field()) .group_by(|el| el.field()); for (field, field_values) in &vals_grouped_by_field { @@ -502,9 +501,17 @@ mod tests { let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap(); let doc = reader.get(0).unwrap(); - assert_eq!(doc.field_values().len(), 2); - assert_eq!(doc.field_values()[0].value().as_text(), Some("A")); - assert_eq!(doc.field_values()[1].value().as_text(), Some("title")); + assert_eq!(doc.value_count(), 2); + let mut field_value_iter = doc.field_values(); + assert_eq!( + field_value_iter.next().unwrap().value().as_text(), + Some("A") + ); + assert_eq!( + field_value_iter.next().unwrap().value().as_text(), + Some("title") + ); + assert!(field_value_iter.next().is_none()); } #[test] @@ -833,20 +840,23 @@ mod tests { // This is a bit of a contrived example. let tokens = PreTokenizedString { text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life. - tokens: vec![Token { // Not the last token, yet ends after the last token. - offset_from: 0, - offset_to: 14, - position: 0, - text: "long_token".to_string(), - position_length: 3, - }, - Token { - offset_from: 0, - offset_to: 14, - position: 1, - text: "short".to_string(), - position_length: 1, - }], + tokens: vec![ + Token { + // Not the last token, yet ends after the last token. + offset_from: 0, + offset_to: 14, + position: 0, + text: "long_token".to_string(), + position_length: 3, + }, + Token { + offset_from: 0, + offset_to: 14, + position: 1, + text: "short".to_string(), + position_length: 1, + }, + ], }; doc.add_pre_tokenized_text(text, tokens); doc.add_text(text, "hello"); diff --git a/src/schema/document.rs b/src/schema/document.rs index 14f220d4d..322a1e37e 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -1,37 +1,86 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::io::{self, Read, Write}; -use std::mem; use std::net::Ipv6Addr; +use std::{fmt, mem}; use common::{BinarySerializable, VInt}; +use itertools::Either; +use yoke::erased::ErasedArcCart; +use yoke::Yoke; use super::*; use crate::tokenizer::PreTokenizedString; use crate::DateTime; +/// A group of FieldValue sharing an underlying storage +/// +/// Or a single owned FieldValue. +#[derive(Clone)] +enum FieldValueGroup { + Single(FieldValue<'static>), + Group(Yoke>, ErasedArcCart>), +} + +impl FieldValueGroup { + fn iter(&self) -> impl Iterator { + match self { + FieldValueGroup::Single(field_value) => Either::Left(std::iter::once(field_value)), + FieldValueGroup::Group(field_values) => Either::Right(field_values.get().iter()), + } + } + + fn count(&self) -> usize { + match self { + FieldValueGroup::Single(_) => 1, + FieldValueGroup::Group(field_values) => field_values.get().len(), + } + } +} + +impl From>> for FieldValueGroup { + fn from(field_values: Vec>) -> FieldValueGroup { + FieldValueGroup::Group( + Yoke::new_always_owned(field_values) + .wrap_cart_in_arc() + .erase_arc_cart(), + ) + } +} + /// Tantivy's Document is the object that can /// be indexed and then searched for. /// /// Documents are fundamentally a collection of unordered couples `(field, value)`. /// In this list, one field may appear more than once. -#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)] -#[serde(bound(deserialize = "'static: 'de, 'de: 'static"))] +#[derive(Clone, Default)] +// TODO bring back Ser/De and Debug +//#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)] +//#[serde(bound(deserialize = "'static: 'de, 'de: 'static"))] pub struct Document { - field_values: Vec>, + field_values: Vec, +} + +impl fmt::Debug for Document { + fn fmt(&self, _: &mut fmt::Formatter<'_>) -> fmt::Result { + todo!() + } } impl From>> for Document { fn from(field_values: Vec>) -> Self { + let field_values = vec![field_values.into()]; Document { field_values } } } impl PartialEq for Document { fn eq(&self, other: &Document) -> bool { // super slow, but only here for tests - let convert_to_comparable_map = |field_values: &[FieldValue]| { + let convert_to_comparable_map = |field_values| { let mut field_value_set: HashMap> = Default::default(); - for field_value in field_values.iter() { + for field_value in field_values { + // for some reason rustc fails to guess the type + let field_value: &FieldValue = field_value; let json_val = serde_json::to_string(field_value.value()).unwrap(); field_value_set .entry(field_value.field()) @@ -41,9 +90,9 @@ impl PartialEq for Document { field_value_set }; let self_field_values: HashMap> = - convert_to_comparable_map(&self.field_values); + convert_to_comparable_map(self.field_values()); let other_field_values: HashMap> = - convert_to_comparable_map(&other.field_values); + convert_to_comparable_map(other.field_values()); self_field_values.eq(&other_field_values) } } @@ -56,7 +105,8 @@ impl IntoIterator for Document { type IntoIter = std::vec::IntoIter>; fn into_iter(self) -> Self::IntoIter { - self.field_values.into_iter() + todo!() + // self.field_values.into_iter() } } @@ -143,12 +193,19 @@ impl Document { pub fn add_field_value>>(&mut self, field: Field, typed_val: T) { let value = typed_val.into(); let field_value = FieldValue { field, value }; - self.field_values.push(field_value); + self.field_values.push(FieldValueGroup::Single(field_value)); } /// field_values accessor - pub fn field_values(&self) -> &[FieldValue] { - &self.field_values + pub fn field_values(&self) -> impl Iterator { + self.field_values.iter().flat_map(|group| group.iter()) + } + + /// Return the total number of values + /// + /// More efficient than calling `self.field_values().count()` + pub fn value_count(&self) -> usize { + self.field_values.iter().map(|group| group.count()).sum() } /// Sort and groups the field_values by field. @@ -156,7 +213,7 @@ impl Document { /// The result of this method is not cached and is /// computed on the fly when this method is called. pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&Value>)> { - let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect(); + let mut field_values: Vec<&FieldValue> = self.field_values().collect(); field_values.sort_by_key(|field_value| field_value.field()); let mut field_values_it = field_values.into_iter(); @@ -191,6 +248,7 @@ impl Document { pub fn get_all(&self, field: Field) -> impl Iterator { self.field_values .iter() + .flat_map(|group| group.iter()) .filter(move |field_value| field_value.field() == field) .map(FieldValue::value) } @@ -204,7 +262,6 @@ impl Document { pub fn serialize_stored(&self, schema: &Schema, writer: &mut W) -> io::Result<()> { let stored_field_values = || { self.field_values() - .iter() .filter(|field_value| schema.get_field_entry(field_value.field()).is_stored()) }; let num_field_values = stored_field_values().count(); @@ -232,7 +289,7 @@ impl Document { impl BinarySerializable for Document { fn serialize(&self, writer: &mut W) -> io::Result<()> { let field_values = self.field_values(); - VInt(field_values.len() as u64).serialize(writer)?; + VInt(self.value_count() as u64).serialize(writer)?; for field_value in field_values { field_value.serialize(writer)?; } @@ -261,7 +318,7 @@ mod tests { let text_field = schema_builder.add_text_field("title", TEXT); let mut doc = Document::default(); doc.add_text(text_field, "My title"); - assert_eq!(doc.field_values().len(), 1); + assert_eq!(doc.value_count(), 1); } #[test] @@ -275,7 +332,7 @@ mod tests { .clone(), ); doc.add_text(Field::from_field_id(1), "hello"); - assert_eq!(doc.field_values().len(), 2); + assert_eq!(doc.value_count(), 2); let mut payload: Vec = Vec::new(); doc.serialize(&mut payload).unwrap(); assert_eq!(payload.len(), 26); diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 37722e7c4..0fe84f83d 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -711,7 +711,7 @@ mod tests { let schema = schema_builder.build(); { let doc = schema.parse_document("{}").unwrap(); - assert!(doc.field_values().is_empty()); + assert_eq!(doc.value_count(), 0); } { let doc = schema