diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 8c5fd919e..8be186a85 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -1082,7 +1082,7 @@ mod tests { #[test] fn test_fast_field_in_json_field_expand_dots_disabled() { let mut schema_builder = Schema::builder(); - let json_option = JsonObjectOptions::default().set_fast(); + let json_option = JsonObjectOptions::default().set_fast(None); let json = schema_builder.add_json_field("json", json_option); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -1105,11 +1105,36 @@ mod tests { assert_eq!(&vals, &[32]) } + #[test] + fn test_fast_field_in_json_field_with_tokenizer() { + let mut schema_builder = Schema::builder(); + let json_option = JsonObjectOptions::default().set_fast(Some("default")); + let json = schema_builder.add_json_field("json", json_option); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests().unwrap(); + index_writer + .add_document(doc!(json => json!({"age": 32}))) + .unwrap(); + index_writer + .add_document(doc!(json => json!({"age": "NEW"}))) + .unwrap(); + + index_writer.commit().unwrap(); + let searcher = index.reader().unwrap().searcher(); + let fast_fields = searcher.segment_reader(0u32).fast_fields(); + + let ff_str = fast_fields.str("json.age").unwrap().unwrap(); + let mut output = String::new(); + ff_str.ord_to_str(0, &mut output).unwrap(); + assert_eq!(output, "new"); + } + #[test] fn test_fast_field_in_json_field_expand_dots_enabled() { let mut schema_builder = Schema::builder(); let json_option = JsonObjectOptions::default() - .set_fast() + .set_fast(None) .set_expand_dots_enabled(); let json = schema_builder.add_json_field("json", json_option); let schema = schema_builder.build(); @@ -1246,7 +1271,7 @@ mod tests { fn test_shadowing_fast_field_with_expand_dots() { let mut schema_builder = Schema::builder(); let json_option = JsonObjectOptions::default() - .set_fast() + .set_fast(None) .set_expand_dots_enabled(); let json_field = schema_builder.add_json_field("jsonfield", json_option.clone()); let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option); diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 45564170c..6d3cef5f7 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -346,7 +346,7 @@ mod tests { schema_builder.add_json_field( "json_expand_dots_enabled", JsonObjectOptions::default() - .set_fast() + .set_fast(None) .set_expand_dots_enabled(), ); let dynamic_field = schema_builder.add_json_field("_dyna", FAST); diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 8f23b1d12..a1a97bdb4 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -46,7 +46,7 @@ impl FastFieldsWriter { .take(schema.num_fields()) .collect(); let mut expand_dots = vec![false; schema.num_fields()]; - let mut per_field_tokenizer = vec![None; schema.num_fields()]; + let mut per_field_tokenizer: Vec> = vec![None; schema.num_fields()]; // TODO see other types for (field_id, field_entry) in schema.fields() { if !field_entry.field_type().is_fast() { @@ -58,6 +58,15 @@ impl FastFieldsWriter { date_precisions[field_id.field_id() as usize] = date_options.get_precision(); } if let FieldType::JsonObject(json_object_options) = field_entry.field_type() { + if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() { + let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| { + TantivyError::InvalidArgument(format!( + "Tokenizer {tokenizer_name:?} not found" + )) + })?; + per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer); + } + expand_dots[field_id.field_id() as usize] = json_object_options.is_expand_dots_enabled(); } @@ -137,10 +146,10 @@ impl FastFieldsWriter { ); } Value::Str(text_val) => { - if let Some(text_analyzer) = + if let Some(tokenizer) = &self.per_field_tokenizer[field_value.field().field_id() as usize] { - let mut token_stream = text_analyzer.token_stream(text_val); + let mut token_stream = tokenizer.token_stream(text_val); token_stream.process(&mut |token: &Token| { self.columnar_writer.record_str( doc_id, @@ -191,6 +200,10 @@ impl FastFieldsWriter { let expand_dots = self.expand_dots[field_value.field().field_id() as usize]; self.json_path_buffer.clear(); self.json_path_buffer.push_str(field_name); + + let text_analyzer = + &self.per_field_tokenizer[field_value.field().field_id() as usize]; + record_json_obj_to_columnar_writer( doc_id, json_obj, @@ -198,6 +211,7 @@ impl FastFieldsWriter { JSON_DEPTH_LIMIT, &mut self.json_path_buffer, &mut self.columnar_writer, + text_analyzer, ); } Value::IpAddr(ip_addr) => { @@ -249,6 +263,7 @@ fn record_json_obj_to_columnar_writer( remaining_depth_limit: usize, json_path_buffer: &mut String, columnar_writer: &mut columnar::ColumnarWriter, + tokenizer: &Option, ) { for (key, child) in json_obj { let len_path = json_path_buffer.len(); @@ -273,6 +288,7 @@ fn record_json_obj_to_columnar_writer( remaining_depth_limit, json_path_buffer, columnar_writer, + tokenizer, ); // popping our sub path. json_path_buffer.truncate(len_path); @@ -286,6 +302,7 @@ fn record_json_value_to_columnar_writer( mut remaining_depth_limit: usize, json_path_writer: &mut String, columnar_writer: &mut columnar::ColumnarWriter, + tokenizer: &Option, ) { if remaining_depth_limit == 0 { return; @@ -304,7 +321,14 @@ fn record_json_value_to_columnar_writer( } } serde_json::Value::String(text) => { - columnar_writer.record_str(doc, json_path_writer.as_str(), text); + if let Some(text_analyzer) = tokenizer { + let mut token_stream = text_analyzer.token_stream(text); + token_stream.process(&mut |token| { + columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text); + }) + } else { + columnar_writer.record_str(doc, json_path_writer.as_str(), text); + } } serde_json::Value::Array(arr) => { for el in arr { @@ -315,6 +339,7 @@ fn record_json_value_to_columnar_writer( remaining_depth_limit, json_path_writer, columnar_writer, + tokenizer, ); } } @@ -326,6 +351,7 @@ fn record_json_value_to_columnar_writer( remaining_depth_limit, json_path_writer, columnar_writer, + tokenizer, ); } } @@ -353,6 +379,7 @@ mod tests { JSON_DEPTH_LIMIT, &mut json_path, &mut columnar_writer, + &None, ); } let mut buffer = Vec::new(); diff --git a/src/schema/json_object_options.rs b/src/schema/json_object_options.rs index 1f7653cfb..eee3618a8 100644 --- a/src/schema/json_object_options.rs +++ b/src/schema/json_object_options.rs @@ -2,19 +2,20 @@ use std::ops::BitOr; use serde::{Deserialize, Serialize}; +use super::text_options::{FastFieldTextOptions, TokenizerName}; use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag}; use crate::schema::{TextFieldIndexing, TextOptions}; /// The `JsonObjectOptions` make it possible to /// configure how a json object field should be indexed and stored. -#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] pub struct JsonObjectOptions { stored: bool, // If set to some, int, date, f64 and text will be indexed. // Text will use the TextFieldIndexing setting for indexing. indexing: Option, - // Store all field as fast fields. - fast: bool, + // Store all field as fast fields with an optional tokenizer for text. + fast: FastFieldTextOptions, /// tantivy will generate pathes to the different nodes of the json object /// both in: /// - the inverted index (for the terms) @@ -57,7 +58,21 @@ impl JsonObjectOptions { /// Returns true if and only if the json object fields are /// to be treated as fast fields. pub fn is_fast(&self) -> bool { - self.fast + matches!(self.fast, FastFieldTextOptions::IsEnabled(true)) + || matches!( + &self.fast, + FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ } + ) + } + + /// Returns true if and only if the value is a fast field. + pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> { + match &self.fast { + FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None, + FastFieldTextOptions::EnabledWithTokenizer { + with_tokenizer: tokenizer, + } => Some(tokenizer.name()), + } } /// Returns `true` iff dots in json keys should be expanded. @@ -99,10 +114,31 @@ impl JsonObjectOptions { self } - /// Sets the field as a fast field + /// Set the field as a fast field. + /// + /// Fast fields are designed for random access. + /// Access time are similar to a random lookup in an array. + /// Text fast fields will have the term ids stored in the fast field. + /// + /// The effective cardinality depends on the tokenizer. Without a tokenizer, the text will be + /// stored as is, which equals to the "raw" tokenizer. The tokenizer can be used to apply + /// normalization like lower case. + /// The passed tokenizer_name must be available on the fast field tokenizer manager. + /// `Index::fast_field_tokenizer`. + /// + /// The original text can be retrieved via + /// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term) + /// from the dictionary. #[must_use] - pub fn set_fast(mut self) -> Self { - self.fast = true; + pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self { + if let Some(tokenizer) = tokenizer_name { + let tokenizer = TokenizerName::from_name(tokenizer); + self.fast = FastFieldTextOptions::EnabledWithTokenizer { + with_tokenizer: tokenizer, + } + } else { + self.fast = FastFieldTextOptions::IsEnabled(true); + } self } @@ -119,7 +155,7 @@ impl From for JsonObjectOptions { JsonObjectOptions { stored: true, indexing: None, - fast: false, + fast: FastFieldTextOptions::default(), expand_dots_enabled: false, } } @@ -130,7 +166,7 @@ impl From for JsonObjectOptions { JsonObjectOptions { stored: false, indexing: None, - fast: true, + fast: FastFieldTextOptions::IsEnabled(true), expand_dots_enabled: false, } } @@ -172,7 +208,7 @@ impl From for JsonObjectOptions { JsonObjectOptions { stored: text_options.is_stored(), indexing: text_options.get_indexing_options().cloned(), - fast: text_options.is_fast(), + fast: text_options.fast, expand_dots_enabled: false, } } diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index cd1a04a22..4519fb59a 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -16,7 +16,7 @@ pub struct TextOptions { #[serde(default)] stored: bool, #[serde(default)] - fast: FastFieldOptions, + pub(crate) fast: FastFieldTextOptions, #[serde(default)] #[serde(skip_serializing_if = "is_false")] /// coerce values into string if they are not of type string @@ -26,7 +26,7 @@ pub struct TextOptions { #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[serde(untagged)] /// Enum to control how the fast field setting of a text field. -enum FastFieldOptions { +pub(crate) enum FastFieldTextOptions { /// Flag to enable/disable IsEnabled(bool), /// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager. @@ -34,35 +34,34 @@ enum FastFieldOptions { EnabledWithTokenizer { with_tokenizer: TokenizerName }, } -impl Default for FastFieldOptions { +impl Default for FastFieldTextOptions { fn default() -> Self { - FastFieldOptions::IsEnabled(false) + FastFieldTextOptions::IsEnabled(false) } } -impl BitOr for FastFieldOptions { - type Output = FastFieldOptions; +impl BitOr for FastFieldTextOptions { + type Output = FastFieldTextOptions; - fn bitor(self, other: FastFieldOptions) -> FastFieldOptions { + fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions { match (self, other) { ( - FastFieldOptions::EnabledWithTokenizer { + FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: tokenizer, }, _, ) | ( _, - FastFieldOptions::EnabledWithTokenizer { + FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: tokenizer, }, - ) => FastFieldOptions::EnabledWithTokenizer { + ) => FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: tokenizer, }, - (FastFieldOptions::IsEnabled(true), _) | (_, FastFieldOptions::IsEnabled(true)) => { - FastFieldOptions::IsEnabled(true) - } - (_, FastFieldOptions::IsEnabled(false)) => FastFieldOptions::IsEnabled(false), + (FastFieldTextOptions::IsEnabled(true), _) + | (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true), + (_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false), } } } @@ -84,18 +83,18 @@ impl TextOptions { /// Returns true if and only if the value is a fast field. pub fn is_fast(&self) -> bool { - matches!(self.fast, FastFieldOptions::IsEnabled(true)) + matches!(self.fast, FastFieldTextOptions::IsEnabled(true)) || matches!( &self.fast, - FastFieldOptions::EnabledWithTokenizer { with_tokenizer: _ } + FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ } ) } /// Returns true if and only if the value is a fast field. pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> { match &self.fast { - FastFieldOptions::IsEnabled(true) | FastFieldOptions::IsEnabled(false) => None, - FastFieldOptions::EnabledWithTokenizer { + FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None, + FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: tokenizer, } => Some(tokenizer.name()), } @@ -125,11 +124,11 @@ impl TextOptions { pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions { if let Some(tokenizer) = tokenizer_name { let tokenizer = TokenizerName::from_name(tokenizer); - self.fast = FastFieldOptions::EnabledWithTokenizer { + self.fast = FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: tokenizer, } } else { - self.fast = FastFieldOptions::IsEnabled(true); + self.fast = FastFieldTextOptions::IsEnabled(true); } self } @@ -173,10 +172,10 @@ impl TokenizerName { pub const fn from_static(name: &'static str) -> Self { TokenizerName(Cow::Borrowed(name)) } - fn from_name(name: &str) -> Self { + pub(crate) fn from_name(name: &str) -> Self { TokenizerName(Cow::Owned(name.to_string())) } - fn name(&self) -> &str { + pub(crate) fn name(&self) -> &str { &self.0 } } @@ -264,7 +263,7 @@ pub const STRING: TextOptions = TextOptions { record: IndexRecordOption::Basic, }), stored: false, - fast: FastFieldOptions::IsEnabled(false), + fast: FastFieldTextOptions::IsEnabled(false), coerce: false, }; @@ -277,7 +276,7 @@ pub const TEXT: TextOptions = TextOptions { }), stored: false, coerce: false, - fast: FastFieldOptions::IsEnabled(false), + fast: FastFieldTextOptions::IsEnabled(false), }; impl> BitOr for TextOptions { @@ -305,7 +304,7 @@ impl From for TextOptions { TextOptions { indexing: None, stored: true, - fast: FastFieldOptions::IsEnabled(false), + fast: FastFieldTextOptions::default(), coerce: false, } } @@ -316,7 +315,7 @@ impl From for TextOptions { TextOptions { indexing: None, stored: false, - fast: FastFieldOptions::IsEnabled(false), + fast: FastFieldTextOptions::default(), coerce: true, } } @@ -327,7 +326,7 @@ impl From for TextOptions { TextOptions { indexing: None, stored: false, - fast: FastFieldOptions::IsEnabled(true), + fast: FastFieldTextOptions::IsEnabled(true), coerce: false, } } @@ -346,7 +345,7 @@ where #[cfg(test)] mod tests { - use crate::schema::text_options::{FastFieldOptions, TokenizerName}; + use crate::schema::text_options::{FastFieldTextOptions, TokenizerName}; use crate::schema::*; #[test] @@ -398,7 +397,7 @@ mod tests { let options: TextOptions = serde_json::from_str(json).unwrap(); assert_eq!( options.fast, - FastFieldOptions::EnabledWithTokenizer { + FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: TokenizerName::from_static("default") } ); @@ -406,7 +405,7 @@ mod tests { serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); assert_eq!( options.fast, - FastFieldOptions::EnabledWithTokenizer { + FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: TokenizerName::from_static("default") } ); @@ -415,18 +414,18 @@ mod tests { "fast": true } "#; let options: TextOptions = serde_json::from_str(json).unwrap(); - assert_eq!(options.fast, FastFieldOptions::IsEnabled(true)); + assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true)); let options: TextOptions = serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); - assert_eq!(options.fast, FastFieldOptions::IsEnabled(true)); + assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true)); let json = r#" { "fast": false } "#; let options: TextOptions = serde_json::from_str(json).unwrap(); - assert_eq!(options.fast, FastFieldOptions::IsEnabled(false)); + assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false)); let options: TextOptions = serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); - assert_eq!(options.fast, FastFieldOptions::IsEnabled(false)); + assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false)); } } diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 7e1394076..65b7815c8 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -1,6 +1,6 @@ /// The tokenizer module contains all of the tools used to process /// text in `tantivy`. -use tokenizer_api::{BoxTokenStream, BoxableTokenizer, TokenFilter, Tokenizer}; +use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer}; use crate::tokenizer::empty_tokenizer::EmptyTokenizer; @@ -9,6 +9,31 @@ pub struct TextAnalyzer { tokenizer: Box, } +/// A boxable `Tokenizer`, with its `TokenStream` type erased. +trait BoxableTokenizer: 'static + Send + Sync { + /// Creates a boxed token stream for a given `str`. + fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; + /// Clone this tokenizer. + fn box_clone(&self) -> Box; +} + +impl BoxableTokenizer for T { + fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + self.token_stream(text).into() + } + fn box_clone(&self) -> Box { + Box::new(self.clone()) + } +} + +impl Clone for TextAnalyzer { + fn clone(&self) -> Self { + TextAnalyzer { + tokenizer: self.tokenizer.box_clone(), + } + } +} + impl Default for TextAnalyzer { fn default() -> TextAnalyzer { TextAnalyzer::from(EmptyTokenizer) @@ -33,14 +58,6 @@ impl TextAnalyzer { } } -impl Clone for TextAnalyzer { - fn clone(&self) -> Self { - TextAnalyzer { - tokenizer: self.tokenizer.box_clone(), - } - } -} - /// Builder helper for [`TextAnalyzer`] pub struct TextAnalyzerBuilder { tokenizer: T, diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs index d1497a946..f43f8b1d6 100644 --- a/tokenizer-api/src/lib.rs +++ b/tokenizer-api/src/lib.rs @@ -49,23 +49,6 @@ pub trait Tokenizer: 'static + Clone + Send + Sync { fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>; } -/// A boxable `Tokenizer`, with its `TokenStream` type erased. -pub trait BoxableTokenizer: 'static + Send + Sync { - /// Creates a boxed token stream for a given `str`. - fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; - /// Clone this tokenizer. - fn box_clone(&self) -> Box; -} - -impl BoxableTokenizer for T { - fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { - self.token_stream(text).into() - } - fn box_clone(&self) -> Box { - Box::new(self.clone()) - } -} - /// Simple wrapper of `Box`. pub struct BoxTokenStream<'a>(Box);