enable tokenizer on json fields (#2053)

* enable tokenizer on json fields

enable tokenizer on json fields for type text

* Avoid making the tokenizer within the TextAnalyzer pub(crate)

* Moving BoxableTokenizer to tantivy.

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
This commit is contained in:
PSeitz
2023-05-24 16:47:39 +08:00
committed by GitHub
parent 4be6f83b0a
commit e56addc63e
7 changed files with 165 additions and 78 deletions

View File

@@ -1082,7 +1082,7 @@ mod tests {
#[test] #[test]
fn test_fast_field_in_json_field_expand_dots_disabled() { fn test_fast_field_in_json_field_expand_dots_disabled() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast(); let json_option = JsonObjectOptions::default().set_fast(None);
let json = schema_builder.add_json_field("json", json_option); let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -1105,11 +1105,36 @@ mod tests {
assert_eq!(&vals, &[32]) assert_eq!(&vals, &[32])
} }
#[test]
fn test_fast_field_in_json_field_with_tokenizer() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast(Some("default"));
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"age": 32})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"age": "NEW"})))
.unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let fast_fields = searcher.segment_reader(0u32).fast_fields();
let ff_str = fast_fields.str("json.age").unwrap().unwrap();
let mut output = String::new();
ff_str.ord_to_str(0, &mut output).unwrap();
assert_eq!(output, "new");
}
#[test] #[test]
fn test_fast_field_in_json_field_expand_dots_enabled() { fn test_fast_field_in_json_field_expand_dots_enabled() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default() let json_option = JsonObjectOptions::default()
.set_fast() .set_fast(None)
.set_expand_dots_enabled(); .set_expand_dots_enabled();
let json = schema_builder.add_json_field("json", json_option); let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -1246,7 +1271,7 @@ mod tests {
fn test_shadowing_fast_field_with_expand_dots() { fn test_shadowing_fast_field_with_expand_dots() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default() let json_option = JsonObjectOptions::default()
.set_fast() .set_fast(None)
.set_expand_dots_enabled(); .set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone()); let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option); let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);

View File

@@ -346,7 +346,7 @@ mod tests {
schema_builder.add_json_field( schema_builder.add_json_field(
"json_expand_dots_enabled", "json_expand_dots_enabled",
JsonObjectOptions::default() JsonObjectOptions::default()
.set_fast() .set_fast(None)
.set_expand_dots_enabled(), .set_expand_dots_enabled(),
); );
let dynamic_field = schema_builder.add_json_field("_dyna", FAST); let dynamic_field = schema_builder.add_json_field("_dyna", FAST);

View File

@@ -46,7 +46,7 @@ impl FastFieldsWriter {
.take(schema.num_fields()) .take(schema.num_fields())
.collect(); .collect();
let mut expand_dots = vec![false; schema.num_fields()]; let mut expand_dots = vec![false; schema.num_fields()];
let mut per_field_tokenizer = vec![None; schema.num_fields()]; let mut per_field_tokenizer: Vec<Option<TextAnalyzer>> = vec![None; schema.num_fields()];
// TODO see other types // TODO see other types
for (field_id, field_entry) in schema.fields() { for (field_id, field_entry) in schema.fields() {
if !field_entry.field_type().is_fast() { if !field_entry.field_type().is_fast() {
@@ -58,6 +58,15 @@ impl FastFieldsWriter {
date_precisions[field_id.field_id() as usize] = date_options.get_precision(); date_precisions[field_id.field_id() as usize] = date_options.get_precision();
} }
if let FieldType::JsonObject(json_object_options) = field_entry.field_type() { if let FieldType::JsonObject(json_object_options) = field_entry.field_type() {
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"Tokenizer {tokenizer_name:?} not found"
))
})?;
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
}
expand_dots[field_id.field_id() as usize] = expand_dots[field_id.field_id() as usize] =
json_object_options.is_expand_dots_enabled(); json_object_options.is_expand_dots_enabled();
} }
@@ -137,10 +146,10 @@ impl FastFieldsWriter {
); );
} }
Value::Str(text_val) => { Value::Str(text_val) => {
if let Some(text_analyzer) = if let Some(tokenizer) =
&self.per_field_tokenizer[field_value.field().field_id() as usize] &self.per_field_tokenizer[field_value.field().field_id() as usize]
{ {
let mut token_stream = text_analyzer.token_stream(text_val); let mut token_stream = tokenizer.token_stream(text_val);
token_stream.process(&mut |token: &Token| { token_stream.process(&mut |token: &Token| {
self.columnar_writer.record_str( self.columnar_writer.record_str(
doc_id, doc_id,
@@ -191,6 +200,10 @@ impl FastFieldsWriter {
let expand_dots = self.expand_dots[field_value.field().field_id() as usize]; let expand_dots = self.expand_dots[field_value.field().field_id() as usize];
self.json_path_buffer.clear(); self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name); self.json_path_buffer.push_str(field_name);
let text_analyzer =
&self.per_field_tokenizer[field_value.field().field_id() as usize];
record_json_obj_to_columnar_writer( record_json_obj_to_columnar_writer(
doc_id, doc_id,
json_obj, json_obj,
@@ -198,6 +211,7 @@ impl FastFieldsWriter {
JSON_DEPTH_LIMIT, JSON_DEPTH_LIMIT,
&mut self.json_path_buffer, &mut self.json_path_buffer,
&mut self.columnar_writer, &mut self.columnar_writer,
text_analyzer,
); );
} }
Value::IpAddr(ip_addr) => { Value::IpAddr(ip_addr) => {
@@ -249,6 +263,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit: usize, remaining_depth_limit: usize,
json_path_buffer: &mut String, json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter, columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &Option<TextAnalyzer>,
) { ) {
for (key, child) in json_obj { for (key, child) in json_obj {
let len_path = json_path_buffer.len(); let len_path = json_path_buffer.len();
@@ -273,6 +288,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit, remaining_depth_limit,
json_path_buffer, json_path_buffer,
columnar_writer, columnar_writer,
tokenizer,
); );
// popping our sub path. // popping our sub path.
json_path_buffer.truncate(len_path); json_path_buffer.truncate(len_path);
@@ -286,6 +302,7 @@ fn record_json_value_to_columnar_writer(
mut remaining_depth_limit: usize, mut remaining_depth_limit: usize,
json_path_writer: &mut String, json_path_writer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter, columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &Option<TextAnalyzer>,
) { ) {
if remaining_depth_limit == 0 { if remaining_depth_limit == 0 {
return; return;
@@ -304,7 +321,14 @@ fn record_json_value_to_columnar_writer(
} }
} }
serde_json::Value::String(text) => { serde_json::Value::String(text) => {
columnar_writer.record_str(doc, json_path_writer.as_str(), text); if let Some(text_analyzer) = tokenizer {
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
})
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
}
} }
serde_json::Value::Array(arr) => { serde_json::Value::Array(arr) => {
for el in arr { for el in arr {
@@ -315,6 +339,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit, remaining_depth_limit,
json_path_writer, json_path_writer,
columnar_writer, columnar_writer,
tokenizer,
); );
} }
} }
@@ -326,6 +351,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit, remaining_depth_limit,
json_path_writer, json_path_writer,
columnar_writer, columnar_writer,
tokenizer,
); );
} }
} }
@@ -353,6 +379,7 @@ mod tests {
JSON_DEPTH_LIMIT, JSON_DEPTH_LIMIT,
&mut json_path, &mut json_path,
&mut columnar_writer, &mut columnar_writer,
&None,
); );
} }
let mut buffer = Vec::new(); let mut buffer = Vec::new();

View File

@@ -2,19 +2,20 @@ use std::ops::BitOr;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::text_options::{FastFieldTextOptions, TokenizerName};
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag}; use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
use crate::schema::{TextFieldIndexing, TextOptions}; use crate::schema::{TextFieldIndexing, TextOptions};
/// The `JsonObjectOptions` make it possible to /// The `JsonObjectOptions` make it possible to
/// configure how a json object field should be indexed and stored. /// configure how a json object field should be indexed and stored.
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] #[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct JsonObjectOptions { pub struct JsonObjectOptions {
stored: bool, stored: bool,
// If set to some, int, date, f64 and text will be indexed. // If set to some, int, date, f64 and text will be indexed.
// Text will use the TextFieldIndexing setting for indexing. // Text will use the TextFieldIndexing setting for indexing.
indexing: Option<TextFieldIndexing>, indexing: Option<TextFieldIndexing>,
// Store all field as fast fields. // Store all field as fast fields with an optional tokenizer for text.
fast: bool, fast: FastFieldTextOptions,
/// tantivy will generate pathes to the different nodes of the json object /// tantivy will generate pathes to the different nodes of the json object
/// both in: /// both in:
/// - the inverted index (for the terms) /// - the inverted index (for the terms)
@@ -57,7 +58,21 @@ impl JsonObjectOptions {
/// Returns true if and only if the json object fields are /// Returns true if and only if the json object fields are
/// to be treated as fast fields. /// to be treated as fast fields.
pub fn is_fast(&self) -> bool { pub fn is_fast(&self) -> bool {
self.fast matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|| matches!(
&self.fast,
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
)
}
/// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast {
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
} => Some(tokenizer.name()),
}
} }
/// Returns `true` iff dots in json keys should be expanded. /// Returns `true` iff dots in json keys should be expanded.
@@ -99,10 +114,31 @@ impl JsonObjectOptions {
self self
} }
/// Sets the field as a fast field /// Set the field as a fast field.
///
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// Text fast fields will have the term ids stored in the fast field.
///
/// The effective cardinality depends on the tokenizer. Without a tokenizer, the text will be
/// stored as is, which equals to the "raw" tokenizer. The tokenizer can be used to apply
/// normalization like lower case.
/// The passed tokenizer_name must be available on the fast field tokenizer manager.
/// `Index::fast_field_tokenizer`.
///
/// The original text can be retrieved via
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
/// from the dictionary.
#[must_use] #[must_use]
pub fn set_fast(mut self) -> Self { pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self {
self.fast = true; if let Some(tokenizer) = tokenizer_name {
let tokenizer = TokenizerName::from_name(tokenizer);
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
}
} else {
self.fast = FastFieldTextOptions::IsEnabled(true);
}
self self
} }
@@ -119,7 +155,7 @@ impl From<StoredFlag> for JsonObjectOptions {
JsonObjectOptions { JsonObjectOptions {
stored: true, stored: true,
indexing: None, indexing: None,
fast: false, fast: FastFieldTextOptions::default(),
expand_dots_enabled: false, expand_dots_enabled: false,
} }
} }
@@ -130,7 +166,7 @@ impl From<FastFlag> for JsonObjectOptions {
JsonObjectOptions { JsonObjectOptions {
stored: false, stored: false,
indexing: None, indexing: None,
fast: true, fast: FastFieldTextOptions::IsEnabled(true),
expand_dots_enabled: false, expand_dots_enabled: false,
} }
} }
@@ -172,7 +208,7 @@ impl From<TextOptions> for JsonObjectOptions {
JsonObjectOptions { JsonObjectOptions {
stored: text_options.is_stored(), stored: text_options.is_stored(),
indexing: text_options.get_indexing_options().cloned(), indexing: text_options.get_indexing_options().cloned(),
fast: text_options.is_fast(), fast: text_options.fast,
expand_dots_enabled: false, expand_dots_enabled: false,
} }
} }

View File

@@ -16,7 +16,7 @@ pub struct TextOptions {
#[serde(default)] #[serde(default)]
stored: bool, stored: bool,
#[serde(default)] #[serde(default)]
fast: FastFieldOptions, pub(crate) fast: FastFieldTextOptions,
#[serde(default)] #[serde(default)]
#[serde(skip_serializing_if = "is_false")] #[serde(skip_serializing_if = "is_false")]
/// coerce values into string if they are not of type string /// coerce values into string if they are not of type string
@@ -26,7 +26,7 @@ pub struct TextOptions {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)] #[serde(untagged)]
/// Enum to control how the fast field setting of a text field. /// Enum to control how the fast field setting of a text field.
enum FastFieldOptions { pub(crate) enum FastFieldTextOptions {
/// Flag to enable/disable /// Flag to enable/disable
IsEnabled(bool), IsEnabled(bool),
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager. /// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
@@ -34,35 +34,34 @@ enum FastFieldOptions {
EnabledWithTokenizer { with_tokenizer: TokenizerName }, EnabledWithTokenizer { with_tokenizer: TokenizerName },
} }
impl Default for FastFieldOptions { impl Default for FastFieldTextOptions {
fn default() -> Self { fn default() -> Self {
FastFieldOptions::IsEnabled(false) FastFieldTextOptions::IsEnabled(false)
} }
} }
impl BitOr<FastFieldOptions> for FastFieldOptions { impl BitOr<FastFieldTextOptions> for FastFieldTextOptions {
type Output = FastFieldOptions; type Output = FastFieldTextOptions;
fn bitor(self, other: FastFieldOptions) -> FastFieldOptions { fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions {
match (self, other) { match (self, other) {
( (
FastFieldOptions::EnabledWithTokenizer { FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer, with_tokenizer: tokenizer,
}, },
_, _,
) )
| ( | (
_, _,
FastFieldOptions::EnabledWithTokenizer { FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer, with_tokenizer: tokenizer,
}, },
) => FastFieldOptions::EnabledWithTokenizer { ) => FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer, with_tokenizer: tokenizer,
}, },
(FastFieldOptions::IsEnabled(true), _) | (_, FastFieldOptions::IsEnabled(true)) => { (FastFieldTextOptions::IsEnabled(true), _)
FastFieldOptions::IsEnabled(true) | (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true),
} (_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false),
(_, FastFieldOptions::IsEnabled(false)) => FastFieldOptions::IsEnabled(false),
} }
} }
} }
@@ -84,18 +83,18 @@ impl TextOptions {
/// Returns true if and only if the value is a fast field. /// Returns true if and only if the value is a fast field.
pub fn is_fast(&self) -> bool { pub fn is_fast(&self) -> bool {
matches!(self.fast, FastFieldOptions::IsEnabled(true)) matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|| matches!( || matches!(
&self.fast, &self.fast,
FastFieldOptions::EnabledWithTokenizer { with_tokenizer: _ } FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
) )
} }
/// Returns true if and only if the value is a fast field. /// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> { pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast { match &self.fast {
FastFieldOptions::IsEnabled(true) | FastFieldOptions::IsEnabled(false) => None, FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
FastFieldOptions::EnabledWithTokenizer { FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer, with_tokenizer: tokenizer,
} => Some(tokenizer.name()), } => Some(tokenizer.name()),
} }
@@ -125,11 +124,11 @@ impl TextOptions {
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions { pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions {
if let Some(tokenizer) = tokenizer_name { if let Some(tokenizer) = tokenizer_name {
let tokenizer = TokenizerName::from_name(tokenizer); let tokenizer = TokenizerName::from_name(tokenizer);
self.fast = FastFieldOptions::EnabledWithTokenizer { self.fast = FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer, with_tokenizer: tokenizer,
} }
} else { } else {
self.fast = FastFieldOptions::IsEnabled(true); self.fast = FastFieldTextOptions::IsEnabled(true);
} }
self self
} }
@@ -173,10 +172,10 @@ impl TokenizerName {
pub const fn from_static(name: &'static str) -> Self { pub const fn from_static(name: &'static str) -> Self {
TokenizerName(Cow::Borrowed(name)) TokenizerName(Cow::Borrowed(name))
} }
fn from_name(name: &str) -> Self { pub(crate) fn from_name(name: &str) -> Self {
TokenizerName(Cow::Owned(name.to_string())) TokenizerName(Cow::Owned(name.to_string()))
} }
fn name(&self) -> &str { pub(crate) fn name(&self) -> &str {
&self.0 &self.0
} }
} }
@@ -264,7 +263,7 @@ pub const STRING: TextOptions = TextOptions {
record: IndexRecordOption::Basic, record: IndexRecordOption::Basic,
}), }),
stored: false, stored: false,
fast: FastFieldOptions::IsEnabled(false), fast: FastFieldTextOptions::IsEnabled(false),
coerce: false, coerce: false,
}; };
@@ -277,7 +276,7 @@ pub const TEXT: TextOptions = TextOptions {
}), }),
stored: false, stored: false,
coerce: false, coerce: false,
fast: FastFieldOptions::IsEnabled(false), fast: FastFieldTextOptions::IsEnabled(false),
}; };
impl<T: Into<TextOptions>> BitOr<T> for TextOptions { impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
@@ -305,7 +304,7 @@ impl From<StoredFlag> for TextOptions {
TextOptions { TextOptions {
indexing: None, indexing: None,
stored: true, stored: true,
fast: FastFieldOptions::IsEnabled(false), fast: FastFieldTextOptions::default(),
coerce: false, coerce: false,
} }
} }
@@ -316,7 +315,7 @@ impl From<CoerceFlag> for TextOptions {
TextOptions { TextOptions {
indexing: None, indexing: None,
stored: false, stored: false,
fast: FastFieldOptions::IsEnabled(false), fast: FastFieldTextOptions::default(),
coerce: true, coerce: true,
} }
} }
@@ -327,7 +326,7 @@ impl From<FastFlag> for TextOptions {
TextOptions { TextOptions {
indexing: None, indexing: None,
stored: false, stored: false,
fast: FastFieldOptions::IsEnabled(true), fast: FastFieldTextOptions::IsEnabled(true),
coerce: false, coerce: false,
} }
} }
@@ -346,7 +345,7 @@ where
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::schema::text_options::{FastFieldOptions, TokenizerName}; use crate::schema::text_options::{FastFieldTextOptions, TokenizerName};
use crate::schema::*; use crate::schema::*;
#[test] #[test]
@@ -398,7 +397,7 @@ mod tests {
let options: TextOptions = serde_json::from_str(json).unwrap(); let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!( assert_eq!(
options.fast, options.fast,
FastFieldOptions::EnabledWithTokenizer { FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: TokenizerName::from_static("default") with_tokenizer: TokenizerName::from_static("default")
} }
); );
@@ -406,7 +405,7 @@ mod tests {
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!( assert_eq!(
options.fast, options.fast,
FastFieldOptions::EnabledWithTokenizer { FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: TokenizerName::from_static("default") with_tokenizer: TokenizerName::from_static("default")
} }
); );
@@ -415,18 +414,18 @@ mod tests {
"fast": true "fast": true
} "#; } "#;
let options: TextOptions = serde_json::from_str(json).unwrap(); let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(options.fast, FastFieldOptions::IsEnabled(true)); assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
let options: TextOptions = let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(options.fast, FastFieldOptions::IsEnabled(true)); assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
let json = r#" { let json = r#" {
"fast": false "fast": false
} "#; } "#;
let options: TextOptions = serde_json::from_str(json).unwrap(); let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(options.fast, FastFieldOptions::IsEnabled(false)); assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
let options: TextOptions = let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(options.fast, FastFieldOptions::IsEnabled(false)); assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
} }
} }

View File

@@ -1,6 +1,6 @@
/// The tokenizer module contains all of the tools used to process /// The tokenizer module contains all of the tools used to process
/// text in `tantivy`. /// text in `tantivy`.
use tokenizer_api::{BoxTokenStream, BoxableTokenizer, TokenFilter, Tokenizer}; use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
use crate::tokenizer::empty_tokenizer::EmptyTokenizer; use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
@@ -9,6 +9,31 @@ pub struct TextAnalyzer {
tokenizer: Box<dyn BoxableTokenizer>, tokenizer: Box<dyn BoxableTokenizer>,
} }
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
trait BoxableTokenizer: 'static + Send + Sync {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
}
impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
self.token_stream(text).into()
}
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Box::new(self.clone())
}
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
}
}
}
impl Default for TextAnalyzer { impl Default for TextAnalyzer {
fn default() -> TextAnalyzer { fn default() -> TextAnalyzer {
TextAnalyzer::from(EmptyTokenizer) TextAnalyzer::from(EmptyTokenizer)
@@ -33,14 +58,6 @@ impl TextAnalyzer {
} }
} }
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
}
}
}
/// Builder helper for [`TextAnalyzer`] /// Builder helper for [`TextAnalyzer`]
pub struct TextAnalyzerBuilder<T> { pub struct TextAnalyzerBuilder<T> {
tokenizer: T, tokenizer: T,

View File

@@ -49,23 +49,6 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>; fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>;
} }
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
pub trait BoxableTokenizer: 'static + Send + Sync {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
}
impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
self.token_stream(text).into()
}
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Box::new(self.clone())
}
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`. /// Simple wrapper of `Box<dyn TokenStream + 'a>`.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>); pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);