mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
enable tokenizer on json fields (#2053)
* enable tokenizer on json fields enable tokenizer on json fields for type text * Avoid making the tokenizer within the TextAnalyzer pub(crate) * Moving BoxableTokenizer to tantivy. --------- Co-authored-by: Paul Masurel <paul@quickwit.io>
This commit is contained in:
@@ -1082,7 +1082,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_fast_field_in_json_field_expand_dots_disabled() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_option = JsonObjectOptions::default().set_fast();
|
||||
let json_option = JsonObjectOptions::default().set_fast(None);
|
||||
let json = schema_builder.add_json_field("json", json_option);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -1105,11 +1105,36 @@ mod tests {
|
||||
assert_eq!(&vals, &[32])
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fast_field_in_json_field_with_tokenizer() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_option = JsonObjectOptions::default().set_fast(Some("default"));
|
||||
let json = schema_builder.add_json_field("json", json_option);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"age": 32})))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"age": "NEW"})))
|
||||
.unwrap();
|
||||
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let fast_fields = searcher.segment_reader(0u32).fast_fields();
|
||||
|
||||
let ff_str = fast_fields.str("json.age").unwrap().unwrap();
|
||||
let mut output = String::new();
|
||||
ff_str.ord_to_str(0, &mut output).unwrap();
|
||||
assert_eq!(output, "new");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fast_field_in_json_field_expand_dots_enabled() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_option = JsonObjectOptions::default()
|
||||
.set_fast()
|
||||
.set_fast(None)
|
||||
.set_expand_dots_enabled();
|
||||
let json = schema_builder.add_json_field("json", json_option);
|
||||
let schema = schema_builder.build();
|
||||
@@ -1246,7 +1271,7 @@ mod tests {
|
||||
fn test_shadowing_fast_field_with_expand_dots() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_option = JsonObjectOptions::default()
|
||||
.set_fast()
|
||||
.set_fast(None)
|
||||
.set_expand_dots_enabled();
|
||||
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
|
||||
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
|
||||
|
||||
@@ -346,7 +346,7 @@ mod tests {
|
||||
schema_builder.add_json_field(
|
||||
"json_expand_dots_enabled",
|
||||
JsonObjectOptions::default()
|
||||
.set_fast()
|
||||
.set_fast(None)
|
||||
.set_expand_dots_enabled(),
|
||||
);
|
||||
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);
|
||||
|
||||
@@ -46,7 +46,7 @@ impl FastFieldsWriter {
|
||||
.take(schema.num_fields())
|
||||
.collect();
|
||||
let mut expand_dots = vec![false; schema.num_fields()];
|
||||
let mut per_field_tokenizer = vec![None; schema.num_fields()];
|
||||
let mut per_field_tokenizer: Vec<Option<TextAnalyzer>> = vec![None; schema.num_fields()];
|
||||
// TODO see other types
|
||||
for (field_id, field_entry) in schema.fields() {
|
||||
if !field_entry.field_type().is_fast() {
|
||||
@@ -58,6 +58,15 @@ impl FastFieldsWriter {
|
||||
date_precisions[field_id.field_id() as usize] = date_options.get_precision();
|
||||
}
|
||||
if let FieldType::JsonObject(json_object_options) = field_entry.field_type() {
|
||||
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
|
||||
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"Tokenizer {tokenizer_name:?} not found"
|
||||
))
|
||||
})?;
|
||||
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
|
||||
}
|
||||
|
||||
expand_dots[field_id.field_id() as usize] =
|
||||
json_object_options.is_expand_dots_enabled();
|
||||
}
|
||||
@@ -137,10 +146,10 @@ impl FastFieldsWriter {
|
||||
);
|
||||
}
|
||||
Value::Str(text_val) => {
|
||||
if let Some(text_analyzer) =
|
||||
if let Some(tokenizer) =
|
||||
&self.per_field_tokenizer[field_value.field().field_id() as usize]
|
||||
{
|
||||
let mut token_stream = text_analyzer.token_stream(text_val);
|
||||
let mut token_stream = tokenizer.token_stream(text_val);
|
||||
token_stream.process(&mut |token: &Token| {
|
||||
self.columnar_writer.record_str(
|
||||
doc_id,
|
||||
@@ -191,6 +200,10 @@ impl FastFieldsWriter {
|
||||
let expand_dots = self.expand_dots[field_value.field().field_id() as usize];
|
||||
self.json_path_buffer.clear();
|
||||
self.json_path_buffer.push_str(field_name);
|
||||
|
||||
let text_analyzer =
|
||||
&self.per_field_tokenizer[field_value.field().field_id() as usize];
|
||||
|
||||
record_json_obj_to_columnar_writer(
|
||||
doc_id,
|
||||
json_obj,
|
||||
@@ -198,6 +211,7 @@ impl FastFieldsWriter {
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut self.json_path_buffer,
|
||||
&mut self.columnar_writer,
|
||||
text_analyzer,
|
||||
);
|
||||
}
|
||||
Value::IpAddr(ip_addr) => {
|
||||
@@ -249,6 +263,7 @@ fn record_json_obj_to_columnar_writer(
|
||||
remaining_depth_limit: usize,
|
||||
json_path_buffer: &mut String,
|
||||
columnar_writer: &mut columnar::ColumnarWriter,
|
||||
tokenizer: &Option<TextAnalyzer>,
|
||||
) {
|
||||
for (key, child) in json_obj {
|
||||
let len_path = json_path_buffer.len();
|
||||
@@ -273,6 +288,7 @@ fn record_json_obj_to_columnar_writer(
|
||||
remaining_depth_limit,
|
||||
json_path_buffer,
|
||||
columnar_writer,
|
||||
tokenizer,
|
||||
);
|
||||
// popping our sub path.
|
||||
json_path_buffer.truncate(len_path);
|
||||
@@ -286,6 +302,7 @@ fn record_json_value_to_columnar_writer(
|
||||
mut remaining_depth_limit: usize,
|
||||
json_path_writer: &mut String,
|
||||
columnar_writer: &mut columnar::ColumnarWriter,
|
||||
tokenizer: &Option<TextAnalyzer>,
|
||||
) {
|
||||
if remaining_depth_limit == 0 {
|
||||
return;
|
||||
@@ -304,7 +321,14 @@ fn record_json_value_to_columnar_writer(
|
||||
}
|
||||
}
|
||||
serde_json::Value::String(text) => {
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
|
||||
if let Some(text_analyzer) = tokenizer {
|
||||
let mut token_stream = text_analyzer.token_stream(text);
|
||||
token_stream.process(&mut |token| {
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
|
||||
})
|
||||
} else {
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Array(arr) => {
|
||||
for el in arr {
|
||||
@@ -315,6 +339,7 @@ fn record_json_value_to_columnar_writer(
|
||||
remaining_depth_limit,
|
||||
json_path_writer,
|
||||
columnar_writer,
|
||||
tokenizer,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -326,6 +351,7 @@ fn record_json_value_to_columnar_writer(
|
||||
remaining_depth_limit,
|
||||
json_path_writer,
|
||||
columnar_writer,
|
||||
tokenizer,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -353,6 +379,7 @@ mod tests {
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut json_path,
|
||||
&mut columnar_writer,
|
||||
&None,
|
||||
);
|
||||
}
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
@@ -2,19 +2,20 @@ use std::ops::BitOr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::text_options::{FastFieldTextOptions, TokenizerName};
|
||||
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
|
||||
use crate::schema::{TextFieldIndexing, TextOptions};
|
||||
|
||||
/// The `JsonObjectOptions` make it possible to
|
||||
/// configure how a json object field should be indexed and stored.
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct JsonObjectOptions {
|
||||
stored: bool,
|
||||
// If set to some, int, date, f64 and text will be indexed.
|
||||
// Text will use the TextFieldIndexing setting for indexing.
|
||||
indexing: Option<TextFieldIndexing>,
|
||||
// Store all field as fast fields.
|
||||
fast: bool,
|
||||
// Store all field as fast fields with an optional tokenizer for text.
|
||||
fast: FastFieldTextOptions,
|
||||
/// tantivy will generate pathes to the different nodes of the json object
|
||||
/// both in:
|
||||
/// - the inverted index (for the terms)
|
||||
@@ -57,7 +58,21 @@ impl JsonObjectOptions {
|
||||
/// Returns true if and only if the json object fields are
|
||||
/// to be treated as fast fields.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
self.fast
|
||||
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|
||||
|| matches!(
|
||||
&self.fast,
|
||||
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns true if and only if the value is a fast field.
|
||||
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
||||
match &self.fast {
|
||||
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
} => Some(tokenizer.name()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `true` iff dots in json keys should be expanded.
|
||||
@@ -99,10 +114,31 @@ impl JsonObjectOptions {
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the field as a fast field
|
||||
/// Set the field as a fast field.
|
||||
///
|
||||
/// Fast fields are designed for random access.
|
||||
/// Access time are similar to a random lookup in an array.
|
||||
/// Text fast fields will have the term ids stored in the fast field.
|
||||
///
|
||||
/// The effective cardinality depends on the tokenizer. Without a tokenizer, the text will be
|
||||
/// stored as is, which equals to the "raw" tokenizer. The tokenizer can be used to apply
|
||||
/// normalization like lower case.
|
||||
/// The passed tokenizer_name must be available on the fast field tokenizer manager.
|
||||
/// `Index::fast_field_tokenizer`.
|
||||
///
|
||||
/// The original text can be retrieved via
|
||||
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
|
||||
/// from the dictionary.
|
||||
#[must_use]
|
||||
pub fn set_fast(mut self) -> Self {
|
||||
self.fast = true;
|
||||
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self {
|
||||
if let Some(tokenizer) = tokenizer_name {
|
||||
let tokenizer = TokenizerName::from_name(tokenizer);
|
||||
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
}
|
||||
} else {
|
||||
self.fast = FastFieldTextOptions::IsEnabled(true);
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
@@ -119,7 +155,7 @@ impl From<StoredFlag> for JsonObjectOptions {
|
||||
JsonObjectOptions {
|
||||
stored: true,
|
||||
indexing: None,
|
||||
fast: false,
|
||||
fast: FastFieldTextOptions::default(),
|
||||
expand_dots_enabled: false,
|
||||
}
|
||||
}
|
||||
@@ -130,7 +166,7 @@ impl From<FastFlag> for JsonObjectOptions {
|
||||
JsonObjectOptions {
|
||||
stored: false,
|
||||
indexing: None,
|
||||
fast: true,
|
||||
fast: FastFieldTextOptions::IsEnabled(true),
|
||||
expand_dots_enabled: false,
|
||||
}
|
||||
}
|
||||
@@ -172,7 +208,7 @@ impl From<TextOptions> for JsonObjectOptions {
|
||||
JsonObjectOptions {
|
||||
stored: text_options.is_stored(),
|
||||
indexing: text_options.get_indexing_options().cloned(),
|
||||
fast: text_options.is_fast(),
|
||||
fast: text_options.fast,
|
||||
expand_dots_enabled: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ pub struct TextOptions {
|
||||
#[serde(default)]
|
||||
stored: bool,
|
||||
#[serde(default)]
|
||||
fast: FastFieldOptions,
|
||||
pub(crate) fast: FastFieldTextOptions,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "is_false")]
|
||||
/// coerce values into string if they are not of type string
|
||||
@@ -26,7 +26,7 @@ pub struct TextOptions {
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
/// Enum to control how the fast field setting of a text field.
|
||||
enum FastFieldOptions {
|
||||
pub(crate) enum FastFieldTextOptions {
|
||||
/// Flag to enable/disable
|
||||
IsEnabled(bool),
|
||||
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
|
||||
@@ -34,35 +34,34 @@ enum FastFieldOptions {
|
||||
EnabledWithTokenizer { with_tokenizer: TokenizerName },
|
||||
}
|
||||
|
||||
impl Default for FastFieldOptions {
|
||||
impl Default for FastFieldTextOptions {
|
||||
fn default() -> Self {
|
||||
FastFieldOptions::IsEnabled(false)
|
||||
FastFieldTextOptions::IsEnabled(false)
|
||||
}
|
||||
}
|
||||
|
||||
impl BitOr<FastFieldOptions> for FastFieldOptions {
|
||||
type Output = FastFieldOptions;
|
||||
impl BitOr<FastFieldTextOptions> for FastFieldTextOptions {
|
||||
type Output = FastFieldTextOptions;
|
||||
|
||||
fn bitor(self, other: FastFieldOptions) -> FastFieldOptions {
|
||||
fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions {
|
||||
match (self, other) {
|
||||
(
|
||||
FastFieldOptions::EnabledWithTokenizer {
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
},
|
||||
_,
|
||||
)
|
||||
| (
|
||||
_,
|
||||
FastFieldOptions::EnabledWithTokenizer {
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
},
|
||||
) => FastFieldOptions::EnabledWithTokenizer {
|
||||
) => FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
},
|
||||
(FastFieldOptions::IsEnabled(true), _) | (_, FastFieldOptions::IsEnabled(true)) => {
|
||||
FastFieldOptions::IsEnabled(true)
|
||||
}
|
||||
(_, FastFieldOptions::IsEnabled(false)) => FastFieldOptions::IsEnabled(false),
|
||||
(FastFieldTextOptions::IsEnabled(true), _)
|
||||
| (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true),
|
||||
(_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -84,18 +83,18 @@ impl TextOptions {
|
||||
|
||||
/// Returns true if and only if the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
matches!(self.fast, FastFieldOptions::IsEnabled(true))
|
||||
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|
||||
|| matches!(
|
||||
&self.fast,
|
||||
FastFieldOptions::EnabledWithTokenizer { with_tokenizer: _ }
|
||||
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns true if and only if the value is a fast field.
|
||||
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
||||
match &self.fast {
|
||||
FastFieldOptions::IsEnabled(true) | FastFieldOptions::IsEnabled(false) => None,
|
||||
FastFieldOptions::EnabledWithTokenizer {
|
||||
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
} => Some(tokenizer.name()),
|
||||
}
|
||||
@@ -125,11 +124,11 @@ impl TextOptions {
|
||||
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions {
|
||||
if let Some(tokenizer) = tokenizer_name {
|
||||
let tokenizer = TokenizerName::from_name(tokenizer);
|
||||
self.fast = FastFieldOptions::EnabledWithTokenizer {
|
||||
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
}
|
||||
} else {
|
||||
self.fast = FastFieldOptions::IsEnabled(true);
|
||||
self.fast = FastFieldTextOptions::IsEnabled(true);
|
||||
}
|
||||
self
|
||||
}
|
||||
@@ -173,10 +172,10 @@ impl TokenizerName {
|
||||
pub const fn from_static(name: &'static str) -> Self {
|
||||
TokenizerName(Cow::Borrowed(name))
|
||||
}
|
||||
fn from_name(name: &str) -> Self {
|
||||
pub(crate) fn from_name(name: &str) -> Self {
|
||||
TokenizerName(Cow::Owned(name.to_string()))
|
||||
}
|
||||
fn name(&self) -> &str {
|
||||
pub(crate) fn name(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
@@ -264,7 +263,7 @@ pub const STRING: TextOptions = TextOptions {
|
||||
record: IndexRecordOption::Basic,
|
||||
}),
|
||||
stored: false,
|
||||
fast: FastFieldOptions::IsEnabled(false),
|
||||
fast: FastFieldTextOptions::IsEnabled(false),
|
||||
coerce: false,
|
||||
};
|
||||
|
||||
@@ -277,7 +276,7 @@ pub const TEXT: TextOptions = TextOptions {
|
||||
}),
|
||||
stored: false,
|
||||
coerce: false,
|
||||
fast: FastFieldOptions::IsEnabled(false),
|
||||
fast: FastFieldTextOptions::IsEnabled(false),
|
||||
};
|
||||
|
||||
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
|
||||
@@ -305,7 +304,7 @@ impl From<StoredFlag> for TextOptions {
|
||||
TextOptions {
|
||||
indexing: None,
|
||||
stored: true,
|
||||
fast: FastFieldOptions::IsEnabled(false),
|
||||
fast: FastFieldTextOptions::default(),
|
||||
coerce: false,
|
||||
}
|
||||
}
|
||||
@@ -316,7 +315,7 @@ impl From<CoerceFlag> for TextOptions {
|
||||
TextOptions {
|
||||
indexing: None,
|
||||
stored: false,
|
||||
fast: FastFieldOptions::IsEnabled(false),
|
||||
fast: FastFieldTextOptions::default(),
|
||||
coerce: true,
|
||||
}
|
||||
}
|
||||
@@ -327,7 +326,7 @@ impl From<FastFlag> for TextOptions {
|
||||
TextOptions {
|
||||
indexing: None,
|
||||
stored: false,
|
||||
fast: FastFieldOptions::IsEnabled(true),
|
||||
fast: FastFieldTextOptions::IsEnabled(true),
|
||||
coerce: false,
|
||||
}
|
||||
}
|
||||
@@ -346,7 +345,7 @@ where
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::schema::text_options::{FastFieldOptions, TokenizerName};
|
||||
use crate::schema::text_options::{FastFieldTextOptions, TokenizerName};
|
||||
use crate::schema::*;
|
||||
|
||||
#[test]
|
||||
@@ -398,7 +397,7 @@ mod tests {
|
||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
options.fast,
|
||||
FastFieldOptions::EnabledWithTokenizer {
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: TokenizerName::from_static("default")
|
||||
}
|
||||
);
|
||||
@@ -406,7 +405,7 @@ mod tests {
|
||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||
assert_eq!(
|
||||
options.fast,
|
||||
FastFieldOptions::EnabledWithTokenizer {
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: TokenizerName::from_static("default")
|
||||
}
|
||||
);
|
||||
@@ -415,18 +414,18 @@ mod tests {
|
||||
"fast": true
|
||||
} "#;
|
||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(options.fast, FastFieldOptions::IsEnabled(true));
|
||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
|
||||
let options: TextOptions =
|
||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||
assert_eq!(options.fast, FastFieldOptions::IsEnabled(true));
|
||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
|
||||
|
||||
let json = r#" {
|
||||
"fast": false
|
||||
} "#;
|
||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(options.fast, FastFieldOptions::IsEnabled(false));
|
||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
|
||||
let options: TextOptions =
|
||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||
assert_eq!(options.fast, FastFieldOptions::IsEnabled(false));
|
||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
use tokenizer_api::{BoxTokenStream, BoxableTokenizer, TokenFilter, Tokenizer};
|
||||
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
|
||||
|
||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||
|
||||
@@ -9,6 +9,31 @@ pub struct TextAnalyzer {
|
||||
tokenizer: Box<dyn BoxableTokenizer>,
|
||||
}
|
||||
|
||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||
trait BoxableTokenizer: 'static + Send + Sync {
|
||||
/// Creates a boxed token stream for a given `str`.
|
||||
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
/// Clone this tokenizer.
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> BoxableTokenizer for T {
|
||||
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.token_stream(text).into()
|
||||
}
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
TextAnalyzer {
|
||||
tokenizer: self.tokenizer.box_clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TextAnalyzer {
|
||||
fn default() -> TextAnalyzer {
|
||||
TextAnalyzer::from(EmptyTokenizer)
|
||||
@@ -33,14 +58,6 @@ impl TextAnalyzer {
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
TextAnalyzer {
|
||||
tokenizer: self.tokenizer.box_clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder helper for [`TextAnalyzer`]
|
||||
pub struct TextAnalyzerBuilder<T> {
|
||||
tokenizer: T,
|
||||
|
||||
@@ -49,23 +49,6 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>;
|
||||
}
|
||||
|
||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||
pub trait BoxableTokenizer: 'static + Send + Sync {
|
||||
/// Creates a boxed token stream for a given `str`.
|
||||
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
/// Clone this tokenizer.
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> BoxableTokenizer for T {
|
||||
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.token_stream(text).into()
|
||||
}
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user