mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
enable tokenizer on json fields (#2053)
* enable tokenizer on json fields enable tokenizer on json fields for type text * Avoid making the tokenizer within the TextAnalyzer pub(crate) * Moving BoxableTokenizer to tantivy. --------- Co-authored-by: Paul Masurel <paul@quickwit.io>
This commit is contained in:
@@ -1082,7 +1082,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_fast_field_in_json_field_expand_dots_disabled() {
|
fn test_fast_field_in_json_field_expand_dots_disabled() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let json_option = JsonObjectOptions::default().set_fast();
|
let json_option = JsonObjectOptions::default().set_fast(None);
|
||||||
let json = schema_builder.add_json_field("json", json_option);
|
let json = schema_builder.add_json_field("json", json_option);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -1105,11 +1105,36 @@ mod tests {
|
|||||||
assert_eq!(&vals, &[32])
|
assert_eq!(&vals, &[32])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_fast_field_in_json_field_with_tokenizer() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let json_option = JsonObjectOptions::default().set_fast(Some("default"));
|
||||||
|
let json = schema_builder.add_json_field("json", json_option);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(json => json!({"age": 32})))
|
||||||
|
.unwrap();
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(json => json!({"age": "NEW"})))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
let searcher = index.reader().unwrap().searcher();
|
||||||
|
let fast_fields = searcher.segment_reader(0u32).fast_fields();
|
||||||
|
|
||||||
|
let ff_str = fast_fields.str("json.age").unwrap().unwrap();
|
||||||
|
let mut output = String::new();
|
||||||
|
ff_str.ord_to_str(0, &mut output).unwrap();
|
||||||
|
assert_eq!(output, "new");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_fast_field_in_json_field_expand_dots_enabled() {
|
fn test_fast_field_in_json_field_expand_dots_enabled() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let json_option = JsonObjectOptions::default()
|
let json_option = JsonObjectOptions::default()
|
||||||
.set_fast()
|
.set_fast(None)
|
||||||
.set_expand_dots_enabled();
|
.set_expand_dots_enabled();
|
||||||
let json = schema_builder.add_json_field("json", json_option);
|
let json = schema_builder.add_json_field("json", json_option);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -1246,7 +1271,7 @@ mod tests {
|
|||||||
fn test_shadowing_fast_field_with_expand_dots() {
|
fn test_shadowing_fast_field_with_expand_dots() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let json_option = JsonObjectOptions::default()
|
let json_option = JsonObjectOptions::default()
|
||||||
.set_fast()
|
.set_fast(None)
|
||||||
.set_expand_dots_enabled();
|
.set_expand_dots_enabled();
|
||||||
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
|
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
|
||||||
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
|
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
|
||||||
|
|||||||
@@ -346,7 +346,7 @@ mod tests {
|
|||||||
schema_builder.add_json_field(
|
schema_builder.add_json_field(
|
||||||
"json_expand_dots_enabled",
|
"json_expand_dots_enabled",
|
||||||
JsonObjectOptions::default()
|
JsonObjectOptions::default()
|
||||||
.set_fast()
|
.set_fast(None)
|
||||||
.set_expand_dots_enabled(),
|
.set_expand_dots_enabled(),
|
||||||
);
|
);
|
||||||
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);
|
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ impl FastFieldsWriter {
|
|||||||
.take(schema.num_fields())
|
.take(schema.num_fields())
|
||||||
.collect();
|
.collect();
|
||||||
let mut expand_dots = vec![false; schema.num_fields()];
|
let mut expand_dots = vec![false; schema.num_fields()];
|
||||||
let mut per_field_tokenizer = vec![None; schema.num_fields()];
|
let mut per_field_tokenizer: Vec<Option<TextAnalyzer>> = vec![None; schema.num_fields()];
|
||||||
// TODO see other types
|
// TODO see other types
|
||||||
for (field_id, field_entry) in schema.fields() {
|
for (field_id, field_entry) in schema.fields() {
|
||||||
if !field_entry.field_type().is_fast() {
|
if !field_entry.field_type().is_fast() {
|
||||||
@@ -58,6 +58,15 @@ impl FastFieldsWriter {
|
|||||||
date_precisions[field_id.field_id() as usize] = date_options.get_precision();
|
date_precisions[field_id.field_id() as usize] = date_options.get_precision();
|
||||||
}
|
}
|
||||||
if let FieldType::JsonObject(json_object_options) = field_entry.field_type() {
|
if let FieldType::JsonObject(json_object_options) = field_entry.field_type() {
|
||||||
|
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
|
||||||
|
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
|
||||||
|
TantivyError::InvalidArgument(format!(
|
||||||
|
"Tokenizer {tokenizer_name:?} not found"
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
|
||||||
|
}
|
||||||
|
|
||||||
expand_dots[field_id.field_id() as usize] =
|
expand_dots[field_id.field_id() as usize] =
|
||||||
json_object_options.is_expand_dots_enabled();
|
json_object_options.is_expand_dots_enabled();
|
||||||
}
|
}
|
||||||
@@ -137,10 +146,10 @@ impl FastFieldsWriter {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
Value::Str(text_val) => {
|
Value::Str(text_val) => {
|
||||||
if let Some(text_analyzer) =
|
if let Some(tokenizer) =
|
||||||
&self.per_field_tokenizer[field_value.field().field_id() as usize]
|
&self.per_field_tokenizer[field_value.field().field_id() as usize]
|
||||||
{
|
{
|
||||||
let mut token_stream = text_analyzer.token_stream(text_val);
|
let mut token_stream = tokenizer.token_stream(text_val);
|
||||||
token_stream.process(&mut |token: &Token| {
|
token_stream.process(&mut |token: &Token| {
|
||||||
self.columnar_writer.record_str(
|
self.columnar_writer.record_str(
|
||||||
doc_id,
|
doc_id,
|
||||||
@@ -191,6 +200,10 @@ impl FastFieldsWriter {
|
|||||||
let expand_dots = self.expand_dots[field_value.field().field_id() as usize];
|
let expand_dots = self.expand_dots[field_value.field().field_id() as usize];
|
||||||
self.json_path_buffer.clear();
|
self.json_path_buffer.clear();
|
||||||
self.json_path_buffer.push_str(field_name);
|
self.json_path_buffer.push_str(field_name);
|
||||||
|
|
||||||
|
let text_analyzer =
|
||||||
|
&self.per_field_tokenizer[field_value.field().field_id() as usize];
|
||||||
|
|
||||||
record_json_obj_to_columnar_writer(
|
record_json_obj_to_columnar_writer(
|
||||||
doc_id,
|
doc_id,
|
||||||
json_obj,
|
json_obj,
|
||||||
@@ -198,6 +211,7 @@ impl FastFieldsWriter {
|
|||||||
JSON_DEPTH_LIMIT,
|
JSON_DEPTH_LIMIT,
|
||||||
&mut self.json_path_buffer,
|
&mut self.json_path_buffer,
|
||||||
&mut self.columnar_writer,
|
&mut self.columnar_writer,
|
||||||
|
text_analyzer,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
Value::IpAddr(ip_addr) => {
|
Value::IpAddr(ip_addr) => {
|
||||||
@@ -249,6 +263,7 @@ fn record_json_obj_to_columnar_writer(
|
|||||||
remaining_depth_limit: usize,
|
remaining_depth_limit: usize,
|
||||||
json_path_buffer: &mut String,
|
json_path_buffer: &mut String,
|
||||||
columnar_writer: &mut columnar::ColumnarWriter,
|
columnar_writer: &mut columnar::ColumnarWriter,
|
||||||
|
tokenizer: &Option<TextAnalyzer>,
|
||||||
) {
|
) {
|
||||||
for (key, child) in json_obj {
|
for (key, child) in json_obj {
|
||||||
let len_path = json_path_buffer.len();
|
let len_path = json_path_buffer.len();
|
||||||
@@ -273,6 +288,7 @@ fn record_json_obj_to_columnar_writer(
|
|||||||
remaining_depth_limit,
|
remaining_depth_limit,
|
||||||
json_path_buffer,
|
json_path_buffer,
|
||||||
columnar_writer,
|
columnar_writer,
|
||||||
|
tokenizer,
|
||||||
);
|
);
|
||||||
// popping our sub path.
|
// popping our sub path.
|
||||||
json_path_buffer.truncate(len_path);
|
json_path_buffer.truncate(len_path);
|
||||||
@@ -286,6 +302,7 @@ fn record_json_value_to_columnar_writer(
|
|||||||
mut remaining_depth_limit: usize,
|
mut remaining_depth_limit: usize,
|
||||||
json_path_writer: &mut String,
|
json_path_writer: &mut String,
|
||||||
columnar_writer: &mut columnar::ColumnarWriter,
|
columnar_writer: &mut columnar::ColumnarWriter,
|
||||||
|
tokenizer: &Option<TextAnalyzer>,
|
||||||
) {
|
) {
|
||||||
if remaining_depth_limit == 0 {
|
if remaining_depth_limit == 0 {
|
||||||
return;
|
return;
|
||||||
@@ -304,7 +321,14 @@ fn record_json_value_to_columnar_writer(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
serde_json::Value::String(text) => {
|
serde_json::Value::String(text) => {
|
||||||
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
|
if let Some(text_analyzer) = tokenizer {
|
||||||
|
let mut token_stream = text_analyzer.token_stream(text);
|
||||||
|
token_stream.process(&mut |token| {
|
||||||
|
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
serde_json::Value::Array(arr) => {
|
serde_json::Value::Array(arr) => {
|
||||||
for el in arr {
|
for el in arr {
|
||||||
@@ -315,6 +339,7 @@ fn record_json_value_to_columnar_writer(
|
|||||||
remaining_depth_limit,
|
remaining_depth_limit,
|
||||||
json_path_writer,
|
json_path_writer,
|
||||||
columnar_writer,
|
columnar_writer,
|
||||||
|
tokenizer,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -326,6 +351,7 @@ fn record_json_value_to_columnar_writer(
|
|||||||
remaining_depth_limit,
|
remaining_depth_limit,
|
||||||
json_path_writer,
|
json_path_writer,
|
||||||
columnar_writer,
|
columnar_writer,
|
||||||
|
tokenizer,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -353,6 +379,7 @@ mod tests {
|
|||||||
JSON_DEPTH_LIMIT,
|
JSON_DEPTH_LIMIT,
|
||||||
&mut json_path,
|
&mut json_path,
|
||||||
&mut columnar_writer,
|
&mut columnar_writer,
|
||||||
|
&None,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
|
|||||||
@@ -2,19 +2,20 @@ use std::ops::BitOr;
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use super::text_options::{FastFieldTextOptions, TokenizerName};
|
||||||
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
|
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
|
||||||
use crate::schema::{TextFieldIndexing, TextOptions};
|
use crate::schema::{TextFieldIndexing, TextOptions};
|
||||||
|
|
||||||
/// The `JsonObjectOptions` make it possible to
|
/// The `JsonObjectOptions` make it possible to
|
||||||
/// configure how a json object field should be indexed and stored.
|
/// configure how a json object field should be indexed and stored.
|
||||||
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||||
pub struct JsonObjectOptions {
|
pub struct JsonObjectOptions {
|
||||||
stored: bool,
|
stored: bool,
|
||||||
// If set to some, int, date, f64 and text will be indexed.
|
// If set to some, int, date, f64 and text will be indexed.
|
||||||
// Text will use the TextFieldIndexing setting for indexing.
|
// Text will use the TextFieldIndexing setting for indexing.
|
||||||
indexing: Option<TextFieldIndexing>,
|
indexing: Option<TextFieldIndexing>,
|
||||||
// Store all field as fast fields.
|
// Store all field as fast fields with an optional tokenizer for text.
|
||||||
fast: bool,
|
fast: FastFieldTextOptions,
|
||||||
/// tantivy will generate pathes to the different nodes of the json object
|
/// tantivy will generate pathes to the different nodes of the json object
|
||||||
/// both in:
|
/// both in:
|
||||||
/// - the inverted index (for the terms)
|
/// - the inverted index (for the terms)
|
||||||
@@ -57,7 +58,21 @@ impl JsonObjectOptions {
|
|||||||
/// Returns true if and only if the json object fields are
|
/// Returns true if and only if the json object fields are
|
||||||
/// to be treated as fast fields.
|
/// to be treated as fast fields.
|
||||||
pub fn is_fast(&self) -> bool {
|
pub fn is_fast(&self) -> bool {
|
||||||
self.fast
|
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|
||||||
|
|| matches!(
|
||||||
|
&self.fast,
|
||||||
|
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the value is a fast field.
|
||||||
|
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
||||||
|
match &self.fast {
|
||||||
|
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
|
||||||
|
FastFieldTextOptions::EnabledWithTokenizer {
|
||||||
|
with_tokenizer: tokenizer,
|
||||||
|
} => Some(tokenizer.name()),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `true` iff dots in json keys should be expanded.
|
/// Returns `true` iff dots in json keys should be expanded.
|
||||||
@@ -99,10 +114,31 @@ impl JsonObjectOptions {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sets the field as a fast field
|
/// Set the field as a fast field.
|
||||||
|
///
|
||||||
|
/// Fast fields are designed for random access.
|
||||||
|
/// Access time are similar to a random lookup in an array.
|
||||||
|
/// Text fast fields will have the term ids stored in the fast field.
|
||||||
|
///
|
||||||
|
/// The effective cardinality depends on the tokenizer. Without a tokenizer, the text will be
|
||||||
|
/// stored as is, which equals to the "raw" tokenizer. The tokenizer can be used to apply
|
||||||
|
/// normalization like lower case.
|
||||||
|
/// The passed tokenizer_name must be available on the fast field tokenizer manager.
|
||||||
|
/// `Index::fast_field_tokenizer`.
|
||||||
|
///
|
||||||
|
/// The original text can be retrieved via
|
||||||
|
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
|
||||||
|
/// from the dictionary.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub fn set_fast(mut self) -> Self {
|
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self {
|
||||||
self.fast = true;
|
if let Some(tokenizer) = tokenizer_name {
|
||||||
|
let tokenizer = TokenizerName::from_name(tokenizer);
|
||||||
|
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
|
||||||
|
with_tokenizer: tokenizer,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
self.fast = FastFieldTextOptions::IsEnabled(true);
|
||||||
|
}
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -119,7 +155,7 @@ impl From<StoredFlag> for JsonObjectOptions {
|
|||||||
JsonObjectOptions {
|
JsonObjectOptions {
|
||||||
stored: true,
|
stored: true,
|
||||||
indexing: None,
|
indexing: None,
|
||||||
fast: false,
|
fast: FastFieldTextOptions::default(),
|
||||||
expand_dots_enabled: false,
|
expand_dots_enabled: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -130,7 +166,7 @@ impl From<FastFlag> for JsonObjectOptions {
|
|||||||
JsonObjectOptions {
|
JsonObjectOptions {
|
||||||
stored: false,
|
stored: false,
|
||||||
indexing: None,
|
indexing: None,
|
||||||
fast: true,
|
fast: FastFieldTextOptions::IsEnabled(true),
|
||||||
expand_dots_enabled: false,
|
expand_dots_enabled: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -172,7 +208,7 @@ impl From<TextOptions> for JsonObjectOptions {
|
|||||||
JsonObjectOptions {
|
JsonObjectOptions {
|
||||||
stored: text_options.is_stored(),
|
stored: text_options.is_stored(),
|
||||||
indexing: text_options.get_indexing_options().cloned(),
|
indexing: text_options.get_indexing_options().cloned(),
|
||||||
fast: text_options.is_fast(),
|
fast: text_options.fast,
|
||||||
expand_dots_enabled: false,
|
expand_dots_enabled: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ pub struct TextOptions {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
stored: bool,
|
stored: bool,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
fast: FastFieldOptions,
|
pub(crate) fast: FastFieldTextOptions,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
#[serde(skip_serializing_if = "is_false")]
|
#[serde(skip_serializing_if = "is_false")]
|
||||||
/// coerce values into string if they are not of type string
|
/// coerce values into string if they are not of type string
|
||||||
@@ -26,7 +26,7 @@ pub struct TextOptions {
|
|||||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
#[serde(untagged)]
|
#[serde(untagged)]
|
||||||
/// Enum to control how the fast field setting of a text field.
|
/// Enum to control how the fast field setting of a text field.
|
||||||
enum FastFieldOptions {
|
pub(crate) enum FastFieldTextOptions {
|
||||||
/// Flag to enable/disable
|
/// Flag to enable/disable
|
||||||
IsEnabled(bool),
|
IsEnabled(bool),
|
||||||
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
|
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
|
||||||
@@ -34,35 +34,34 @@ enum FastFieldOptions {
|
|||||||
EnabledWithTokenizer { with_tokenizer: TokenizerName },
|
EnabledWithTokenizer { with_tokenizer: TokenizerName },
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for FastFieldOptions {
|
impl Default for FastFieldTextOptions {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
FastFieldOptions::IsEnabled(false)
|
FastFieldTextOptions::IsEnabled(false)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BitOr<FastFieldOptions> for FastFieldOptions {
|
impl BitOr<FastFieldTextOptions> for FastFieldTextOptions {
|
||||||
type Output = FastFieldOptions;
|
type Output = FastFieldTextOptions;
|
||||||
|
|
||||||
fn bitor(self, other: FastFieldOptions) -> FastFieldOptions {
|
fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions {
|
||||||
match (self, other) {
|
match (self, other) {
|
||||||
(
|
(
|
||||||
FastFieldOptions::EnabledWithTokenizer {
|
FastFieldTextOptions::EnabledWithTokenizer {
|
||||||
with_tokenizer: tokenizer,
|
with_tokenizer: tokenizer,
|
||||||
},
|
},
|
||||||
_,
|
_,
|
||||||
)
|
)
|
||||||
| (
|
| (
|
||||||
_,
|
_,
|
||||||
FastFieldOptions::EnabledWithTokenizer {
|
FastFieldTextOptions::EnabledWithTokenizer {
|
||||||
with_tokenizer: tokenizer,
|
with_tokenizer: tokenizer,
|
||||||
},
|
},
|
||||||
) => FastFieldOptions::EnabledWithTokenizer {
|
) => FastFieldTextOptions::EnabledWithTokenizer {
|
||||||
with_tokenizer: tokenizer,
|
with_tokenizer: tokenizer,
|
||||||
},
|
},
|
||||||
(FastFieldOptions::IsEnabled(true), _) | (_, FastFieldOptions::IsEnabled(true)) => {
|
(FastFieldTextOptions::IsEnabled(true), _)
|
||||||
FastFieldOptions::IsEnabled(true)
|
| (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true),
|
||||||
}
|
(_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false),
|
||||||
(_, FastFieldOptions::IsEnabled(false)) => FastFieldOptions::IsEnabled(false),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -84,18 +83,18 @@ impl TextOptions {
|
|||||||
|
|
||||||
/// Returns true if and only if the value is a fast field.
|
/// Returns true if and only if the value is a fast field.
|
||||||
pub fn is_fast(&self) -> bool {
|
pub fn is_fast(&self) -> bool {
|
||||||
matches!(self.fast, FastFieldOptions::IsEnabled(true))
|
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|
||||||
|| matches!(
|
|| matches!(
|
||||||
&self.fast,
|
&self.fast,
|
||||||
FastFieldOptions::EnabledWithTokenizer { with_tokenizer: _ }
|
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true if and only if the value is a fast field.
|
/// Returns true if and only if the value is a fast field.
|
||||||
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
||||||
match &self.fast {
|
match &self.fast {
|
||||||
FastFieldOptions::IsEnabled(true) | FastFieldOptions::IsEnabled(false) => None,
|
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
|
||||||
FastFieldOptions::EnabledWithTokenizer {
|
FastFieldTextOptions::EnabledWithTokenizer {
|
||||||
with_tokenizer: tokenizer,
|
with_tokenizer: tokenizer,
|
||||||
} => Some(tokenizer.name()),
|
} => Some(tokenizer.name()),
|
||||||
}
|
}
|
||||||
@@ -125,11 +124,11 @@ impl TextOptions {
|
|||||||
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions {
|
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions {
|
||||||
if let Some(tokenizer) = tokenizer_name {
|
if let Some(tokenizer) = tokenizer_name {
|
||||||
let tokenizer = TokenizerName::from_name(tokenizer);
|
let tokenizer = TokenizerName::from_name(tokenizer);
|
||||||
self.fast = FastFieldOptions::EnabledWithTokenizer {
|
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
|
||||||
with_tokenizer: tokenizer,
|
with_tokenizer: tokenizer,
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
self.fast = FastFieldOptions::IsEnabled(true);
|
self.fast = FastFieldTextOptions::IsEnabled(true);
|
||||||
}
|
}
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
@@ -173,10 +172,10 @@ impl TokenizerName {
|
|||||||
pub const fn from_static(name: &'static str) -> Self {
|
pub const fn from_static(name: &'static str) -> Self {
|
||||||
TokenizerName(Cow::Borrowed(name))
|
TokenizerName(Cow::Borrowed(name))
|
||||||
}
|
}
|
||||||
fn from_name(name: &str) -> Self {
|
pub(crate) fn from_name(name: &str) -> Self {
|
||||||
TokenizerName(Cow::Owned(name.to_string()))
|
TokenizerName(Cow::Owned(name.to_string()))
|
||||||
}
|
}
|
||||||
fn name(&self) -> &str {
|
pub(crate) fn name(&self) -> &str {
|
||||||
&self.0
|
&self.0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -264,7 +263,7 @@ pub const STRING: TextOptions = TextOptions {
|
|||||||
record: IndexRecordOption::Basic,
|
record: IndexRecordOption::Basic,
|
||||||
}),
|
}),
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: FastFieldOptions::IsEnabled(false),
|
fast: FastFieldTextOptions::IsEnabled(false),
|
||||||
coerce: false,
|
coerce: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -277,7 +276,7 @@ pub const TEXT: TextOptions = TextOptions {
|
|||||||
}),
|
}),
|
||||||
stored: false,
|
stored: false,
|
||||||
coerce: false,
|
coerce: false,
|
||||||
fast: FastFieldOptions::IsEnabled(false),
|
fast: FastFieldTextOptions::IsEnabled(false),
|
||||||
};
|
};
|
||||||
|
|
||||||
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
|
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
|
||||||
@@ -305,7 +304,7 @@ impl From<StoredFlag> for TextOptions {
|
|||||||
TextOptions {
|
TextOptions {
|
||||||
indexing: None,
|
indexing: None,
|
||||||
stored: true,
|
stored: true,
|
||||||
fast: FastFieldOptions::IsEnabled(false),
|
fast: FastFieldTextOptions::default(),
|
||||||
coerce: false,
|
coerce: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -316,7 +315,7 @@ impl From<CoerceFlag> for TextOptions {
|
|||||||
TextOptions {
|
TextOptions {
|
||||||
indexing: None,
|
indexing: None,
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: FastFieldOptions::IsEnabled(false),
|
fast: FastFieldTextOptions::default(),
|
||||||
coerce: true,
|
coerce: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -327,7 +326,7 @@ impl From<FastFlag> for TextOptions {
|
|||||||
TextOptions {
|
TextOptions {
|
||||||
indexing: None,
|
indexing: None,
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: FastFieldOptions::IsEnabled(true),
|
fast: FastFieldTextOptions::IsEnabled(true),
|
||||||
coerce: false,
|
coerce: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -346,7 +345,7 @@ where
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::schema::text_options::{FastFieldOptions, TokenizerName};
|
use crate::schema::text_options::{FastFieldTextOptions, TokenizerName};
|
||||||
use crate::schema::*;
|
use crate::schema::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -398,7 +397,7 @@ mod tests {
|
|||||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
options.fast,
|
options.fast,
|
||||||
FastFieldOptions::EnabledWithTokenizer {
|
FastFieldTextOptions::EnabledWithTokenizer {
|
||||||
with_tokenizer: TokenizerName::from_static("default")
|
with_tokenizer: TokenizerName::from_static("default")
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
@@ -406,7 +405,7 @@ mod tests {
|
|||||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
options.fast,
|
options.fast,
|
||||||
FastFieldOptions::EnabledWithTokenizer {
|
FastFieldTextOptions::EnabledWithTokenizer {
|
||||||
with_tokenizer: TokenizerName::from_static("default")
|
with_tokenizer: TokenizerName::from_static("default")
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
@@ -415,18 +414,18 @@ mod tests {
|
|||||||
"fast": true
|
"fast": true
|
||||||
} "#;
|
} "#;
|
||||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||||
assert_eq!(options.fast, FastFieldOptions::IsEnabled(true));
|
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
|
||||||
let options: TextOptions =
|
let options: TextOptions =
|
||||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||||
assert_eq!(options.fast, FastFieldOptions::IsEnabled(true));
|
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
|
||||||
|
|
||||||
let json = r#" {
|
let json = r#" {
|
||||||
"fast": false
|
"fast": false
|
||||||
} "#;
|
} "#;
|
||||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||||
assert_eq!(options.fast, FastFieldOptions::IsEnabled(false));
|
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
|
||||||
let options: TextOptions =
|
let options: TextOptions =
|
||||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||||
assert_eq!(options.fast, FastFieldOptions::IsEnabled(false));
|
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
/// The tokenizer module contains all of the tools used to process
|
/// The tokenizer module contains all of the tools used to process
|
||||||
/// text in `tantivy`.
|
/// text in `tantivy`.
|
||||||
use tokenizer_api::{BoxTokenStream, BoxableTokenizer, TokenFilter, Tokenizer};
|
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
|
||||||
|
|
||||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||||
|
|
||||||
@@ -9,6 +9,31 @@ pub struct TextAnalyzer {
|
|||||||
tokenizer: Box<dyn BoxableTokenizer>,
|
tokenizer: Box<dyn BoxableTokenizer>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||||
|
trait BoxableTokenizer: 'static + Send + Sync {
|
||||||
|
/// Creates a boxed token stream for a given `str`.
|
||||||
|
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||||
|
/// Clone this tokenizer.
|
||||||
|
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Tokenizer> BoxableTokenizer for T {
|
||||||
|
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
|
self.token_stream(text).into()
|
||||||
|
}
|
||||||
|
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
||||||
|
Box::new(self.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Clone for TextAnalyzer {
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
TextAnalyzer {
|
||||||
|
tokenizer: self.tokenizer.box_clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Default for TextAnalyzer {
|
impl Default for TextAnalyzer {
|
||||||
fn default() -> TextAnalyzer {
|
fn default() -> TextAnalyzer {
|
||||||
TextAnalyzer::from(EmptyTokenizer)
|
TextAnalyzer::from(EmptyTokenizer)
|
||||||
@@ -33,14 +58,6 @@ impl TextAnalyzer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for TextAnalyzer {
|
|
||||||
fn clone(&self) -> Self {
|
|
||||||
TextAnalyzer {
|
|
||||||
tokenizer: self.tokenizer.box_clone(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Builder helper for [`TextAnalyzer`]
|
/// Builder helper for [`TextAnalyzer`]
|
||||||
pub struct TextAnalyzerBuilder<T> {
|
pub struct TextAnalyzerBuilder<T> {
|
||||||
tokenizer: T,
|
tokenizer: T,
|
||||||
|
|||||||
@@ -49,23 +49,6 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
|
|||||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>;
|
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
|
||||||
pub trait BoxableTokenizer: 'static + Send + Sync {
|
|
||||||
/// Creates a boxed token stream for a given `str`.
|
|
||||||
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
|
||||||
/// Clone this tokenizer.
|
|
||||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: Tokenizer> BoxableTokenizer for T {
|
|
||||||
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
|
||||||
self.token_stream(text).into()
|
|
||||||
}
|
|
||||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
|
||||||
Box::new(self.clone())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user