enable tokenizer on json fields (#2053)

* enable tokenizer on json fields

enable tokenizer on json fields for type text

* Avoid making the tokenizer within the TextAnalyzer pub(crate)

* Moving BoxableTokenizer to tantivy.

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
This commit is contained in:
PSeitz
2023-05-24 16:47:39 +08:00
committed by GitHub
parent 4be6f83b0a
commit e56addc63e
7 changed files with 165 additions and 78 deletions

View File

@@ -1082,7 +1082,7 @@ mod tests {
#[test]
fn test_fast_field_in_json_field_expand_dots_disabled() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast();
let json_option = JsonObjectOptions::default().set_fast(None);
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -1105,11 +1105,36 @@ mod tests {
assert_eq!(&vals, &[32])
}
#[test]
fn test_fast_field_in_json_field_with_tokenizer() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast(Some("default"));
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"age": 32})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"age": "NEW"})))
.unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let fast_fields = searcher.segment_reader(0u32).fast_fields();
let ff_str = fast_fields.str("json.age").unwrap().unwrap();
let mut output = String::new();
ff_str.ord_to_str(0, &mut output).unwrap();
assert_eq!(output, "new");
}
#[test]
fn test_fast_field_in_json_field_expand_dots_enabled() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default()
.set_fast()
.set_fast(None)
.set_expand_dots_enabled();
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
@@ -1246,7 +1271,7 @@ mod tests {
fn test_shadowing_fast_field_with_expand_dots() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default()
.set_fast()
.set_fast(None)
.set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);

View File

@@ -346,7 +346,7 @@ mod tests {
schema_builder.add_json_field(
"json_expand_dots_enabled",
JsonObjectOptions::default()
.set_fast()
.set_fast(None)
.set_expand_dots_enabled(),
);
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);

View File

@@ -46,7 +46,7 @@ impl FastFieldsWriter {
.take(schema.num_fields())
.collect();
let mut expand_dots = vec![false; schema.num_fields()];
let mut per_field_tokenizer = vec![None; schema.num_fields()];
let mut per_field_tokenizer: Vec<Option<TextAnalyzer>> = vec![None; schema.num_fields()];
// TODO see other types
for (field_id, field_entry) in schema.fields() {
if !field_entry.field_type().is_fast() {
@@ -58,6 +58,15 @@ impl FastFieldsWriter {
date_precisions[field_id.field_id() as usize] = date_options.get_precision();
}
if let FieldType::JsonObject(json_object_options) = field_entry.field_type() {
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"Tokenizer {tokenizer_name:?} not found"
))
})?;
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
}
expand_dots[field_id.field_id() as usize] =
json_object_options.is_expand_dots_enabled();
}
@@ -137,10 +146,10 @@ impl FastFieldsWriter {
);
}
Value::Str(text_val) => {
if let Some(text_analyzer) =
if let Some(tokenizer) =
&self.per_field_tokenizer[field_value.field().field_id() as usize]
{
let mut token_stream = text_analyzer.token_stream(text_val);
let mut token_stream = tokenizer.token_stream(text_val);
token_stream.process(&mut |token: &Token| {
self.columnar_writer.record_str(
doc_id,
@@ -191,6 +200,10 @@ impl FastFieldsWriter {
let expand_dots = self.expand_dots[field_value.field().field_id() as usize];
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
let text_analyzer =
&self.per_field_tokenizer[field_value.field().field_id() as usize];
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
@@ -198,6 +211,7 @@ impl FastFieldsWriter {
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
Value::IpAddr(ip_addr) => {
@@ -249,6 +263,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit: usize,
json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &Option<TextAnalyzer>,
) {
for (key, child) in json_obj {
let len_path = json_path_buffer.len();
@@ -273,6 +288,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit,
json_path_buffer,
columnar_writer,
tokenizer,
);
// popping our sub path.
json_path_buffer.truncate(len_path);
@@ -286,6 +302,7 @@ fn record_json_value_to_columnar_writer(
mut remaining_depth_limit: usize,
json_path_writer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &Option<TextAnalyzer>,
) {
if remaining_depth_limit == 0 {
return;
@@ -304,7 +321,14 @@ fn record_json_value_to_columnar_writer(
}
}
serde_json::Value::String(text) => {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
if let Some(text_analyzer) = tokenizer {
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
})
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
}
}
serde_json::Value::Array(arr) => {
for el in arr {
@@ -315,6 +339,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit,
json_path_writer,
columnar_writer,
tokenizer,
);
}
}
@@ -326,6 +351,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit,
json_path_writer,
columnar_writer,
tokenizer,
);
}
}
@@ -353,6 +379,7 @@ mod tests {
JSON_DEPTH_LIMIT,
&mut json_path,
&mut columnar_writer,
&None,
);
}
let mut buffer = Vec::new();

View File

@@ -2,19 +2,20 @@ use std::ops::BitOr;
use serde::{Deserialize, Serialize};
use super::text_options::{FastFieldTextOptions, TokenizerName};
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
use crate::schema::{TextFieldIndexing, TextOptions};
/// The `JsonObjectOptions` make it possible to
/// configure how a json object field should be indexed and stored.
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct JsonObjectOptions {
stored: bool,
// If set to some, int, date, f64 and text will be indexed.
// Text will use the TextFieldIndexing setting for indexing.
indexing: Option<TextFieldIndexing>,
// Store all field as fast fields.
fast: bool,
// Store all field as fast fields with an optional tokenizer for text.
fast: FastFieldTextOptions,
/// tantivy will generate pathes to the different nodes of the json object
/// both in:
/// - the inverted index (for the terms)
@@ -57,7 +58,21 @@ impl JsonObjectOptions {
/// Returns true if and only if the json object fields are
/// to be treated as fast fields.
pub fn is_fast(&self) -> bool {
self.fast
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|| matches!(
&self.fast,
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
)
}
/// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast {
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
} => Some(tokenizer.name()),
}
}
/// Returns `true` iff dots in json keys should be expanded.
@@ -99,10 +114,31 @@ impl JsonObjectOptions {
self
}
/// Sets the field as a fast field
/// Set the field as a fast field.
///
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// Text fast fields will have the term ids stored in the fast field.
///
/// The effective cardinality depends on the tokenizer. Without a tokenizer, the text will be
/// stored as is, which equals to the "raw" tokenizer. The tokenizer can be used to apply
/// normalization like lower case.
/// The passed tokenizer_name must be available on the fast field tokenizer manager.
/// `Index::fast_field_tokenizer`.
///
/// The original text can be retrieved via
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
/// from the dictionary.
#[must_use]
pub fn set_fast(mut self) -> Self {
self.fast = true;
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self {
if let Some(tokenizer) = tokenizer_name {
let tokenizer = TokenizerName::from_name(tokenizer);
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
}
} else {
self.fast = FastFieldTextOptions::IsEnabled(true);
}
self
}
@@ -119,7 +155,7 @@ impl From<StoredFlag> for JsonObjectOptions {
JsonObjectOptions {
stored: true,
indexing: None,
fast: false,
fast: FastFieldTextOptions::default(),
expand_dots_enabled: false,
}
}
@@ -130,7 +166,7 @@ impl From<FastFlag> for JsonObjectOptions {
JsonObjectOptions {
stored: false,
indexing: None,
fast: true,
fast: FastFieldTextOptions::IsEnabled(true),
expand_dots_enabled: false,
}
}
@@ -172,7 +208,7 @@ impl From<TextOptions> for JsonObjectOptions {
JsonObjectOptions {
stored: text_options.is_stored(),
indexing: text_options.get_indexing_options().cloned(),
fast: text_options.is_fast(),
fast: text_options.fast,
expand_dots_enabled: false,
}
}

View File

@@ -16,7 +16,7 @@ pub struct TextOptions {
#[serde(default)]
stored: bool,
#[serde(default)]
fast: FastFieldOptions,
pub(crate) fast: FastFieldTextOptions,
#[serde(default)]
#[serde(skip_serializing_if = "is_false")]
/// coerce values into string if they are not of type string
@@ -26,7 +26,7 @@ pub struct TextOptions {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
/// Enum to control how the fast field setting of a text field.
enum FastFieldOptions {
pub(crate) enum FastFieldTextOptions {
/// Flag to enable/disable
IsEnabled(bool),
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
@@ -34,35 +34,34 @@ enum FastFieldOptions {
EnabledWithTokenizer { with_tokenizer: TokenizerName },
}
impl Default for FastFieldOptions {
impl Default for FastFieldTextOptions {
fn default() -> Self {
FastFieldOptions::IsEnabled(false)
FastFieldTextOptions::IsEnabled(false)
}
}
impl BitOr<FastFieldOptions> for FastFieldOptions {
type Output = FastFieldOptions;
impl BitOr<FastFieldTextOptions> for FastFieldTextOptions {
type Output = FastFieldTextOptions;
fn bitor(self, other: FastFieldOptions) -> FastFieldOptions {
fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions {
match (self, other) {
(
FastFieldOptions::EnabledWithTokenizer {
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
_,
)
| (
_,
FastFieldOptions::EnabledWithTokenizer {
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
) => FastFieldOptions::EnabledWithTokenizer {
) => FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
(FastFieldOptions::IsEnabled(true), _) | (_, FastFieldOptions::IsEnabled(true)) => {
FastFieldOptions::IsEnabled(true)
}
(_, FastFieldOptions::IsEnabled(false)) => FastFieldOptions::IsEnabled(false),
(FastFieldTextOptions::IsEnabled(true), _)
| (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true),
(_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false),
}
}
}
@@ -84,18 +83,18 @@ impl TextOptions {
/// Returns true if and only if the value is a fast field.
pub fn is_fast(&self) -> bool {
matches!(self.fast, FastFieldOptions::IsEnabled(true))
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|| matches!(
&self.fast,
FastFieldOptions::EnabledWithTokenizer { with_tokenizer: _ }
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
)
}
/// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast {
FastFieldOptions::IsEnabled(true) | FastFieldOptions::IsEnabled(false) => None,
FastFieldOptions::EnabledWithTokenizer {
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
} => Some(tokenizer.name()),
}
@@ -125,11 +124,11 @@ impl TextOptions {
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions {
if let Some(tokenizer) = tokenizer_name {
let tokenizer = TokenizerName::from_name(tokenizer);
self.fast = FastFieldOptions::EnabledWithTokenizer {
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
}
} else {
self.fast = FastFieldOptions::IsEnabled(true);
self.fast = FastFieldTextOptions::IsEnabled(true);
}
self
}
@@ -173,10 +172,10 @@ impl TokenizerName {
pub const fn from_static(name: &'static str) -> Self {
TokenizerName(Cow::Borrowed(name))
}
fn from_name(name: &str) -> Self {
pub(crate) fn from_name(name: &str) -> Self {
TokenizerName(Cow::Owned(name.to_string()))
}
fn name(&self) -> &str {
pub(crate) fn name(&self) -> &str {
&self.0
}
}
@@ -264,7 +263,7 @@ pub const STRING: TextOptions = TextOptions {
record: IndexRecordOption::Basic,
}),
stored: false,
fast: FastFieldOptions::IsEnabled(false),
fast: FastFieldTextOptions::IsEnabled(false),
coerce: false,
};
@@ -277,7 +276,7 @@ pub const TEXT: TextOptions = TextOptions {
}),
stored: false,
coerce: false,
fast: FastFieldOptions::IsEnabled(false),
fast: FastFieldTextOptions::IsEnabled(false),
};
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
@@ -305,7 +304,7 @@ impl From<StoredFlag> for TextOptions {
TextOptions {
indexing: None,
stored: true,
fast: FastFieldOptions::IsEnabled(false),
fast: FastFieldTextOptions::default(),
coerce: false,
}
}
@@ -316,7 +315,7 @@ impl From<CoerceFlag> for TextOptions {
TextOptions {
indexing: None,
stored: false,
fast: FastFieldOptions::IsEnabled(false),
fast: FastFieldTextOptions::default(),
coerce: true,
}
}
@@ -327,7 +326,7 @@ impl From<FastFlag> for TextOptions {
TextOptions {
indexing: None,
stored: false,
fast: FastFieldOptions::IsEnabled(true),
fast: FastFieldTextOptions::IsEnabled(true),
coerce: false,
}
}
@@ -346,7 +345,7 @@ where
#[cfg(test)]
mod tests {
use crate::schema::text_options::{FastFieldOptions, TokenizerName};
use crate::schema::text_options::{FastFieldTextOptions, TokenizerName};
use crate::schema::*;
#[test]
@@ -398,7 +397,7 @@ mod tests {
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(
options.fast,
FastFieldOptions::EnabledWithTokenizer {
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: TokenizerName::from_static("default")
}
);
@@ -406,7 +405,7 @@ mod tests {
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(
options.fast,
FastFieldOptions::EnabledWithTokenizer {
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: TokenizerName::from_static("default")
}
);
@@ -415,18 +414,18 @@ mod tests {
"fast": true
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(options.fast, FastFieldOptions::IsEnabled(true));
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(options.fast, FastFieldOptions::IsEnabled(true));
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
let json = r#" {
"fast": false
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(options.fast, FastFieldOptions::IsEnabled(false));
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(options.fast, FastFieldOptions::IsEnabled(false));
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
}
}

View File

@@ -1,6 +1,6 @@
/// The tokenizer module contains all of the tools used to process
/// text in `tantivy`.
use tokenizer_api::{BoxTokenStream, BoxableTokenizer, TokenFilter, Tokenizer};
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
@@ -9,6 +9,31 @@ pub struct TextAnalyzer {
tokenizer: Box<dyn BoxableTokenizer>,
}
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
trait BoxableTokenizer: 'static + Send + Sync {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
}
impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
self.token_stream(text).into()
}
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Box::new(self.clone())
}
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
}
}
}
impl Default for TextAnalyzer {
fn default() -> TextAnalyzer {
TextAnalyzer::from(EmptyTokenizer)
@@ -33,14 +58,6 @@ impl TextAnalyzer {
}
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
}
}
}
/// Builder helper for [`TextAnalyzer`]
pub struct TextAnalyzerBuilder<T> {
tokenizer: T,

View File

@@ -49,23 +49,6 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>;
}
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
pub trait BoxableTokenizer: 'static + Send + Sync {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
}
impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
self.token_stream(text).into()
}
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Box::new(self.clone())
}
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);