Compare commits

...

1 Commits

Author SHA1 Message Date
Paul Masurel
61422d7cd5 Change in the default fast field tokenizer manager.
`{ fast: true }` now results in the use of a the default fast field tokenizer. (instead of no tokenizer)
The default tokenizer lowercases.

Fast field gets a different default tokenizer manager than the normal tokenizer.

The serialization of the fast field options is unchanged.
2023-07-18 19:20:10 +09:00
13 changed files with 209 additions and 137 deletions

View File

@@ -37,7 +37,7 @@ fn main() -> tantivy::Result<()> {
.set_index_option(IndexRecordOption::WithFreqs) .set_index_option(IndexRecordOption::WithFreqs)
.set_tokenizer("raw"), .set_tokenizer("raw"),
) )
.set_fast(None) .set_fast("default")
.set_stored(); .set_stored();
schema_builder.add_text_field("category", text_fieldtype); schema_builder.add_text_field("category", text_fieldtype);
schema_builder.add_f64_field("stock", FAST); schema_builder.add_f64_field("stock", FAST);

View File

@@ -1293,13 +1293,13 @@ mod tests {
// searching for terma, but min_doc_count will return all terms // searching for terma, but min_doc_count will return all terms
let res = exec_request_with_query(agg_req, &index, Some(("string2", "hit")))?; let res = exec_request_with_query(agg_req, &index, Some(("string2", "hit")))?;
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A"); assert_eq!(res["my_texts"]["buckets"][0]["key"], "a");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2); assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
assert_eq!( assert_eq!(
res["my_texts"]["buckets"][0]["elhistogram"]["buckets"], res["my_texts"]["buckets"][0]["elhistogram"]["buckets"],
json!([{ "doc_count": 1, "key": 1.0 }, { "doc_count": 1, "key": 2.0 } ]) json!([{ "doc_count": 1, "key": 1.0 }, { "doc_count": 1, "key": 2.0 } ])
); );
assert_eq!(res["my_texts"]["buckets"][1]["key"], "B"); assert_eq!(res["my_texts"]["buckets"][1]["key"], "b");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1); assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
assert_eq!( assert_eq!(
res["my_texts"]["buckets"][1]["elhistogram"]["buckets"], res["my_texts"]["buckets"][1]["elhistogram"]["buckets"],
@@ -1421,10 +1421,10 @@ mod tests {
let res = exec_request_with_query(agg_req, &index, None).unwrap(); let res = exec_request_with_query(agg_req, &index, None).unwrap();
println!("{}", serde_json::to_string_pretty(&res).unwrap()); println!("{}", serde_json::to_string_pretty(&res).unwrap());
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hallo Hallo"); assert_eq!(res["my_texts"]["buckets"][0]["key"], "hallo hallo");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1); assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Hello Hello"); assert_eq!(res["my_texts"]["buckets"][1]["key"], "hello hello");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1); assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
Ok(()) Ok(())

View File

@@ -411,7 +411,7 @@ mod tests {
.set_index_option(IndexRecordOption::Basic) .set_index_option(IndexRecordOption::Basic)
.set_fieldnorms(false), .set_fieldnorms(false),
) )
.set_fast(None) .set_fast("default")
.set_stored(); .set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone()); let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype); let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
@@ -466,7 +466,7 @@ mod tests {
.set_indexing_options( .set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
) )
.set_fast(None) .set_fast("default")
.set_stored(); .set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype); let text_field = schema_builder.add_text_field("text", text_fieldtype);
let date_field = schema_builder.add_date_field("date", FAST); let date_field = schema_builder.add_date_field("date", FAST);

View File

@@ -120,8 +120,8 @@ impl IndexBuilder {
Self { Self {
schema: None, schema: None,
index_settings: IndexSettings::default(), index_settings: IndexSettings::default(),
tokenizer_manager: TokenizerManager::default(), tokenizer_manager: TokenizerManager::default_for_indexing(),
fast_field_tokenizer_manager: TokenizerManager::default(), fast_field_tokenizer_manager: TokenizerManager::default_for_fast_fields(),
} }
} }
@@ -400,8 +400,8 @@ impl Index {
settings: metas.index_settings.clone(), settings: metas.index_settings.clone(),
directory, directory,
schema, schema,
tokenizers: TokenizerManager::default(), tokenizers: TokenizerManager::default_for_indexing(),
fast_field_tokenizers: TokenizerManager::default(), fast_field_tokenizers: TokenizerManager::default_for_fast_fields(),
executor: Arc::new(Executor::single_thread()), executor: Arc::new(Executor::single_thread()),
inventory, inventory,
} }

View File

@@ -446,7 +446,8 @@ mod tests {
#[test] #[test]
fn test_text_fastfield() { fn test_text_fastfield() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT | FAST); let text_options: TextOptions = TextOptions::from(TEXT).set_fast("raw");
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -1082,7 +1083,7 @@ mod tests {
#[test] #[test]
fn test_fast_field_in_json_field_expand_dots_disabled() { fn test_fast_field_in_json_field_expand_dots_disabled() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast(None); let json_option = JsonObjectOptions::default().set_fast("default");
let json = schema_builder.add_json_field("json", json_option); let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -1108,7 +1109,7 @@ mod tests {
#[test] #[test]
fn test_fast_field_in_json_field_with_tokenizer() { fn test_fast_field_in_json_field_with_tokenizer() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast(Some("default")); let json_option = JsonObjectOptions::default().set_fast("default");
let json = schema_builder.add_json_field("json", json_option); let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -1134,7 +1135,7 @@ mod tests {
fn test_fast_field_in_json_field_expand_dots_enabled() { fn test_fast_field_in_json_field_expand_dots_enabled() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default() let json_option = JsonObjectOptions::default()
.set_fast(None) .set_fast("default")
.set_expand_dots_enabled(); .set_expand_dots_enabled();
let json = schema_builder.add_json_field("json", json_option); let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -1202,10 +1203,10 @@ mod tests {
#[test] #[test]
fn test_fast_field_tokenizer() { fn test_fast_field_tokenizer() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let opt = TextOptions::default().set_fast(Some("custom_lowercase")); let opt = TextOptions::default().set_fast("custom_lowercase");
let text_field = schema_builder.add_text_field("text", opt); let text_field = schema_builder.add_text_field("text", opt);
let schema = schema_builder.build(); let schema = schema_builder.build();
let ff_tokenizer_manager = TokenizerManager::default(); let ff_tokenizer_manager = TokenizerManager::default_for_fast_fields();
ff_tokenizer_manager.register( ff_tokenizer_manager.register(
"custom_lowercase", "custom_lowercase",
TextAnalyzer::builder(RawTokenizer::default()) TextAnalyzer::builder(RawTokenizer::default())
@@ -1238,7 +1239,7 @@ mod tests {
.set_index_option(crate::schema::IndexRecordOption::WithFreqs) .set_index_option(crate::schema::IndexRecordOption::WithFreqs)
.set_tokenizer("raw"), .set_tokenizer("raw"),
) )
.set_fast(Some("default")) .set_fast("default")
.set_stored(); .set_stored();
let log_field = schema_builder.add_text_field("log_level", text_fieldtype); let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
@@ -1271,7 +1272,7 @@ mod tests {
fn test_shadowing_fast_field_with_expand_dots() { fn test_shadowing_fast_field_with_expand_dots() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default() let json_option = JsonObjectOptions::default()
.set_fast(None) .set_fast("default")
.set_expand_dots_enabled(); .set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone()); let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option); let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);

View File

@@ -349,7 +349,7 @@ mod tests {
schema_builder.add_json_field( schema_builder.add_json_field(
"json_expand_dots_enabled", "json_expand_dots_enabled",
JsonObjectOptions::default() JsonObjectOptions::default()
.set_fast(None) .set_fast("default")
.set_expand_dots_enabled(), .set_expand_dots_enabled(),
); );
let dynamic_field = schema_builder.add_json_field("_dyna", FAST); let dynamic_field = schema_builder.add_json_field("_dyna", FAST);

View File

@@ -18,6 +18,8 @@ const JSON_DEPTH_LIMIT: usize = 20;
pub struct FastFieldsWriter { pub struct FastFieldsWriter {
columnar_writer: ColumnarWriter, columnar_writer: ColumnarWriter,
fast_field_names: Vec<Option<String>>, //< TODO see if we can hash the field name hash too. fast_field_names: Vec<Option<String>>, //< TODO see if we can hash the field name hash too.
// Field -> Fast field tokenizer mapping.
// All text fast fields should have a tokenizer.
per_field_tokenizer: Vec<Option<TextAnalyzer>>, per_field_tokenizer: Vec<Option<TextAnalyzer>>,
date_precisions: Vec<DateTimePrecision>, date_precisions: Vec<DateTimePrecision>,
expand_dots: Vec<bool>, expand_dots: Vec<bool>,
@@ -61,7 +63,7 @@ impl FastFieldsWriter {
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() { if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| { let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
TantivyError::InvalidArgument(format!( TantivyError::InvalidArgument(format!(
"Tokenizer {tokenizer_name:?} not found" "Tokenizer `{tokenizer_name}` not found"
)) ))
})?; })?;
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer); per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
@@ -157,9 +159,6 @@ impl FastFieldsWriter {
&token.text, &token.text,
); );
}) })
} else {
self.columnar_writer
.record_str(doc_id, field_name.as_str(), text_val);
} }
} }
Value::Bytes(bytes_val) => { Value::Bytes(bytes_val) => {
@@ -201,18 +200,20 @@ impl FastFieldsWriter {
self.json_path_buffer.clear(); self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name); self.json_path_buffer.push_str(field_name);
let text_analyzer = let text_analyzer_opt =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize]; &mut self.per_field_tokenizer[field_value.field().field_id() as usize];
record_json_obj_to_columnar_writer( if let Some(text_analyzer) = text_analyzer_opt {
doc_id, record_json_obj_to_columnar_writer(
json_obj, doc_id,
expand_dots, json_obj,
JSON_DEPTH_LIMIT, expand_dots,
&mut self.json_path_buffer, JSON_DEPTH_LIMIT,
&mut self.columnar_writer, &mut self.json_path_buffer,
text_analyzer, &mut self.columnar_writer,
); text_analyzer,
);
}
} }
Value::IpAddr(ip_addr) => { Value::IpAddr(ip_addr) => {
self.columnar_writer self.columnar_writer
@@ -263,7 +264,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit: usize, remaining_depth_limit: usize,
json_path_buffer: &mut String, json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter, columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>, text_analyzer: &mut TextAnalyzer,
) { ) {
for (key, child) in json_obj { for (key, child) in json_obj {
let len_path = json_path_buffer.len(); let len_path = json_path_buffer.len();
@@ -288,7 +289,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit, remaining_depth_limit,
json_path_buffer, json_path_buffer,
columnar_writer, columnar_writer,
tokenizer, text_analyzer,
); );
// popping our sub path. // popping our sub path.
json_path_buffer.truncate(len_path); json_path_buffer.truncate(len_path);
@@ -302,7 +303,7 @@ fn record_json_value_to_columnar_writer(
mut remaining_depth_limit: usize, mut remaining_depth_limit: usize,
json_path_writer: &mut String, json_path_writer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter, columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>, text_analyzer: &mut TextAnalyzer,
) { ) {
if remaining_depth_limit == 0 { if remaining_depth_limit == 0 {
return; return;
@@ -321,14 +322,10 @@ fn record_json_value_to_columnar_writer(
} }
} }
serde_json::Value::String(text) => { serde_json::Value::String(text) => {
if let Some(text_analyzer) = tokenizer.as_mut() { let mut token_stream = text_analyzer.token_stream(text);
let mut token_stream = text_analyzer.token_stream(text); token_stream.process(&mut |token| {
token_stream.process(&mut |token| { columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text); });
})
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
}
} }
serde_json::Value::Array(arr) => { serde_json::Value::Array(arr) => {
for el in arr { for el in arr {
@@ -339,7 +336,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit, remaining_depth_limit,
json_path_writer, json_path_writer,
columnar_writer, columnar_writer,
tokenizer, text_analyzer,
); );
} }
} }
@@ -351,7 +348,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit, remaining_depth_limit,
json_path_writer, json_path_writer,
columnar_writer, columnar_writer,
tokenizer, text_analyzer,
); );
} }
} }
@@ -371,6 +368,9 @@ mod tests {
) -> ColumnarReader { ) -> ColumnarReader {
let mut columnar_writer = ColumnarWriter::default(); let mut columnar_writer = ColumnarWriter::default();
let mut json_path = String::new(); let mut json_path = String::new();
let mut text_analyzer = crate::tokenizer::TokenizerManager::default_for_fast_fields()
.get(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER)
.unwrap();
for (doc, json_doc) in json_docs.iter().enumerate() { for (doc, json_doc) in json_docs.iter().enumerate() {
record_json_value_to_columnar_writer( record_json_value_to_columnar_writer(
doc as u32, doc as u32,
@@ -379,7 +379,7 @@ mod tests {
JSON_DEPTH_LIMIT, JSON_DEPTH_LIMIT,
&mut json_path, &mut json_path,
&mut columnar_writer, &mut columnar_writer,
&mut None, &mut text_analyzer,
); );
} }
let mut buffer = Vec::new(); let mut buffer = Vec::new();
@@ -399,6 +399,7 @@ mod tests {
}); });
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false); let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false);
let columns = columnar_reader.list_columns().unwrap(); let columns = columnar_reader.list_columns().unwrap();
assert_eq!(columns.len(), 5);
{ {
assert_eq!(columns[0].0, "arr"); assert_eq!(columns[0].0, "arr");
let column_arr_opt: Option<StrColumn> = columns[0].1.open().unwrap().into(); let column_arr_opt: Option<StrColumn> = columns[0].1.open().unwrap().into();
@@ -434,7 +435,9 @@ mod tests {
{ {
assert_eq!(columns[4].0, "text"); assert_eq!(columns[4].0, "text");
let column_text_opt: Option<StrColumn> = columns[4].1.open().unwrap().into(); let column_text_opt: Option<StrColumn> = columns[4].1.open().unwrap().into();
assert!(column_text_opt.unwrap().term_ords(0).eq([0].into_iter())); let column_text = column_text_opt.unwrap();
let term_ords: Vec<u64> = column_text.term_ords(0).collect();
assert_eq!(&term_ords[..], &[0]);
} }
} }

View File

@@ -956,7 +956,7 @@ mod test {
.iter() .iter()
.flat_map(|field_name| schema.get_field(field_name)) .flat_map(|field_name| schema.get_field(field_name))
.collect(); .collect();
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default_for_indexing();
tokenizer_manager.register( tokenizer_manager.register(
"en_with_stop_words", "en_with_stop_words",
TextAnalyzer::builder(SimpleTokenizer::default()) TextAnalyzer::builder(SimpleTokenizer::default())
@@ -1447,7 +1447,7 @@ mod test {
let title = schema_builder.add_text_field("title", text_options); let title = schema_builder.add_text_field("title", text_options);
let schema = schema_builder.build(); let schema = schema_builder.build();
let default_fields = vec![title]; let default_fields = vec![title];
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default_for_indexing();
let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager); let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager);
assert_matches!( assert_matches!(
@@ -1622,7 +1622,8 @@ mod test {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
schema_builder.add_text_field(r#"a\.b"#, STRING); schema_builder.add_text_field(r#"a\.b"#, STRING);
let schema = schema_builder.build(); let schema = schema_builder.build();
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default()); let query_parser =
QueryParser::new(schema, Vec::new(), TokenizerManager::default_for_indexing());
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap(); let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
assert_eq!( assert_eq!(
format!("{query:?}"), format!("{query:?}"),
@@ -1639,8 +1640,11 @@ mod test {
schema_builder.add_text_field("first.toto.titi", STRING); schema_builder.add_text_field("first.toto.titi", STRING);
schema_builder.add_text_field("third.a.b.c", STRING); schema_builder.add_text_field("third.a.b.c", STRING);
let schema = schema_builder.build(); let schema = schema_builder.build();
let query_parser = let query_parser = QueryParser::new(
QueryParser::new(schema.clone(), Vec::new(), TokenizerManager::default()); schema.clone(),
Vec::new(),
TokenizerManager::default_for_indexing(),
);
assert_eq!( assert_eq!(
query_parser.split_full_path("first.toto"), query_parser.split_full_path("first.toto"),
Some((schema.get_field("first.toto").unwrap(), "")) Some((schema.get_field("first.toto").unwrap(), ""))

View File

@@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
use super::text_options::{FastFieldTextOptions, TokenizerName}; use super::text_options::{FastFieldTextOptions, TokenizerName};
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag}; use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
use crate::schema::{TextFieldIndexing, TextOptions}; use crate::schema::{TextFieldIndexing, TextOptions, DEFAULT_FAST_FIELD_TOKENIZER};
/// The `JsonObjectOptions` make it possible to /// The `JsonObjectOptions` make it possible to
/// configure how a json object field should be indexed and stored. /// configure how a json object field should be indexed and stored.
@@ -58,20 +58,19 @@ impl JsonObjectOptions {
/// Returns true if and only if the json object fields are /// Returns true if and only if the json object fields are
/// to be treated as fast fields. /// to be treated as fast fields.
pub fn is_fast(&self) -> bool { pub fn is_fast(&self) -> bool {
matches!(self.fast, FastFieldTextOptions::IsEnabled(true)) match self.fast {
|| matches!( FastFieldTextOptions::Disabled => false,
&self.fast, FastFieldTextOptions::Enabled { .. } => true,
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ } }
)
} }
/// Returns true if and only if the value is a fast field. /// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> { pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast { match &self.fast {
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None, FastFieldTextOptions::Disabled => None,
FastFieldTextOptions::EnabledWithTokenizer { FastFieldTextOptions::Enabled {
with_tokenizer: tokenizer, tokenizer: with_tokenizer,
} => Some(tokenizer.name()), } => Some(with_tokenizer.name()),
} }
} }
@@ -130,15 +129,11 @@ impl JsonObjectOptions {
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term) /// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
/// from the dictionary. /// from the dictionary.
#[must_use] #[must_use]
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self { pub fn set_fast(mut self, tokenizer_name: &str) -> Self {
if let Some(tokenizer) = tokenizer_name { let with_tokenizer = TokenizerName::from_name(tokenizer_name);
let tokenizer = TokenizerName::from_name(tokenizer); self.fast = FastFieldTextOptions::Enabled {
self.fast = FastFieldTextOptions::EnabledWithTokenizer { tokenizer: with_tokenizer,
with_tokenizer: tokenizer, };
}
} else {
self.fast = FastFieldTextOptions::IsEnabled(true);
}
self self
} }
@@ -166,7 +161,9 @@ impl From<FastFlag> for JsonObjectOptions {
JsonObjectOptions { JsonObjectOptions {
stored: false, stored: false,
indexing: None, indexing: None,
fast: FastFieldTextOptions::IsEnabled(true), fast: FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER),
},
expand_dots_enabled: false, expand_dots_enabled: false,
} }
} }

View File

@@ -1,7 +1,7 @@
//! Schema definition for tantivy's indices. //! Schema definition for tantivy's indices.
//!
//! # Setting your schema in Tantivy //! # Setting your schema in Tantivy
//! //!
//!
//! Tantivy has a very strict schema. //! Tantivy has a very strict schema.
//! The schema defines information about the fields your index contains, that is, for each field: //! The schema defines information about the fields your index contains, that is, for each field:
//! //!
@@ -153,6 +153,8 @@ pub use self::term::{Term, ValueBytes, JSON_END_OF_PATH};
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT}; pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
pub use self::value::Value; pub use self::value::Value;
pub(crate) const DEFAULT_FAST_FIELD_TOKENIZER: &str = "default";
/// Validator for a potential `field_name`. /// Validator for a potential `field_name`.
/// Returns true if the name can be use for a field name. /// Returns true if the name can be use for a field name.
/// ///

View File

@@ -24,19 +24,68 @@ pub struct TextOptions {
} }
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)] #[serde(
into = "FastFieldTextOptionsForSerialization",
from = "FastFieldTextOptionsForSerialization"
)]
/// Enum to control how the fast field setting of a text field. /// Enum to control how the fast field setting of a text field.
#[derive(Default)]
pub(crate) enum FastFieldTextOptions { pub(crate) enum FastFieldTextOptions {
/// Flag to enable/disable /// Fastfield disabled
IsEnabled(bool), #[default]
Disabled,
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager. /// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
/// `Index::fast_field_tokenizer`. /// `Index::fast_field_tokenizer`.
EnabledWithTokenizer { with_tokenizer: TokenizerName }, Enabled { tokenizer: TokenizerName },
} }
impl Default for FastFieldTextOptions { /// Enum used to control the way we serialize fast field text options.
fn default() -> Self { ///
FastFieldTextOptions::IsEnabled(false) /// For backward compatiblity reasons, we folow the format introduce in tantivy 0.19.
/// `false` -> Disabled
/// `true` -> Enabled with default tokenizer
/// `{ tokenizer: "something" }` -> Enabled with a specific tokenizer.
#[derive(Serialize, Deserialize)]
#[serde(untagged)]
enum FastFieldTextOptionsForSerialization {
IsEnabled(bool),
EnabledWithTokenizer {
#[serde(alias = "with_tokenizer")]
tokenizer: TokenizerName,
},
}
impl From<FastFieldTextOptionsForSerialization> for FastFieldTextOptions {
fn from(value: FastFieldTextOptionsForSerialization) -> Self {
match value {
FastFieldTextOptionsForSerialization::IsEnabled(enabled) => {
if enabled {
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
),
}
} else {
FastFieldTextOptions::Disabled
}
}
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer } => {
FastFieldTextOptions::Enabled { tokenizer }
}
}
}
}
impl From<FastFieldTextOptions> for FastFieldTextOptionsForSerialization {
fn from(value: FastFieldTextOptions) -> Self {
match value {
FastFieldTextOptions::Disabled => {
FastFieldTextOptionsForSerialization::IsEnabled(false)
}
FastFieldTextOptions::Enabled { tokenizer } => {
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer }
}
}
} }
} }
@@ -45,23 +94,13 @@ impl BitOr<FastFieldTextOptions> for FastFieldTextOptions {
fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions { fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions {
match (self, other) { match (self, other) {
( (FastFieldTextOptions::Enabled { tokenizer }, _)
FastFieldTextOptions::EnabledWithTokenizer { | (_, FastFieldTextOptions::Enabled { tokenizer }) => {
with_tokenizer: tokenizer, FastFieldTextOptions::Enabled { tokenizer }
}, }
_, (FastFieldTextOptions::Disabled, FastFieldTextOptions::Disabled) => {
) FastFieldTextOptions::Disabled
| ( }
_,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
) => FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
(FastFieldTextOptions::IsEnabled(true), _)
| (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true),
(_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false),
} }
} }
} }
@@ -83,20 +122,17 @@ impl TextOptions {
/// Returns true if and only if the value is a fast field. /// Returns true if and only if the value is a fast field.
pub fn is_fast(&self) -> bool { pub fn is_fast(&self) -> bool {
matches!(self.fast, FastFieldTextOptions::IsEnabled(true)) match &self.fast {
|| matches!( FastFieldTextOptions::Disabled => false,
&self.fast, FastFieldTextOptions::Enabled { .. } => true,
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ } }
)
} }
/// Returns true if and only if the value is a fast field. /// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> { pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast { match &self.fast {
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None, FastFieldTextOptions::Disabled => None,
FastFieldTextOptions::EnabledWithTokenizer { FastFieldTextOptions::Enabled { tokenizer } => Some(tokenizer.name()),
with_tokenizer: tokenizer,
} => Some(tokenizer.name()),
} }
} }
@@ -121,15 +157,9 @@ impl TextOptions {
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term) /// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
/// from the dictionary. /// from the dictionary.
#[must_use] #[must_use]
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions { pub fn set_fast(mut self, tokenizer_name: &str) -> TextOptions {
if let Some(tokenizer) = tokenizer_name { let tokenizer = TokenizerName::from_name(tokenizer_name);
let tokenizer = TokenizerName::from_name(tokenizer); self.fast = FastFieldTextOptions::Enabled { tokenizer };
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
}
} else {
self.fast = FastFieldTextOptions::IsEnabled(true);
}
self self
} }
@@ -263,7 +293,7 @@ pub const STRING: TextOptions = TextOptions {
record: IndexRecordOption::Basic, record: IndexRecordOption::Basic,
}), }),
stored: false, stored: false,
fast: FastFieldTextOptions::IsEnabled(false), fast: FastFieldTextOptions::Disabled,
coerce: false, coerce: false,
}; };
@@ -276,7 +306,7 @@ pub const TEXT: TextOptions = TextOptions {
}), }),
stored: false, stored: false,
coerce: false, coerce: false,
fast: FastFieldTextOptions::IsEnabled(false), fast: FastFieldTextOptions::Disabled,
}; };
impl<T: Into<TextOptions>> BitOr<T> for TextOptions { impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
@@ -326,7 +356,9 @@ impl From<FastFlag> for TextOptions {
TextOptions { TextOptions {
indexing: None, indexing: None,
stored: false, stored: false,
fast: FastFieldTextOptions::IsEnabled(true), fast: FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER),
},
coerce: false, coerce: false,
} }
} }
@@ -392,21 +424,21 @@ mod tests {
#[test] #[test]
fn serde_fast_field_tokenizer() { fn serde_fast_field_tokenizer() {
let json = r#" { let json = r#" {
"fast": { "with_tokenizer": "default" } "fast": { "tokenizer": "default" }
} "#; } "#;
let options: TextOptions = serde_json::from_str(json).unwrap(); let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!( assert_eq!(
options.fast, options.fast,
FastFieldTextOptions::EnabledWithTokenizer { FastFieldTextOptions::Enabled {
with_tokenizer: TokenizerName::from_static("default") tokenizer: TokenizerName::from_static("default")
} }
); );
let options: TextOptions = let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!( assert_eq!(
options.fast, options.fast,
FastFieldTextOptions::EnabledWithTokenizer { FastFieldTextOptions::Enabled {
with_tokenizer: TokenizerName::from_static("default") tokenizer: TokenizerName::from_static("default")
} }
); );
@@ -414,18 +446,28 @@ mod tests {
"fast": true "fast": true
} "#; } "#;
let options: TextOptions = serde_json::from_str(json).unwrap(); let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true)); assert_eq!(
options.fast,
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
}
);
let options: TextOptions = let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true)); assert_eq!(
options.fast,
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
}
);
let json = r#" { let json = r#" {
"fast": false "fast": false
} "#; } "#;
let options: TextOptions = serde_json::from_str(json).unwrap(); let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false)); assert_eq!(options.fast, FastFieldTextOptions::Disabled);
let options: TextOptions = let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false)); assert_eq!(options.fast, FastFieldTextOptions::Disabled);
} }
} }

View File

@@ -189,7 +189,7 @@ pub mod tests {
#[test] #[test]
fn test_raw_tokenizer2() { fn test_raw_tokenizer2() {
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default_for_indexing();
let mut en_tokenizer = tokenizer_manager.get("raw").unwrap(); let mut en_tokenizer = tokenizer_manager.get("raw").unwrap();
let mut tokens: Vec<Token> = vec![]; let mut tokens: Vec<Token> = vec![];
{ {
@@ -206,7 +206,7 @@ pub mod tests {
#[test] #[test]
fn test_en_tokenizer() { fn test_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default_for_indexing();
assert!(tokenizer_manager.get("en_doesnotexist").is_none()); assert!(tokenizer_manager.get("en_doesnotexist").is_none());
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
let mut tokens: Vec<Token> = vec![]; let mut tokens: Vec<Token> = vec![];
@@ -228,7 +228,7 @@ pub mod tests {
#[test] #[test]
fn test_non_en_tokenizer() { fn test_non_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default_for_indexing();
tokenizer_manager.register( tokenizer_manager.register(
"el_stem", "el_stem",
TextAnalyzer::builder(SimpleTokenizer::default()) TextAnalyzer::builder(SimpleTokenizer::default())
@@ -256,7 +256,7 @@ pub mod tests {
#[test] #[test]
fn test_tokenizer_empty() { fn test_tokenizer_empty() {
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default_for_indexing();
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
{ {
let mut tokens: Vec<Token> = vec![]; let mut tokens: Vec<Token> = vec![];
@@ -282,7 +282,7 @@ pub mod tests {
#[test] #[test]
fn test_whitespace_tokenizer() { fn test_whitespace_tokenizer() {
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default_for_indexing();
let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap(); let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
let mut tokens: Vec<Token> = vec![]; let mut tokens: Vec<Token> = vec![];
{ {

View File

@@ -27,6 +27,7 @@ pub struct TokenizerManager {
impl TokenizerManager { impl TokenizerManager {
/// Creates an empty tokenizer manager. /// Creates an empty tokenizer manager.
#[allow(clippy::new_without_default)]
pub fn new() -> Self { pub fn new() -> Self {
Self { Self {
tokenizers: Arc::new(RwLock::new(HashMap::new())), tokenizers: Arc::new(RwLock::new(HashMap::new())),
@@ -51,12 +52,10 @@ impl TokenizerManager {
.get(tokenizer_name) .get(tokenizer_name)
.cloned() .cloned()
} }
}
impl Default for TokenizerManager {
/// Creates an `TokenizerManager` prepopulated with /// Creates an `TokenizerManager` prepopulated with
/// the default pre-configured tokenizers of `tantivy`. /// the default pre-configured tokenizers of `tantivy`.
fn default() -> TokenizerManager { pub fn default_for_indexing() -> TokenizerManager {
let manager = TokenizerManager::new(); let manager = TokenizerManager::new();
manager.register("raw", RawTokenizer::default()); manager.register("raw", RawTokenizer::default());
manager.register( manager.register(
@@ -77,4 +76,28 @@ impl Default for TokenizerManager {
manager.register("whitespace", WhitespaceTokenizer::default()); manager.register("whitespace", WhitespaceTokenizer::default());
manager manager
} }
/// Creates an `TokenizerManager` prepopulated with
/// the default pre-configured tokenizers of `tantivy`
/// for fast fields.
///
/// Fast fields usually do not really tokenize the text.
/// It is however very useful to filter / normalize the text.
pub fn default_for_fast_fields() -> TokenizerManager {
let manager = TokenizerManager::new();
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.build();
let lower_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.filter(LowerCaser)
.build();
manager.register(
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
lower_tokenizer.clone(),
);
manager.register("raw", raw_tokenizer);
manager.register("lower", lower_tokenizer);
manager
}
} }