mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
Compare commits
1 Commits
low_card_o
...
default_fa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
61422d7cd5 |
@@ -37,7 +37,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
.set_index_option(IndexRecordOption::WithFreqs)
|
.set_index_option(IndexRecordOption::WithFreqs)
|
||||||
.set_tokenizer("raw"),
|
.set_tokenizer("raw"),
|
||||||
)
|
)
|
||||||
.set_fast(None)
|
.set_fast("default")
|
||||||
.set_stored();
|
.set_stored();
|
||||||
schema_builder.add_text_field("category", text_fieldtype);
|
schema_builder.add_text_field("category", text_fieldtype);
|
||||||
schema_builder.add_f64_field("stock", FAST);
|
schema_builder.add_f64_field("stock", FAST);
|
||||||
|
|||||||
@@ -1293,13 +1293,13 @@ mod tests {
|
|||||||
// searching for terma, but min_doc_count will return all terms
|
// searching for terma, but min_doc_count will return all terms
|
||||||
let res = exec_request_with_query(agg_req, &index, Some(("string2", "hit")))?;
|
let res = exec_request_with_query(agg_req, &index, Some(("string2", "hit")))?;
|
||||||
|
|
||||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
|
assert_eq!(res["my_texts"]["buckets"][0]["key"], "a");
|
||||||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
|
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
res["my_texts"]["buckets"][0]["elhistogram"]["buckets"],
|
res["my_texts"]["buckets"][0]["elhistogram"]["buckets"],
|
||||||
json!([{ "doc_count": 1, "key": 1.0 }, { "doc_count": 1, "key": 2.0 } ])
|
json!([{ "doc_count": 1, "key": 1.0 }, { "doc_count": 1, "key": 2.0 } ])
|
||||||
);
|
);
|
||||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "B");
|
assert_eq!(res["my_texts"]["buckets"][1]["key"], "b");
|
||||||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
|
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
res["my_texts"]["buckets"][1]["elhistogram"]["buckets"],
|
res["my_texts"]["buckets"][1]["elhistogram"]["buckets"],
|
||||||
@@ -1421,10 +1421,10 @@ mod tests {
|
|||||||
let res = exec_request_with_query(agg_req, &index, None).unwrap();
|
let res = exec_request_with_query(agg_req, &index, None).unwrap();
|
||||||
println!("{}", serde_json::to_string_pretty(&res).unwrap());
|
println!("{}", serde_json::to_string_pretty(&res).unwrap());
|
||||||
|
|
||||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hallo Hallo");
|
assert_eq!(res["my_texts"]["buckets"][0]["key"], "hallo hallo");
|
||||||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
|
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
|
||||||
|
|
||||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Hello Hello");
|
assert_eq!(res["my_texts"]["buckets"][1]["key"], "hello hello");
|
||||||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
|
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -411,7 +411,7 @@ mod tests {
|
|||||||
.set_index_option(IndexRecordOption::Basic)
|
.set_index_option(IndexRecordOption::Basic)
|
||||||
.set_fieldnorms(false),
|
.set_fieldnorms(false),
|
||||||
)
|
)
|
||||||
.set_fast(None)
|
.set_fast("default")
|
||||||
.set_stored();
|
.set_stored();
|
||||||
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
|
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
|
||||||
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
|
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
|
||||||
@@ -466,7 +466,7 @@ mod tests {
|
|||||||
.set_indexing_options(
|
.set_indexing_options(
|
||||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||||
)
|
)
|
||||||
.set_fast(None)
|
.set_fast("default")
|
||||||
.set_stored();
|
.set_stored();
|
||||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||||
let date_field = schema_builder.add_date_field("date", FAST);
|
let date_field = schema_builder.add_date_field("date", FAST);
|
||||||
|
|||||||
@@ -120,8 +120,8 @@ impl IndexBuilder {
|
|||||||
Self {
|
Self {
|
||||||
schema: None,
|
schema: None,
|
||||||
index_settings: IndexSettings::default(),
|
index_settings: IndexSettings::default(),
|
||||||
tokenizer_manager: TokenizerManager::default(),
|
tokenizer_manager: TokenizerManager::default_for_indexing(),
|
||||||
fast_field_tokenizer_manager: TokenizerManager::default(),
|
fast_field_tokenizer_manager: TokenizerManager::default_for_fast_fields(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -400,8 +400,8 @@ impl Index {
|
|||||||
settings: metas.index_settings.clone(),
|
settings: metas.index_settings.clone(),
|
||||||
directory,
|
directory,
|
||||||
schema,
|
schema,
|
||||||
tokenizers: TokenizerManager::default(),
|
tokenizers: TokenizerManager::default_for_indexing(),
|
||||||
fast_field_tokenizers: TokenizerManager::default(),
|
fast_field_tokenizers: TokenizerManager::default_for_fast_fields(),
|
||||||
executor: Arc::new(Executor::single_thread()),
|
executor: Arc::new(Executor::single_thread()),
|
||||||
inventory,
|
inventory,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -446,7 +446,8 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_text_fastfield() {
|
fn test_text_fastfield() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT | FAST);
|
let text_options: TextOptions = TextOptions::from(TEXT).set_fast("raw");
|
||||||
|
let text_field = schema_builder.add_text_field("text", text_options);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
@@ -1082,7 +1083,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_fast_field_in_json_field_expand_dots_disabled() {
|
fn test_fast_field_in_json_field_expand_dots_disabled() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let json_option = JsonObjectOptions::default().set_fast(None);
|
let json_option = JsonObjectOptions::default().set_fast("default");
|
||||||
let json = schema_builder.add_json_field("json", json_option);
|
let json = schema_builder.add_json_field("json", json_option);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -1108,7 +1109,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_fast_field_in_json_field_with_tokenizer() {
|
fn test_fast_field_in_json_field_with_tokenizer() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let json_option = JsonObjectOptions::default().set_fast(Some("default"));
|
let json_option = JsonObjectOptions::default().set_fast("default");
|
||||||
let json = schema_builder.add_json_field("json", json_option);
|
let json = schema_builder.add_json_field("json", json_option);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -1134,7 +1135,7 @@ mod tests {
|
|||||||
fn test_fast_field_in_json_field_expand_dots_enabled() {
|
fn test_fast_field_in_json_field_expand_dots_enabled() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let json_option = JsonObjectOptions::default()
|
let json_option = JsonObjectOptions::default()
|
||||||
.set_fast(None)
|
.set_fast("default")
|
||||||
.set_expand_dots_enabled();
|
.set_expand_dots_enabled();
|
||||||
let json = schema_builder.add_json_field("json", json_option);
|
let json = schema_builder.add_json_field("json", json_option);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -1202,10 +1203,10 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_fast_field_tokenizer() {
|
fn test_fast_field_tokenizer() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let opt = TextOptions::default().set_fast(Some("custom_lowercase"));
|
let opt = TextOptions::default().set_fast("custom_lowercase");
|
||||||
let text_field = schema_builder.add_text_field("text", opt);
|
let text_field = schema_builder.add_text_field("text", opt);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let ff_tokenizer_manager = TokenizerManager::default();
|
let ff_tokenizer_manager = TokenizerManager::default_for_fast_fields();
|
||||||
ff_tokenizer_manager.register(
|
ff_tokenizer_manager.register(
|
||||||
"custom_lowercase",
|
"custom_lowercase",
|
||||||
TextAnalyzer::builder(RawTokenizer::default())
|
TextAnalyzer::builder(RawTokenizer::default())
|
||||||
@@ -1238,7 +1239,7 @@ mod tests {
|
|||||||
.set_index_option(crate::schema::IndexRecordOption::WithFreqs)
|
.set_index_option(crate::schema::IndexRecordOption::WithFreqs)
|
||||||
.set_tokenizer("raw"),
|
.set_tokenizer("raw"),
|
||||||
)
|
)
|
||||||
.set_fast(Some("default"))
|
.set_fast("default")
|
||||||
.set_stored();
|
.set_stored();
|
||||||
|
|
||||||
let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
|
let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
|
||||||
@@ -1271,7 +1272,7 @@ mod tests {
|
|||||||
fn test_shadowing_fast_field_with_expand_dots() {
|
fn test_shadowing_fast_field_with_expand_dots() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let json_option = JsonObjectOptions::default()
|
let json_option = JsonObjectOptions::default()
|
||||||
.set_fast(None)
|
.set_fast("default")
|
||||||
.set_expand_dots_enabled();
|
.set_expand_dots_enabled();
|
||||||
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
|
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
|
||||||
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
|
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
|
||||||
|
|||||||
@@ -349,7 +349,7 @@ mod tests {
|
|||||||
schema_builder.add_json_field(
|
schema_builder.add_json_field(
|
||||||
"json_expand_dots_enabled",
|
"json_expand_dots_enabled",
|
||||||
JsonObjectOptions::default()
|
JsonObjectOptions::default()
|
||||||
.set_fast(None)
|
.set_fast("default")
|
||||||
.set_expand_dots_enabled(),
|
.set_expand_dots_enabled(),
|
||||||
);
|
);
|
||||||
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);
|
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);
|
||||||
|
|||||||
@@ -18,6 +18,8 @@ const JSON_DEPTH_LIMIT: usize = 20;
|
|||||||
pub struct FastFieldsWriter {
|
pub struct FastFieldsWriter {
|
||||||
columnar_writer: ColumnarWriter,
|
columnar_writer: ColumnarWriter,
|
||||||
fast_field_names: Vec<Option<String>>, //< TODO see if we can hash the field name hash too.
|
fast_field_names: Vec<Option<String>>, //< TODO see if we can hash the field name hash too.
|
||||||
|
// Field -> Fast field tokenizer mapping.
|
||||||
|
// All text fast fields should have a tokenizer.
|
||||||
per_field_tokenizer: Vec<Option<TextAnalyzer>>,
|
per_field_tokenizer: Vec<Option<TextAnalyzer>>,
|
||||||
date_precisions: Vec<DateTimePrecision>,
|
date_precisions: Vec<DateTimePrecision>,
|
||||||
expand_dots: Vec<bool>,
|
expand_dots: Vec<bool>,
|
||||||
@@ -61,7 +63,7 @@ impl FastFieldsWriter {
|
|||||||
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
|
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
|
||||||
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
|
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
|
||||||
TantivyError::InvalidArgument(format!(
|
TantivyError::InvalidArgument(format!(
|
||||||
"Tokenizer {tokenizer_name:?} not found"
|
"Tokenizer `{tokenizer_name}` not found"
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
|
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
|
||||||
@@ -157,9 +159,6 @@ impl FastFieldsWriter {
|
|||||||
&token.text,
|
&token.text,
|
||||||
);
|
);
|
||||||
})
|
})
|
||||||
} else {
|
|
||||||
self.columnar_writer
|
|
||||||
.record_str(doc_id, field_name.as_str(), text_val);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Value::Bytes(bytes_val) => {
|
Value::Bytes(bytes_val) => {
|
||||||
@@ -201,18 +200,20 @@ impl FastFieldsWriter {
|
|||||||
self.json_path_buffer.clear();
|
self.json_path_buffer.clear();
|
||||||
self.json_path_buffer.push_str(field_name);
|
self.json_path_buffer.push_str(field_name);
|
||||||
|
|
||||||
let text_analyzer =
|
let text_analyzer_opt =
|
||||||
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
|
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
|
||||||
|
|
||||||
record_json_obj_to_columnar_writer(
|
if let Some(text_analyzer) = text_analyzer_opt {
|
||||||
doc_id,
|
record_json_obj_to_columnar_writer(
|
||||||
json_obj,
|
doc_id,
|
||||||
expand_dots,
|
json_obj,
|
||||||
JSON_DEPTH_LIMIT,
|
expand_dots,
|
||||||
&mut self.json_path_buffer,
|
JSON_DEPTH_LIMIT,
|
||||||
&mut self.columnar_writer,
|
&mut self.json_path_buffer,
|
||||||
text_analyzer,
|
&mut self.columnar_writer,
|
||||||
);
|
text_analyzer,
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Value::IpAddr(ip_addr) => {
|
Value::IpAddr(ip_addr) => {
|
||||||
self.columnar_writer
|
self.columnar_writer
|
||||||
@@ -263,7 +264,7 @@ fn record_json_obj_to_columnar_writer(
|
|||||||
remaining_depth_limit: usize,
|
remaining_depth_limit: usize,
|
||||||
json_path_buffer: &mut String,
|
json_path_buffer: &mut String,
|
||||||
columnar_writer: &mut columnar::ColumnarWriter,
|
columnar_writer: &mut columnar::ColumnarWriter,
|
||||||
tokenizer: &mut Option<TextAnalyzer>,
|
text_analyzer: &mut TextAnalyzer,
|
||||||
) {
|
) {
|
||||||
for (key, child) in json_obj {
|
for (key, child) in json_obj {
|
||||||
let len_path = json_path_buffer.len();
|
let len_path = json_path_buffer.len();
|
||||||
@@ -288,7 +289,7 @@ fn record_json_obj_to_columnar_writer(
|
|||||||
remaining_depth_limit,
|
remaining_depth_limit,
|
||||||
json_path_buffer,
|
json_path_buffer,
|
||||||
columnar_writer,
|
columnar_writer,
|
||||||
tokenizer,
|
text_analyzer,
|
||||||
);
|
);
|
||||||
// popping our sub path.
|
// popping our sub path.
|
||||||
json_path_buffer.truncate(len_path);
|
json_path_buffer.truncate(len_path);
|
||||||
@@ -302,7 +303,7 @@ fn record_json_value_to_columnar_writer(
|
|||||||
mut remaining_depth_limit: usize,
|
mut remaining_depth_limit: usize,
|
||||||
json_path_writer: &mut String,
|
json_path_writer: &mut String,
|
||||||
columnar_writer: &mut columnar::ColumnarWriter,
|
columnar_writer: &mut columnar::ColumnarWriter,
|
||||||
tokenizer: &mut Option<TextAnalyzer>,
|
text_analyzer: &mut TextAnalyzer,
|
||||||
) {
|
) {
|
||||||
if remaining_depth_limit == 0 {
|
if remaining_depth_limit == 0 {
|
||||||
return;
|
return;
|
||||||
@@ -321,14 +322,10 @@ fn record_json_value_to_columnar_writer(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
serde_json::Value::String(text) => {
|
serde_json::Value::String(text) => {
|
||||||
if let Some(text_analyzer) = tokenizer.as_mut() {
|
let mut token_stream = text_analyzer.token_stream(text);
|
||||||
let mut token_stream = text_analyzer.token_stream(text);
|
token_stream.process(&mut |token| {
|
||||||
token_stream.process(&mut |token| {
|
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
|
||||||
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
|
});
|
||||||
})
|
|
||||||
} else {
|
|
||||||
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
serde_json::Value::Array(arr) => {
|
serde_json::Value::Array(arr) => {
|
||||||
for el in arr {
|
for el in arr {
|
||||||
@@ -339,7 +336,7 @@ fn record_json_value_to_columnar_writer(
|
|||||||
remaining_depth_limit,
|
remaining_depth_limit,
|
||||||
json_path_writer,
|
json_path_writer,
|
||||||
columnar_writer,
|
columnar_writer,
|
||||||
tokenizer,
|
text_analyzer,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -351,7 +348,7 @@ fn record_json_value_to_columnar_writer(
|
|||||||
remaining_depth_limit,
|
remaining_depth_limit,
|
||||||
json_path_writer,
|
json_path_writer,
|
||||||
columnar_writer,
|
columnar_writer,
|
||||||
tokenizer,
|
text_analyzer,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -371,6 +368,9 @@ mod tests {
|
|||||||
) -> ColumnarReader {
|
) -> ColumnarReader {
|
||||||
let mut columnar_writer = ColumnarWriter::default();
|
let mut columnar_writer = ColumnarWriter::default();
|
||||||
let mut json_path = String::new();
|
let mut json_path = String::new();
|
||||||
|
let mut text_analyzer = crate::tokenizer::TokenizerManager::default_for_fast_fields()
|
||||||
|
.get(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER)
|
||||||
|
.unwrap();
|
||||||
for (doc, json_doc) in json_docs.iter().enumerate() {
|
for (doc, json_doc) in json_docs.iter().enumerate() {
|
||||||
record_json_value_to_columnar_writer(
|
record_json_value_to_columnar_writer(
|
||||||
doc as u32,
|
doc as u32,
|
||||||
@@ -379,7 +379,7 @@ mod tests {
|
|||||||
JSON_DEPTH_LIMIT,
|
JSON_DEPTH_LIMIT,
|
||||||
&mut json_path,
|
&mut json_path,
|
||||||
&mut columnar_writer,
|
&mut columnar_writer,
|
||||||
&mut None,
|
&mut text_analyzer,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
@@ -399,6 +399,7 @@ mod tests {
|
|||||||
});
|
});
|
||||||
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false);
|
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false);
|
||||||
let columns = columnar_reader.list_columns().unwrap();
|
let columns = columnar_reader.list_columns().unwrap();
|
||||||
|
assert_eq!(columns.len(), 5);
|
||||||
{
|
{
|
||||||
assert_eq!(columns[0].0, "arr");
|
assert_eq!(columns[0].0, "arr");
|
||||||
let column_arr_opt: Option<StrColumn> = columns[0].1.open().unwrap().into();
|
let column_arr_opt: Option<StrColumn> = columns[0].1.open().unwrap().into();
|
||||||
@@ -434,7 +435,9 @@ mod tests {
|
|||||||
{
|
{
|
||||||
assert_eq!(columns[4].0, "text");
|
assert_eq!(columns[4].0, "text");
|
||||||
let column_text_opt: Option<StrColumn> = columns[4].1.open().unwrap().into();
|
let column_text_opt: Option<StrColumn> = columns[4].1.open().unwrap().into();
|
||||||
assert!(column_text_opt.unwrap().term_ords(0).eq([0].into_iter()));
|
let column_text = column_text_opt.unwrap();
|
||||||
|
let term_ords: Vec<u64> = column_text.term_ords(0).collect();
|
||||||
|
assert_eq!(&term_ords[..], &[0]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -956,7 +956,7 @@ mod test {
|
|||||||
.iter()
|
.iter()
|
||||||
.flat_map(|field_name| schema.get_field(field_name))
|
.flat_map(|field_name| schema.get_field(field_name))
|
||||||
.collect();
|
.collect();
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||||
tokenizer_manager.register(
|
tokenizer_manager.register(
|
||||||
"en_with_stop_words",
|
"en_with_stop_words",
|
||||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||||
@@ -1447,7 +1447,7 @@ mod test {
|
|||||||
let title = schema_builder.add_text_field("title", text_options);
|
let title = schema_builder.add_text_field("title", text_options);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let default_fields = vec![title];
|
let default_fields = vec![title];
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||||
let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager);
|
let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager);
|
||||||
|
|
||||||
assert_matches!(
|
assert_matches!(
|
||||||
@@ -1622,7 +1622,8 @@ mod test {
|
|||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
schema_builder.add_text_field(r#"a\.b"#, STRING);
|
schema_builder.add_text_field(r#"a\.b"#, STRING);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
|
let query_parser =
|
||||||
|
QueryParser::new(schema, Vec::new(), TokenizerManager::default_for_indexing());
|
||||||
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
|
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
format!("{query:?}"),
|
format!("{query:?}"),
|
||||||
@@ -1639,8 +1640,11 @@ mod test {
|
|||||||
schema_builder.add_text_field("first.toto.titi", STRING);
|
schema_builder.add_text_field("first.toto.titi", STRING);
|
||||||
schema_builder.add_text_field("third.a.b.c", STRING);
|
schema_builder.add_text_field("third.a.b.c", STRING);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let query_parser =
|
let query_parser = QueryParser::new(
|
||||||
QueryParser::new(schema.clone(), Vec::new(), TokenizerManager::default());
|
schema.clone(),
|
||||||
|
Vec::new(),
|
||||||
|
TokenizerManager::default_for_indexing(),
|
||||||
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
query_parser.split_full_path("first.toto"),
|
query_parser.split_full_path("first.toto"),
|
||||||
Some((schema.get_field("first.toto").unwrap(), ""))
|
Some((schema.get_field("first.toto").unwrap(), ""))
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
|
|
||||||
use super::text_options::{FastFieldTextOptions, TokenizerName};
|
use super::text_options::{FastFieldTextOptions, TokenizerName};
|
||||||
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
|
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
|
||||||
use crate::schema::{TextFieldIndexing, TextOptions};
|
use crate::schema::{TextFieldIndexing, TextOptions, DEFAULT_FAST_FIELD_TOKENIZER};
|
||||||
|
|
||||||
/// The `JsonObjectOptions` make it possible to
|
/// The `JsonObjectOptions` make it possible to
|
||||||
/// configure how a json object field should be indexed and stored.
|
/// configure how a json object field should be indexed and stored.
|
||||||
@@ -58,20 +58,19 @@ impl JsonObjectOptions {
|
|||||||
/// Returns true if and only if the json object fields are
|
/// Returns true if and only if the json object fields are
|
||||||
/// to be treated as fast fields.
|
/// to be treated as fast fields.
|
||||||
pub fn is_fast(&self) -> bool {
|
pub fn is_fast(&self) -> bool {
|
||||||
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|
match self.fast {
|
||||||
|| matches!(
|
FastFieldTextOptions::Disabled => false,
|
||||||
&self.fast,
|
FastFieldTextOptions::Enabled { .. } => true,
|
||||||
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
|
}
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true if and only if the value is a fast field.
|
/// Returns true if and only if the value is a fast field.
|
||||||
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
||||||
match &self.fast {
|
match &self.fast {
|
||||||
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
|
FastFieldTextOptions::Disabled => None,
|
||||||
FastFieldTextOptions::EnabledWithTokenizer {
|
FastFieldTextOptions::Enabled {
|
||||||
with_tokenizer: tokenizer,
|
tokenizer: with_tokenizer,
|
||||||
} => Some(tokenizer.name()),
|
} => Some(with_tokenizer.name()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -130,15 +129,11 @@ impl JsonObjectOptions {
|
|||||||
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
|
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
|
||||||
/// from the dictionary.
|
/// from the dictionary.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self {
|
pub fn set_fast(mut self, tokenizer_name: &str) -> Self {
|
||||||
if let Some(tokenizer) = tokenizer_name {
|
let with_tokenizer = TokenizerName::from_name(tokenizer_name);
|
||||||
let tokenizer = TokenizerName::from_name(tokenizer);
|
self.fast = FastFieldTextOptions::Enabled {
|
||||||
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
|
tokenizer: with_tokenizer,
|
||||||
with_tokenizer: tokenizer,
|
};
|
||||||
}
|
|
||||||
} else {
|
|
||||||
self.fast = FastFieldTextOptions::IsEnabled(true);
|
|
||||||
}
|
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -166,7 +161,9 @@ impl From<FastFlag> for JsonObjectOptions {
|
|||||||
JsonObjectOptions {
|
JsonObjectOptions {
|
||||||
stored: false,
|
stored: false,
|
||||||
indexing: None,
|
indexing: None,
|
||||||
fast: FastFieldTextOptions::IsEnabled(true),
|
fast: FastFieldTextOptions::Enabled {
|
||||||
|
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER),
|
||||||
|
},
|
||||||
expand_dots_enabled: false,
|
expand_dots_enabled: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
//! Schema definition for tantivy's indices.
|
//! Schema definition for tantivy's indices.
|
||||||
//!
|
|
||||||
//! # Setting your schema in Tantivy
|
//! # Setting your schema in Tantivy
|
||||||
//!
|
//!
|
||||||
|
//!
|
||||||
//! Tantivy has a very strict schema.
|
//! Tantivy has a very strict schema.
|
||||||
//! The schema defines information about the fields your index contains, that is, for each field:
|
//! The schema defines information about the fields your index contains, that is, for each field:
|
||||||
//!
|
//!
|
||||||
@@ -153,6 +153,8 @@ pub use self::term::{Term, ValueBytes, JSON_END_OF_PATH};
|
|||||||
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
|
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
|
||||||
pub use self::value::Value;
|
pub use self::value::Value;
|
||||||
|
|
||||||
|
pub(crate) const DEFAULT_FAST_FIELD_TOKENIZER: &str = "default";
|
||||||
|
|
||||||
/// Validator for a potential `field_name`.
|
/// Validator for a potential `field_name`.
|
||||||
/// Returns true if the name can be use for a field name.
|
/// Returns true if the name can be use for a field name.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -24,19 +24,68 @@ pub struct TextOptions {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
#[serde(untagged)]
|
#[serde(
|
||||||
|
into = "FastFieldTextOptionsForSerialization",
|
||||||
|
from = "FastFieldTextOptionsForSerialization"
|
||||||
|
)]
|
||||||
/// Enum to control how the fast field setting of a text field.
|
/// Enum to control how the fast field setting of a text field.
|
||||||
|
#[derive(Default)]
|
||||||
pub(crate) enum FastFieldTextOptions {
|
pub(crate) enum FastFieldTextOptions {
|
||||||
/// Flag to enable/disable
|
/// Fastfield disabled
|
||||||
IsEnabled(bool),
|
#[default]
|
||||||
|
Disabled,
|
||||||
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
|
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
|
||||||
/// `Index::fast_field_tokenizer`.
|
/// `Index::fast_field_tokenizer`.
|
||||||
EnabledWithTokenizer { with_tokenizer: TokenizerName },
|
Enabled { tokenizer: TokenizerName },
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for FastFieldTextOptions {
|
/// Enum used to control the way we serialize fast field text options.
|
||||||
fn default() -> Self {
|
///
|
||||||
FastFieldTextOptions::IsEnabled(false)
|
/// For backward compatiblity reasons, we folow the format introduce in tantivy 0.19.
|
||||||
|
/// `false` -> Disabled
|
||||||
|
/// `true` -> Enabled with default tokenizer
|
||||||
|
/// `{ tokenizer: "something" }` -> Enabled with a specific tokenizer.
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
#[serde(untagged)]
|
||||||
|
enum FastFieldTextOptionsForSerialization {
|
||||||
|
IsEnabled(bool),
|
||||||
|
EnabledWithTokenizer {
|
||||||
|
#[serde(alias = "with_tokenizer")]
|
||||||
|
tokenizer: TokenizerName,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<FastFieldTextOptionsForSerialization> for FastFieldTextOptions {
|
||||||
|
fn from(value: FastFieldTextOptionsForSerialization) -> Self {
|
||||||
|
match value {
|
||||||
|
FastFieldTextOptionsForSerialization::IsEnabled(enabled) => {
|
||||||
|
if enabled {
|
||||||
|
FastFieldTextOptions::Enabled {
|
||||||
|
tokenizer: TokenizerName::from_static(
|
||||||
|
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
FastFieldTextOptions::Disabled
|
||||||
|
}
|
||||||
|
}
|
||||||
|
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer } => {
|
||||||
|
FastFieldTextOptions::Enabled { tokenizer }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<FastFieldTextOptions> for FastFieldTextOptionsForSerialization {
|
||||||
|
fn from(value: FastFieldTextOptions) -> Self {
|
||||||
|
match value {
|
||||||
|
FastFieldTextOptions::Disabled => {
|
||||||
|
FastFieldTextOptionsForSerialization::IsEnabled(false)
|
||||||
|
}
|
||||||
|
FastFieldTextOptions::Enabled { tokenizer } => {
|
||||||
|
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer }
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -45,23 +94,13 @@ impl BitOr<FastFieldTextOptions> for FastFieldTextOptions {
|
|||||||
|
|
||||||
fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions {
|
fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions {
|
||||||
match (self, other) {
|
match (self, other) {
|
||||||
(
|
(FastFieldTextOptions::Enabled { tokenizer }, _)
|
||||||
FastFieldTextOptions::EnabledWithTokenizer {
|
| (_, FastFieldTextOptions::Enabled { tokenizer }) => {
|
||||||
with_tokenizer: tokenizer,
|
FastFieldTextOptions::Enabled { tokenizer }
|
||||||
},
|
}
|
||||||
_,
|
(FastFieldTextOptions::Disabled, FastFieldTextOptions::Disabled) => {
|
||||||
)
|
FastFieldTextOptions::Disabled
|
||||||
| (
|
}
|
||||||
_,
|
|
||||||
FastFieldTextOptions::EnabledWithTokenizer {
|
|
||||||
with_tokenizer: tokenizer,
|
|
||||||
},
|
|
||||||
) => FastFieldTextOptions::EnabledWithTokenizer {
|
|
||||||
with_tokenizer: tokenizer,
|
|
||||||
},
|
|
||||||
(FastFieldTextOptions::IsEnabled(true), _)
|
|
||||||
| (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true),
|
|
||||||
(_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -83,20 +122,17 @@ impl TextOptions {
|
|||||||
|
|
||||||
/// Returns true if and only if the value is a fast field.
|
/// Returns true if and only if the value is a fast field.
|
||||||
pub fn is_fast(&self) -> bool {
|
pub fn is_fast(&self) -> bool {
|
||||||
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|
match &self.fast {
|
||||||
|| matches!(
|
FastFieldTextOptions::Disabled => false,
|
||||||
&self.fast,
|
FastFieldTextOptions::Enabled { .. } => true,
|
||||||
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
|
}
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true if and only if the value is a fast field.
|
/// Returns true if and only if the value is a fast field.
|
||||||
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
||||||
match &self.fast {
|
match &self.fast {
|
||||||
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
|
FastFieldTextOptions::Disabled => None,
|
||||||
FastFieldTextOptions::EnabledWithTokenizer {
|
FastFieldTextOptions::Enabled { tokenizer } => Some(tokenizer.name()),
|
||||||
with_tokenizer: tokenizer,
|
|
||||||
} => Some(tokenizer.name()),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -121,15 +157,9 @@ impl TextOptions {
|
|||||||
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
|
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
|
||||||
/// from the dictionary.
|
/// from the dictionary.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions {
|
pub fn set_fast(mut self, tokenizer_name: &str) -> TextOptions {
|
||||||
if let Some(tokenizer) = tokenizer_name {
|
let tokenizer = TokenizerName::from_name(tokenizer_name);
|
||||||
let tokenizer = TokenizerName::from_name(tokenizer);
|
self.fast = FastFieldTextOptions::Enabled { tokenizer };
|
||||||
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
|
|
||||||
with_tokenizer: tokenizer,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
self.fast = FastFieldTextOptions::IsEnabled(true);
|
|
||||||
}
|
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -263,7 +293,7 @@ pub const STRING: TextOptions = TextOptions {
|
|||||||
record: IndexRecordOption::Basic,
|
record: IndexRecordOption::Basic,
|
||||||
}),
|
}),
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: FastFieldTextOptions::IsEnabled(false),
|
fast: FastFieldTextOptions::Disabled,
|
||||||
coerce: false,
|
coerce: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -276,7 +306,7 @@ pub const TEXT: TextOptions = TextOptions {
|
|||||||
}),
|
}),
|
||||||
stored: false,
|
stored: false,
|
||||||
coerce: false,
|
coerce: false,
|
||||||
fast: FastFieldTextOptions::IsEnabled(false),
|
fast: FastFieldTextOptions::Disabled,
|
||||||
};
|
};
|
||||||
|
|
||||||
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
|
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
|
||||||
@@ -326,7 +356,9 @@ impl From<FastFlag> for TextOptions {
|
|||||||
TextOptions {
|
TextOptions {
|
||||||
indexing: None,
|
indexing: None,
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: FastFieldTextOptions::IsEnabled(true),
|
fast: FastFieldTextOptions::Enabled {
|
||||||
|
tokenizer: TokenizerName::from_static(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER),
|
||||||
|
},
|
||||||
coerce: false,
|
coerce: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -392,21 +424,21 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn serde_fast_field_tokenizer() {
|
fn serde_fast_field_tokenizer() {
|
||||||
let json = r#" {
|
let json = r#" {
|
||||||
"fast": { "with_tokenizer": "default" }
|
"fast": { "tokenizer": "default" }
|
||||||
} "#;
|
} "#;
|
||||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
options.fast,
|
options.fast,
|
||||||
FastFieldTextOptions::EnabledWithTokenizer {
|
FastFieldTextOptions::Enabled {
|
||||||
with_tokenizer: TokenizerName::from_static("default")
|
tokenizer: TokenizerName::from_static("default")
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
let options: TextOptions =
|
let options: TextOptions =
|
||||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
options.fast,
|
options.fast,
|
||||||
FastFieldTextOptions::EnabledWithTokenizer {
|
FastFieldTextOptions::Enabled {
|
||||||
with_tokenizer: TokenizerName::from_static("default")
|
tokenizer: TokenizerName::from_static("default")
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -414,18 +446,28 @@ mod tests {
|
|||||||
"fast": true
|
"fast": true
|
||||||
} "#;
|
} "#;
|
||||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
|
assert_eq!(
|
||||||
|
options.fast,
|
||||||
|
FastFieldTextOptions::Enabled {
|
||||||
|
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
|
||||||
|
}
|
||||||
|
);
|
||||||
let options: TextOptions =
|
let options: TextOptions =
|
||||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
|
assert_eq!(
|
||||||
|
options.fast,
|
||||||
|
FastFieldTextOptions::Enabled {
|
||||||
|
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
let json = r#" {
|
let json = r#" {
|
||||||
"fast": false
|
"fast": false
|
||||||
} "#;
|
} "#;
|
||||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
|
assert_eq!(options.fast, FastFieldTextOptions::Disabled);
|
||||||
let options: TextOptions =
|
let options: TextOptions =
|
||||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
|
assert_eq!(options.fast, FastFieldTextOptions::Disabled);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -189,7 +189,7 @@ pub mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_raw_tokenizer2() {
|
fn test_raw_tokenizer2() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||||
let mut en_tokenizer = tokenizer_manager.get("raw").unwrap();
|
let mut en_tokenizer = tokenizer_manager.get("raw").unwrap();
|
||||||
let mut tokens: Vec<Token> = vec![];
|
let mut tokens: Vec<Token> = vec![];
|
||||||
{
|
{
|
||||||
@@ -206,7 +206,7 @@ pub mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_en_tokenizer() {
|
fn test_en_tokenizer() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||||
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
|
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
|
||||||
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||||
let mut tokens: Vec<Token> = vec![];
|
let mut tokens: Vec<Token> = vec![];
|
||||||
@@ -228,7 +228,7 @@ pub mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_non_en_tokenizer() {
|
fn test_non_en_tokenizer() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||||
tokenizer_manager.register(
|
tokenizer_manager.register(
|
||||||
"el_stem",
|
"el_stem",
|
||||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||||
@@ -256,7 +256,7 @@ pub mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_tokenizer_empty() {
|
fn test_tokenizer_empty() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||||
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||||
{
|
{
|
||||||
let mut tokens: Vec<Token> = vec![];
|
let mut tokens: Vec<Token> = vec![];
|
||||||
@@ -282,7 +282,7 @@ pub mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_whitespace_tokenizer() {
|
fn test_whitespace_tokenizer() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||||
let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
|
let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
|
||||||
let mut tokens: Vec<Token> = vec![];
|
let mut tokens: Vec<Token> = vec![];
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ pub struct TokenizerManager {
|
|||||||
|
|
||||||
impl TokenizerManager {
|
impl TokenizerManager {
|
||||||
/// Creates an empty tokenizer manager.
|
/// Creates an empty tokenizer manager.
|
||||||
|
#[allow(clippy::new_without_default)]
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
||||||
@@ -51,12 +52,10 @@ impl TokenizerManager {
|
|||||||
.get(tokenizer_name)
|
.get(tokenizer_name)
|
||||||
.cloned()
|
.cloned()
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for TokenizerManager {
|
|
||||||
/// Creates an `TokenizerManager` prepopulated with
|
/// Creates an `TokenizerManager` prepopulated with
|
||||||
/// the default pre-configured tokenizers of `tantivy`.
|
/// the default pre-configured tokenizers of `tantivy`.
|
||||||
fn default() -> TokenizerManager {
|
pub fn default_for_indexing() -> TokenizerManager {
|
||||||
let manager = TokenizerManager::new();
|
let manager = TokenizerManager::new();
|
||||||
manager.register("raw", RawTokenizer::default());
|
manager.register("raw", RawTokenizer::default());
|
||||||
manager.register(
|
manager.register(
|
||||||
@@ -77,4 +76,28 @@ impl Default for TokenizerManager {
|
|||||||
manager.register("whitespace", WhitespaceTokenizer::default());
|
manager.register("whitespace", WhitespaceTokenizer::default());
|
||||||
manager
|
manager
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Creates an `TokenizerManager` prepopulated with
|
||||||
|
/// the default pre-configured tokenizers of `tantivy`
|
||||||
|
/// for fast fields.
|
||||||
|
///
|
||||||
|
/// Fast fields usually do not really tokenize the text.
|
||||||
|
/// It is however very useful to filter / normalize the text.
|
||||||
|
pub fn default_for_fast_fields() -> TokenizerManager {
|
||||||
|
let manager = TokenizerManager::new();
|
||||||
|
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||||
|
.filter(RemoveLongFilter::limit(255))
|
||||||
|
.build();
|
||||||
|
let lower_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||||
|
.filter(RemoveLongFilter::limit(255))
|
||||||
|
.filter(LowerCaser)
|
||||||
|
.build();
|
||||||
|
manager.register(
|
||||||
|
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
|
||||||
|
lower_tokenizer.clone(),
|
||||||
|
);
|
||||||
|
manager.register("raw", raw_tokenizer);
|
||||||
|
manager.register("lower", lower_tokenizer);
|
||||||
|
manager
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user