From 5c4ea6a708a0ef79419b473fd661bffd3bb063f9 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Fri, 31 Mar 2023 16:03:38 +0800 Subject: [PATCH] tokenizer option on text fastfield (#1945) * tokenizer option on text fastfield allow to set tokenizer option on text fastfield (fixes #1901) handle PreTokenized strings in fast field * change visibility * remove custom de/serialization --- examples/aggregation.rs | 2 +- src/aggregation/mod.rs | 4 +- src/fastfield/mod.rs | 63 +++++++++++++--- src/fastfield/writer.rs | 64 ++++++++++++++-- src/indexer/segment_writer.rs | 5 +- src/schema/text_options.rs | 134 +++++++++++++++++++++++++++++----- 6 files changed, 231 insertions(+), 41 deletions(-) diff --git a/examples/aggregation.rs b/examples/aggregation.rs index 24c11513c..7dc7d6754 100644 --- a/examples/aggregation.rs +++ b/examples/aggregation.rs @@ -42,7 +42,7 @@ fn main() -> tantivy::Result<()> { .set_index_option(IndexRecordOption::WithFreqs) .set_tokenizer("raw"), ) - .set_fast() + .set_fast(None) .set_stored(); schema_builder.add_text_field("category", text_fieldtype); schema_builder.add_f64_field("stock", FAST); diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index 76e969e2c..350b1e46d 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -445,7 +445,7 @@ mod tests { .set_index_option(IndexRecordOption::Basic) .set_fieldnorms(false), ) - .set_fast() + .set_fast(None) .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype.clone()); let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype); @@ -500,7 +500,7 @@ mod tests { .set_indexing_options( TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), ) - .set_fast() + .set_fast(None) .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); let date_field = schema_builder.add_date_field("date", FAST); diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index bbcae7fe6..238a89df1 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -115,7 +115,7 @@ mod tests { let directory: RamDirectory = RamDirectory::create(); { let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); + let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA).unwrap(); fast_field_writers .add_document(&doc!(*FIELD=>13u64)) .unwrap(); @@ -148,7 +148,7 @@ mod tests { let directory: RamDirectory = RamDirectory::create(); { let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); + let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA).unwrap(); fast_field_writers .add_document(&doc!(*FIELD=>4u64)) .unwrap(); @@ -203,7 +203,7 @@ mod tests { let directory: RamDirectory = RamDirectory::create(); { let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); + let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA).unwrap(); for _ in 0..10_000 { fast_field_writers .add_document(&doc!(*FIELD=>100_000u64)) @@ -231,7 +231,7 @@ mod tests { { let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); + let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA).unwrap(); // forcing the amplitude to be high fast_field_writers .add_document(&doc!(*FIELD=>0u64)) @@ -268,7 +268,7 @@ mod tests { let schema = schema_builder.build(); { let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); + let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); for i in -100i64..10_000i64 { let mut doc = Document::default(); doc.add_i64(i64_field, i); @@ -310,7 +310,7 @@ mod tests { { let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); + let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); let doc = Document::default(); fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.serialize(&mut write, None).unwrap(); @@ -343,7 +343,7 @@ mod tests { let schema = schema_builder.build(); { let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); + let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); let doc = Document::default(); fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.serialize(&mut write, None).unwrap(); @@ -379,7 +379,7 @@ mod tests { let directory = RamDirectory::create(); { let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); + let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA).unwrap(); for &x in &permutation { fast_field_writers.add_document(&doc!(*FIELD=>x)).unwrap(); } @@ -759,7 +759,7 @@ mod tests { { let mut write: WritePtr = directory.open_write(path).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); + let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); fast_field_writers.add_document(&doc!(field=>true)).unwrap(); fast_field_writers .add_document(&doc!(field=>false)) @@ -793,7 +793,7 @@ mod tests { { let mut write: WritePtr = directory.open_write(path).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); + let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); for _ in 0..50 { fast_field_writers.add_document(&doc!(field=>true)).unwrap(); fast_field_writers @@ -822,7 +822,7 @@ mod tests { let schema = schema_builder.build(); { let mut write: WritePtr = directory.open_write(path).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); + let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); let doc = Document::default(); fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.serialize(&mut write, None).unwrap(); @@ -849,7 +849,7 @@ mod tests { let directory: RamDirectory = RamDirectory::create(); { let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(schema); + let mut fast_field_writers = FastFieldsWriter::from_schema(schema).unwrap(); for doc in docs { fast_field_writers.add_document(doc).unwrap(); } @@ -1173,6 +1173,45 @@ mod tests { assert_eq!(&vals, &[33]); } + #[test] + fn test_text_fast_field_tokenizer() { + let mut schema_builder = Schema::builder(); + + let text_fieldtype = crate::schema::TextOptions::default() + .set_indexing_options( + crate::schema::TextFieldIndexing::default() + .set_index_option(crate::schema::IndexRecordOption::WithFreqs) + .set_tokenizer("raw"), + ) + .set_fast(Some("default")) + .set_stored(); + + let log_field = schema_builder.add_text_field("log_level", text_fieldtype); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests().unwrap(); + index_writer + .add_document(doc!(log_field => "info")) + .unwrap(); + index_writer + .add_document(doc!(log_field => "INFO")) + .unwrap(); + index_writer.commit().unwrap(); + let searcher = index.reader().unwrap().searcher(); + let fast_field_reader = searcher.segment_reader(0u32).fast_fields(); + + let text_fast_field = fast_field_reader.str("log_level").unwrap().unwrap(); + let mut buffer = String::new(); + assert!(text_fast_field.ord_to_str(0, &mut buffer).unwrap()); + assert_eq!(buffer, "info"); + assert!(!text_fast_field.ord_to_str(1, &mut buffer).unwrap()); + + assert!(text_fast_field.term_ords(0).eq([0].into_iter())); + assert!(text_fast_field.term_ords(1).eq([0].into_iter())); + assert!(text_fast_field.ords().values_for_doc(0u32).eq([0])); + assert!(text_fast_field.ords().values_for_doc(1u32).eq([0])); + } + #[test] fn test_shadowing_fast_field_with_expand_dots() { let mut schema_builder = Schema::builder(); diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 69f881619..f17562b98 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -2,11 +2,13 @@ use std::io; use columnar::{ColumnarWriter, NumericalValue}; use common::replace_in_place; +use tokenizer_api::Token; use crate::indexer::doc_id_mapping::DocIdMapping; use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR}; use crate::schema::{value_type_to_column_type, Document, FieldType, Schema, Type, Value}; -use crate::{DatePrecision, DocId}; +use crate::tokenizer::{TextAnalyzer, TokenizerManager}; +use crate::{DatePrecision, DocId, TantivyError}; /// Only index JSON down to a depth of 20. /// This is mostly to guard us from a stack overflow triggered by malicious input. @@ -15,7 +17,8 @@ const JSON_DEPTH_LIMIT: usize = 20; /// The `FastFieldsWriter` groups all of the fast field writers. pub struct FastFieldsWriter { columnar_writer: ColumnarWriter, - fast_field_names: Vec>, //< TODO see if we can cash the field name hash too. + fast_field_names: Vec>, //< TODO see if we can hash the field name hash too. + per_field_tokenizer: Vec>, date_precisions: Vec, expand_dots: Vec, num_docs: DocId, @@ -25,14 +28,25 @@ pub struct FastFieldsWriter { impl FastFieldsWriter { /// Create all `FastFieldWriter` required by the schema. - pub fn from_schema(schema: &Schema) -> FastFieldsWriter { + #[cfg(test)] + pub fn from_schema(schema: &Schema) -> crate::Result { + FastFieldsWriter::from_schema_and_tokenizer_manager(&schema, TokenizerManager::new()) + } + + /// Create all `FastFieldWriter` required by the schema. + pub fn from_schema_and_tokenizer_manager( + schema: &Schema, + tokenizer_manager: TokenizerManager, + ) -> crate::Result { let mut columnar_writer = ColumnarWriter::default(); + let mut fast_field_names: Vec> = vec![None; schema.num_fields()]; let mut date_precisions: Vec = std::iter::repeat_with(DatePrecision::default) .take(schema.num_fields()) .collect(); let mut expand_dots = vec![false; schema.num_fields()]; + let mut per_field_tokenizer = vec![None; schema.num_fields()]; // TODO see other types for (field_id, field_entry) in schema.fields() { if !field_entry.field_type().is_fast() { @@ -47,6 +61,18 @@ impl FastFieldsWriter { expand_dots[field_id.field_id() as usize] = json_object_options.is_expand_dots_enabled(); } + if let FieldType::Str(text_options) = field_entry.field_type() { + if let Some(tokenizer_name) = text_options.get_fast_field_tokenizer_name() { + let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| { + TantivyError::InvalidArgument(format!( + "Tokenizer {:?} not found", + tokenizer_name + )) + })?; + per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer); + } + } + let sort_values_within_row = value_type == Type::Facet; if let Some(column_type) = value_type_to_column_type(value_type) { columnar_writer.record_column_type( @@ -56,14 +82,15 @@ impl FastFieldsWriter { ); } } - FastFieldsWriter { + Ok(FastFieldsWriter { columnar_writer, fast_field_names, + per_field_tokenizer, num_docs: 0u32, date_precisions, expand_dots, json_path_buffer: String::new(), - } + }) } /// The memory used (inclusive childs) @@ -111,14 +138,35 @@ impl FastFieldsWriter { ); } Value::Str(text_val) => { - self.columnar_writer - .record_str(doc_id, field_name.as_str(), text_val); + if let Some(text_analyzer) = + &self.per_field_tokenizer[field_value.field().field_id() as usize] + { + let mut token_stream = text_analyzer.token_stream(text_val); + token_stream.process(&mut |token: &Token| { + self.columnar_writer.record_str( + doc_id, + field_name.as_str(), + &token.text, + ); + }) + } else { + self.columnar_writer + .record_str(doc_id, field_name.as_str(), text_val); + } } Value::Bytes(bytes_val) => { self.columnar_writer .record_bytes(doc_id, field_name.as_str(), bytes_val); } - Value::PreTokStr(_) => todo!(), + Value::PreTokStr(pre_tok) => { + for token in &pre_tok.tokens { + self.columnar_writer.record_str( + doc_id, + field_name.as_str(), + &token.text, + ); + } + } Value::Bool(bool_val) => { self.columnar_writer .record_bool(doc_id, field_name.as_str(), *bool_val); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index d7cfb4935..e4a8d9afc 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -111,7 +111,10 @@ impl SegmentWriter { per_field_postings_writers, fieldnorms_writer: FieldNormsWriter::for_schema(&schema), segment_serializer, - fast_field_writers: FastFieldsWriter::from_schema(&schema), + fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager( + &schema, + tokenizer_manager, + )?, doc_opstamps: Vec::with_capacity(1_000), per_field_text_analyzers, term_buffer: Term::with_capacity(16), diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 8eb520f87..f049ba8e1 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -16,13 +16,53 @@ pub struct TextOptions { #[serde(default)] stored: bool, #[serde(default)] - fast: bool, + fast: FastFieldOptions, #[serde(default)] #[serde(skip_serializing_if = "is_false")] - /// coerce values if they are not of type string + /// coerce values into string if they are not of type string coerce: bool, } +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(untagged)] +enum FastFieldOptions { + IsEnabled(bool), + EnabledWithTokenizer { with_tokenizer: TokenizerName }, +} + +impl Default for FastFieldOptions { + fn default() -> Self { + FastFieldOptions::IsEnabled(false) + } +} + +impl BitOr for FastFieldOptions { + type Output = FastFieldOptions; + + fn bitor(self, other: FastFieldOptions) -> FastFieldOptions { + match (self, other) { + ( + FastFieldOptions::EnabledWithTokenizer { + with_tokenizer: tokenizer, + }, + _, + ) + | ( + _, + FastFieldOptions::EnabledWithTokenizer { + with_tokenizer: tokenizer, + }, + ) => FastFieldOptions::EnabledWithTokenizer { + with_tokenizer: tokenizer, + }, + (FastFieldOptions::IsEnabled(true), _) | (_, FastFieldOptions::IsEnabled(true)) => { + FastFieldOptions::IsEnabled(true) + } + (_, FastFieldOptions::IsEnabled(false)) => FastFieldOptions::IsEnabled(false), + } + } +} + fn is_false(val: &bool) -> bool { !val } @@ -40,7 +80,21 @@ impl TextOptions { /// Returns true if and only if the value is a fast field. pub fn is_fast(&self) -> bool { - self.fast + matches!(self.fast, FastFieldOptions::IsEnabled(true)) + || matches!( + &self.fast, + FastFieldOptions::EnabledWithTokenizer { with_tokenizer: _ } + ) + } + + /// Returns true if and only if the value is a fast field. + pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> { + match &self.fast { + FastFieldOptions::IsEnabled(true) | FastFieldOptions::IsEnabled(false) => None, + FastFieldOptions::EnabledWithTokenizer { + with_tokenizer: tokenizer, + } => Some(tokenizer.name()), + } } /// Returns true if values should be coerced to strings (numbers, null). @@ -53,19 +107,24 @@ impl TextOptions { /// Fast fields are designed for random access. /// Access time are similar to a random lookup in an array. /// Text fast fields will have the term ids stored in the fast field. - /// The fast field will be a multivalued fast field. /// - /// The effective cardinality depends on the tokenizer. When creating fast fields on text - /// fields it is recommended to use the "raw" tokenizer, since it will store the original text - /// unchanged. The "default" tokenizer will store the terms as lower case and this will be - /// reflected in the dictionary. + /// The effective cardinality depends on the tokenizer. Without a tokenizer, the text will be + /// stored as is, which equals to the "raw" tokenizer. The tokenizer can be used to apply + /// normalization like lower case. /// /// The original text can be retrieved via /// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term) /// from the dictionary. #[must_use] - pub fn set_fast(mut self) -> TextOptions { - self.fast = true; + pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions { + if let Some(tokenizer) = tokenizer_name { + let tokenizer = TokenizerName::from_name(tokenizer); + self.fast = FastFieldOptions::EnabledWithTokenizer { + with_tokenizer: tokenizer, + } + } else { + self.fast = FastFieldOptions::IsEnabled(true); + } self } @@ -92,7 +151,7 @@ impl TextOptions { } #[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)] -struct TokenizerName(Cow<'static, str>); +pub(crate) struct TokenizerName(Cow<'static, str>); const DEFAULT_TOKENIZER_NAME: &str = "default"; @@ -105,7 +164,7 @@ impl Default for TokenizerName { } impl TokenizerName { - const fn from_static(name: &'static str) -> Self { + pub const fn from_static(name: &'static str) -> Self { TokenizerName(Cow::Borrowed(name)) } fn from_name(name: &str) -> Self { @@ -199,7 +258,7 @@ pub const STRING: TextOptions = TextOptions { record: IndexRecordOption::Basic, }), stored: false, - fast: false, + fast: FastFieldOptions::IsEnabled(false), coerce: false, }; @@ -212,7 +271,7 @@ pub const TEXT: TextOptions = TextOptions { }), stored: false, coerce: false, - fast: false, + fast: FastFieldOptions::IsEnabled(false), }; impl> BitOr for TextOptions { @@ -240,7 +299,7 @@ impl From for TextOptions { TextOptions { indexing: None, stored: true, - fast: false, + fast: FastFieldOptions::IsEnabled(false), coerce: false, } } @@ -251,7 +310,7 @@ impl From for TextOptions { TextOptions { indexing: None, stored: false, - fast: false, + fast: FastFieldOptions::IsEnabled(false), coerce: true, } } @@ -262,7 +321,7 @@ impl From for TextOptions { TextOptions { indexing: None, stored: false, - fast: true, + fast: FastFieldOptions::IsEnabled(true), coerce: false, } } @@ -281,6 +340,7 @@ where #[cfg(test)] mod tests { + use crate::schema::text_options::{FastFieldOptions, TokenizerName}; use crate::schema::*; #[test] @@ -323,4 +383,44 @@ mod tests { let options3: TextOptions = serde_json::from_str("{}").unwrap(); assert_eq!(options3.indexing, None); } + + #[test] + fn serde_fast_field_tokenizer() { + let json = r#" { + "fast": { "with_tokenizer": "default" } + } "#; + let options: TextOptions = serde_json::from_str(json).unwrap(); + assert_eq!( + options.fast, + FastFieldOptions::EnabledWithTokenizer { + with_tokenizer: TokenizerName::from_static("default") + } + ); + let options: TextOptions = + serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); + assert_eq!( + options.fast, + FastFieldOptions::EnabledWithTokenizer { + with_tokenizer: TokenizerName::from_static("default") + } + ); + + let json = r#" { + "fast": true + } "#; + let options: TextOptions = serde_json::from_str(json).unwrap(); + assert_eq!(options.fast, FastFieldOptions::IsEnabled(true)); + let options: TextOptions = + serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); + assert_eq!(options.fast, FastFieldOptions::IsEnabled(true)); + + let json = r#" { + "fast": false + } "#; + let options: TextOptions = serde_json::from_str(json).unwrap(); + assert_eq!(options.fast, FastFieldOptions::IsEnabled(false)); + let options: TextOptions = + serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); + assert_eq!(options.fast, FastFieldOptions::IsEnabled(false)); + } }