From fdecb79273f66c4d88bed334839720e899130900 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Thu, 8 Jun 2023 18:37:58 +0800 Subject: [PATCH] tokenizer-api: reduce Tokenizer overhead (#2062) * tokenizer-api: reduce Tokenizer overhead Previously a new `Token` for each text encountered was created, which contains `String::with_capacity(200)` In the new API the token_stream gets mutable access to the tokenizer, this allows state to be shared (in this PR Token is shared). Ideally the allocation for the BoxTokenStream would also be removed, but this may require some lifetime tricks. * simplify api * move lowercase and ascii folding buffer to global * empty Token text as default --- Cargo.toml | 2 +- benches/analyzer.rs | 2 +- examples/pre_tokenized_text.rs | 3 +- examples/stop_words.rs | 2 +- src/core/json_utils.rs | 8 ++-- src/fastfield/mod.rs | 2 +- src/fastfield/writer.rs | 12 ++--- src/indexer/segment_writer.rs | 8 ++-- src/postings/mod.rs | 4 +- src/query/more_like_this/more_like_this.rs | 44 +++++++++-------- src/query/query_parser/query_parser.rs | 16 +++---- src/snippet/mod.rs | 55 +++++++++++++++++----- src/tokenizer/alphanum_only.rs | 8 ++-- src/tokenizer/ascii_folding_filter.rs | 51 +++++++++++--------- src/tokenizer/empty_tokenizer.rs | 4 +- src/tokenizer/facet_tokenizer.rs | 26 +++++----- src/tokenizer/lower_caser.rs | 48 +++++++++++-------- src/tokenizer/mod.rs | 20 ++++---- src/tokenizer/ngram_tokenizer.rs | 15 +++--- src/tokenizer/raw_tokenizer.rs | 38 ++++++++------- src/tokenizer/regex_tokenizer.rs | 21 +++++---- src/tokenizer/remove_long.rs | 6 +-- src/tokenizer/simple_tokenizer.rs | 19 ++++---- src/tokenizer/split_compound_words.rs | 22 ++++----- src/tokenizer/stemmer.rs | 2 +- src/tokenizer/stop_word_filter/mod.rs | 6 +-- src/tokenizer/tokenizer.rs | 8 ++-- src/tokenizer/tokenizer_manager.rs | 8 ++-- src/tokenizer/whitespace_tokenizer.rs | 19 ++++---- tokenizer-api/src/lib.rs | 15 +++++- 30 files changed, 286 insertions(+), 208 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0be3f600e..2e985b2c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,7 +77,7 @@ proptest = "1.0.0" criterion = "0.5" test-log = "0.2.10" env_logger = "0.10.0" -pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] } +pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5 futures = "0.3.21" paste = "1.0.11" more-asserts = "0.3.1" diff --git a/benches/analyzer.rs b/benches/analyzer.rs index caebc7153..7a96fa119 100644 --- a/benches/analyzer.rs +++ b/benches/analyzer.rs @@ -5,7 +5,7 @@ const ALICE_TXT: &str = include_str!("alice.txt"); pub fn criterion_benchmark(c: &mut Criterion) { let tokenizer_manager = TokenizerManager::default(); - let tokenizer = tokenizer_manager.get("default").unwrap(); + let mut tokenizer = tokenizer_manager.get("default").unwrap(); c.bench_function("default-tokenize-alice", |b| { b.iter(|| { let mut word_count = 0; diff --git a/examples/pre_tokenized_text.rs b/examples/pre_tokenized_text.rs index c6595cef9..126b598ef 100644 --- a/examples/pre_tokenized_text.rs +++ b/examples/pre_tokenized_text.rs @@ -17,7 +17,8 @@ use tantivy::{doc, Index, ReloadPolicy}; use tempfile::TempDir; fn pre_tokenize_text(text: &str) -> Vec { - let mut token_stream = SimpleTokenizer.token_stream(text); + let mut tokenizer = SimpleTokenizer::default(); + let mut token_stream = tokenizer.token_stream(text); let mut tokens = vec![]; while token_stream.advance() { tokens.push(token_stream.token().clone()); diff --git a/examples/stop_words.rs b/examples/stop_words.rs index b1c8d7fbb..4b1f52a57 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> { // This tokenizer lowers all of the text (to help with stop word matching) // then removes all instances of `the` and `and` from the corpus - let tokenizer = TextAnalyzer::builder(SimpleTokenizer) + let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(LowerCaser) .filter(StopWordFilter::remove(vec![ "the".to_string(), diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index 9432bbc46..0dec432d8 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -67,7 +67,7 @@ impl IndexingPositionsPerPath { pub(crate) fn index_json_values<'a>( doc: DocId, json_values: impl Iterator>>, - text_analyzer: &TextAnalyzer, + text_analyzer: &mut TextAnalyzer, expand_dots_enabled: bool, term_buffer: &mut Term, postings_writer: &mut dyn PostingsWriter, @@ -93,7 +93,7 @@ pub(crate) fn index_json_values<'a>( fn index_json_object( doc: DocId, json_value: &serde_json::Map, - text_analyzer: &TextAnalyzer, + text_analyzer: &mut TextAnalyzer, json_term_writer: &mut JsonTermWriter, postings_writer: &mut dyn PostingsWriter, ctx: &mut IndexingContext, @@ -117,7 +117,7 @@ fn index_json_object( fn index_json_value( doc: DocId, json_value: &serde_json::Value, - text_analyzer: &TextAnalyzer, + text_analyzer: &mut TextAnalyzer, json_term_writer: &mut JsonTermWriter, postings_writer: &mut dyn PostingsWriter, ctx: &mut IndexingContext, @@ -239,7 +239,7 @@ pub(crate) fn set_fastvalue_and_get_term( pub(crate) fn set_string_and_get_terms( json_term_writer: &mut JsonTermWriter, value: &str, - text_analyzer: &TextAnalyzer, + text_analyzer: &mut TextAnalyzer, ) -> Vec<(usize, Term)> { let mut positions_and_terms = Vec::<(usize, Term)>::new(); json_term_writer.close_path_and_set_type(Type::Str); diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 8046898f1..d450e3e59 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -1208,7 +1208,7 @@ mod tests { let ff_tokenizer_manager = TokenizerManager::default(); ff_tokenizer_manager.register( "custom_lowercase", - TextAnalyzer::builder(RawTokenizer) + TextAnalyzer::builder(RawTokenizer::default()) .filter(LowerCaser) .build(), ); diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index a1a97bdb4..6389ce8a2 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -147,7 +147,7 @@ impl FastFieldsWriter { } Value::Str(text_val) => { if let Some(tokenizer) = - &self.per_field_tokenizer[field_value.field().field_id() as usize] + &mut self.per_field_tokenizer[field_value.field().field_id() as usize] { let mut token_stream = tokenizer.token_stream(text_val); token_stream.process(&mut |token: &Token| { @@ -202,7 +202,7 @@ impl FastFieldsWriter { self.json_path_buffer.push_str(field_name); let text_analyzer = - &self.per_field_tokenizer[field_value.field().field_id() as usize]; + &mut self.per_field_tokenizer[field_value.field().field_id() as usize]; record_json_obj_to_columnar_writer( doc_id, @@ -263,7 +263,7 @@ fn record_json_obj_to_columnar_writer( remaining_depth_limit: usize, json_path_buffer: &mut String, columnar_writer: &mut columnar::ColumnarWriter, - tokenizer: &Option, + tokenizer: &mut Option, ) { for (key, child) in json_obj { let len_path = json_path_buffer.len(); @@ -302,7 +302,7 @@ fn record_json_value_to_columnar_writer( mut remaining_depth_limit: usize, json_path_writer: &mut String, columnar_writer: &mut columnar::ColumnarWriter, - tokenizer: &Option, + tokenizer: &mut Option, ) { if remaining_depth_limit == 0 { return; @@ -321,7 +321,7 @@ fn record_json_value_to_columnar_writer( } } serde_json::Value::String(text) => { - if let Some(text_analyzer) = tokenizer { + if let Some(text_analyzer) = tokenizer.as_mut() { let mut token_stream = text_analyzer.token_stream(text); token_stream.process(&mut |token| { columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text); @@ -379,7 +379,7 @@ mod tests { JSON_DEPTH_LIMIT, &mut json_path, &mut columnar_writer, - &None, + &mut None, ); } let mut buffer = Vec::new(); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index de9f951f3..9ccb9810a 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -185,10 +185,11 @@ impl SegmentWriter { match field_entry.field_type() { FieldType::Facet(_) => { + let mut facet_tokenizer = FacetTokenizer::default(); // this can be global for value in values { let facet = value.as_facet().ok_or_else(make_schema_error)?; let facet_str = facet.encoded_str(); - let mut facet_tokenizer = FacetTokenizer.token_stream(facet_str); + let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str); let mut indexing_position = IndexingPosition::default(); postings_writer.index_text( doc_id, @@ -208,7 +209,7 @@ impl SegmentWriter { } Value::Str(ref text) => { let text_analyzer = - &self.per_field_text_analyzers[field.field_id() as usize]; + &mut self.per_field_text_analyzers[field.field_id() as usize]; text_analyzer.token_stream(text) } _ => { @@ -304,7 +305,8 @@ impl SegmentWriter { } } FieldType::JsonObject(json_options) => { - let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize]; + let text_analyzer = + &mut self.per_field_text_analyzers[field.field_id() as usize]; let json_values_it = values.map(|value| value.as_json().ok_or_else(make_schema_error)); index_json_values( diff --git a/src/postings/mod.rs b/src/postings/mod.rs index dd3397445..09265b085 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -162,7 +162,7 @@ pub mod tests { let index = Index::create_in_ram(schema); index .tokenizers() - .register("simple_no_truncation", SimpleTokenizer); + .register("simple_no_truncation", SimpleTokenizer::default()); let reader = index.reader()?; let mut index_writer = index.writer_for_tests()?; @@ -194,7 +194,7 @@ pub mod tests { let index = Index::create_in_ram(schema); index .tokenizers() - .register("simple_no_truncation", SimpleTokenizer); + .register("simple_no_truncation", SimpleTokenizer::default()); let reader = index.reader()?; let mut index_writer = index.writer_for_tests()?; diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index d18e27275..994dd96c0 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -192,45 +192,49 @@ impl MoreLikeThis { }) .collect::>>()?; for fake_str in facets { - FacetTokenizer.token_stream(fake_str).process(&mut |token| { - if self.is_noise_word(token.text.clone()) { - let term = Term::from_field_text(field, &token.text); - *term_frequencies.entry(term).or_insert(0) += 1; - } - }); + FacetTokenizer::default() + .token_stream(fake_str) + .process(&mut |token| { + if self.is_noise_word(token.text.clone()) { + let term = Term::from_field_text(field, &token.text); + *term_frequencies.entry(term).or_insert(0) += 1; + } + }); } } FieldType::Str(text_options) => { - let mut token_streams: Vec = vec![]; - for value in values { match value { Value::PreTokStr(tok_str) => { - token_streams.push(PreTokenizedStream::from(tok_str.clone()).into()); + let mut token_stream: BoxTokenStream = + PreTokenizedStream::from(tok_str.clone()).into(); + token_stream.process(&mut |token| { + if !self.is_noise_word(token.text.clone()) { + let term = Term::from_field_text(field, &token.text); + *term_frequencies.entry(term).or_insert(0) += 1; + } + }); } Value::Str(ref text) => { - if let Some(tokenizer) = text_options + if let Some(mut tokenizer) = text_options .get_indexing_options() .map(|text_indexing_options| { text_indexing_options.tokenizer().to_string() }) .and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)) { - token_streams.push(tokenizer.token_stream(text)); + let mut token_stream = tokenizer.token_stream(text); + token_stream.process(&mut |token| { + if !self.is_noise_word(token.text.clone()) { + let term = Term::from_field_text(field, &token.text); + *term_frequencies.entry(term).or_insert(0) += 1; + } + }); } } _ => (), } } - - for mut token_stream in token_streams { - token_stream.process(&mut |token| { - if !self.is_noise_word(token.text.clone()) { - let term = Term::from_field_text(field, &token.text); - *term_frequencies.entry(term).or_insert(0) += 1; - } - }); - } } FieldType::U64(_) => { for value in values { diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index cfb7cbd5b..72a735848 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -403,7 +403,7 @@ impl QueryParser { // This should have been seen earlier really. QueryParserError::FieldNotIndexed(field_entry.name().to_string()) })?; - let text_analyzer = + let mut text_analyzer = self.tokenizer_manager .get(option.tokenizer()) .ok_or_else(|| QueryParserError::UnknownTokenizer { @@ -497,7 +497,7 @@ impl QueryParser { // This should have been seen earlier really. QueryParserError::FieldNotIndexed(field_name.to_string()) })?; - let text_analyzer = self + let mut text_analyzer = self .tokenizer_manager .get(indexing_options.tokenizer()) .ok_or_else(|| QueryParserError::UnknownTokenizer { @@ -511,7 +511,7 @@ impl QueryParser { slop, prefix, indexing_options, - &text_analyzer, + &mut text_analyzer, )? .into_iter() .collect()) @@ -795,7 +795,7 @@ fn generate_literals_for_str( slop: u32, prefix: bool, indexing_options: &TextFieldIndexing, - text_analyzer: &TextAnalyzer, + text_analyzer: &mut TextAnalyzer, ) -> Result, QueryParserError> { let mut terms: Vec<(usize, Term)> = Vec::new(); let mut token_stream = text_analyzer.token_stream(phrase); @@ -840,7 +840,7 @@ fn generate_literals_for_json_object( // This should have been seen earlier really. QueryParserError::FieldNotIndexed(field_name.to_string()) })?; - let text_analyzer = tokenizer_manager + let mut text_analyzer = tokenizer_manager .get(text_options.tokenizer()) .ok_or_else(|| QueryParserError::UnknownTokenizer { field: field_name.to_string(), @@ -858,7 +858,7 @@ fn generate_literals_for_json_object( if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) { logical_literals.push(LogicalLiteral::Term(term)); } - let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &text_analyzer); + let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &mut text_analyzer); drop(json_term_writer); if terms.len() <= 1 { for (_, term) in terms { @@ -959,7 +959,7 @@ mod test { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( "en_with_stop_words", - TextAnalyzer::builder(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer::default()) .filter(LowerCaser) .filter(StopWordFilter::remove(vec!["the".to_string()])) .build(), @@ -1463,7 +1463,7 @@ mod test { let index = Index::create_in_ram(schema); index .tokenizers() - .register("customtokenizer", SimpleTokenizer); + .register("customtokenizer", SimpleTokenizer::default()); let query_parser = QueryParser::for_index(&index, vec![title]); assert_eq!( query_parser.parse_query("title:\"happy tax\"").unwrap_err(), diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 09fd9c8d6..bf3a1f82c 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -145,7 +145,7 @@ impl Snippet { /// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ /// has to be a valid string. fn search_fragments( - tokenizer: &TextAnalyzer, + tokenizer: &mut TextAnalyzer, text: &str, terms: &BTreeMap, max_num_chars: usize, @@ -370,8 +370,12 @@ impl SnippetGenerator { /// Generates a snippet for the given text. pub fn snippet(&self, text: &str) -> Snippet { - let fragment_candidates = - search_fragments(&self.tokenizer, text, &self.terms_text, self.max_num_chars); + let fragment_candidates = search_fragments( + &mut self.tokenizer.clone(), + text, + &self.terms_text, + self.max_num_chars, + ); select_best_fragment_combination(&fragment_candidates[..], text) } } @@ -408,7 +412,12 @@ Survey in 2016, 2017, and 2018."#; String::from("rust") => 1.0, String::from("language") => 0.9 }; - let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100); + let fragments = search_fragments( + &mut From::from(SimpleTokenizer::default()), + TEST_TEXT, + &terms, + 100, + ); assert_eq!(fragments.len(), 7); { let first = &fragments[0]; @@ -435,7 +444,12 @@ Survey in 2016, 2017, and 2018."#; String::from("rust") =>1.0, String::from("language") => 0.9 }; - let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20); + let fragments = search_fragments( + &mut From::from(SimpleTokenizer::default()), + TEST_TEXT, + &terms, + 20, + ); { let first = &fragments[0]; assert_eq!(first.score, 1.0); @@ -449,7 +463,12 @@ Survey in 2016, 2017, and 2018."#; String::from("rust") =>0.9, String::from("language") => 1.0 }; - let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20); + let fragments = search_fragments( + &mut From::from(SimpleTokenizer::default()), + TEST_TEXT, + &terms, + 20, + ); // assert_eq!(fragments.len(), 7); { let first = &fragments[0]; @@ -468,7 +487,8 @@ Survey in 2016, 2017, and 2018."#; let mut terms = BTreeMap::new(); terms.insert(String::from("c"), 1.0); - let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3); + let fragments = + search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3); assert_eq!(fragments.len(), 1); { @@ -490,7 +510,8 @@ Survey in 2016, 2017, and 2018."#; let mut terms = BTreeMap::new(); terms.insert(String::from("f"), 1.0); - let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3); + let fragments = + search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3); assert_eq!(fragments.len(), 2); { @@ -513,7 +534,8 @@ Survey in 2016, 2017, and 2018."#; terms.insert(String::from("f"), 1.0); terms.insert(String::from("a"), 0.9); - let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 7); + let fragments = + search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 7); assert_eq!(fragments.len(), 2); { @@ -535,7 +557,8 @@ Survey in 2016, 2017, and 2018."#; let mut terms = BTreeMap::new(); terms.insert(String::from("z"), 1.0); - let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3); + let fragments = + search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3); assert_eq!(fragments.len(), 0); @@ -550,7 +573,8 @@ Survey in 2016, 2017, and 2018."#; let text = "a b c d"; let terms = BTreeMap::new(); - let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3); + let fragments = + search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3); assert_eq!(fragments.len(), 0); let snippet = select_best_fragment_combination(&fragments[..], text); @@ -669,7 +693,7 @@ Survey in 2016, 2017, and 2018."#; terms.insert(String::from("bc"), 1.0); let fragments = search_fragments( - &From::from(NgramTokenizer::all_ngrams(2, 2)), + &mut From::from(NgramTokenizer::all_ngrams(2, 2)), text, &terms, 3, @@ -691,7 +715,12 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet_generator_custom_highlighted_elements() { let terms = btreemap! { String::from("rust") => 1.0, String::from("language") => 0.9 }; - let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100); + let fragments = search_fragments( + &mut From::from(SimpleTokenizer::default()), + TEST_TEXT, + &terms, + 100, + ); let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); assert_eq!( snippet.to_html(), diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index c0175e736..b40731fd3 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -2,7 +2,7 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = TextAnalyzer::builder(RawTokenizer) +//! let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default()) //! .filter(AlphaNumOnlyFilter) //! .build(); //! @@ -11,7 +11,7 @@ //! // contains a space //! assert!(stream.next().is_none()); //! -//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer) +//! let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) //! .filter(AlphaNumOnlyFilter) //! .build(); //! @@ -52,7 +52,7 @@ pub struct AlphaNumOnlyFilterWrapper(T); impl Tokenizer for AlphaNumOnlyFilterWrapper { type TokenStream<'a> = AlphaNumOnlyFilterStream>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { AlphaNumOnlyFilterStream { tail: self.0.token_stream(text), } @@ -96,7 +96,7 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let a = TextAnalyzer::builder(SimpleTokenizer) + let mut a = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(AlphaNumOnlyFilter) .build(); let mut token_stream = a.token_stream(text); diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index beef3ff31..da8039e17 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -12,38 +12,45 @@ impl TokenFilter for AsciiFoldingFilter { type Tokenizer = AsciiFoldingFilterWrapper; fn transform(self, tokenizer: T) -> AsciiFoldingFilterWrapper { - AsciiFoldingFilterWrapper(tokenizer) - } -} - -#[derive(Clone)] -pub struct AsciiFoldingFilterWrapper(T); - -impl Tokenizer for AsciiFoldingFilterWrapper { - type TokenStream<'a> = AsciiFoldingFilterTokenStream>; - - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { - AsciiFoldingFilterTokenStream { - buffer: String::with_capacity(100), - tail: self.0.token_stream(text), + AsciiFoldingFilterWrapper { + tokenizer, + buffer: String::new(), } } } -pub struct AsciiFoldingFilterTokenStream { +#[derive(Clone)] +pub struct AsciiFoldingFilterWrapper { + tokenizer: T, buffer: String, +} + +impl Tokenizer for AsciiFoldingFilterWrapper { + type TokenStream<'a> = AsciiFoldingFilterTokenStream<'a, T::TokenStream<'a>>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + self.buffer.clear(); + AsciiFoldingFilterTokenStream { + buffer: &mut self.buffer, + tail: self.tokenizer.token_stream(text), + } + } +} + +pub struct AsciiFoldingFilterTokenStream<'a, T> { + buffer: &'a mut String, tail: T, } -impl TokenStream for AsciiFoldingFilterTokenStream { +impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; } if !self.token_mut().text.is_ascii() { // ignore its already ascii - to_ascii(&self.tail.token().text, &mut self.buffer); - mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); + to_ascii(&self.tail.token().text, self.buffer); + mem::swap(&mut self.tail.token_mut().text, self.buffer); } true } @@ -1573,7 +1580,7 @@ mod tests { fn folding_helper(text: &str) -> Vec { let mut tokens = Vec::new(); - TextAnalyzer::builder(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer::default()) .filter(AsciiFoldingFilter) .build() .token_stream(text) @@ -1584,10 +1591,10 @@ mod tests { } fn folding_using_raw_tokenizer_helper(text: &str) -> String { - let mut token_stream = TextAnalyzer::builder(RawTokenizer) + let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default()) .filter(AsciiFoldingFilter) - .build() - .token_stream(text); + .build(); + let mut token_stream = tokenizer.token_stream(text); token_stream.advance(); token_stream.token().text.clone() } diff --git a/src/tokenizer/empty_tokenizer.rs b/src/tokenizer/empty_tokenizer.rs index 4f4822206..46cb78c10 100644 --- a/src/tokenizer/empty_tokenizer.rs +++ b/src/tokenizer/empty_tokenizer.rs @@ -5,7 +5,7 @@ pub(crate) struct EmptyTokenizer; impl Tokenizer for EmptyTokenizer { type TokenStream<'a> = EmptyTokenStream; - fn token_stream(&self, _text: &str) -> EmptyTokenStream { + fn token_stream(&mut self, _text: &str) -> EmptyTokenStream { EmptyTokenStream::default() } } @@ -35,7 +35,7 @@ mod tests { #[test] fn test_empty_tokenizer() { - let tokenizer = super::EmptyTokenizer; + let mut tokenizer = super::EmptyTokenizer; let mut empty = tokenizer.token_stream("whatever string"); assert!(!empty.advance()); } diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index 3f2f1df2f..568d60ae3 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -9,8 +9,10 @@ use crate::schema::FACET_SEP_BYTE; /// - `/america/north_america/canada` /// - `/america/north_america` /// - `/america` -#[derive(Clone)] -pub struct FacetTokenizer; +#[derive(Clone, Default)] +pub struct FacetTokenizer { + token: Token, +} #[derive(Debug)] enum State { @@ -22,20 +24,18 @@ enum State { pub struct FacetTokenStream<'a> { text: &'a str, state: State, - token: Token, + token: &'a mut Token, } impl Tokenizer for FacetTokenizer { type TokenStream<'a> = FacetTokenStream<'a>; - fn token_stream<'a>(&self, text: &'a str) -> FacetTokenStream<'a> { - let token = Token { - position: 0, - ..Default::default() - }; + fn token_stream<'a>(&'a mut self, text: &'a str) -> FacetTokenStream<'a> { + self.token.reset(); + self.token.position = 0; FacetTokenStream { text, state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet. - token, + token: &mut self.token, } } } @@ -74,11 +74,11 @@ impl<'a> TokenStream for FacetTokenStream<'a> { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } @@ -98,7 +98,7 @@ mod tests { let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); tokens.push(format!("{}", facet)); }; - FacetTokenizer + FacetTokenizer::default() .token_stream(facet.encoded_str()) .process(&mut add_token); } @@ -118,7 +118,7 @@ mod tests { let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test tokens.push(format!("{}", facet)); }; - FacetTokenizer + FacetTokenizer::default() .token_stream(facet.encoded_str()) // ok test .process(&mut add_token); } diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index dc10d3e27..56792ba82 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -10,26 +10,33 @@ impl TokenFilter for LowerCaser { type Tokenizer = LowerCaserFilter; fn transform(self, tokenizer: T) -> Self::Tokenizer { - LowerCaserFilter(tokenizer) - } -} - -#[derive(Clone)] -pub struct LowerCaserFilter(T); - -impl Tokenizer for LowerCaserFilter { - type TokenStream<'a> = LowerCaserTokenStream>; - - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { - LowerCaserTokenStream { - tail: self.0.token_stream(text), + LowerCaserFilter { + tokenizer, buffer: String::new(), } } } -pub struct LowerCaserTokenStream { +#[derive(Clone)] +pub struct LowerCaserFilter { + tokenizer: T, buffer: String, +} + +impl Tokenizer for LowerCaserFilter { + type TokenStream<'a> = LowerCaserTokenStream<'a, T::TokenStream<'a>>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + self.buffer.clear(); + LowerCaserTokenStream { + tail: self.tokenizer.token_stream(text), + buffer: &mut self.buffer, + } + } +} + +pub struct LowerCaserTokenStream<'a, T> { + buffer: &'a mut String, tail: T, } @@ -44,7 +51,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) { } } -impl TokenStream for LowerCaserTokenStream { +impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; @@ -53,8 +60,8 @@ impl TokenStream for LowerCaserTokenStream { // fast track for ascii. self.token_mut().text.make_ascii_lowercase(); } else { - to_lowercase_unicode(&self.tail.token().text, &mut self.buffer); - mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); + to_lowercase_unicode(&self.tail.token().text, self.buffer); + mem::swap(&mut self.tail.token_mut().text, self.buffer); } true } @@ -86,10 +93,11 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let mut token_stream = TextAnalyzer::builder(SimpleTokenizer) + let mut token_stream = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(LowerCaser) - .build() - .token_stream(text); + .build(); + + let mut token_stream = token_stream.token_stream(text); let mut tokens = vec![]; let mut add_token = |token: &Token| { tokens.push(token.clone()); diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 8bd3fd465..dbc3dd867 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -66,7 +66,7 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let en_stem = TextAnalyzer::builder(SimpleTokenizer) +//! let en_stem = TextAnalyzer::builder(SimpleTokenizer::default()) //! .filter(RemoveLongFilter::limit(40)) //! .filter(LowerCaser) //! .filter(Stemmer::new(Language::English)) @@ -81,7 +81,7 @@ //! # use tantivy::tokenizer::*; //! # use tantivy::Index; //! # -//! let custom_en_tokenizer = SimpleTokenizer; +//! let custom_en_tokenizer = SimpleTokenizer::default(); //! # let schema = Schema::builder().build(); //! let index = Index::create_in_ram(schema); //! index.tokenizers() @@ -113,7 +113,7 @@ //! let index = Index::create_in_ram(schema); //! //! // We need to register our tokenizer : -//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer) +//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) //! .filter(RemoveLongFilter::limit(40)) //! .filter(LowerCaser) //! .build(); @@ -188,9 +188,9 @@ pub mod tests { } #[test] - fn test_raw_tokenizer() { + fn test_raw_tokenizer2() { let tokenizer_manager = TokenizerManager::default(); - let en_tokenizer = tokenizer_manager.get("raw").unwrap(); + let mut en_tokenizer = tokenizer_manager.get("raw").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { @@ -208,7 +208,7 @@ pub mod tests { fn test_en_tokenizer() { let tokenizer_manager = TokenizerManager::default(); assert!(tokenizer_manager.get("en_doesnotexist").is_none()); - let en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); + let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { @@ -231,13 +231,13 @@ pub mod tests { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( "el_stem", - TextAnalyzer::builder(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) .filter(Stemmer::new(Language::Greek)) .build(), ); - let en_tokenizer = tokenizer_manager.get("el_stem").unwrap(); + let mut en_tokenizer = tokenizer_manager.get("el_stem").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { @@ -257,7 +257,7 @@ pub mod tests { #[test] fn test_tokenizer_empty() { let tokenizer_manager = TokenizerManager::default(); - let en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); + let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); { let mut tokens: Vec = vec![]; { @@ -283,7 +283,7 @@ pub mod tests { #[test] fn test_whitespace_tokenizer() { let tokenizer_manager = TokenizerManager::default(); - let ws_tokenizer = tokenizer_manager.get("whitespace").unwrap(); + let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index b3af1dd03..ae54cacf4 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -33,7 +33,7 @@ use super::{Token, TokenStream, Tokenizer}; /// ```rust /// use tantivy::tokenizer::*; /// -/// let tokenizer = NgramTokenizer::new(2, 3, false); +/// let mut tokenizer = NgramTokenizer::new(2, 3, false); /// let mut stream = tokenizer.token_stream("hello"); /// { /// let token = stream.next().unwrap(); @@ -87,6 +87,7 @@ pub struct NgramTokenizer { max_gram: usize, /// if true, will only parse the leading edge of the input prefix_only: bool, + token: Token, } impl NgramTokenizer { @@ -101,6 +102,7 @@ impl NgramTokenizer { min_gram, max_gram, prefix_only, + token: Token::default(), } } @@ -127,12 +129,13 @@ pub struct NgramTokenStream<'a> { /// input text: &'a str, /// output - token: Token, + token: &'a mut Token, } impl Tokenizer for NgramTokenizer { type TokenStream<'a> = NgramTokenStream<'a>; - fn token_stream<'a>(&self, text: &'a str) -> NgramTokenStream<'a> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> NgramTokenStream<'a> { + self.token.reset(); NgramTokenStream { ngram_charidx_iterator: StutteringIterator::new( CodepointFrontiers::for_str(text), @@ -141,7 +144,7 @@ impl Tokenizer for NgramTokenizer { ), prefix_only: self.prefix_only, text, - token: Token::default(), + token: &mut self.token, } } } @@ -164,10 +167,10 @@ impl<'a> TokenStream for NgramTokenStream<'a> { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 901994915..9bf7ee22a 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -1,32 +1,34 @@ use super::{Token, TokenStream, Tokenizer}; /// For each value of the field, emit a single unprocessed token. -#[derive(Clone)] -pub struct RawTokenizer; - -pub struct RawTokenStream { +#[derive(Clone, Default)] +pub struct RawTokenizer { token: Token, +} + +pub struct RawTokenStream<'a> { + token: &'a mut Token, has_token: bool, } impl Tokenizer for RawTokenizer { - type TokenStream<'a> = RawTokenStream; - fn token_stream(&self, text: &str) -> RawTokenStream { - let token = Token { - offset_from: 0, - offset_to: text.len(), - position: 0, - text: text.to_string(), - position_length: 1, - }; + type TokenStream<'a> = RawTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &str) -> RawTokenStream<'a> { + self.token.reset(); + self.token.position = 0; + self.token.position_length = 1; + self.token.offset_from = 0; + self.token.offset_to = text.len(); + self.token.text.clear(); + self.token.text.push_str(text); RawTokenStream { - token, + token: &mut self.token, has_token: true, } } } -impl TokenStream for RawTokenStream { +impl<'a> TokenStream for RawTokenStream<'a> { fn advance(&mut self) -> bool { let result = self.has_token; self.has_token = false; @@ -34,11 +36,11 @@ impl TokenStream for RawTokenStream { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } @@ -55,7 +57,7 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let a = TextAnalyzer::from(RawTokenizer); + let mut a = TextAnalyzer::from(RawTokenizer::default()); let mut token_stream = a.token_stream(text); let mut tokens: Vec = vec![]; let mut add_token = |token: &Token| { diff --git a/src/tokenizer/regex_tokenizer.rs b/src/tokenizer/regex_tokenizer.rs index f65a5cece..f9a10ad20 100644 --- a/src/tokenizer/regex_tokenizer.rs +++ b/src/tokenizer/regex_tokenizer.rs @@ -22,7 +22,7 @@ use crate::TantivyError; /// ```rust /// use tantivy::tokenizer::*; /// -/// let tokenizer = RegexTokenizer::new(r"'(?:\w*)'").unwrap(); +/// let mut tokenizer = RegexTokenizer::new(r"'(?:\w*)'").unwrap(); /// let mut stream = tokenizer.token_stream("'aaa' bbb 'ccc' 'ddd'"); /// { /// let token = stream.next().unwrap(); @@ -48,6 +48,7 @@ use crate::TantivyError; #[derive(Clone)] pub struct RegexTokenizer { regex: Regex, + token: Token, } impl RegexTokenizer { @@ -55,17 +56,21 @@ impl RegexTokenizer { pub fn new(regex_pattern: &str) -> crate::Result { Regex::new(regex_pattern) .map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_owned())) - .map(|regex| Self { regex }) + .map(|regex| Self { + regex, + token: Token::default(), + }) } } impl Tokenizer for RegexTokenizer { type TokenStream<'a> = RegexTokenStream<'a>; - fn token_stream<'a>(&self, text: &'a str) -> RegexTokenStream<'a> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> RegexTokenStream<'a> { + self.token.reset(); RegexTokenStream { regex: self.regex.clone(), text, - token: Token::default(), + token: &mut self.token, cursor: 0, } } @@ -74,7 +79,7 @@ impl Tokenizer for RegexTokenizer { pub struct RegexTokenStream<'a> { regex: Regex, text: &'a str, - token: Token, + token: &'a mut Token, cursor: usize, } @@ -100,11 +105,11 @@ impl<'a> TokenStream for RegexTokenStream<'a> { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } @@ -147,7 +152,7 @@ mod tests { fn token_stream_helper(text: &str, pattern: &str) -> Vec { let r = RegexTokenizer::new(pattern).unwrap(); - let a = TextAnalyzer::from(r); + let mut a = TextAnalyzer::from(r); let mut token_stream = a.token_stream(text); let mut tokens: Vec = vec![]; let mut add_token = |token: &Token| { diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index 933e98adb..78f3e731a 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -2,7 +2,7 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer) +//! let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) //! .filter(RemoveLongFilter::limit(5)) //! .build(); //! @@ -57,7 +57,7 @@ pub struct RemoveLongFilterWrapper { impl Tokenizer for RemoveLongFilterWrapper { type TokenStream<'a> = RemoveLongFilterStream>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { RemoveLongFilterStream { token_length_limit: self.length_limit, tail: self.inner.token_stream(text), @@ -103,7 +103,7 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let a = TextAnalyzer::builder(SimpleTokenizer) + let mut a = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(6)) .build(); let mut token_stream = a.token_stream(text); diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index dc9a3b126..540dfac47 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -3,23 +3,26 @@ use std::str::CharIndices; use super::{Token, TokenStream, Tokenizer}; /// Tokenize the text by splitting on whitespaces and punctuation. -#[derive(Clone)] -pub struct SimpleTokenizer; +#[derive(Clone, Default)] +pub struct SimpleTokenizer { + token: Token, +} /// TokenStream produced by the `SimpleTokenizer`. pub struct SimpleTokenStream<'a> { text: &'a str, chars: CharIndices<'a>, - token: Token, + token: &'a mut Token, } impl Tokenizer for SimpleTokenizer { type TokenStream<'a> = SimpleTokenStream<'a>; - fn token_stream<'a>(&self, text: &'a str) -> SimpleTokenStream<'a> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> SimpleTokenStream<'a> { + self.token.reset(); SimpleTokenStream { text, chars: text.char_indices(), - token: Token::default(), + token: &mut self.token, } } } @@ -52,11 +55,11 @@ impl<'a> TokenStream for SimpleTokenStream<'a> { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } @@ -76,7 +79,7 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let a = TextAnalyzer::from(SimpleTokenizer); + let mut a = TextAnalyzer::from(SimpleTokenizer::default()); let mut token_stream = a.token_stream(text); let mut tokens: Vec = vec![]; let mut add_token = |token: &Token| { diff --git a/src/tokenizer/split_compound_words.rs b/src/tokenizer/split_compound_words.rs index e80e6b31f..bcde161cc 100644 --- a/src/tokenizer/split_compound_words.rs +++ b/src/tokenizer/split_compound_words.rs @@ -20,8 +20,8 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// ```rust /// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer}; /// -/// let tokenizer = -/// TextAnalyzer::builder(SimpleTokenizer) +/// let mut tokenizer = +/// TextAnalyzer::builder(SimpleTokenizer::default()) /// .filter( /// SplitCompoundWords::from_dictionary([ /// "dampf", "schiff", "fahrt", "brot", "backen", "automat", @@ -29,13 +29,13 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// .unwrap() /// ) /// .build(); -/// -/// let mut stream = tokenizer.token_stream("dampfschifffahrt"); -/// assert_eq!(stream.next().unwrap().text, "dampf"); -/// assert_eq!(stream.next().unwrap().text, "schiff"); -/// assert_eq!(stream.next().unwrap().text, "fahrt"); -/// assert_eq!(stream.next(), None); -/// +/// { +/// let mut stream = tokenizer.token_stream("dampfschifffahrt"); +/// assert_eq!(stream.next().unwrap().text, "dampf"); +/// assert_eq!(stream.next().unwrap().text, "schiff"); +/// assert_eq!(stream.next().unwrap().text, "fahrt"); +/// assert_eq!(stream.next(), None); +/// } /// let mut stream = tokenizer.token_stream("brotbackautomat"); /// assert_eq!(stream.next().unwrap().text, "brotbackautomat"); /// assert_eq!(stream.next(), None); @@ -99,7 +99,7 @@ pub struct SplitCompoundWordsFilter { impl Tokenizer for SplitCompoundWordsFilter { type TokenStream<'a> = SplitCompoundWordsTokenStream>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { SplitCompoundWordsTokenStream { dict: self.dict.clone(), tail: self.inner.token_stream(text), @@ -188,7 +188,7 @@ mod tests { #[test] fn splitting_compound_words_works() { - let tokenizer = TextAnalyzer::builder(SimpleTokenizer) + let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(SplitCompoundWords::from_dictionary(["foo", "bar"]).unwrap()) .build(); diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 3f8a3eead..4c43b609a 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -100,7 +100,7 @@ pub struct StemmerFilter { impl Tokenizer for StemmerFilter { type TokenStream<'a> = StemmerTokenStream>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); StemmerTokenStream { tail: self.inner.token_stream(text), diff --git a/src/tokenizer/stop_word_filter/mod.rs b/src/tokenizer/stop_word_filter/mod.rs index b4367ec45..3217af716 100644 --- a/src/tokenizer/stop_word_filter/mod.rs +++ b/src/tokenizer/stop_word_filter/mod.rs @@ -2,7 +2,7 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer) +//! let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) //! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])) //! .build(); //! @@ -91,7 +91,7 @@ pub struct StopWordFilterWrapper { impl Tokenizer for StopWordFilterWrapper { type TokenStream<'a> = StopWordFilterStream>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { StopWordFilterStream { words: self.words.clone(), tail: self.inner.token_stream(text), @@ -152,7 +152,7 @@ mod tests { "am".to_string(), "i".to_string(), ]; - let a = TextAnalyzer::builder(SimpleTokenizer) + let mut a = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(StopWordFilter::remove(stops)) .build(); let mut token_stream = a.token_stream(text); diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 65b7815c8..ccab6cda7 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -12,13 +12,13 @@ pub struct TextAnalyzer { /// A boxable `Tokenizer`, with its `TokenStream` type erased. trait BoxableTokenizer: 'static + Send + Sync { /// Creates a boxed token stream for a given `str`. - fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; + fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>; /// Clone this tokenizer. fn box_clone(&self) -> Box; } impl BoxableTokenizer for T { - fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { self.token_stream(text).into() } fn box_clone(&self) -> Box { @@ -53,7 +53,7 @@ impl TextAnalyzer { } /// Creates a token stream for a given `str`. - pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { self.tokenizer.box_token_stream(text) } } @@ -71,7 +71,7 @@ impl TextAnalyzerBuilder { /// ```rust /// use tantivy::tokenizer::*; /// - /// let en_stem = TextAnalyzer::builder(SimpleTokenizer) + /// let en_stem = TextAnalyzer::builder(SimpleTokenizer::default()) /// .filter(RemoveLongFilter::limit(40)) /// .filter(LowerCaser) /// .filter(Stemmer::default()) diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index e849471bc..a2be12390 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -58,23 +58,23 @@ impl Default for TokenizerManager { /// the default pre-configured tokenizers of `tantivy`. fn default() -> TokenizerManager { let manager = TokenizerManager::new(); - manager.register("raw", RawTokenizer); + manager.register("raw", RawTokenizer::default()); manager.register( "default", - TextAnalyzer::builder(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) .build(), ); manager.register( "en_stem", - TextAnalyzer::builder(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) .filter(Stemmer::new(Language::English)) .build(), ); - manager.register("whitespace", WhitespaceTokenizer); + manager.register("whitespace", WhitespaceTokenizer::default()); manager } } diff --git a/src/tokenizer/whitespace_tokenizer.rs b/src/tokenizer/whitespace_tokenizer.rs index 6de19ddd7..69a3b0550 100644 --- a/src/tokenizer/whitespace_tokenizer.rs +++ b/src/tokenizer/whitespace_tokenizer.rs @@ -3,22 +3,25 @@ use std::str::CharIndices; use super::{Token, TokenStream, Tokenizer}; /// Tokenize the text by splitting on whitespaces. -#[derive(Clone)] -pub struct WhitespaceTokenizer; +#[derive(Clone, Default)] +pub struct WhitespaceTokenizer { + token: Token, +} pub struct WhitespaceTokenStream<'a> { text: &'a str, chars: CharIndices<'a>, - token: Token, + token: &'a mut Token, } impl Tokenizer for WhitespaceTokenizer { type TokenStream<'a> = WhitespaceTokenStream<'a>; - fn token_stream<'a>(&self, text: &'a str) -> WhitespaceTokenStream<'a> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> WhitespaceTokenStream<'a> { + self.token.reset(); WhitespaceTokenStream { text, chars: text.char_indices(), - token: Token::default(), + token: &mut self.token, } } } @@ -51,11 +54,11 @@ impl<'a> TokenStream for WhitespaceTokenStream<'a> { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } @@ -75,7 +78,7 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let a = TextAnalyzer::from(WhitespaceTokenizer); + let mut a = TextAnalyzer::from(WhitespaceTokenizer::default()); let mut token_stream = a.token_stream(text); let mut tokens: Vec = vec![]; let mut add_token = |token: &Token| { diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs index f43f8b1d6..adb37a0b4 100644 --- a/tokenizer-api/src/lib.rs +++ b/tokenizer-api/src/lib.rs @@ -34,19 +34,30 @@ impl Default for Token { offset_from: 0, offset_to: 0, position: usize::MAX, - text: String::with_capacity(200), + text: String::new(), position_length: 1, } } } +impl Token { + /// reset to default + pub fn reset(&mut self) { + self.offset_from = 0; + self.offset_to = 0; + self.position = usize::MAX; + self.text.clear(); + self.position_length = 1; + } +} + /// `Tokenizer` are in charge of splitting text into a stream of token /// before indexing. pub trait Tokenizer: 'static + Clone + Send + Sync { /// The token stream returned by this Tokenizer. type TokenStream<'a>: TokenStream; /// Creates a token stream for a given `str`. - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>; } /// Simple wrapper of `Box`.