diff --git a/examples/stop_words.rs b/examples/stop_words.rs index 4b1f52a57..d40ac9fd3 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> { // This tokenizer lowers all of the text (to help with stop word matching) // then removes all instances of `the` and `and` from the corpus let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(LowerCaser) + .filter(LowerCaser::default()) .filter(StopWordFilter::remove(vec![ "the".to_string(), "and".to_string(), diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index d450e3e59..578596dd7 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -1209,7 +1209,7 @@ mod tests { ff_tokenizer_manager.register( "custom_lowercase", TextAnalyzer::builder(RawTokenizer::default()) - .filter(LowerCaser) + .filter(LowerCaser::default()) .build(), ); diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 4a8b86469..1aa025105 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -960,7 +960,8 @@ mod test { tokenizer_manager.register( "en_with_stop_words", TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(LowerCaser) + .filter(LowerCaser::default()) + .filter(LowerCaser::default()) .filter(StopWordFilter::remove(vec!["the".to_string()])) .build(), ); diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index 592575d4d..f79e9cf0c 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -39,9 +39,9 @@ impl AlphaNumOnlyFilterStream { } impl TokenFilter for AlphaNumOnlyFilter { - type OutputTokenStream = AlphaNumOnlyFilterStream; + type OutputTokenStream<'a, T: TokenStream> = AlphaNumOnlyFilterStream; - fn filter(&self, token_stream: T) -> Self::OutputTokenStream { + fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> { AlphaNumOnlyFilterStream { tail: token_stream } } } diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index 981d09e27..b00e5904b 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -5,34 +5,35 @@ use super::{Token, TokenFilter, TokenStream}; /// This class converts alphabetic, numeric, and symbolic Unicode characters /// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode /// block) into their ASCII equivalents, if one exists. -#[derive(Clone)] -pub struct AsciiFoldingFilter; +#[derive(Clone, Default)] +pub struct AsciiFoldingFilter(String); impl TokenFilter for AsciiFoldingFilter { - type OutputTokenStream = AsciiFoldingFilterTokenStream; + type OutputTokenStream<'a, T: TokenStream> = AsciiFoldingFilterTokenStream<'a, T>; - fn filter(&self, token_stream: T) -> Self::OutputTokenStream { + fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> { + self.0.clear(); AsciiFoldingFilterTokenStream { - buffer: String::new(), + buffer: &mut self.0, tail: token_stream, } } } -pub struct AsciiFoldingFilterTokenStream { - buffer: String, +pub struct AsciiFoldingFilterTokenStream<'a, T> { + buffer: &'a mut String, tail: T, } -impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream { +impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; } if !self.token_mut().text.is_ascii() { // ignore its already ascii - to_ascii(&self.tail.token().text, &mut self.buffer); - mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); + to_ascii(&self.tail.token().text, self.buffer); + mem::swap(&mut self.tail.token_mut().text, self.buffer); } true } @@ -1563,7 +1564,7 @@ mod tests { fn folding_helper(text: &str) -> Vec { let mut tokens = Vec::new(); TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(AsciiFoldingFilter) + .filter(AsciiFoldingFilter::default()) .build() .token_stream(text) .process(&mut |token| { @@ -1574,7 +1575,7 @@ mod tests { fn folding_using_raw_tokenizer_helper(text: &str) -> String { let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default()) - .filter(AsciiFoldingFilter) + .filter(AsciiFoldingFilter::default()) .build(); let mut token_stream = tokenizer.token_stream(text); token_stream.advance(); diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index ab3b3533c..18375c711 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -3,22 +3,23 @@ use std::mem; use super::{Token, TokenFilter, TokenStream}; /// Token filter that lowercase terms. -#[derive(Clone)] -pub struct LowerCaser; +#[derive(Clone, Default)] +pub struct LowerCaser(String); impl TokenFilter for LowerCaser { - type OutputTokenStream = LowerCaserTokenStream; + type OutputTokenStream<'a, T: TokenStream> = LowerCaserTokenStream<'a, T>; - fn filter(&self, token_stream: T) -> Self::OutputTokenStream { + fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> { + self.0.clear(); LowerCaserTokenStream { tail: token_stream, - buffer: String::new(), + buffer: &mut self.0, } } } -pub struct LowerCaserTokenStream { - buffer: String, +pub struct LowerCaserTokenStream<'a, T> { + buffer: &'a mut String, tail: T, } @@ -33,7 +34,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) { } } -impl TokenStream for LowerCaserTokenStream { +impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; @@ -42,8 +43,8 @@ impl TokenStream for LowerCaserTokenStream { // fast track for ascii. self.token_mut().text.make_ascii_lowercase(); } else { - to_lowercase_unicode(&self.tail.token().text, &mut self.buffer); - mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); + to_lowercase_unicode(&self.tail.token().text, self.buffer); + mem::swap(&mut self.tail.token_mut().text, self.buffer); } true } @@ -76,7 +77,7 @@ mod tests { fn token_stream_helper(text: &str) -> Vec { let mut token_stream = TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(LowerCaser) + .filter(LowerCaser::default()) .build(); let mut token_stream = token_stream.token_stream(text); diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 42d98e90a..4e7253964 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -68,7 +68,7 @@ //! //! let en_stem = TextAnalyzer::builder(SimpleTokenizer::default()) //! .filter(RemoveLongFilter::limit(40)) -//! .filter(LowerCaser) +//! .filter(LowerCaser::default()) //! .filter(Stemmer::new(Language::English)) //! .build(); //! ``` @@ -115,7 +115,7 @@ //! // We need to register our tokenizer : //! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) //! .filter(RemoveLongFilter::limit(40)) -//! .filter(LowerCaser) +//! .filter(LowerCaser::default()) //! .build(); //! index //! .tokenizers() @@ -233,7 +233,7 @@ pub mod tests { "el_stem", TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) + .filter(LowerCaser::default()) .filter(Stemmer::new(Language::Greek)) .build(), ); diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index 5342e89e9..5d562e353 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -38,9 +38,9 @@ impl RemoveLongFilterStream { } impl TokenFilter for RemoveLongFilter { - type OutputTokenStream = RemoveLongFilterStream; + type OutputTokenStream<'a, T: TokenStream> = RemoveLongFilterStream; - fn filter(&self, token_stream: T) -> Self::OutputTokenStream { + fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> { RemoveLongFilterStream { token_length_limit: self.length_limit, tail: token_stream, diff --git a/src/tokenizer/split_compound_words.rs b/src/tokenizer/split_compound_words.rs index 678a204fc..96ce7a888 100644 --- a/src/tokenizer/split_compound_words.rs +++ b/src/tokenizer/split_compound_words.rs @@ -80,9 +80,9 @@ impl SplitCompoundWords { } impl TokenFilter for SplitCompoundWords { - type OutputTokenStream = SplitCompoundWordsTokenStream; + type OutputTokenStream<'a, T: TokenStream> = SplitCompoundWordsTokenStream; - fn filter(&self, token_stream: T) -> Self::OutputTokenStream { + fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> { SplitCompoundWordsTokenStream { dict: self.dict.clone(), tail: token_stream, diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 8d7e68776..4bf7f905b 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -81,9 +81,9 @@ impl Default for Stemmer { } impl TokenFilter for Stemmer { - type OutputTokenStream = StemmerTokenStream; + type OutputTokenStream<'a, T: TokenStream> = StemmerTokenStream; - fn filter(&self, token_stream: T) -> Self::OutputTokenStream { + fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> { let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); StemmerTokenStream { tail: token_stream, diff --git a/src/tokenizer/stop_word_filter/mod.rs b/src/tokenizer/stop_word_filter/mod.rs index bd5ee6425..b5cc559e5 100644 --- a/src/tokenizer/stop_word_filter/mod.rs +++ b/src/tokenizer/stop_word_filter/mod.rs @@ -72,9 +72,9 @@ impl StopWordFilter { } impl TokenFilter for StopWordFilter { - type OutputTokenStream = StopWordFilterStream; + type OutputTokenStream<'a, T: TokenStream> = StopWordFilterStream; - fn filter(&self, token_stream: T) -> Self::OutputTokenStream { + fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> { StopWordFilterStream { words: self.words.clone(), tail: token_stream, diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 4b34821ee..de6fde8d7 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -29,11 +29,17 @@ dyn_clone::clone_trait_object!(BoxableTokenizer); /// A boxable `TokenFilter`, with its `Tokenizer` type erased. trait BoxableTokenFilter: 'static + Send + Sync + DynClone { /// Transforms a boxed token stream into a new one. - fn box_filter<'a>(&self, token_stream: Box) -> Box; + fn box_filter<'a>( + &'a mut self, + token_stream: Box, + ) -> Box; } impl BoxableTokenFilter for T { - fn box_filter<'a>(&self, token_stream: Box) -> Box { + fn box_filter<'a>( + &'a mut self, + token_stream: Box, + ) -> Box { Box::new(self.filter(token_stream)) } } @@ -87,7 +93,7 @@ impl TextAnalyzer { /// Creates a token stream for a given `str`. pub fn token_stream<'a>(&'a mut self, text: &'a str) -> Box { let mut token_stream = self.tokenizer.box_token_stream(text); - for token_filter in &self.token_filters { + for token_filter in self.token_filters.iter_mut() { token_stream = token_filter.0.box_filter(token_stream); } token_stream @@ -154,7 +160,7 @@ mod tests { let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default()) .filter(AlphaNumOnlyFilter) .filter(RemoveLongFilter::limit(6)) - .filter(LowerCaser) + .filter(LowerCaser::default()) .build(); let mut stream = analyzer.token_stream("- first bullet point"); assert_eq!(stream.next().unwrap().text, "first"); @@ -167,7 +173,7 @@ mod tests { WhitespaceTokenizer::default(), vec![ BoxTokenFilter::from(AlphaNumOnlyFilter), - BoxTokenFilter::from(LowerCaser), + BoxTokenFilter::from(LowerCaser::default()), BoxTokenFilter::from(RemoveLongFilter::limit(6)), ], ); diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index a2be12390..44b05caa6 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -63,14 +63,14 @@ impl Default for TokenizerManager { "default", TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) + .filter(LowerCaser::default()) .build(), ); manager.register( "en_stem", TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) + .filter(LowerCaser::default()) .filter(Stemmer::new(Language::English)) .build(), ); diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs index 179cbe39e..8f83bc2f1 100644 --- a/tokenizer-api/src/lib.rs +++ b/tokenizer-api/src/lib.rs @@ -115,9 +115,9 @@ pub trait TokenStream { pub trait TokenFilter: 'static + Send + Sync + Clone { /// The Tokenizer type returned by this filter, typically parametrized by the underlying /// Tokenizer. - type OutputTokenStream: TokenStream; + type OutputTokenStream<'a, T: TokenStream>: TokenStream; /// Filter a token stream and returns a new one. - fn filter(&self, token_stream: T) -> Self::OutputTokenStream; + fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T>; /// Wraps a Tokenizer and returns a new one. fn transform(self, tokenizer: T) -> FilteredTokenizer { FilteredTokenizer { @@ -134,7 +134,7 @@ pub struct FilteredTokenizer { } impl Tokenizer for FilteredTokenizer { - type TokenStream<'a> = F::OutputTokenStream>; + type TokenStream<'a> = F::OutputTokenStream<'a, T::TokenStream<'a>>; fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { let token_stream = self.tokenizer.token_stream(text);