diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 23f7893d2..9e53bbba8 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -154,7 +154,7 @@ pub use self::split_compound_words::SplitCompoundWords; pub use self::stemmer::{Language, Stemmer}; pub use self::stop_word_filter::StopWordFilter; pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; -pub use self::tokenizer::{TextAnalyzer, TextAnalyzerBuilder}; +pub use self::tokenizer::{BoxTokenFilter, TextAnalyzer, TextAnalyzerBuilder}; pub use self::tokenizer_manager::TokenizerManager; pub use self::whitespace_tokenizer::WhitespaceTokenizer; diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 011978fb2..e7e4d6081 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -1,5 +1,3 @@ -use std::ops::Deref; - /// The tokenizer module contains all of the tools used to process /// text in `tantivy`. use tokenizer_api::{BoxTokenStream, TokenFilter, TokenStream, Tokenizer}; @@ -12,7 +10,7 @@ pub struct TextAnalyzer { } /// A boxable `Tokenizer`, with its `TokenStream` type erased. -pub trait BoxableTokenizer: 'static + Send + Sync { +trait BoxableTokenizer: 'static + Send + Sync { /// Creates a boxed token stream for a given `str`. fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>; /// Clone this tokenizer. @@ -28,15 +26,16 @@ impl BoxableTokenizer for T { } } -pub struct BoxedTokenizer(Box); +/// A boxed `BoxableTokenizer` which is a `Tokenizer` with its `TokenStream` type erased. +struct BoxTokenizer(Box); -impl Clone for BoxedTokenizer { - fn clone(&self) -> BoxedTokenizer { +impl Clone for BoxTokenizer { + fn clone(&self) -> BoxTokenizer { Self(self.0.box_clone()) } } -impl Tokenizer for BoxedTokenizer { +impl Tokenizer for BoxTokenizer { type TokenStream<'a> = Box; fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { @@ -44,29 +43,22 @@ impl Tokenizer for BoxedTokenizer { } } -/// Trait for the pluggable components of `Tokenizer`s. -pub trait BoxableTokenFilter: 'static + Send + Sync { - /// Wraps a Tokenizer and returns a new one. - fn box_transform(&self, tokenizer: BoxedTokenizer) -> Box; +/// A boxable `TokenFilter`, with its `Tokenizer` type erased. +trait BoxableTokenFilter: 'static + Send + Sync { + /// Wraps a `BoxedTokenizer` and returns a new one. + fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer; } impl BoxableTokenFilter for T { - fn box_transform(&self, tokenizer: BoxedTokenizer) -> Box { + fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer { let tokenizer = self.clone().transform(tokenizer); - tokenizer.box_clone() + BoxTokenizer(Box::new(tokenizer)) } } +/// A boxed `BoxableTokenFilter` which is a `TokenFilter` with its `Tokenizer` type erased. pub struct BoxTokenFilter(Box); -impl Deref for BoxTokenFilter { - type Target = dyn BoxableTokenFilter; - - fn deref(&self) -> &dyn BoxableTokenFilter { - &*self.0 - } -} - impl From for BoxTokenFilter { fn from(tokenizer: T) -> BoxTokenFilter { BoxTokenFilter(Box::new(tokenizer)) @@ -76,18 +68,31 @@ impl From for BoxTokenFilter { impl TextAnalyzer { /// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`. /// - /// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using - /// `TextAnalyzer::from(tokenizer)`. /// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`, - /// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()`. + /// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it + /// will be more performant and only create one `Box` instead of + /// one per `TokenFilter`. + /// + /// # Example + /// + /// ```rust + /// use tantivy::tokenizer::*; + /// + /// let en_stem = TextAnalyzer::build( + /// SimpleTokenizer::default(), + /// vec![ + /// BoxTokenFilter::from(RemoveLongFilter::limit(40)), + /// BoxTokenFilter::from(LowerCaser), + /// BoxTokenFilter::from(Stemmer::default()), + /// ]); + /// ``` pub fn build( tokenizer: T, boxed_token_filters: Vec, ) -> TextAnalyzer { - let mut boxed_tokenizer = BoxedTokenizer(Box::new(tokenizer)); + let mut boxed_tokenizer = BoxTokenizer(Box::new(tokenizer)); for filter in boxed_token_filters.into_iter() { - let filtered_boxed_tokenizer = filter.box_transform(boxed_tokenizer); - boxed_tokenizer = BoxedTokenizer(filtered_boxed_tokenizer); + boxed_tokenizer = filter.0.box_transform(boxed_tokenizer); } TextAnalyzer { tokenizer: boxed_tokenizer.0, diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs index 41312a61c..eca0d9566 100644 --- a/tokenizer-api/src/lib.rs +++ b/tokenizer-api/src/lib.rs @@ -148,7 +148,7 @@ pub trait TokenFilter: 'static + Send + Sync + Clone { /// The Tokenizer type returned by this filter, typically parametrized by the underlying /// Tokenizer. type Tokenizer: Tokenizer; - /// Wraps a Tokenizer and returns a new onex . + /// Wraps a Tokenizer and returns a new one. fn transform(self, tokenizer: T) -> Self::Tokenizer; }